POSIX Regex seems to work correctly with the UTF-8 locale. I just wrote a simple test (see below) and used it to match a string with Cyrillic characters against the regular expression "[[:alpha:]]"
(for example). And everything works fine.
Note: The main thing you should remember is that regular expression functions are related to the locale. Therefore, before this, you must call setlocale()
.
#include <sys/types.h> #include <string.h> #include <regex.h> #include <stdio.h> #include <locale.h> int main(int argc, char** argv) { int ret; regex_t reg; regmatch_t matches[10]; if (argc != 3) { fprintf(stderr, "Usage: %s regex string\n", argv[0]); return 1; } setlocale(LC_ALL, ""); /* Use system locale instead of default "C" */ if ((ret = regcomp(®, argv[1], 0)) != 0) { char buf[256]; regerror(ret, ®, buf, sizeof(buf)); fprintf(stderr, "regcomp() error (%d): %s\n", ret, buf); return 1; } if ((ret = regexec(®, argv[2], 10, matches, 0)) == 0) { int i; char buf[256]; int size; for (i = 0; i < sizeof(matches) / sizeof(regmatch_t); i++) { if (matches[i].rm_so == -1) break; size = matches[i].rm_eo - matches[i].rm_so; if (size >= sizeof(buf)) { fprintf(stderr, "match (%d-%d) is too long (%d)\n", matches[i].rm_so, matches[i].rm_eo, size); continue; } buf[size] = '\0'; printf("%d: %d-%d: '%s'\n", i, matches[i].rm_so, matches[i].rm_eo, strncpy(buf, argv[2] + matches[i].rm_so, size)); } } return 0; }
Usage example:
$ locale LANG=ru_RU.UTF-8 LC_CTYPE="ru_RU.UTF-8" LC_COLLATE="ru_RU.UTF-8" ... (skip) LC_ALL= $ ./reg '[[:alpha:]]' ' 359 ' 0: 5-7: '' $
The length of the matching result is two bytes, because the Cyrillic letters in UTF-8 take up so much.
praetorian droid
source share