1 /* Regular expression tests. 2 Copyright (C) 2003 Free Software Foundation, Inc. 3 This file is part of the GNU C Library. 4 Contributed by Jakub Jelinek <jakub (at) redhat.com>, 2003. 5 6 The GNU C Library is free software; you can redistribute it and/or 7 modify it under the terms of the GNU Lesser General Public 8 License as published by the Free Software Foundation; either 9 version 2.1 of the License, or (at your option) any later version. 10 11 The GNU C Library is distributed in the hope that it will be useful, 12 but WITHOUT ANY WARRANTY; without even the implied warranty of 13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 Lesser General Public License for more details. 15 16 You should have received a copy of the GNU Lesser General Public 17 License along with the GNU C Library; if not, write to the Free 18 Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 19 02110-1301 USA. */ 20 21 #ifdef HAVE_CONFIG_H 22 #include "config.h" 23 #endif 24 25 #include <sys/types.h> 26 #ifdef HAVE_MCHECK_H 27 #include <mcheck.h> 28 #endif 29 #include <regex.h> 30 #include <stdio.h> 31 #include <stdlib.h> 32 #include <string.h> 33 #include <locale.h> 34 #include <getopt.h> 35 36 static void 37 replace_special_chars (char *str) 38 { 39 for (; (str = strpbrk (str, "NTSZ")) != NULL; ++str) 40 switch (*str) 41 { 42 case 'N': *str = '\n'; break; 43 case 'T': *str = '\t'; break; 44 case 'S': *str = ' '; break; 45 case 'Z': *str = '\0'; break; 46 } 47 } 48 49 static void 50 glibc_re_syntax (char *str) 51 { 52 char *p, *end = strchr (str, '\0') + 1; 53 54 /* Replace [[:<:]] with \< and [[:>:]] with \>. */ 55 for (p = str; (p = strstr (p, "[[:")) != NULL; ) 56 if ((p[3] == '<' || p[3] == '>') && strncmp (p + 4, ":]]", 3) == 0) 57 { 58 p[0] = '\\'; 59 p[1] = p[3]; 60 memmove (p + 2, p + 7, end - p - 7); 61 end -= 5; 62 p += 2; 63 } 64 else 65 p += 3; 66 } 67 68 static char * 69 mb_replace (char *dst, const char c) 70 { 71 switch (c) 72 { 73 /* Replace a with \'a and A with \'A. */ 74 case 'a': 75 *dst++ = '\xc3'; 76 *dst++ = '\xa1'; 77 break; 78 case 'A': 79 *dst++ = '\xc3'; 80 *dst++ = '\x81'; 81 break; 82 /* Replace b with \v{c} and B with \v{C}. */ 83 case 'b': 84 *dst++ = '\xc4'; 85 *dst++ = '\x8d'; 86 break; 87 case 'B': 88 *dst++ = '\xc4'; 89 *dst++ = '\x8c'; 90 break; 91 /* Replace c with \v{d} and C with \v{D}. */ 92 case 'c': 93 *dst++ = '\xc4'; 94 *dst++ = '\x8f'; 95 break; 96 case 'C': 97 *dst++ = '\xc4'; 98 *dst++ = '\x8e'; 99 break; 100 /* Replace d with \'e and D with \'E. */ 101 case 'd': 102 *dst++ = '\xc3'; 103 *dst++ = '\xa9'; 104 break; 105 case 'D': 106 *dst++ = '\xc3'; 107 *dst++ = '\x89'; 108 break; 109 } 110 return dst; 111 } 112 113 static char * 114 mb_frob_string (const char *str, const char *letters) 115 { 116 char *ret, *dst; 117 const char *src; 118 119 if (str == NULL) 120 return NULL; 121 122 ret = malloc (2 * strlen (str) + 1); 123 if (ret == NULL) 124 return NULL; 125 126 for (src = str, dst = ret; *src; ++src) 127 if (strchr (letters, *src)) 128 dst = mb_replace (dst, *src); 129 else 130 *dst++ = *src; 131 *dst = '\0'; 132 return ret; 133 } 134 135 /* Like mb_frob_string, but don't replace anything between 136 [: and :], [. and .] or [= and =]. */ 137 138 static char * 139 mb_frob_pattern (const char *str, const char *letters) 140 { 141 char *ret, *dst; 142 const char *src; 143 int in_class = 0; 144 145 if (str == NULL) 146 return NULL; 147 148 ret = malloc (2 * strlen (str) + 1); 149 if (ret == NULL) 150 return NULL; 151 152 for (src = str, dst = ret; *src; ++src) 153 if (!in_class && strchr (letters, *src)) 154 dst = mb_replace (dst, *src); 155 else 156 { 157 if (!in_class && *src == '[' && strchr (":.=", src[1])) 158 in_class = 1; 159 else if (in_class && *src == ']' && strchr (":.=", src[-1])) 160 in_class = 0; 161 *dst++ = *src; 162 } 163 *dst = '\0'; 164 return ret; 165 } 166 167 static int 168 check_match (regmatch_t *rm, int idx, const char *string, 169 const char *match, const char *fail) 170 { 171 if (match[0] == '-' && match[1] == '\0') 172 { 173 if (rm[idx].rm_so == -1 && rm[idx].rm_eo == -1) 174 return 0; 175 printf ("%s rm[%d] unexpectedly matched\n", fail, idx); 176 return 1; 177 } 178 179 if (rm[idx].rm_so == -1 || rm[idx].rm_eo == -1) 180 { 181 printf ("%s rm[%d] unexpectedly did not match\n", fail, idx); 182 return 1; 183 } 184 185 if (match[0] == '@') 186 { 187 if (rm[idx].rm_so != rm[idx].rm_eo) 188 { 189 printf ("%s rm[%d] not empty\n", fail, idx); 190 return 1; 191 } 192 193 if (strncmp (string + rm[idx].rm_so, match + 1, strlen (match + 1) 194 ? strlen (match + 1) : 1)) 195 { 196 printf ("%s rm[%d] not matching %s\n", fail, idx, match); 197 return 1; 198 } 199 return 0; 200 } 201 202 if (rm[idx].rm_eo - rm[idx].rm_so != strlen (match) 203 || strncmp (string + rm[idx].rm_so, match, 204 rm[idx].rm_eo - rm[idx].rm_so)) 205 { 206 printf ("%s rm[%d] not matching %s\n", fail, idx, match); 207 return 1; 208 } 209 210 return 0; 211 } 212 213 static int 214 test (const char *pattern, int cflags, const char *string, int eflags, 215 char *expect, char *matches, const char *fail) 216 { 217 regex_t re; 218 regmatch_t rm[10]; 219 int n, ret = 0; 220 221 n = regcomp (&re, pattern, cflags); 222 if (n != 0) 223 { 224 char buf[500]; 225 if (eflags == -1) 226 { 227 static struct { reg_errcode_t code; const char *name; } codes [] 228 #define C(x) { REG_##x, #x } 229 = { C(NOERROR), C(NOMATCH), C(BADPAT), C(ECOLLATE), 230 C(ECTYPE), C(EESCAPE), C(ESUBREG), C(EBRACK), 231 C(EPAREN), C(EBRACE), C(BADBR), C(ERANGE), 232 C(ESPACE), C(BADRPT) }; 233 234 int i; 235 for (i = 0; i < sizeof (codes) / sizeof (codes[0]); ++i) 236 if (n == codes[i].code) 237 { 238 if (strcmp (string, codes[i].name)) 239 { 240 printf ("%s regcomp returned REG_%s (expected REG_%s)\n", 241 fail, codes[i].name, string); 242 return 1; 243 } 244 return 0; 245 } 246 247 printf ("%s regcomp return value REG_%d\n", fail, n); 248 return 1; 249 } 250 251 regerror (n, &re, buf, sizeof (buf)); 252 printf ("%s regcomp failed: %s\n", fail, buf); 253 return 1; 254 } 255 256 if (eflags == -1) 257 { 258 regfree (&re); 259 260 /* The test case file assumes something only guaranteed by the 261 rxspencer regex implementation. Namely that for empty 262 expressions regcomp() return REG_EMPTY. This is not the case 263 for us and so we ignore this error. */ 264 if (strcmp (string, "EMPTY") == 0) 265 return 0; 266 267 printf ("%s regcomp unexpectedly succeeded\n", fail); 268 return 1; 269 } 270 271 if (regexec (&re, string, 10, rm, eflags)) 272 { 273 regfree (&re); 274 if (expect == NULL) 275 return 0; 276 printf ("%s regexec failed\n", fail); 277 return 1; 278 } 279 280 regfree (&re); 281 282 if (expect == NULL) 283 { 284 printf ("%s regexec unexpectedly succeeded\n", fail); 285 return 1; 286 } 287 288 if (cflags & REG_NOSUB) 289 return 0; 290 291 ret = check_match (rm, 0, string, expect, fail); 292 if (matches == NULL) 293 return ret; 294 295 for (n = 1; ret == 0 && n < 10; ++n) 296 { 297 char *p = NULL; 298 299 if (matches) 300 { 301 p = strchr (matches, ','); 302 if (p != NULL) 303 *p = '\0'; 304 } 305 ret = check_match (rm, n, string, matches ? matches : "-", fail); 306 if (p) 307 { 308 *p = ','; 309 matches = p + 1; 310 } 311 else 312 matches = NULL; 313 } 314 315 return ret; 316 } 317 318 static int 319 mb_test (const char *pattern, int cflags, const char *string, int eflags, 320 char *expect, const char *matches, const char *letters, 321 const char *fail) 322 { 323 char *pattern_mb = mb_frob_pattern (pattern, letters); 324 const char *string_mb 325 = eflags == -1 ? string : mb_frob_string (string, letters); 326 char *expect_mb = mb_frob_string (expect, letters); 327 char *matches_mb = mb_frob_string (matches, letters); 328 int ret = 0; 329 330 if (!pattern_mb || !string_mb 331 || (expect && !expect_mb) || (matches && !matches_mb)) 332 { 333 printf ("%s %m", fail); 334 ret = 1; 335 } 336 else 337 ret = test (pattern_mb, cflags, string_mb, eflags, expect_mb, 338 matches_mb, fail); 339 340 free (matches_mb); 341 free (expect_mb); 342 if (string_mb != string) 343 free ((char *) string_mb); 344 free (pattern_mb); 345 return ret; 346 } 347 348 static int 349 mb_tests (const char *pattern, int cflags, const char *string, int eflags, 350 char *expect, const char *matches) 351 { 352 int ret = 0; 353 int i; 354 char letters[9], fail[20]; 355 356 /* The tests aren't supposed to work with xdigit, since a-dA-D are 357 hex digits while \'a \'A \v{c}\v{C}\v{d}\v{D}\'e \'E are not. */ 358 if (strstr (pattern, "[:xdigit:]")) 359 return 0; 360 361 /* XXX: regex ATM handles only single byte equivalence classes. */ 362 if (strstr (pattern, "[[=b=]]")) 363 return 0; 364 365 for (i = 1; i < 16; ++i) 366 { 367 char *p = letters; 368 if (i & 1) 369 { 370 if (!strchr (pattern, 'a') && !strchr (string, 'a') 371 && !strchr (pattern, 'A') && !strchr (string, 'A')) 372 continue; 373 *p++ = 'a', *p++ = 'A'; 374 } 375 if (i & 2) 376 { 377 if (!strchr (pattern, 'b') && !strchr (string, 'b') 378 && !strchr (pattern, 'B') && !strchr (string, 'B')) 379 continue; 380 *p++ = 'b', *p++ = 'B'; 381 } 382 if (i & 4) 383 { 384 if (!strchr (pattern, 'c') && !strchr (string, 'c') 385 && !strchr (pattern, 'C') && !strchr (string, 'C')) 386 continue; 387 *p++ = 'c', *p++ = 'C'; 388 } 389 if (i & 8) 390 { 391 if (!strchr (pattern, 'd') && !strchr (string, 'd') 392 && !strchr (pattern, 'D') && !strchr (string, 'D')) 393 continue; 394 *p++ = 'd', *p++ = 'D'; 395 } 396 *p++ = '\0'; 397 sprintf (fail, "UTF-8 %s FAIL", letters); 398 ret |= mb_test (pattern, cflags, string, eflags, expect, matches, 399 letters, fail); 400 } 401 return ret; 402 } 403 404 int 405 main (int argc, char **argv) 406 { 407 int ret = 0; 408 char *line = NULL; 409 size_t line_len = 0; 410 ssize_t len; 411 FILE *f; 412 static int test_utf8 = 0; 413 static const struct option options[] = 414 { 415 {"utf8", no_argument, &test_utf8, 1}, 416 {NULL, 0, NULL, 0 } 417 }; 418 419 #ifdef HAVE_MCHECK_H 420 mtrace (); 421 #endif 422 423 while (getopt_long (argc, argv, "", options, NULL) >= 0); 424 425 if (optind + 1 != argc) 426 { 427 fprintf (stderr, "Missing test filename\n"); 428 return 1; 429 } 430 431 f = fopen (argv[optind], "r"); 432 if (f == NULL) 433 { 434 fprintf (stderr, "Couldn't open %s\n", argv[optind]); 435 return 1; 436 } 437 438 while ((len = getline (&line, &line_len, f)) > 0) 439 { 440 char *pattern, *flagstr, *string, *expect, *matches, *p; 441 int cflags = REG_EXTENDED, eflags = 0, try_bre_ere = 0; 442 443 if (line[len - 1] == '\n') 444 line[len - 1] = '\0'; 445 446 /* Skip comments and empty lines. */ 447 if (*line == '#' || *line == '\0') 448 continue; 449 450 puts (line); 451 fflush (stdout); 452 453 pattern = strtok (line, "\t"); 454 if (pattern == NULL) 455 continue; 456 457 if (strcmp (pattern, "\"\"") == 0) 458 pattern += 2; 459 460 flagstr = strtok (NULL, "\t"); 461 if (flagstr == NULL) 462 continue; 463 464 string = strtok (NULL, "\t"); 465 if (string == NULL) 466 continue; 467 468 if (strcmp (string, "\"\"") == 0) 469 string += 2; 470 471 for (p = flagstr; *p; ++p) 472 switch (*p) 473 { 474 case '-': 475 break; 476 case 'b': 477 cflags &= ~REG_EXTENDED; 478 break; 479 case '&': 480 try_bre_ere = 1; 481 break; 482 case 'C': 483 eflags = -1; 484 break; 485 case 'i': 486 cflags |= REG_ICASE; 487 break; 488 case 's': 489 cflags |= REG_NOSUB; 490 break; 491 case 'n': 492 cflags |= REG_NEWLINE; 493 break; 494 case '^': 495 eflags |= REG_NOTBOL; 496 break; 497 case '$': 498 eflags |= REG_NOTEOL; 499 break; 500 case 'm': 501 case 'p': 502 case '#': 503 /* Not supported. */ 504 flagstr = NULL; 505 break; 506 } 507 508 if (flagstr == NULL) 509 continue; 510 511 replace_special_chars (pattern); 512 glibc_re_syntax (pattern); 513 if (eflags != -1) 514 replace_special_chars (string); 515 516 expect = strtok (NULL, "\t"); 517 matches = NULL; 518 if (expect != NULL) 519 { 520 replace_special_chars (expect); 521 matches = strtok (NULL, "\t"); 522 if (matches != NULL) 523 replace_special_chars (matches); 524 } 525 526 if (setlocale (LC_ALL, "C") == NULL) 527 { 528 puts ("setlocale C failed"); 529 ret = 1; 530 } 531 if (test (pattern, cflags, string, eflags, expect, matches, "FAIL") 532 || (try_bre_ere 533 && test (pattern, cflags & ~REG_EXTENDED, string, eflags, 534 expect, matches, "FAIL"))) 535 ret = 1; 536 else if (test_utf8) 537 { 538 if (setlocale (LC_ALL, "cs_CZ.UTF-8") == NULL) 539 { 540 puts ("setlocale cs_CZ.UTF-8 failed"); 541 ret = 1; 542 } 543 else if (test (pattern, cflags, string, eflags, expect, matches, 544 "UTF-8 FAIL") 545 || (try_bre_ere 546 && test (pattern, cflags & ~REG_EXTENDED, string, 547 eflags, expect, matches, "UTF-8 FAIL"))) 548 ret = 1; 549 else if (mb_tests (pattern, cflags, string, eflags, expect, matches) 550 || (try_bre_ere 551 && mb_tests (pattern, cflags & ~REG_EXTENDED, string, 552 eflags, expect, matches))) 553 ret = 1; 554 } 555 } 556 557 free (line); 558 fclose (f); 559 return ret; 560 } 561