1 <html> 2 <head> 3 <title>pcredemo specification</title> 4 </head> 5 <body bgcolor="#FFFFFF" text="#00005A" link="#0066FF" alink="#3399FF" vlink="#2222BB"> 6 <h1>pcredemo man page</h1> 7 <p> 8 Return to the <a href="index.html">PCRE index page</a>. 9 </p> 10 <p> 11 This page is part of the PCRE HTML documentation. It was generated automatically 12 from the original man page. If there is any nonsense in it, please consult the 13 man page, in case the conversion went wrong. 14 <br> 15 <ul> 16 </ul> 17 <PRE> 18 /************************************************* 19 * PCRE DEMONSTRATION PROGRAM * 20 *************************************************/ 21 22 /* This is a demonstration program to illustrate the most straightforward ways 23 of calling the PCRE regular expression library from a C program. See the 24 pcresample documentation for a short discussion ("man pcresample" if you have 25 the PCRE man pages installed). 26 27 In Unix-like environments, if PCRE is installed in your standard system 28 libraries, you should be able to compile this program using this command: 29 30 gcc -Wall pcredemo.c -lpcre -o pcredemo 31 32 If PCRE is not installed in a standard place, it is likely to be installed with 33 support for the pkg-config mechanism. If you have pkg-config, you can compile 34 this program using this command: 35 36 gcc -Wall pcredemo.c `pkg-config --cflags --libs libpcre` -o pcredemo 37 38 If you do not have pkg-config, you may have to use this: 39 40 gcc -Wall pcredemo.c -I/usr/local/include -L/usr/local/lib \ 41 -R/usr/local/lib -lpcre -o pcredemo 42 43 Replace "/usr/local/include" and "/usr/local/lib" with wherever the include and 44 library files for PCRE are installed on your system. Only some operating 45 systems (e.g. Solaris) use the -R option. 46 47 Building under Windows: 48 49 If you want to statically link this program against a non-dll .a file, you must 50 define PCRE_STATIC before including pcre.h, otherwise the pcre_malloc() and 51 pcre_free() exported functions will be declared __declspec(dllimport), with 52 unwanted results. So in this environment, uncomment the following line. */ 53 54 /* #define PCRE_STATIC */ 55 56 #include <stdio.h> 57 #include <string.h> 58 #include <pcre.h> 59 60 #define OVECCOUNT 30 /* should be a multiple of 3 */ 61 62 63 int main(int argc, char **argv) 64 { 65 pcre *re; 66 const char *error; 67 char *pattern; 68 char *subject; 69 unsigned char *name_table; 70 unsigned int option_bits; 71 int erroffset; 72 int find_all; 73 int crlf_is_newline; 74 int namecount; 75 int name_entry_size; 76 int ovector[OVECCOUNT]; 77 int subject_length; 78 int rc, i; 79 int utf8; 80 81 82 /************************************************************************** 83 * First, sort out the command line. There is only one possible option at * 84 * the moment, "-g" to request repeated matching to find all occurrences, * 85 * like Perl's /g option. We set the variable find_all to a non-zero value * 86 * if the -g option is present. Apart from that, there must be exactly two * 87 * arguments. * 88 **************************************************************************/ 89 90 find_all = 0; 91 for (i = 1; i < argc; i++) 92 { 93 if (strcmp(argv[i], "-g") == 0) find_all = 1; 94 else break; 95 } 96 97 /* After the options, we require exactly two arguments, which are the pattern, 98 and the subject string. */ 99 100 if (argc - i != 2) 101 { 102 printf("Two arguments required: a regex and a subject string\n"); 103 return 1; 104 } 105 106 pattern = argv[i]; 107 subject = argv[i+1]; 108 subject_length = (int)strlen(subject); 109 110 111 /************************************************************************* 112 * Now we are going to compile the regular expression pattern, and handle * 113 * and errors that are detected. * 114 *************************************************************************/ 115 116 re = pcre_compile( 117 pattern, /* the pattern */ 118 0, /* default options */ 119 &error, /* for error message */ 120 &erroffset, /* for error offset */ 121 NULL); /* use default character tables */ 122 123 /* Compilation failed: print the error message and exit */ 124 125 if (re == NULL) 126 { 127 printf("PCRE compilation failed at offset %d: %s\n", erroffset, error); 128 return 1; 129 } 130 131 132 /************************************************************************* 133 * If the compilation succeeded, we call PCRE again, in order to do a * 134 * pattern match against the subject string. This does just ONE match. If * 135 * further matching is needed, it will be done below. * 136 *************************************************************************/ 137 138 rc = pcre_exec( 139 re, /* the compiled pattern */ 140 NULL, /* no extra data - we didn't study the pattern */ 141 subject, /* the subject string */ 142 subject_length, /* the length of the subject */ 143 0, /* start at offset 0 in the subject */ 144 0, /* default options */ 145 ovector, /* output vector for substring information */ 146 OVECCOUNT); /* number of elements in the output vector */ 147 148 /* Matching failed: handle error cases */ 149 150 if (rc < 0) 151 { 152 switch(rc) 153 { 154 case PCRE_ERROR_NOMATCH: printf("No match\n"); break; 155 /* 156 Handle other special cases if you like 157 */ 158 default: printf("Matching error %d\n", rc); break; 159 } 160 pcre_free(re); /* Release memory used for the compiled pattern */ 161 return 1; 162 } 163 164 /* Match succeded */ 165 166 printf("\nMatch succeeded at offset %d\n", ovector[0]); 167 168 169 /************************************************************************* 170 * We have found the first match within the subject string. If the output * 171 * vector wasn't big enough, say so. Then output any substrings that were * 172 * captured. * 173 *************************************************************************/ 174 175 /* The output vector wasn't big enough */ 176 177 if (rc == 0) 178 { 179 rc = OVECCOUNT/3; 180 printf("ovector only has room for %d captured substrings\n", rc - 1); 181 } 182 183 /* Show substrings stored in the output vector by number. Obviously, in a real 184 application you might want to do things other than print them. */ 185 186 for (i = 0; i < rc; i++) 187 { 188 char *substring_start = subject + ovector[2*i]; 189 int substring_length = ovector[2*i+1] - ovector[2*i]; 190 printf("%2d: %.*s\n", i, substring_length, substring_start); 191 } 192 193 194 /************************************************************************** 195 * That concludes the basic part of this demonstration program. We have * 196 * compiled a pattern, and performed a single match. The code that follows * 197 * shows first how to access named substrings, and then how to code for * 198 * repeated matches on the same subject. * 199 **************************************************************************/ 200 201 /* See if there are any named substrings, and if so, show them by name. First 202 we have to extract the count of named parentheses from the pattern. */ 203 204 (void)pcre_fullinfo( 205 re, /* the compiled pattern */ 206 NULL, /* no extra data - we didn't study the pattern */ 207 PCRE_INFO_NAMECOUNT, /* number of named substrings */ 208 &namecount); /* where to put the answer */ 209 210 if (namecount <= 0) printf("No named substrings\n"); else 211 { 212 unsigned char *tabptr; 213 printf("Named substrings\n"); 214 215 /* Before we can access the substrings, we must extract the table for 216 translating names to numbers, and the size of each entry in the table. */ 217 218 (void)pcre_fullinfo( 219 re, /* the compiled pattern */ 220 NULL, /* no extra data - we didn't study the pattern */ 221 PCRE_INFO_NAMETABLE, /* address of the table */ 222 &name_table); /* where to put the answer */ 223 224 (void)pcre_fullinfo( 225 re, /* the compiled pattern */ 226 NULL, /* no extra data - we didn't study the pattern */ 227 PCRE_INFO_NAMEENTRYSIZE, /* size of each entry in the table */ 228 &name_entry_size); /* where to put the answer */ 229 230 /* Now we can scan the table and, for each entry, print the number, the name, 231 and the substring itself. */ 232 233 tabptr = name_table; 234 for (i = 0; i < namecount; i++) 235 { 236 int n = (tabptr[0] << 8) | tabptr[1]; 237 printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, 238 ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]); 239 tabptr += name_entry_size; 240 } 241 } 242 243 244 /************************************************************************* 245 * If the "-g" option was given on the command line, we want to continue * 246 * to search for additional matches in the subject string, in a similar * 247 * way to the /g option in Perl. This turns out to be trickier than you * 248 * might think because of the possibility of matching an empty string. * 249 * What happens is as follows: * 250 * * 251 * If the previous match was NOT for an empty string, we can just start * 252 * the next match at the end of the previous one. * 253 * * 254 * If the previous match WAS for an empty string, we can't do that, as it * 255 * would lead to an infinite loop. Instead, a special call of pcre_exec() * 256 * is made with the PCRE_NOTEMPTY_ATSTART and PCRE_ANCHORED flags set. * 257 * The first of these tells PCRE that an empty string at the start of the * 258 * subject is not a valid match; other possibilities must be tried. The * 259 * second flag restricts PCRE to one match attempt at the initial string * 260 * position. If this match succeeds, an alternative to the empty string * 261 * match has been found, and we can print it and proceed round the loop, * 262 * advancing by the length of whatever was found. If this match does not * 263 * succeed, we still stay in the loop, advancing by just one character. * 264 * In UTF-8 mode, which can be set by (*UTF8) in the pattern, this may be * 265 * more than one byte. * 266 * * 267 * However, there is a complication concerned with newlines. When the * 268 * newline convention is such that CRLF is a valid newline, we must * 269 * advance by two characters rather than one. The newline convention can * 270 * be set in the regex by (*CR), etc.; if not, we must find the default. * 271 *************************************************************************/ 272 273 if (!find_all) /* Check for -g */ 274 { 275 pcre_free(re); /* Release the memory used for the compiled pattern */ 276 return 0; /* Finish unless -g was given */ 277 } 278 279 /* Before running the loop, check for UTF-8 and whether CRLF is a valid newline 280 sequence. First, find the options with which the regex was compiled; extract 281 the UTF-8 state, and mask off all but the newline options. */ 282 283 (void)pcre_fullinfo(re, NULL, PCRE_INFO_OPTIONS, &option_bits); 284 utf8 = option_bits & PCRE_UTF8; 285 option_bits &= PCRE_NEWLINE_CR|PCRE_NEWLINE_LF|PCRE_NEWLINE_CRLF| 286 PCRE_NEWLINE_ANY|PCRE_NEWLINE_ANYCRLF; 287 288 /* If no newline options were set, find the default newline convention from the 289 build configuration. */ 290 291 if (option_bits == 0) 292 { 293 int d; 294 (void)pcre_config(PCRE_CONFIG_NEWLINE, &d); 295 /* Note that these values are always the ASCII ones, even in 296 EBCDIC environments. CR = 13, NL = 10. */ 297 option_bits = (d == 13)? PCRE_NEWLINE_CR : 298 (d == 10)? PCRE_NEWLINE_LF : 299 (d == (13<<8 | 10))? PCRE_NEWLINE_CRLF : 300 (d == -2)? PCRE_NEWLINE_ANYCRLF : 301 (d == -1)? PCRE_NEWLINE_ANY : 0; 302 } 303 304 /* See if CRLF is a valid newline sequence. */ 305 306 crlf_is_newline = 307 option_bits == PCRE_NEWLINE_ANY || 308 option_bits == PCRE_NEWLINE_CRLF || 309 option_bits == PCRE_NEWLINE_ANYCRLF; 310 311 /* Loop for second and subsequent matches */ 312 313 for (;;) 314 { 315 int options = 0; /* Normally no options */ 316 int start_offset = ovector[1]; /* Start at end of previous match */ 317 318 /* If the previous match was for an empty string, we are finished if we are 319 at the end of the subject. Otherwise, arrange to run another match at the 320 same point to see if a non-empty match can be found. */ 321 322 if (ovector[0] == ovector[1]) 323 { 324 if (ovector[0] == subject_length) break; 325 options = PCRE_NOTEMPTY_ATSTART | PCRE_ANCHORED; 326 } 327 328 /* Run the next matching operation */ 329 330 rc = pcre_exec( 331 re, /* the compiled pattern */ 332 NULL, /* no extra data - we didn't study the pattern */ 333 subject, /* the subject string */ 334 subject_length, /* the length of the subject */ 335 start_offset, /* starting offset in the subject */ 336 options, /* options */ 337 ovector, /* output vector for substring information */ 338 OVECCOUNT); /* number of elements in the output vector */ 339 340 /* This time, a result of NOMATCH isn't an error. If the value in "options" 341 is zero, it just means we have found all possible matches, so the loop ends. 342 Otherwise, it means we have failed to find a non-empty-string match at a 343 point where there was a previous empty-string match. In this case, we do what 344 Perl does: advance the matching position by one character, and continue. We 345 do this by setting the "end of previous match" offset, because that is picked 346 up at the top of the loop as the point at which to start again. 347 348 There are two complications: (a) When CRLF is a valid newline sequence, and 349 the current position is just before it, advance by an extra byte. (b) 350 Otherwise we must ensure that we skip an entire UTF-8 character if we are in 351 UTF-8 mode. */ 352 353 if (rc == PCRE_ERROR_NOMATCH) 354 { 355 if (options == 0) break; /* All matches found */ 356 ovector[1] = start_offset + 1; /* Advance one byte */ 357 if (crlf_is_newline && /* If CRLF is newline & */ 358 start_offset < subject_length - 1 && /* we are at CRLF, */ 359 subject[start_offset] == '\r' && 360 subject[start_offset + 1] == '\n') 361 ovector[1] += 1; /* Advance by one more. */ 362 else if (utf8) /* Otherwise, ensure we */ 363 { /* advance a whole UTF-8 */ 364 while (ovector[1] < subject_length) /* character. */ 365 { 366 if ((subject[ovector[1]] & 0xc0) != 0x80) break; 367 ovector[1] += 1; 368 } 369 } 370 continue; /* Go round the loop again */ 371 } 372 373 /* Other matching errors are not recoverable. */ 374 375 if (rc < 0) 376 { 377 printf("Matching error %d\n", rc); 378 pcre_free(re); /* Release memory used for the compiled pattern */ 379 return 1; 380 } 381 382 /* Match succeded */ 383 384 printf("\nMatch succeeded again at offset %d\n", ovector[0]); 385 386 /* The match succeeded, but the output vector wasn't big enough. */ 387 388 if (rc == 0) 389 { 390 rc = OVECCOUNT/3; 391 printf("ovector only has room for %d captured substrings\n", rc - 1); 392 } 393 394 /* As before, show substrings stored in the output vector by number, and then 395 also any named substrings. */ 396 397 for (i = 0; i < rc; i++) 398 { 399 char *substring_start = subject + ovector[2*i]; 400 int substring_length = ovector[2*i+1] - ovector[2*i]; 401 printf("%2d: %.*s\n", i, substring_length, substring_start); 402 } 403 404 if (namecount <= 0) printf("No named substrings\n"); else 405 { 406 unsigned char *tabptr = name_table; 407 printf("Named substrings\n"); 408 for (i = 0; i < namecount; i++) 409 { 410 int n = (tabptr[0] << 8) | tabptr[1]; 411 printf("(%d) %*s: %.*s\n", n, name_entry_size - 3, tabptr + 2, 412 ovector[2*n+1] - ovector[2*n], subject + ovector[2*n]); 413 tabptr += name_entry_size; 414 } 415 } 416 } /* End of loop to find second and subsequent matches */ 417 418 printf("\n"); 419 pcre_free(re); /* Release memory used for the compiled pattern */ 420 return 0; 421 } 422 423 /* End of pcredemo.c */ 424 <p> 425 Return to the <a href="index.html">PCRE index page</a>. 426 </p> 427