1 /************************************************************************** 2 * 3 * Copyright (C) 2002, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 * 6 *************************************************************************** 7 */ 8 9 // 10 // ugrep - an ICU sample program illustrating the use of ICU Regular Expressions. 11 // 12 // The use of the ICU Regex API all occurs within the main() 13 // function. The rest of the code deals with with opening files, 14 // encoding conversions, printing results, etc. 15 // 16 // This is not a full-featured grep program. The command line options 17 // have been kept to a minimum to avoid complicating the sample code. 18 // 19 20 21 22 #include <stdio.h> 23 #include <stdlib.h> 24 #include <string.h> 25 26 #include "unicode/utypes.h" 27 #include "unicode/ustring.h" 28 #include "unicode/regex.h" 29 #include "unicode/ucnv.h" 30 #include "unicode/uclean.h" 31 32 33 // 34 // The following variables contain paramters that may be set from the command line. 35 // 36 const char *pattern = NULL; // The regular expression 37 int firstFileNum; // argv index of the first file name 38 UBool displayFileName = FALSE; 39 UBool displayLineNum = FALSE; 40 41 42 // 43 // Info regarding the file currently being processed 44 // 45 const char *fileName; 46 int fileLen; // Length, in UTF-16 Code Units. 47 48 UChar *ucharBuf = 0; // Buffer, holds converted file. (Simple minded program, always reads 49 // the whole file at once. 50 51 char *charBuf = 0; // Buffer, for original, unconverted file data. 52 53 54 // 55 // Info regarding the line currently being processed 56 // 57 int lineStart; // Index of first char of the current line in the file buffer 58 int lineEnd; // Index of char following the new line sequence for the current line 59 int lineNum; 60 61 // 62 // Converter, used on output to convert Unicode data back to char * 63 // so that it will display in non-Unicode terminal windows. 64 // 65 UConverter *outConverter = 0; 66 67 // 68 // Function forward declarations 69 // 70 void processOptions(int argc, const char **argv); 71 void nextLine(int start); 72 void printMatch(); 73 void printUsage(); 74 void readFile(const char *name); 75 76 77 78 //------------------------------------------------------------------------------------------ 79 // 80 // main for ugrep 81 // 82 // Structurally, all use of the ICU Regular Expression API is in main(), 83 // and all of the supporting stuff necessary to make a running program, but 84 // not directly related to regular expressions, is factored out into these other 85 // functions. 86 // 87 //------------------------------------------------------------------------------------------ 88 int main(int argc, const char** argv) { 89 UBool matchFound = FALSE; 90 91 // 92 // Process the commmand line options. 93 // 94 processOptions(argc, argv); 95 96 // 97 // Create a RegexPattern object from the user supplied pattern string. 98 // 99 UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure 100 // in a status variable. 101 102 UParseError parseErr; // In the event of a syntax error in the regex pattern, 103 // this struct will contain the position of the 104 // error. 105 106 RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status); 107 // Note that C++ is doing an automatic conversion 108 // of the (char *) pattern to a temporary 109 // UnicodeString object. 110 if (U_FAILURE(status)) { 111 fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n", 112 u_errorName(status), parseErr.offset); 113 exit(-1); 114 } 115 116 // 117 // Create a RegexMatcher from the newly created pattern. 118 // 119 UnicodeString empty; 120 RegexMatcher *matcher = rePat->matcher(empty, status); 121 if (U_FAILURE(status)) { 122 fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n", 123 u_errorName(status)); 124 exit(-1); 125 } 126 127 // 128 // Loop, processing each of the input files. 129 // 130 for (int fileNum=firstFileNum; fileNum < argc; fileNum++) { 131 readFile(argv[fileNum]); 132 133 // 134 // Loop through the lines of a file, trying to match the regex pattern on each. 135 // 136 for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) { 137 UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart); 138 matcher->reset(s); 139 if (matcher->find()) { 140 matchFound = TRUE; 141 printMatch(); 142 } 143 } 144 } 145 146 // 147 // Clean up 148 // 149 delete matcher; 150 delete rePat; 151 free(ucharBuf); 152 free(charBuf); 153 ucnv_close(outConverter); 154 155 u_cleanup(); // shut down ICU, release any cached data it owns. 156 157 return matchFound? 0: 1; 158 } 159 160 161 162 //------------------------------------------------------------------------------------------ 163 // 164 // doOptions Run through the command line options, and set 165 // the global variables accordingly. 166 // 167 // exit without returning if an error occured and 168 // ugrep should not proceed further. 169 // 170 //------------------------------------------------------------------------------------------ 171 void processOptions(int argc, const char **argv) { 172 int optInd; 173 UBool doUsage = FALSE; 174 UBool doVersion = FALSE; 175 const char *arg; 176 177 178 for(optInd = 1; optInd < argc; ++optInd) { 179 arg = argv[optInd]; 180 181 /* version info */ 182 if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) { 183 doVersion = TRUE; 184 } 185 /* usage info */ 186 else if(strcmp(arg, "--help") == 0) { 187 doUsage = TRUE; 188 } 189 else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) { 190 displayLineNum = TRUE; 191 } 192 /* POSIX.1 says all arguments after -- are not options */ 193 else if(strcmp(arg, "--") == 0) { 194 /* skip the -- */ 195 ++optInd; 196 break; 197 } 198 /* unrecognized option */ 199 else if(strncmp(arg, "-", strlen("-")) == 0) { 200 printf("ugrep: invalid option -- %s\n", arg+1); 201 doUsage = TRUE; 202 } 203 /* done with options */ 204 else { 205 break; 206 } 207 } 208 209 if (doUsage) { 210 printUsage(); 211 exit(0); 212 } 213 214 if (doVersion) { 215 printf("ugrep version 0.01\n"); 216 if (optInd == argc) { 217 exit(0); 218 } 219 } 220 221 int remainingArgs = argc-optInd; // pattern file ... 222 if (remainingArgs < 2) { 223 fprintf(stderr, "ugrep: files or pattern are missing.\n"); 224 printUsage(); 225 exit(1); 226 } 227 228 if (remainingArgs > 2) { 229 // More than one file to be processed. Display file names with match output. 230 displayFileName = TRUE; 231 } 232 233 pattern = argv[optInd]; 234 firstFileNum = optInd+1; 235 } 236 237 //------------------------------------------------------------------------------------------ 238 // 239 // printUsage 240 // 241 //------------------------------------------------------------------------------------------ 242 void printUsage() { 243 printf("ugrep [options] pattern file...\n" 244 " -V or --version display version information\n" 245 " --help display this help and exit\n" 246 " -- stop further option processing\n" 247 "-n, --line-number Prefix each line of output with the line number within its input file.\n" 248 ); 249 exit(0); 250 } 251 252 //------------------------------------------------------------------------------------------ 253 // 254 // readFile Read a file into memory, and convert it to Unicode. 255 // 256 // Since this is just a demo program, take the simple minded approach 257 // of always reading the whole file at once. No intelligent buffering 258 // is done. 259 // 260 //------------------------------------------------------------------------------------------ 261 void readFile(const char *name) { 262 263 // 264 // Initialize global file variables 265 // 266 fileName = name; 267 fileLen = 0; // zero length prevents processing in case of errors. 268 269 270 // 271 // Open the file and determine its size. 272 // 273 FILE *file = fopen(name, "rb"); 274 if (file == 0 ) { 275 fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName); 276 return; 277 } 278 fseek(file, 0, SEEK_END); 279 int rawFileLen = ftell(file); 280 fseek(file, 0, SEEK_SET); 281 282 283 // 284 // Read in the file 285 // 286 charBuf = (char *)realloc(charBuf, rawFileLen+1); // Need error checking... 287 int t = fread(charBuf, 1, rawFileLen, file); 288 if (t != rawFileLen) { 289 fprintf(stderr, "Error reading file \"%s\"\n", fileName); 290 return; 291 } 292 charBuf[rawFileLen]=0; 293 fclose(file); 294 295 // 296 // Look for a Unicode Signature (BOM) in the data 297 // 298 int32_t signatureLength; 299 const char * charDataStart = charBuf; 300 UErrorCode status = U_ZERO_ERROR; 301 const char* encoding = ucnv_detectUnicodeSignature( 302 charDataStart, rawFileLen, &signatureLength, &status); 303 if (U_FAILURE(status)) { 304 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n", 305 u_errorName(status)); 306 return; 307 } 308 if(encoding!=NULL ){ 309 charDataStart += signatureLength; 310 rawFileLen -= signatureLength; 311 } 312 313 // 314 // Open a converter to take the file to UTF-16 315 // 316 UConverter* conv; 317 conv = ucnv_open(encoding, &status); 318 if (U_FAILURE(status)) { 319 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status)); 320 return; 321 } 322 323 // 324 // Convert the file data to UChar. 325 // Preflight first to determine required buffer size. 326 // 327 uint32_t destCap = ucnv_toUChars(conv, 328 NULL, // dest, 329 0, // destCapacity, 330 charDataStart, 331 rawFileLen, 332 &status); 333 if (status != U_BUFFER_OVERFLOW_ERROR) { 334 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 335 return; 336 }; 337 338 status = U_ZERO_ERROR; 339 ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar)); 340 ucnv_toUChars(conv, 341 ucharBuf, // dest, 342 destCap+1, 343 charDataStart, 344 rawFileLen, 345 &status); 346 if (U_FAILURE(status)) { 347 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 348 return; 349 }; 350 ucnv_close(conv); 351 352 // 353 // Successful conversion. Set the global size variables so that 354 // the rest of the processing will proceed for this file. 355 // 356 fileLen = destCap; 357 } 358 359 360 361 362 363 //------------------------------------------------------------------------------------------ 364 // 365 // nextLine Advance the line index variables, starting at the 366 // specified position in the input file buffer, by 367 // scanning forwrd until the next end-of-line. 368 // 369 // Need to take into account all of the possible Unicode 370 // line ending sequences. 371 // 372 //------------------------------------------------------------------------------------------ 373 void nextLine(int startPos) { 374 if (startPos == 0) { 375 lineNum = 0; 376 } else { 377 lineNum++; 378 } 379 lineStart = lineEnd = startPos; 380 381 for (;;) { 382 if (lineEnd >= fileLen) { 383 return; 384 } 385 UChar c = ucharBuf[lineEnd]; 386 lineEnd++; 387 if (c == 0x0a || // Line Feed 388 c == 0x0c || // Form Feed 389 c == 0x0d || // Carriage Return 390 c == 0x85 || // Next Line 391 c == 0x2028 || // Line Separator 392 c == 0x2029) // Paragraph separator 393 { 394 break; 395 } 396 } 397 398 // Check for CR/LF sequence, and advance over the LF if we're in the middle of one. 399 if (lineEnd < fileLen && 400 ucharBuf[lineEnd-1] == 0x0d && 401 ucharBuf[lineEnd] == 0x0a) 402 { 403 lineEnd++; 404 } 405 } 406 407 408 //------------------------------------------------------------------------------------------ 409 // 410 // printMatch Called when a matching line has been located. 411 // Print out the line from the file with the match, after 412 // converting it back to the default code page. 413 // 414 //------------------------------------------------------------------------------------------ 415 void printMatch() { 416 char buf[2000]; 417 UErrorCode status = U_ZERO_ERROR; 418 419 // If we haven't already created a converter for output, do it now. 420 if (outConverter == 0) { 421 outConverter = ucnv_open(NULL, &status); 422 if (U_FAILURE(status)) { 423 fprintf(stderr, "ugrep: Error opening default converter: \"%s\"\n", 424 u_errorName(status)); 425 exit(-1); 426 } 427 }; 428 429 // Convert the line to be printed back to the default 8 bit code page. 430 // If the line is too long for our buffer, just truncate it. 431 ucnv_fromUChars(outConverter, 432 buf, // destination buffer for conversion 433 sizeof(buf), // capacity of destination buffer 434 &ucharBuf[lineStart], // Input to conversion 435 lineEnd-lineStart, // number of UChars to convert 436 &status); 437 buf[sizeof(buf)-1] = 0; // Add null for use in case of too long lines. 438 // The converter null-terminates its output unless 439 // the buffer completely fills. 440 441 if (displayFileName) { 442 printf("%s:", fileName); 443 } 444 if (displayLineNum) { 445 printf("%d:", lineNum); 446 } 447 printf("%s", buf); 448 } 449 450