1 /************************************************************************** 2 * 3 * Copyright (C) 2002-2010, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 * 6 *************************************************************************** 7 */ 8 9 // 10 // ugrep - an ICU sample program illustrating the use of ICU Regular Expressions. 11 // 12 // The use of the ICU Regex API all occurs within the main() 13 // function. The rest of the code deals with with opening files, 14 // encoding conversions, printing results, etc. 15 // 16 // This is not a full-featured grep program. The command line options 17 // have been kept to a minimum to avoid complicating the sample code. 18 // 19 20 21 22 #include <stdio.h> 23 #include <stdlib.h> 24 #include <string.h> 25 26 #include "unicode/utypes.h" 27 #include "unicode/ustring.h" 28 #include "unicode/regex.h" 29 #include "unicode/ucnv.h" 30 #include "unicode/uclean.h" 31 32 33 // 34 // The following variables contain paramters that may be set from the command line. 35 // 36 const char *pattern = NULL; // The regular expression 37 int firstFileNum; // argv index of the first file name 38 UBool displayFileName = FALSE; 39 UBool displayLineNum = FALSE; 40 41 42 // 43 // Info regarding the file currently being processed 44 // 45 const char *fileName; 46 int fileLen; // Length, in UTF-16 Code Units. 47 48 UChar *ucharBuf = 0; // Buffer, holds converted file. (Simple minded program, always reads 49 // the whole file at once. 50 51 char *charBuf = 0; // Buffer, for original, unconverted file data. 52 53 54 // 55 // Info regarding the line currently being processed 56 // 57 int lineStart; // Index of first char of the current line in the file buffer 58 int lineEnd; // Index of char following the new line sequence for the current line 59 int lineNum; 60 61 // 62 // Converter, used on output to convert Unicode data back to char * 63 // so that it will display in non-Unicode terminal windows. 64 // 65 UConverter *outConverter = 0; 66 67 // 68 // Function forward declarations 69 // 70 void processOptions(int argc, const char **argv); 71 void nextLine(int start); 72 void printMatch(); 73 void printUsage(); 74 void readFile(const char *name); 75 76 77 78 //------------------------------------------------------------------------------------------ 79 // 80 // main for ugrep 81 // 82 // Structurally, all use of the ICU Regular Expression API is in main(), 83 // and all of the supporting stuff necessary to make a running program, but 84 // not directly related to regular expressions, is factored out into these other 85 // functions. 86 // 87 //------------------------------------------------------------------------------------------ 88 int main(int argc, const char** argv) { 89 UBool matchFound = FALSE; 90 91 // 92 // Process the commmand line options. 93 // 94 processOptions(argc, argv); 95 96 // 97 // Create a RegexPattern object from the user supplied pattern string. 98 // 99 UErrorCode status = U_ZERO_ERROR; // All ICU operations report success or failure 100 // in a status variable. 101 102 UParseError parseErr; // In the event of a syntax error in the regex pattern, 103 // this struct will contain the position of the 104 // error. 105 106 RegexPattern *rePat = RegexPattern::compile(pattern, parseErr, status); 107 // Note that C++ is doing an automatic conversion 108 // of the (char *) pattern to a temporary 109 // UnicodeString object. 110 if (U_FAILURE(status)) { 111 fprintf(stderr, "ugrep: error in pattern: \"%s\" at position %d\n", 112 u_errorName(status), parseErr.offset); 113 exit(-1); 114 } 115 116 // 117 // Create a RegexMatcher from the newly created pattern. 118 // 119 UnicodeString empty; 120 RegexMatcher *matcher = rePat->matcher(empty, status); 121 if (U_FAILURE(status)) { 122 fprintf(stderr, "ugrep: error in creating RegexMatcher: \"%s\"\n", 123 u_errorName(status)); 124 exit(-1); 125 } 126 127 // 128 // Loop, processing each of the input files. 129 // 130 for (int fileNum=firstFileNum; fileNum < argc; fileNum++) { 131 readFile(argv[fileNum]); 132 133 // 134 // Loop through the lines of a file, trying to match the regex pattern on each. 135 // 136 for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) { 137 UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart); 138 matcher->reset(s); 139 if (matcher->find()) { 140 matchFound = TRUE; 141 printMatch(); 142 } 143 } 144 } 145 146 // 147 // Clean up 148 // 149 delete matcher; 150 delete rePat; 151 free(ucharBuf); 152 free(charBuf); 153 ucnv_close(outConverter); 154 155 u_cleanup(); // shut down ICU, release any cached data it owns. 156 157 return matchFound? 0: 1; 158 } 159 160 161 162 //------------------------------------------------------------------------------------------ 163 // 164 // doOptions Run through the command line options, and set 165 // the global variables accordingly. 166 // 167 // exit without returning if an error occured and 168 // ugrep should not proceed further. 169 // 170 //------------------------------------------------------------------------------------------ 171 void processOptions(int argc, const char **argv) { 172 int optInd; 173 UBool doUsage = FALSE; 174 UBool doVersion = FALSE; 175 const char *arg; 176 177 178 for(optInd = 1; optInd < argc; ++optInd) { 179 arg = argv[optInd]; 180 181 /* version info */ 182 if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) { 183 doVersion = TRUE; 184 } 185 /* usage info */ 186 else if(strcmp(arg, "--help") == 0) { 187 doUsage = TRUE; 188 } 189 else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) { 190 displayLineNum = TRUE; 191 } 192 /* POSIX.1 says all arguments after -- are not options */ 193 else if(strcmp(arg, "--") == 0) { 194 /* skip the -- */ 195 ++optInd; 196 break; 197 } 198 /* unrecognized option */ 199 else if(strncmp(arg, "-", strlen("-")) == 0) { 200 printf("ugrep: invalid option -- %s\n", arg+1); 201 doUsage = TRUE; 202 } 203 /* done with options */ 204 else { 205 break; 206 } 207 } 208 209 if (doUsage) { 210 printUsage(); 211 exit(0); 212 } 213 214 if (doVersion) { 215 printf("ugrep version 0.01\n"); 216 if (optInd == argc) { 217 exit(0); 218 } 219 } 220 221 int remainingArgs = argc-optInd; // pattern file ... 222 if (remainingArgs < 2) { 223 fprintf(stderr, "ugrep: files or pattern are missing.\n"); 224 printUsage(); 225 exit(1); 226 } 227 228 if (remainingArgs > 2) { 229 // More than one file to be processed. Display file names with match output. 230 displayFileName = TRUE; 231 } 232 233 pattern = argv[optInd]; 234 firstFileNum = optInd+1; 235 } 236 237 //------------------------------------------------------------------------------------------ 238 // 239 // printUsage 240 // 241 //------------------------------------------------------------------------------------------ 242 void printUsage() { 243 printf("ugrep [options] pattern file...\n" 244 " -V or --version display version information\n" 245 " --help display this help and exit\n" 246 " -- stop further option processing\n" 247 "-n, --line-number Prefix each line of output with the line number within its input file.\n" 248 ); 249 exit(0); 250 } 251 252 //------------------------------------------------------------------------------------------ 253 // 254 // readFile Read a file into memory, and convert it to Unicode. 255 // 256 // Since this is just a demo program, take the simple minded approach 257 // of always reading the whole file at once. No intelligent buffering 258 // is done. 259 // 260 //------------------------------------------------------------------------------------------ 261 void readFile(const char *name) { 262 263 // 264 // Initialize global file variables 265 // 266 fileName = name; 267 fileLen = 0; // zero length prevents processing in case of errors. 268 269 270 // 271 // Open the file and determine its size. 272 // 273 FILE *file = fopen(name, "rb"); 274 if (file == 0 ) { 275 fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName); 276 return; 277 } 278 fseek(file, 0, SEEK_END); 279 int rawFileLen = ftell(file); 280 fseek(file, 0, SEEK_SET); 281 282 283 // 284 // Read in the file 285 // 286 charBuf = (char *)realloc(charBuf, rawFileLen+1); // Need error checking... 287 int t = fread(charBuf, 1, rawFileLen, file); 288 if (t != rawFileLen) { 289 fprintf(stderr, "Error reading file \"%s\"\n", fileName); 290 fclose(file); 291 return; 292 } 293 charBuf[rawFileLen]=0; 294 fclose(file); 295 296 // 297 // Look for a Unicode Signature (BOM) in the data 298 // 299 int32_t signatureLength; 300 const char * charDataStart = charBuf; 301 UErrorCode status = U_ZERO_ERROR; 302 const char* encoding = ucnv_detectUnicodeSignature( 303 charDataStart, rawFileLen, &signatureLength, &status); 304 if (U_FAILURE(status)) { 305 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n", 306 u_errorName(status)); 307 return; 308 } 309 if(encoding!=NULL ){ 310 charDataStart += signatureLength; 311 rawFileLen -= signatureLength; 312 } 313 314 // 315 // Open a converter to take the file to UTF-16 316 // 317 UConverter* conv; 318 conv = ucnv_open(encoding, &status); 319 if (U_FAILURE(status)) { 320 fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status)); 321 return; 322 } 323 324 // 325 // Convert the file data to UChar. 326 // Preflight first to determine required buffer size. 327 // 328 uint32_t destCap = ucnv_toUChars(conv, 329 NULL, // dest, 330 0, // destCapacity, 331 charDataStart, 332 rawFileLen, 333 &status); 334 if (status != U_BUFFER_OVERFLOW_ERROR) { 335 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 336 return; 337 }; 338 339 status = U_ZERO_ERROR; 340 ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar)); 341 ucnv_toUChars(conv, 342 ucharBuf, // dest, 343 destCap+1, 344 charDataStart, 345 rawFileLen, 346 &status); 347 if (U_FAILURE(status)) { 348 fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 349 return; 350 }; 351 ucnv_close(conv); 352 353 // 354 // Successful conversion. Set the global size variables so that 355 // the rest of the processing will proceed for this file. 356 // 357 fileLen = destCap; 358 } 359 360 361 362 363 364 //------------------------------------------------------------------------------------------ 365 // 366 // nextLine Advance the line index variables, starting at the 367 // specified position in the input file buffer, by 368 // scanning forwrd until the next end-of-line. 369 // 370 // Need to take into account all of the possible Unicode 371 // line ending sequences. 372 // 373 //------------------------------------------------------------------------------------------ 374 void nextLine(int startPos) { 375 if (startPos == 0) { 376 lineNum = 0; 377 } else { 378 lineNum++; 379 } 380 lineStart = lineEnd = startPos; 381 382 for (;;) { 383 if (lineEnd >= fileLen) { 384 return; 385 } 386 UChar c = ucharBuf[lineEnd]; 387 lineEnd++; 388 if (c == 0x0a || // Line Feed 389 c == 0x0c || // Form Feed 390 c == 0x0d || // Carriage Return 391 c == 0x85 || // Next Line 392 c == 0x2028 || // Line Separator 393 c == 0x2029) // Paragraph separator 394 { 395 break; 396 } 397 } 398 399 // Check for CR/LF sequence, and advance over the LF if we're in the middle of one. 400 if (lineEnd < fileLen && 401 ucharBuf[lineEnd-1] == 0x0d && 402 ucharBuf[lineEnd] == 0x0a) 403 { 404 lineEnd++; 405 } 406 } 407 408 409 //------------------------------------------------------------------------------------------ 410 // 411 // printMatch Called when a matching line has been located. 412 // Print out the line from the file with the match, after 413 // converting it back to the default code page. 414 // 415 //------------------------------------------------------------------------------------------ 416 void printMatch() { 417 char buf[2000]; 418 UErrorCode status = U_ZERO_ERROR; 419 420 // If we haven't already created a converter for output, do it now. 421 if (outConverter == 0) { 422 outConverter = ucnv_open(NULL, &status); 423 if (U_FAILURE(status)) { 424 fprintf(stderr, "ugrep: Error opening default converter: \"%s\"\n", 425 u_errorName(status)); 426 exit(-1); 427 } 428 }; 429 430 // Convert the line to be printed back to the default 8 bit code page. 431 // If the line is too long for our buffer, just truncate it. 432 ucnv_fromUChars(outConverter, 433 buf, // destination buffer for conversion 434 sizeof(buf), // capacity of destination buffer 435 &ucharBuf[lineStart], // Input to conversion 436 lineEnd-lineStart, // number of UChars to convert 437 &status); 438 buf[sizeof(buf)-1] = 0; // Add null for use in case of too long lines. 439 // The converter null-terminates its output unless 440 // the buffer completely fills. 441 442 if (displayFileName) { 443 printf("%s:", fileName); 444 } 445 if (displayLineNum) { 446 printf("%d:", lineNum); 447 } 448 printf("%s", buf); 449 } 450 451