Home | History | Annotate | Download | only in ugrep
      1 /**************************************************************************
      2 *
      3 *   Copyright (C) 2002-2010, International Business Machines
      4 *   Corporation and others.  All Rights Reserved.
      5 *
      6 ***************************************************************************
      7 */
      8 
      9 //
     10 //   ugrep  - an ICU sample program illustrating the use of ICU Regular Expressions.
     11 //
     12 //            The use of the ICU Regex API all occurs within the main()
     13 //            function.  The rest of the code deals with with opening files,
     14 //            encoding conversions, printing results, etc.
     15 //
     16 //            This is not a full-featured grep program.  The command line options
     17 //            have been kept to a minimum to avoid complicating the sample code.
     18 //
     19 
     20 
     21 
     22 #include <stdio.h>
     23 #include <stdlib.h>
     24 #include <string.h>
     25 
     26 #include "unicode/utypes.h"
     27 #include "unicode/ustring.h"
     28 #include "unicode/regex.h"
     29 #include "unicode/ucnv.h"
     30 #include "unicode/uclean.h"
     31 
     32 
     33 //
     34 //  The following variables contain paramters that may be set from the command line.
     35 //
     36 const char *pattern = NULL;     // The regular expression
     37 int        firstFileNum;        //  argv index of the first file name
     38 UBool      displayFileName = FALSE;
     39 UBool      displayLineNum  = FALSE;
     40 
     41 
     42 //
     43 //  Info regarding the file currently being processed
     44 //
     45 const char *fileName;
     46 int         fileLen;              // Length, in UTF-16 Code Units.
     47 
     48 UChar      *ucharBuf = 0;         // Buffer, holds converted file.  (Simple minded program, always reads
     49                                   //   the whole file at once.
     50 
     51 char       *charBuf = 0;          // Buffer, for original, unconverted file data.
     52 
     53 
     54 //
     55 //  Info regarding the line currently being processed
     56 //
     57 int      lineStart;     // Index of first char of the current line in the file buffer
     58 int      lineEnd;       // Index of char following the new line sequence for the current line
     59 int      lineNum;
     60 
     61 //
     62 //  Converter, used on output to convert Unicode data back to char *
     63 //             so that it will display in non-Unicode terminal windows.
     64 //
     65 UConverter  *outConverter = 0;
     66 
     67 //
     68 //  Function forward declarations
     69 //
     70 void processOptions(int argc, const char **argv);
     71 void nextLine(int start);
     72 void printMatch();
     73 void printUsage();
     74 void readFile(const char *name);
     75 
     76 
     77 
     78 //------------------------------------------------------------------------------------------
     79 //
     80 //   main          for ugrep
     81 //
     82 //           Structurally, all use of the ICU Regular Expression API is in main(),
     83 //           and all of the supporting stuff necessary to make a running program, but
     84 //           not directly related to regular expressions, is factored out into these other
     85 //           functions.
     86 //
     87 //------------------------------------------------------------------------------------------
     88 int main(int argc, const char** argv) {
     89     UBool     matchFound = FALSE;
     90 
     91     //
     92     //  Process the commmand line options.
     93     //
     94     processOptions(argc, argv);
     95 
     96     //
     97     // Create a RegexPattern object from the user supplied pattern string.
     98     //
     99     UErrorCode status = U_ZERO_ERROR;   // All ICU operations report success or failure
    100                                         //   in a status variable.
    101 
    102     UParseError    parseErr;            // In the event of a syntax error in the regex pattern,
    103                                         //   this struct will contain the position of the
    104                                         //   error.
    105 
    106     RegexPattern  *rePat = RegexPattern::compile(pattern, parseErr, status);
    107                                         // Note that C++ is doing an automatic conversion
    108                                         //  of the (char *) pattern to a temporary
    109                                         //  UnicodeString object.
    110     if (U_FAILURE(status)) {
    111         fprintf(stderr, "ugrep:  error in pattern: \"%s\" at position %d\n",
    112             u_errorName(status), parseErr.offset);
    113         exit(-1);
    114     }
    115 
    116     //
    117     // Create a RegexMatcher from the newly created pattern.
    118     //
    119     UnicodeString empty;
    120     RegexMatcher *matcher = rePat->matcher(empty, status);
    121     if (U_FAILURE(status)) {
    122         fprintf(stderr, "ugrep:  error in creating RegexMatcher: \"%s\"\n",
    123             u_errorName(status));
    124         exit(-1);
    125     }
    126 
    127     //
    128     // Loop, processing each of the input files.
    129     //
    130     for (int fileNum=firstFileNum; fileNum < argc; fileNum++) {
    131         readFile(argv[fileNum]);
    132 
    133         //
    134         //  Loop through the lines of a file, trying to match the regex pattern on each.
    135         //
    136         for (nextLine(0); lineStart<fileLen; nextLine(lineEnd)) {
    137             UnicodeString s(FALSE, ucharBuf+lineStart, lineEnd-lineStart);
    138             matcher->reset(s);
    139             if (matcher->find()) {
    140                 matchFound = TRUE;
    141                 printMatch();
    142             }
    143         }
    144     }
    145 
    146     //
    147     //  Clean up
    148     //
    149     delete matcher;
    150     delete rePat;
    151     free(ucharBuf);
    152     free(charBuf);
    153     ucnv_close(outConverter);
    154 
    155     u_cleanup();       // shut down ICU, release any cached data it owns.
    156 
    157     return matchFound? 0: 1;
    158 }
    159 
    160 
    161 
    162 //------------------------------------------------------------------------------------------
    163 //
    164 //   doOptions          Run through the command line options, and set
    165 //                      the global variables accordingly.
    166 //
    167 //                      exit without returning if an error occured and
    168 //                      ugrep should not proceed further.
    169 //
    170 //------------------------------------------------------------------------------------------
    171 void processOptions(int argc, const char **argv) {
    172     int            optInd;
    173     UBool          doUsage   = FALSE;
    174     UBool          doVersion = FALSE;
    175     const char    *arg;
    176 
    177 
    178     for(optInd = 1; optInd < argc; ++optInd) {
    179         arg = argv[optInd];
    180 
    181         /* version info */
    182         if(strcmp(arg, "-V") == 0 || strcmp(arg, "--version") == 0) {
    183             doVersion = TRUE;
    184         }
    185         /* usage info */
    186         else if(strcmp(arg, "--help") == 0) {
    187             doUsage = TRUE;
    188         }
    189         else if(strcmp(arg, "-n") == 0 || strcmp(arg, "--line-number") == 0) {
    190             displayLineNum = TRUE;
    191         }
    192         /* POSIX.1 says all arguments after -- are not options */
    193         else if(strcmp(arg, "--") == 0) {
    194             /* skip the -- */
    195             ++optInd;
    196             break;
    197         }
    198         /* unrecognized option */
    199         else if(strncmp(arg, "-", strlen("-")) == 0) {
    200             printf("ugrep: invalid option -- %s\n", arg+1);
    201             doUsage = TRUE;
    202         }
    203         /* done with options */
    204         else {
    205             break;
    206         }
    207     }
    208 
    209     if (doUsage) {
    210         printUsage();
    211         exit(0);
    212     }
    213 
    214     if (doVersion) {
    215         printf("ugrep version 0.01\n");
    216         if (optInd == argc) {
    217             exit(0);
    218         }
    219     }
    220 
    221     int  remainingArgs = argc-optInd;     // pattern file ...
    222     if (remainingArgs < 2) {
    223         fprintf(stderr, "ugrep:  files or pattern are missing.\n");
    224         printUsage();
    225         exit(1);
    226     }
    227 
    228     if (remainingArgs > 2) {
    229         // More than one file to be processed.   Display file names with match output.
    230         displayFileName = TRUE;
    231     }
    232 
    233     pattern      = argv[optInd];
    234     firstFileNum = optInd+1;
    235 }
    236 
    237 //------------------------------------------------------------------------------------------
    238 //
    239 //   printUsage
    240 //
    241 //------------------------------------------------------------------------------------------
    242 void printUsage() {
    243     printf("ugrep [options] pattern file...\n"
    244         "     -V or --version     display version information\n"
    245         "     --help              display this help and exit\n"
    246         "     --                  stop further option processing\n"
    247         "-n,  --line-number       Prefix each line of output with the line number within its input file.\n"
    248         );
    249     exit(0);
    250 }
    251 
    252 //------------------------------------------------------------------------------------------
    253 //
    254 //    readFile          Read a file into memory, and convert it to Unicode.
    255 //
    256 //                      Since this is just a demo program, take the simple minded approach
    257 //                      of always reading the whole file at once.  No intelligent buffering
    258 //                      is done.
    259 //
    260 //------------------------------------------------------------------------------------------
    261 void readFile(const char *name) {
    262 
    263     //
    264     //  Initialize global file variables
    265     //
    266     fileName = name;
    267     fileLen  = 0;      // zero length prevents processing in case of errors.
    268 
    269 
    270     //
    271     //  Open the file and determine its size.
    272     //
    273     FILE *file = fopen(name, "rb");
    274     if (file == 0 ) {
    275         fprintf(stderr, "ugrep: Could not open file \"%s\"\n", fileName);
    276         return;
    277     }
    278     fseek(file, 0, SEEK_END);
    279     int rawFileLen = ftell(file);
    280     fseek(file, 0, SEEK_SET);
    281 
    282 
    283     //
    284     //   Read in the file
    285     //
    286     charBuf    = (char *)realloc(charBuf, rawFileLen+1);   // Need error checking...
    287     int t = fread(charBuf, 1, rawFileLen, file);
    288     if (t != rawFileLen)  {
    289         fprintf(stderr, "Error reading file \"%s\"\n", fileName);
    290         fclose(file);
    291         return;
    292     }
    293     charBuf[rawFileLen]=0;
    294     fclose(file);
    295 
    296     //
    297     // Look for a Unicode Signature (BOM) in the data
    298     //
    299     int32_t        signatureLength;
    300     const char *   charDataStart = charBuf;
    301     UErrorCode     status        = U_ZERO_ERROR;
    302     const char*    encoding      = ucnv_detectUnicodeSignature(
    303                            charDataStart, rawFileLen, &signatureLength, &status);
    304     if (U_FAILURE(status)) {
    305         fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_detectUnicodeSignature()\n",
    306             u_errorName(status));
    307         return;
    308     }
    309     if(encoding!=NULL ){
    310         charDataStart  += signatureLength;
    311         rawFileLen     -= signatureLength;
    312     }
    313 
    314     //
    315     // Open a converter to take the file to UTF-16
    316     //
    317     UConverter* conv;
    318     conv = ucnv_open(encoding, &status);
    319     if (U_FAILURE(status)) {
    320         fprintf(stderr, "ugrep: ICU Error \"%s\" from ucnv_open()\n", u_errorName(status));
    321         return;
    322     }
    323 
    324     //
    325     // Convert the file data to UChar.
    326     //  Preflight first to determine required buffer size.
    327     //
    328     uint32_t destCap = ucnv_toUChars(conv,
    329                        NULL,           //  dest,
    330                        0,              //  destCapacity,
    331                        charDataStart,
    332                        rawFileLen,
    333                        &status);
    334     if (status != U_BUFFER_OVERFLOW_ERROR) {
    335         fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
    336         return;
    337     };
    338 
    339     status = U_ZERO_ERROR;
    340     ucharBuf = (UChar *)realloc(ucharBuf, (destCap+1) * sizeof(UChar));
    341     ucnv_toUChars(conv,
    342         ucharBuf,           //  dest,
    343         destCap+1,
    344         charDataStart,
    345         rawFileLen,
    346         &status);
    347     if (U_FAILURE(status)) {
    348         fprintf(stderr, "ugrep: ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status));
    349         return;
    350     };
    351     ucnv_close(conv);
    352 
    353     //
    354     //  Successful conversion.  Set the global size variables so that
    355     //     the rest of the processing will proceed for this file.
    356     //
    357     fileLen = destCap;
    358 }
    359 
    360 
    361 
    362 
    363 
    364 //------------------------------------------------------------------------------------------
    365 //
    366 //   nextLine           Advance the line index variables, starting at the
    367 //                      specified position in the input file buffer, by
    368 //                      scanning forwrd until the next end-of-line.
    369 //
    370 //                      Need to take into account all of the possible Unicode
    371 //                      line ending sequences.
    372 //
    373 //------------------------------------------------------------------------------------------
    374 void nextLine(int  startPos) {
    375     if (startPos == 0) {
    376         lineNum = 0;
    377     } else {
    378         lineNum++;
    379     }
    380     lineStart = lineEnd = startPos;
    381 
    382     for (;;) {
    383         if (lineEnd >= fileLen) {
    384             return;
    385         }
    386         UChar c = ucharBuf[lineEnd];
    387         lineEnd++;
    388         if (c == 0x0a   ||       // Line Feed
    389             c == 0x0c   ||       // Form Feed
    390             c == 0x0d   ||       // Carriage Return
    391             c == 0x85   ||       // Next Line
    392             c == 0x2028 ||       // Line Separator
    393             c == 0x2029)         // Paragraph separator
    394         {
    395             break;
    396         }
    397     }
    398 
    399     // Check for CR/LF sequence, and advance over the LF if we're in the middle of one.
    400     if (lineEnd < fileLen           &&
    401         ucharBuf[lineEnd-1] == 0x0d &&
    402         ucharBuf[lineEnd]   == 0x0a)
    403     {
    404         lineEnd++;
    405     }
    406 }
    407 
    408 
    409 //------------------------------------------------------------------------------------------
    410 //
    411 //   printMatch         Called when a matching line has been located.
    412 //                      Print out the line from the file with the match, after
    413 //                         converting it back to the default code page.
    414 //
    415 //------------------------------------------------------------------------------------------
    416 void printMatch() {
    417     char                buf[2000];
    418     UErrorCode         status       = U_ZERO_ERROR;
    419 
    420     // If we haven't already created a converter for output, do it now.
    421     if (outConverter == 0) {
    422         outConverter = ucnv_open(NULL, &status);
    423         if (U_FAILURE(status)) {
    424             fprintf(stderr, "ugrep:  Error opening default converter: \"%s\"\n",
    425                 u_errorName(status));
    426             exit(-1);
    427         }
    428     };
    429 
    430     // Convert the line to be printed back to the default 8 bit code page.
    431     //   If the line is too long for our buffer, just truncate it.
    432     ucnv_fromUChars(outConverter,
    433                     buf,                   // destination buffer for conversion
    434                     sizeof(buf),           // capacity of destination buffer
    435                     &ucharBuf[lineStart],   // Input to conversion
    436                     lineEnd-lineStart,     // number of UChars to convert
    437                     &status);
    438     buf[sizeof(buf)-1] = 0;                // Add null for use in case of too long lines.
    439                                            // The converter null-terminates its output unless
    440                                            //   the buffer completely fills.
    441 
    442     if (displayFileName) {
    443         printf("%s:", fileName);
    444     }
    445     if (displayLineNum) {
    446         printf("%d:", lineNum);
    447     }
    448     printf("%s", buf);
    449 }
    450 
    451