Home | History | Annotate | Download | only in i18n
      1 //
      2 //  file:  repattrn.cpp
      3 //
      4 /*
      5 ***************************************************************************
      6 *   Copyright (C) 2002-2008 International Business Machines Corporation   *
      7 *   and others. All rights reserved.                                      *
      8 ***************************************************************************
      9 */
     10 
     11 #include "unicode/utypes.h"
     12 
     13 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
     14 
     15 #include "unicode/regex.h"
     16 #include "unicode/uclean.h"
     17 #include "uassert.h"
     18 #include "uvector.h"
     19 #include "uvectr32.h"
     20 #include "regexcmp.h"
     21 #include "regeximp.h"
     22 #include "regexst.h"
     23 
     24 U_NAMESPACE_BEGIN
     25 
     26 //--------------------------------------------------------------------------
     27 //
     28 //    RegexPattern    Default Constructor
     29 //
     30 //--------------------------------------------------------------------------
     31 RegexPattern::RegexPattern() {
     32     UErrorCode status = U_ZERO_ERROR;
     33     u_init(&status);
     34     // Init all of this instances data.
     35     init();
     36 
     37     // Lazy init of all shared global sets.
     38     RegexStaticSets::initGlobals(&fDeferredStatus);
     39 }
     40 
     41 
     42 //--------------------------------------------------------------------------
     43 //
     44 //   Copy Constructor        Note:  This is a rather inefficient implementation,
     45 //                                  but it probably doesn't matter.
     46 //
     47 //--------------------------------------------------------------------------
     48 RegexPattern::RegexPattern(const RegexPattern &other) :  UObject(other) {
     49     init();
     50     *this = other;
     51 }
     52 
     53 
     54 
     55 //--------------------------------------------------------------------------
     56 //
     57 //    Assignmenet Operator
     58 //
     59 //--------------------------------------------------------------------------
     60 RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
     61     if (this == &other) {
     62         // Source and destination are the same.  Don't do anything.
     63         return *this;
     64     }
     65 
     66     // Clean out any previous contents of object being assigned to.
     67     zap();
     68 
     69     // Give target object a default initialization
     70     init();
     71 
     72     // Copy simple fields
     73     fPattern          = other.fPattern;
     74     fFlags            = other.fFlags;
     75     fLiteralText      = other.fLiteralText;
     76     fDeferredStatus   = other.fDeferredStatus;
     77     fMinMatchLen      = other.fMinMatchLen;
     78     fFrameSize        = other.fFrameSize;
     79     fDataSize         = other.fDataSize;
     80     fMaxCaptureDigits = other.fMaxCaptureDigits;
     81     fStaticSets       = other.fStaticSets;
     82     fStaticSets8      = other.fStaticSets8;
     83 
     84     fStartType        = other.fStartType;
     85     fInitialStringIdx = other.fInitialStringIdx;
     86     fInitialStringLen = other.fInitialStringLen;
     87     *fInitialChars    = *other.fInitialChars;
     88     fInitialChar      = other.fInitialChar;
     89     *fInitialChars8   = *other.fInitialChars8;
     90 
     91     //  Copy the pattern.  It's just values, nothing deep to copy.
     92     fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus);
     93     fGroupMap->assign(*other.fGroupMap, fDeferredStatus);
     94 
     95     //  Copy the Unicode Sets.
     96     //    Could be made more efficient if the sets were reference counted and shared,
     97     //    but I doubt that pattern copying will be particularly common.
     98     //    Note:  init() already added an empty element zero to fSets
     99     int32_t i;
    100     int32_t  numSets = other.fSets->size();
    101     fSets8 = new Regex8BitSet[numSets];
    102     if (fSets8 == NULL) {
    103     	fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    104     	return *this;
    105     }
    106     for (i=1; i<numSets; i++) {
    107         if (U_FAILURE(fDeferredStatus)) {
    108             return *this;
    109         }
    110         UnicodeSet *sourceSet = (UnicodeSet *)other.fSets->elementAt(i);
    111         UnicodeSet *newSet    = new UnicodeSet(*sourceSet);
    112         if (newSet == NULL) {
    113             fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    114             break;
    115         }
    116         fSets->addElement(newSet, fDeferredStatus);
    117         fSets8[i] = other.fSets8[i];
    118     }
    119 
    120     return *this;
    121 }
    122 
    123 
    124 //--------------------------------------------------------------------------
    125 //
    126 //    init        Shared initialization for use by constructors.
    127 //                Bring an uninitialized RegexPattern up to a default state.
    128 //
    129 //--------------------------------------------------------------------------
    130 void RegexPattern::init() {
    131     fPattern.remove();
    132     fFlags            = 0;
    133     fCompiledPat      = 0;
    134     fLiteralText.remove();
    135     fSets             = NULL;
    136     fSets8            = NULL;
    137     fDeferredStatus   = U_ZERO_ERROR;
    138     fMinMatchLen      = 0;
    139     fFrameSize        = 0;
    140     fDataSize         = 0;
    141     fGroupMap         = NULL;
    142     fMaxCaptureDigits = 1;
    143     fStaticSets       = NULL;
    144     fStaticSets8      = NULL;
    145     fStartType        = START_NO_INFO;
    146     fInitialStringIdx = 0;
    147     fInitialStringLen = 0;
    148     fInitialChars     = NULL;
    149     fInitialChar      = 0;
    150     fInitialChars8    = NULL;
    151 
    152     fCompiledPat      = new UVector32(fDeferredStatus);
    153     fGroupMap         = new UVector32(fDeferredStatus);
    154     fSets             = new UVector(fDeferredStatus);
    155     fInitialChars     = new UnicodeSet;
    156     fInitialChars8    = new Regex8BitSet;
    157     if (U_FAILURE(fDeferredStatus)) {
    158         return;
    159     }
    160     if (fCompiledPat == NULL  || fGroupMap == NULL || fSets == NULL ||
    161         fInitialChars == NULL || fInitialChars8 == NULL) {
    162         fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
    163         return;
    164     }
    165 
    166     // Slot zero of the vector of sets is reserved.  Fill it here.
    167     fSets->addElement((int32_t)0, fDeferredStatus);
    168 }
    169 
    170 
    171 //--------------------------------------------------------------------------
    172 //
    173 //   zap            Delete everything owned by this RegexPattern.
    174 //
    175 //--------------------------------------------------------------------------
    176 void RegexPattern::zap() {
    177     delete fCompiledPat;
    178     fCompiledPat = NULL;
    179     int i;
    180     for (i=1; i<fSets->size(); i++) {
    181         UnicodeSet *s;
    182         s = (UnicodeSet *)fSets->elementAt(i);
    183         if (s != NULL) {
    184             delete s;
    185         }
    186     }
    187     delete fSets;
    188     fSets = NULL;
    189     delete[] fSets8;
    190     fSets8 = NULL;
    191     delete fGroupMap;
    192     fGroupMap = NULL;
    193     delete fInitialChars;
    194     fInitialChars = NULL;
    195     delete fInitialChars8;
    196     fInitialChars8 = NULL;
    197 }
    198 
    199 
    200 //--------------------------------------------------------------------------
    201 //
    202 //   Destructor
    203 //
    204 //--------------------------------------------------------------------------
    205 RegexPattern::~RegexPattern() {
    206     zap();
    207 }
    208 
    209 
    210 //--------------------------------------------------------------------------
    211 //
    212 //   Clone
    213 //
    214 //--------------------------------------------------------------------------
    215 RegexPattern  *RegexPattern::clone() const {
    216     RegexPattern  *copy = new RegexPattern(*this);
    217     return copy;
    218 }
    219 
    220 
    221 //--------------------------------------------------------------------------
    222 //
    223 //   operator ==   (comparison)    Consider to patterns to be == if the
    224 //                                 pattern strings and the flags are the same.
    225 //
    226 //--------------------------------------------------------------------------
    227 UBool   RegexPattern::operator ==(const RegexPattern &other) const {
    228     UBool r = this->fFlags    == other.fFlags &&
    229               this->fPattern  == other.fPattern &&
    230               this->fDeferredStatus == other.fDeferredStatus;
    231     return r;
    232 }
    233 
    234 //---------------------------------------------------------------------
    235 //
    236 //   compile
    237 //
    238 //---------------------------------------------------------------------
    239 RegexPattern * U_EXPORT2
    240 RegexPattern::compile(const UnicodeString &regex,
    241                       uint32_t             flags,
    242                       UParseError          &pe,
    243                       UErrorCode           &status)
    244 {
    245 
    246     if (U_FAILURE(status)) {
    247         return NULL;
    248     }
    249 
    250     const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS |
    251                               UREGEX_DOTALL   | UREGEX_MULTILINE        | UREGEX_UWORD |
    252                               UREGEX_ERROR_ON_UNKNOWN_ESCAPES           | UREGEX_UNIX_LINES;
    253 
    254     if ((flags & ~allFlags) != 0) {
    255         status = U_REGEX_INVALID_FLAG;
    256         return NULL;
    257     }
    258 
    259     if ((flags & UREGEX_CANON_EQ) != 0) {
    260         status = U_REGEX_UNIMPLEMENTED;
    261         return NULL;
    262     }
    263 
    264     RegexPattern *This = new RegexPattern;
    265     if (This == NULL) {
    266         status = U_MEMORY_ALLOCATION_ERROR;
    267         return NULL;
    268     }
    269     if (U_FAILURE(This->fDeferredStatus)) {
    270         status = This->fDeferredStatus;
    271         delete This;
    272         return NULL;
    273     }
    274     This->fFlags = flags;
    275 
    276     RegexCompile     compiler(This, status);
    277     compiler.compile(regex, pe, status);
    278 
    279     if (U_FAILURE(status)) {
    280         delete This;
    281         This = NULL;
    282     }
    283 
    284     return This;
    285 }
    286 
    287 //
    288 //   compile with default flags.
    289 //
    290 RegexPattern * U_EXPORT2
    291 RegexPattern::compile(const UnicodeString &regex,
    292                       UParseError         &pe,
    293                       UErrorCode          &err)
    294 {
    295     return compile(regex, 0, pe, err);
    296 }
    297 
    298 
    299 
    300 //
    301 //   compile with no UParseErr parameter.
    302 //
    303 RegexPattern * U_EXPORT2
    304 RegexPattern::compile( const UnicodeString &regex,
    305         uint32_t             flags,
    306         UErrorCode           &err)
    307 {
    308     UParseError pe;
    309     return compile(regex, flags, pe, err);
    310 }
    311 
    312 
    313 
    314 //---------------------------------------------------------------------
    315 //
    316 //   flags
    317 //
    318 //---------------------------------------------------------------------
    319 uint32_t RegexPattern::flags() const {
    320     return fFlags;
    321 }
    322 
    323 
    324 //---------------------------------------------------------------------
    325 //
    326 //   matcher(UnicodeString, err)
    327 //
    328 //---------------------------------------------------------------------
    329 RegexMatcher *RegexPattern::matcher(const UnicodeString &input,
    330                                     UErrorCode          &status)  const {
    331     RegexMatcher    *retMatcher = matcher(status);
    332     retMatcher->fDeferredStatus = status;
    333     if (retMatcher != NULL) {
    334         retMatcher->reset(input);
    335     }
    336     return retMatcher;
    337 }
    338 
    339 #if 0
    340 RegexMatcher *RegexPattern::matcher(const UChar * /*input*/,
    341                                     UErrorCode          &status)  const
    342 {
    343     /* This should never get called. The API with UnicodeString should be called instead. */
    344     if (U_SUCCESS(status)) {
    345         status = U_UNSUPPORTED_ERROR;
    346     }
    347     return NULL;
    348 }
    349 #endif
    350 
    351 //---------------------------------------------------------------------
    352 //
    353 //   matcher(status)
    354 //
    355 //---------------------------------------------------------------------
    356 RegexMatcher *RegexPattern::matcher(UErrorCode &status)  const {
    357     RegexMatcher    *retMatcher = NULL;
    358 
    359     if (U_FAILURE(status)) {
    360         return NULL;
    361     }
    362     if (U_FAILURE(fDeferredStatus)) {
    363         status = fDeferredStatus;
    364         return NULL;
    365     }
    366 
    367     retMatcher = new RegexMatcher(this);
    368     if (retMatcher == NULL) {
    369         status = U_MEMORY_ALLOCATION_ERROR;
    370         return NULL;
    371     }
    372     return retMatcher;
    373 }
    374 
    375 
    376 
    377 //---------------------------------------------------------------------
    378 //
    379 //   matches        Convenience function to test for a match, starting
    380 //                  with a pattern string and a data string.
    381 //
    382 //---------------------------------------------------------------------
    383 UBool U_EXPORT2 RegexPattern::matches(const UnicodeString   &regex,
    384               const UnicodeString   &input,
    385                     UParseError     &pe,
    386                     UErrorCode      &status) {
    387 
    388     if (U_FAILURE(status)) {return FALSE;}
    389 
    390     UBool         retVal;
    391     RegexPattern *pat     = NULL;
    392     RegexMatcher *matcher = NULL;
    393 
    394     pat     = RegexPattern::compile(regex, 0, pe, status);
    395     matcher = pat->matcher(input, status);
    396     retVal  = matcher->matches(status);
    397 
    398     delete matcher;
    399     delete pat;
    400     return retVal;
    401 }
    402 
    403 
    404 
    405 
    406 //---------------------------------------------------------------------
    407 //
    408 //   pattern
    409 //
    410 //---------------------------------------------------------------------
    411 UnicodeString RegexPattern::pattern() const {
    412     return fPattern;
    413 }
    414 
    415 
    416 
    417 
    418 //---------------------------------------------------------------------
    419 //
    420 //   split
    421 //
    422 //---------------------------------------------------------------------
    423 int32_t  RegexPattern::split(const UnicodeString &input,
    424         UnicodeString    dest[],
    425         int32_t          destCapacity,
    426         UErrorCode       &status) const
    427 {
    428     if (U_FAILURE(status)) {
    429         return 0;
    430     };
    431 
    432     RegexMatcher  m(this);
    433     int32_t r = 0;
    434     // Check m's status to make sure all is ok.
    435     if (U_SUCCESS(m.fDeferredStatus)) {
    436     	r = m.split(input, dest, destCapacity, status);
    437     }
    438     return r;
    439 }
    440 
    441 
    442 
    443 //---------------------------------------------------------------------
    444 //
    445 //   dump    Output the compiled form of the pattern.
    446 //           Debugging function only.
    447 //
    448 //---------------------------------------------------------------------
    449 #if defined(REGEX_DEBUG)
    450 void   RegexPattern::dumpOp(int32_t index) const {
    451     static const char * const opNames[] = {URX_OPCODE_NAMES};
    452     int32_t op          = fCompiledPat->elementAti(index);
    453     int32_t val         = URX_VAL(op);
    454     int32_t type        = URX_TYPE(op);
    455     int32_t pinnedType  = type;
    456     if ((uint32_t)pinnedType >= sizeof(opNames)/sizeof(char *)) {
    457         pinnedType = 0;
    458     }
    459 
    460     REGEX_DUMP_DEBUG_PRINTF(("%4d   %08x    %-15s  ", index, op, opNames[pinnedType]));
    461     switch (type) {
    462     case URX_NOP:
    463     case URX_DOTANY:
    464     case URX_DOTANY_ALL:
    465     case URX_FAIL:
    466     case URX_CARET:
    467     case URX_DOLLAR:
    468     case URX_BACKSLASH_G:
    469     case URX_BACKSLASH_X:
    470     case URX_END:
    471     case URX_DOLLAR_M:
    472     case URX_CARET_M:
    473         // Types with no operand field of interest.
    474         break;
    475 
    476     case URX_RESERVED_OP:
    477     case URX_START_CAPTURE:
    478     case URX_END_CAPTURE:
    479     case URX_STATE_SAVE:
    480     case URX_JMP:
    481     case URX_JMP_SAV:
    482     case URX_JMP_SAV_X:
    483     case URX_BACKSLASH_B:
    484     case URX_BACKSLASH_BU:
    485     case URX_BACKSLASH_D:
    486     case URX_BACKSLASH_Z:
    487     case URX_STRING_LEN:
    488     case URX_CTR_INIT:
    489     case URX_CTR_INIT_NG:
    490     case URX_CTR_LOOP:
    491     case URX_CTR_LOOP_NG:
    492     case URX_RELOC_OPRND:
    493     case URX_STO_SP:
    494     case URX_LD_SP:
    495     case URX_BACKREF:
    496     case URX_STO_INP_LOC:
    497     case URX_JMPX:
    498     case URX_LA_START:
    499     case URX_LA_END:
    500     case URX_BACKREF_I:
    501     case URX_LB_START:
    502     case URX_LB_CONT:
    503     case URX_LB_END:
    504     case URX_LBN_CONT:
    505     case URX_LBN_END:
    506     case URX_LOOP_C:
    507     case URX_LOOP_DOT_I:
    508         // types with an integer operand field.
    509         REGEX_DUMP_DEBUG_PRINTF(("%d", val));
    510         break;
    511 
    512     case URX_ONECHAR:
    513     case URX_ONECHAR_I:
    514         REGEX_DUMP_DEBUG_PRINTF(("%c", val<256?val:'?'));
    515         break;
    516 
    517     case URX_STRING:
    518     case URX_STRING_I:
    519         {
    520             int32_t lengthOp       = fCompiledPat->elementAti(index+1);
    521             U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN);
    522             int32_t length = URX_VAL(lengthOp);
    523             int32_t i;
    524             for (i=val; i<val+length; i++) {
    525                 UChar c = fLiteralText[i];
    526                 if (c < 32 || c >= 256) {c = '.';}
    527                 REGEX_DUMP_DEBUG_PRINTF(("%c", c));
    528             }
    529         }
    530         break;
    531 
    532     case URX_SETREF:
    533     case URX_LOOP_SR_I:
    534         {
    535             UnicodeString s;
    536             UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val);
    537             set->toPattern(s, TRUE);
    538             for (int32_t i=0; i<s.length(); i++) {
    539                 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
    540             }
    541         }
    542         break;
    543 
    544     case URX_STATIC_SETREF:
    545     case URX_STAT_SETREF_N:
    546         {
    547             UnicodeString s;
    548             if (val & URX_NEG_SET) {
    549                 REGEX_DUMP_DEBUG_PRINTF(("NOT "));
    550                 val &= ~URX_NEG_SET;
    551             }
    552             UnicodeSet *set = fStaticSets[val];
    553             set->toPattern(s, TRUE);
    554             for (int32_t i=0; i<s.length(); i++) {
    555                 REGEX_DUMP_DEBUG_PRINTF(("%c", s.charAt(i)));
    556             }
    557         }
    558         break;
    559 
    560 
    561     default:
    562         REGEX_DUMP_DEBUG_PRINTF(("??????"));
    563         break;
    564     }
    565     REGEX_DUMP_DEBUG_PRINTF(("\n"));
    566 }
    567 #endif
    568 
    569 
    570 #if defined(REGEX_DEBUG)
    571 U_CAPI void  U_EXPORT2
    572 RegexPatternDump(const RegexPattern *This) {
    573     int      index;
    574     int      i;
    575 
    576     REGEX_DUMP_DEBUG_PRINTF(("Original Pattern:  "));
    577     for (i=0; i<This->fPattern.length(); i++) {
    578         REGEX_DUMP_DEBUG_PRINTF(("%c", This->fPattern.charAt(i)));
    579     }
    580     REGEX_DUMP_DEBUG_PRINTF(("\n"));
    581     REGEX_DUMP_DEBUG_PRINTF(("   Min Match Length:  %d\n", This->fMinMatchLen));
    582     REGEX_DUMP_DEBUG_PRINTF(("   Match Start Type:  %s\n", START_OF_MATCH_STR(This->fStartType)));
    583     if (This->fStartType == START_STRING) {
    584         REGEX_DUMP_DEBUG_PRINTF(("    Initial match sting: \""));
    585         for (i=This->fInitialStringIdx; i<This->fInitialStringIdx+This->fInitialStringLen; i++) {
    586             REGEX_DUMP_DEBUG_PRINTF(("%c", This->fLiteralText[i]));   // TODO:  non-printables, surrogates.
    587         }
    588 
    589     } else if (This->fStartType == START_SET) {
    590         int32_t numSetChars = This->fInitialChars->size();
    591         if (numSetChars > 20) {
    592             numSetChars = 20;
    593         }
    594         REGEX_DUMP_DEBUG_PRINTF(("     Match First Chars : "));
    595         for (i=0; i<numSetChars; i++) {
    596             UChar32 c = This->fInitialChars->charAt(i);
    597             if (0x20<c && c <0x7e) {
    598                 REGEX_DUMP_DEBUG_PRINTF(("%c ", c));
    599             } else {
    600                 REGEX_DUMP_DEBUG_PRINTF(("%#x ", c));
    601             }
    602         }
    603         if (numSetChars < This->fInitialChars->size()) {
    604             REGEX_DUMP_DEBUG_PRINTF((" ..."));
    605         }
    606         REGEX_DUMP_DEBUG_PRINTF(("\n"));
    607 
    608     } else if (This->fStartType == START_CHAR) {
    609         REGEX_DUMP_DEBUG_PRINTF(("    First char of Match : "));
    610         if (0x20 < This->fInitialChar && This->fInitialChar<0x7e) {
    611                 REGEX_DUMP_DEBUG_PRINTF(("%c\n", This->fInitialChar));
    612             } else {
    613                 REGEX_DUMP_DEBUG_PRINTF(("%#x\n", This->fInitialChar));
    614             }
    615     }
    616 
    617     REGEX_DUMP_DEBUG_PRINTF(("\nIndex   Binary     Type             Operand\n" \
    618            "-------------------------------------------\n"));
    619     for (index = 0; index<This->fCompiledPat->size(); index++) {
    620         This->dumpOp(index);
    621     }
    622     REGEX_DUMP_DEBUG_PRINTF(("\n\n"));
    623 }
    624 #endif
    625 
    626 
    627 
    628 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern)
    629 
    630 U_NAMESPACE_END
    631 #endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
    632