Home | History | Annotate | Download | only in regex-re2
      1 This is a dump from Google's source control system of the change
      2 that removed UCS-2 support from RE2.  As the explanation below
      3 says, UCS-2 mode is fundamentally at odds with things like ^ and $,
      4 so it never really worked very well.  But if you are interested in using
      5 it without those operators, it did work for that.  It assumed that the
      6 UCS-2 data was in the native host byte order.
      7 
      8 If you are interested in adding UCS-2 mode back, this patch might
      9 be a good starting point.
     10 
     11 
     12 Change 12780686 by rsc@rsc-re2 on 2009/09/16 15:30:15
     13 
     14 	Retire UCS-2 mode.
     15 	
     16 	I added it as an experiment for V8, but it
     17 	requires 2-byte lookahead to do completely,
     18 	and RE2 has 1-byte lookahead (enough for UTF-8)
     19 	as a fairly deep fundamental assumption,
     20 	so it did not support ^ or $.
     21 
     22 ==== re2/bitstate.cc#2 - re2/bitstate.cc#3 ====
     23 re2/bitstate.cc#2:314,321 - re2/bitstate.cc#3:314,319
     24       cap_[0] = p;
     25       if (TrySearch(prog_->start(), p))  // Match must be leftmost; done.
     26         return true;
     27 -     if (prog_->flags() & Regexp::UCS2)
     28 -       p++;
     29     }
     30     return false;
     31   }
     32 ==== re2/compile.cc#17 - re2/compile.cc#18 ====
     33 re2/compile.cc#17:95,101 - re2/compile.cc#18:95,100
     34   // Input encodings.
     35   enum Encoding {
     36     kEncodingUTF8 = 1,  // UTF-8 (0-10FFFF)
     37 -   kEncodingUCS2,     // UCS-2 (0-FFFF), native byte order
     38     kEncodingLatin1,    // Latin1 (0-FF)
     39   };
     40   
     41 re2/compile.cc#17:168,176 - re2/compile.cc#18:167,172
     42     void AddRuneRangeLatin1(Rune lo, Rune hi, bool foldcase);
     43     void AddRuneRangeUTF8(Rune lo, Rune hi, bool foldcase);
     44     void Add_80_10ffff();
     45 -   void AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase);
     46 -   void AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
     47 -                    uint8 lo2, uint8 hi2, bool fold2);
     48   
     49     // New suffix that matches the byte range lo-hi, then goes to next.
     50     Inst* RuneByteSuffix(uint8 lo, uint8 hi, bool foldcase, Inst* next);
     51 re2/compile.cc#17:475,481 - re2/compile.cc#18:471,477
     52   
     53   // Converts rune range lo-hi into a fragment that recognizes
     54   // the bytes that would make up those runes in the current
     55 - // encoding (Latin 1, UTF-8, or UCS-2).
     56 + // encoding (Latin 1 or UTF-8).
     57   // This lets the machine work byte-by-byte even when
     58   // using multibyte encodings.
     59   
     60 re2/compile.cc#17:488,496 - re2/compile.cc#18:484,489
     61       case kEncodingLatin1:
     62         AddRuneRangeLatin1(lo, hi, foldcase);
     63         break;
     64 -     case kEncodingUCS2:
     65 -       AddRuneRangeUCS2(lo, hi, foldcase);
     66 -       break;
     67     }
     68   }
     69   
     70 re2/compile.cc#17:503,581 - re2/compile.cc#18:496,501
     71     AddSuffix(RuneByteSuffix(lo, hi, foldcase, NULL));
     72   }
     73   
     74 - // Test whether 16-bit values are big or little endian.
     75 - static bool BigEndian() {
     76 -   union {
     77 -     char byte[2];
     78 -     int16 endian;
     79 -   } u;
     80 - 
     81 -   u.byte[0] = 1;
     82 -   u.byte[1] = 2;
     83 -   return u.endian == 0x0102;
     84 - }
     85 - 
     86 - void Compiler::AddUCS2Pair(uint8 lo1, uint8 hi1, bool fold1,
     87 -                            uint8 lo2, uint8 hi2, bool fold2) {
     88 -   Inst* ip;
     89 -   if (reversed_) {
     90 -     ip = RuneByteSuffix(lo1, hi1, fold1, NULL);
     91 -     ip = RuneByteSuffix(lo2, hi2, fold2, ip);
     92 -   } else {
     93 -     ip = RuneByteSuffix(lo2, hi2, fold2, NULL);
     94 -     ip = RuneByteSuffix(lo1, hi1, fold1, ip);
     95 -   }
     96 -   AddSuffix(ip);
     97 - }
     98 - 
     99 - void Compiler::AddRuneRangeUCS2(Rune lo, Rune hi, bool foldcase) {
    100 -   if (lo > hi || lo > 0xFFFF)
    101 -     return;
    102 -   if (hi > 0xFFFF)
    103 -     hi = 0xFFFF;
    104 - 
    105 -   // We'll assemble a pattern assuming big endian.
    106 -   // If the machine isn't, tell Cat to reverse its arguments.
    107 -   bool oldreversed = reversed_;
    108 -   if (!BigEndian()) {
    109 -     reversed_ = !oldreversed;
    110 -   }
    111 - 
    112 -   // Split into bytes.
    113 -   int lo1 = lo >> 8;
    114 -   int lo2 = lo & 0xFF;
    115 -   int hi1 = hi >> 8;
    116 -   int hi2 = hi & 0xFF;
    117 - 
    118 -   if (lo1 == hi1) {
    119 -     // Easy case: high bits are same in both.
    120 -     // Only do ASCII case folding on the second byte if the top byte is 00.
    121 -     AddUCS2Pair(lo1, lo1, false, lo2, hi2, lo1==0 && foldcase);
    122 -   } else {
    123 -     // Harder case: different second byte ranges depending on first byte.
    124 - 
    125 -     // Initial fragment.
    126 -     if (lo2 > 0) {
    127 -       AddUCS2Pair(lo1, lo1, false, lo2, 0xFF, lo1==0 && foldcase);
    128 -       lo1++;
    129 -     }
    130 - 
    131 -     // Trailing fragment.
    132 -     if (hi2 < 0xFF) {
    133 -       AddUCS2Pair(hi1, hi1, false, 0, hi2, false);
    134 -       hi1--;
    135 -     }
    136 - 
    137 -     // Inner ranges.
    138 -     if (lo1 <= hi1) {
    139 -       AddUCS2Pair(lo1, hi1, false, 0, 0xFF, false);
    140 -     }
    141 -   }
    142 - 
    143 -   // Restore reverse setting.
    144 -   reversed_ = oldreversed;
    145 - }
    146 - 
    147   // Table describing how to make a UTF-8 matching machine
    148   // for the rune range 80-10FFFF (Runeself-Runemax).
    149   // This range happens frequently enough (for example /./ and /[^a-z]/)
    150 re2/compile.cc#17:707,716 - re2/compile.cc#18:627,634
    151   
    152   Frag Compiler::Literal(Rune r, bool foldcase) {
    153     switch (encoding_) {
    154 -     default:  // UCS-2 or something new
    155 -       BeginRange();
    156 -       AddRuneRange(r, r, foldcase);
    157 -       return EndRange();
    158 +     default:
    159 +       return kNullFrag;
    160   
    161       case kEncodingLatin1:
    162         return ByteRange(r, r, foldcase);
    163 re2/compile.cc#17:927,934 - re2/compile.cc#18:845,850
    164   
    165     if (re->parse_flags() & Regexp::Latin1)
    166       c.encoding_ = kEncodingLatin1;
    167 -   else if (re->parse_flags() & Regexp::UCS2)
    168 -     c.encoding_ = kEncodingUCS2;
    169     c.reversed_ = reversed;
    170     if (max_mem <= 0) {
    171       c.max_inst_ = 100000;  // more than enough
    172 re2/compile.cc#17:983,993 - re2/compile.cc#18:899,905
    173       c.prog_->set_start_unanchored(c.prog_->start());
    174     } else {
    175       Frag dot;
    176 -     if (c.encoding_ == kEncodingUCS2) {
    177 -       dot = c.Cat(c.ByteRange(0x00, 0xFF, false), c.ByteRange(0x00, 0xFF, false));
    178 -     } else {
    179 -       dot = c.ByteRange(0x00, 0xFF, false);
    180 -     }
    181 +     dot = c.ByteRange(0x00, 0xFF, false);
    182       Frag dotloop = c.Star(dot, true);
    183       Frag unanchored = c.Cat(dotloop, all);
    184       c.prog_->set_start_unanchored(unanchored.begin);
    185 ==== re2/nfa.cc#8 - re2/nfa.cc#9 ====
    186 re2/nfa.cc#8:426,432 - re2/nfa.cc#9:426,431
    187     const char* bp = context.begin();
    188     int c = -1;
    189     int wasword = 0;
    190 -   bool ucs2 = prog_->flags() & Regexp::UCS2;
    191   
    192     if (text.begin() > context.begin()) {
    193       c = text.begin()[-1] & 0xFF;
    194 re2/nfa.cc#8:492,498 - re2/nfa.cc#9:491,497
    195         // If there's a required first byte for an unanchored search
    196         // and we're not in the middle of any possible matches,
    197         // use memchr to search for the byte quickly.
    198 -       if (!ucs2 && !anchored && first_byte_ >= 0 && runq->size() == 0 &&
    199 +       if (!anchored && first_byte_ >= 0 && runq->size() == 0 &&
    200             p < text.end() && (p[0] & 0xFF) != first_byte_) {
    201           p = reinterpret_cast<const char*>(memchr(p, first_byte_,
    202                                                    text.end() - p));
    203 re2/nfa.cc#8:505,526 - re2/nfa.cc#9:504,514
    204           flag = Prog::EmptyFlags(context, p);
    205         }
    206   
    207 -       // In UCS-2 mode, if we need to start a new thread,
    208 -       // make sure to do it on an even boundary.
    209 -       if(ucs2 && runq->size() == 0 &&
    210 -           (p - context.begin()) % 2 && p < text.end()) {
    211 -         p++;
    212 -         flag = Prog::EmptyFlags(context, p);
    213 -       }
    214 - 
    215         // Steal match storage (cleared but unused as of yet)
    216         // temporarily to hold match boundaries for new thread.
    217 -       // In UCS-2 mode, only start the thread on a 2-byte boundary.
    218 -       if(!ucs2 || (p - context.begin()) % 2 == 0) {
    219 -         match_[0] = p;
    220 -         AddToThreadq(runq, start_, flag, p, match_);
    221 -         match_[0] = NULL;
    222 -       }
    223 +       match_[0] = p;
    224 +       AddToThreadq(runq, start_, flag, p, match_);
    225 +       match_[0] = NULL;
    226       }
    227   
    228       // If all the threads have died, stop early.
    229 ==== re2/parse.cc#22 - re2/parse.cc#23 ====
    230 re2/parse.cc#22:160,167 - re2/parse.cc#23:160,165
    231       status_(status), stacktop_(NULL), ncap_(0) {
    232     if (flags_ & Latin1)
    233       rune_max_ = 0xFF;
    234 -   else if (flags & UCS2)
    235 -     rune_max_ = 0xFFFF;
    236     else
    237       rune_max_ = Runemax;
    238   }
    239 re2/parse.cc#22:365,387 - re2/parse.cc#23:363,374
    240   bool Regexp::ParseState::PushCarat() {
    241     if (flags_ & OneLine) {
    242       return PushSimpleOp(kRegexpBeginText);
    243 -   } else {
    244 -     if (flags_ & UCS2) {
    245 -       status_->set_code(kRegexpUnsupported);
    246 -       status_->set_error_arg("multiline ^ in UCS-2 mode");
    247 -       return false;
    248 -     }
    249 -     return PushSimpleOp(kRegexpBeginLine);
    250     }
    251 +   return PushSimpleOp(kRegexpBeginLine);
    252   }
    253   
    254   // Pushes a \b or \B onto the stack.
    255   bool Regexp::ParseState::PushWordBoundary(bool word) {
    256 -   if (flags_ & UCS2) {
    257 -     status_->set_code(kRegexpUnsupported);
    258 -     status_->set_error_arg("\\b or \\B in UCS-2 mode");
    259 -     return false;
    260 -   }
    261     if (word)
    262       return PushSimpleOp(kRegexpWordBoundary);
    263     return PushSimpleOp(kRegexpNoWordBoundary);
    264 re2/parse.cc#22:397,407 - re2/parse.cc#23:384,389
    265       bool ret = PushSimpleOp(kRegexpEndText);
    266       flags_ = oflags;
    267       return ret;
    268 -   }
    269 -   if (flags_ & UCS2) {
    270 -     status_->set_code(kRegexpUnsupported);
    271 -     status_->set_error_arg("multiline $ in UCS-2 mode");
    272 -     return false;
    273     }
    274     return PushSimpleOp(kRegexpEndLine);
    275   }
    276 ==== re2/re2.cc#34 - re2/re2.cc#35 ====
    277 re2/re2.cc#34:79,86 - re2/re2.cc#35:79,84
    278         return RE2::ErrorBadUTF8;
    279       case re2::kRegexpBadNamedCapture:
    280         return RE2::ErrorBadNamedCapture;
    281 -     case re2::kRegexpUnsupported:
    282 -       return RE2::ErrorUnsupported;
    283     }
    284     return RE2::ErrorInternal;
    285   }
    286 re2/re2.cc#34:122,130 - re2/re2.cc#35:120,125
    287         break;
    288       case RE2::Options::EncodingLatin1:
    289         flags |= Regexp::Latin1;
    290 -       break;
    291 -     case RE2::Options::EncodingUCS2:
    292 -       flags |= Regexp::UCS2;
    293         break;
    294     }
    295   
    296 ==== re2/re2.h#36 - re2/re2.h#37 ====
    297 re2/re2.h#36:246,252 - re2/re2.h#37:246,251
    298       ErrorBadUTF8,            // invalid UTF-8 in regexp
    299       ErrorBadNamedCapture,    // bad named capture group
    300       ErrorPatternTooLarge,    // pattern too large (compile failed)
    301 -     ErrorUnsupported,        // unsupported feature (in UCS-2 mode)
    302     };
    303   
    304     // Predefined common options.
    305 re2/re2.h#36:570,576 - re2/re2.h#37:569,574
    306   
    307       enum Encoding {
    308         EncodingUTF8 = 1,
    309 -       EncodingUCS2,      // 16-bit Unicode 0-FFFF only
    310         EncodingLatin1
    311       };
    312   
    313 ==== re2/regexp.cc#15 - re2/regexp.cc#16 ====
    314 re2/regexp.cc#15:324,333 - re2/regexp.cc#16:324,329
    315   // the regexp that remains after the prefix.  The prefix might
    316   // be ASCII case-insensitive.
    317   bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
    318 -   // Don't even bother for UCS-2; it's time to throw that code away.
    319 -   if (parse_flags_ & UCS2)
    320 -     return false;
    321 - 
    322     // No need for a walker: the regexp must be of the form
    323     // 1. some number of ^ anchors
    324     // 2. a literal char or string
    325 ==== re2/regexp.h#20 - re2/regexp.h#21 ====
    326 re2/regexp.h#20:187,193 - re2/regexp.h#21:187,192
    327     kRegexpBadPerlOp,          // bad perl operator
    328     kRegexpBadUTF8,            // invalid UTF-8 in regexp
    329     kRegexpBadNamedCapture,    // bad named capture
    330 -   kRegexpUnsupported,        // unsupported operator
    331   };
    332   
    333   // Error status for certain operations.
    334 re2/regexp.h#20:307,316 - re2/regexp.h#21:306,314
    335                              //   \Q and \E to disable/enable metacharacters
    336                              //   (?P<name>expr) for named captures
    337                              //   \C to match any single byte
    338 -     UCS2         = 1<<10,  // Text is in UCS-2, regexp is in UTF-8.
    339 -     UnicodeGroups = 1<<11, // Allow \p{Han} for Unicode Han group
    340 +     UnicodeGroups = 1<<10, // Allow \p{Han} for Unicode Han group
    341                              //   and \P{Han} for its negation.
    342 -     NeverNL      = 1<<12,  // Never match NL, even if the regexp mentions
    343 +     NeverNL      = 1<<11,  // Never match NL, even if the regexp mentions
    344                              //   it explicitly.
    345   
    346       // As close to Perl as we can get.
    347 ==== re2/testing/backtrack.cc#4 - re2/testing/backtrack.cc#5 ====
    348 re2/testing/backtrack.cc#4:134,141 - re2/testing/backtrack.cc#5:134,139
    349       cap_[0] = p;
    350       if (Visit(prog_->start(), p))  // Match must be leftmost; done.
    351         return true;
    352 -     if (prog_->flags() & Regexp::UCS2)
    353 -       p++;
    354     }
    355     return false;
    356   }
    357 ==== re2/testing/tester.cc#12 - re2/testing/tester.cc#13 ====
    358 re2/testing/tester.cc#12:144,154 - re2/testing/tester.cc#13:144,152
    359   static ParseMode parse_modes[] = {
    360     { single_line,                   "single-line"          },
    361     { single_line|Regexp::Latin1,    "single-line, latin1"  },
    362 -   { single_line|Regexp::UCS2,     "single-line, ucs2"   },
    363     { multi_line,                    "multiline"            },
    364     { multi_line|Regexp::NonGreedy,  "multiline, nongreedy" },
    365     { multi_line|Regexp::Latin1,     "multiline, latin1"    },
    366 -   { multi_line|Regexp::UCS2,      "multiline, ucs2"     },
    367   };
    368   
    369   static string FormatMode(Regexp::ParseFlags flags) {
    370 re2/testing/tester.cc#12:179,189 - re2/testing/tester.cc#13:177,185
    371     RegexpStatus status;
    372     regexp_ = Regexp::Parse(regexp_str, flags, &status);
    373     if (regexp_ == NULL) {
    374 -     if (status.code() != kRegexpUnsupported) {
    375 -       LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
    376 -                 << " mode: " << FormatMode(flags);
    377 -       error_ = true;
    378 -     }
    379 +     LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
    380 +               << " mode: " << FormatMode(flags);
    381 +     error_ = true;
    382       return;
    383     }
    384     prog_ = regexp_->CompileToProg(0);
    385 re2/testing/tester.cc#12:230,237 - re2/testing/tester.cc#13:226,231
    386       RE2::Options options;
    387       if (flags & Regexp::Latin1)
    388         options.set_encoding(RE2::Options::EncodingLatin1);
    389 -     else if (flags & Regexp::UCS2)
    390 -       options.set_encoding(RE2::Options::EncodingUCS2);
    391       if (kind_ == Prog::kLongestMatch)
    392         options.set_longest_match(true);
    393       re2_ = new RE2(re, options);
    394 re2/testing/tester.cc#12:281,379 - re2/testing/tester.cc#13:275,280
    395       delete re2_;
    396   }
    397   
    398 - // Converts UTF-8 string in text into UCS-2 string in new_text.
    399 - static bool ConvertUTF8ToUCS2(const StringPiece& text, StringPiece* new_text) {
    400 -   const char* p = text.begin();
    401 -   const char* ep = text.end();
    402 -   uint16* q = new uint16[ep - p];
    403 -   uint16* q0 = q;
    404 - 
    405 -   int n;
    406 -   Rune r;
    407 -   for (; p < ep; p += n) {
    408 -     if (!fullrune(p, ep - p)) {
    409 -       delete[] q0;
    410 -       return false;
    411 -     }
    412 -     n = chartorune(&r, p);
    413 -     if (r > 0xFFFF) {
    414 -       delete[] q0;
    415 -       return false;
    416 -     }
    417 -     *q++ = r;
    418 -   }
    419 -   *new_text = StringPiece(reinterpret_cast<char*>(q0), 2*(q - q0));
    420 -   return true;
    421 - }
    422 - 
    423 - // Rewrites *sp from being a pointer into text8 (UTF-8)
    424 - // to being a pointer into text16 (equivalent text but in UCS-2).
    425 - static void AdjustUTF8ToUCS2(const StringPiece& text8, const StringPiece& text16,
    426 -                               StringPiece *sp) {
    427 -   if (sp->begin() == NULL && text8.begin() != NULL)
    428 -     return;
    429 - 
    430 -   int nrune = 0;
    431 -   int n;
    432 -   Rune r;
    433 -   const char* p = text8.begin();
    434 -   const char* ep = text8.end();
    435 -   const char* spbegin = NULL;
    436 -   const char* spend = NULL;
    437 -   for (;;) {
    438 -     if (p == sp->begin())
    439 -       spbegin = text16.begin() + sizeof(uint16)*nrune;
    440 -     if (p == sp->end())
    441 -       spend = text16.begin() + sizeof(uint16)*nrune;
    442 -     if (p >= ep)
    443 -       break;
    444 -     n = chartorune(&r, p);
    445 -     p += n;
    446 -     nrune++;
    447 -   }
    448 -   if (spbegin == NULL || spend == NULL) {
    449 -     LOG(FATAL) << "Error in AdjustUTF8ToUCS2 "
    450 -                << CEscape(text8) << " "
    451 -                << (int)(sp->begin() - text8.begin()) << " "
    452 -                << (int)(sp->end() - text8.begin());
    453 -   }
    454 -   *sp = StringPiece(spbegin, spend - spbegin);
    455 - }
    456 - 
    457 - // Rewrites *sp from begin a pointer into text16 (UCS-2)
    458 - // to being a pointer into text8 (equivalent text but in UTF-8).
    459 - static void AdjustUCS2ToUTF8(const StringPiece& text16, const StringPiece& text8,
    460 -                               StringPiece* sp) {
    461 -   if (sp->begin() == NULL)
    462 -     return;
    463 - 
    464 -   int nrune = 0;
    465 -   int n;
    466 -   Rune r;
    467 -   const char* p = text8.begin();
    468 -   const char* ep = text8.end();
    469 -   const char* spbegin = NULL;
    470 -   const char* spend = NULL;
    471 -   for (;;) {
    472 -     if (nrune == (sp->begin() - text16.begin())/2)
    473 -       spbegin = p;
    474 -     if (nrune == (sp->end() - text16.begin())/2)
    475 -       spend = p;
    476 -     if (p >= ep)
    477 -       break;
    478 -     n = chartorune(&r, p);
    479 -     p += n;
    480 -     nrune++;
    481 -   }
    482 -   if (text8.begin() != NULL && (spbegin == NULL || spend == NULL)) {
    483 -     LOG(FATAL) << "Error in AdjustUCS2ToUTF8 "
    484 -                << CEscape(text16) << " "
    485 -                << (int)(sp->begin() - text16.begin()) << " "
    486 -                << (int)(sp->end() - text16.begin());
    487 -   }
    488 -   *sp = StringPiece(spbegin, spend - spbegin);
    489 - }
    490 - 
    491   // Runs a single search using the named engine type.
    492   // This interface hides all the irregularities of the various
    493   // engine interfaces from the rest of this file.
    494 re2/testing/tester.cc#12:393,411 - re2/testing/tester.cc#13:294,300
    495   
    496     StringPiece text = orig_text;
    497     StringPiece context = orig_context;
    498 -   bool ucs2 = false;
    499   
    500 -   if ((flags() & Regexp::UCS2) && type != kEnginePCRE) {
    501 -     if (!ConvertUTF8ToUCS2(orig_context, &context)) {
    502 -       result->skipped = true;
    503 -       return;
    504 -     }
    505 - 
    506 -     // Rewrite context to refer to new text.
    507 -     AdjustUTF8ToUCS2(orig_context, context, &text);
    508 -     ucs2 = true;
    509 -   }
    510 - 
    511     switch (type) {
    512       default:
    513         LOG(FATAL) << "Bad RunSearch type: " << (int)type;
    514 re2/testing/tester.cc#12:557,577 - re2/testing/tester.cc#13:446,451
    515       }
    516     }
    517   
    518 -   // If we did UCS-2 matching, rewrite the matches to refer
    519 -   // to the original UTF-8 text.
    520 -   if (ucs2) {
    521 -     if (result->matched) {
    522 -       if (result->have_submatch0) {
    523 -         AdjustUCS2ToUTF8(context, orig_context, &result->submatch[0]);
    524 -       } else if (result->have_submatch) {
    525 -         for (int i = 0; i < nsubmatch; i++) {
    526 -           AdjustUCS2ToUTF8(context, orig_context, &result->submatch[i]);
    527 -         }
    528 -       }
    529 -     }
    530 -     delete[] context.begin();
    531 -   }
    532 - 
    533     if (!result->matched)
    534       memset(result->submatch, 0, sizeof result->submatch);
    535   }
    536 re2/testing/tester.cc#12:596,617 - re2/testing/tester.cc#13:470,475
    537     return true;
    538   }
    539   
    540 - // Check whether text uses only Unicode points <= 0xFFFF
    541 - // (in the BMP).
    542 - static bool IsBMP(const StringPiece& text) {
    543 -   const char* p = text.begin();
    544 -   const char* ep = text.end();
    545 -   while (p < ep) {
    546 -     if (!fullrune(p, ep - p))
    547 -       return false;
    548 -     Rune r;
    549 -     p += chartorune(&r, p);
    550 -     if (r > 0xFFFF)
    551 -       return false;
    552 -   }
    553 -   return true;
    554 - }
    555 - 
    556   // Runs a single test.
    557   bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
    558                              Prog::Anchor anchor) {
    559 re2/testing/tester.cc#12:619,625 - re2/testing/tester.cc#13:477,483
    560     Result correct;
    561     RunSearch(kEngineBacktrack, text, context, anchor, &correct);
    562     if (correct.skipped) {
    563 -     if (regexp_ == NULL || !IsBMP(context))  // okay to skip in UCS-2 mode
    564 +     if (regexp_ == NULL)
    565         return true;
    566       LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
    567                  << " " << FormatMode(flags_);
    568