Home | History | Annotate | Download | only in testdata
      1 # This set of tests is for UTF-8 support and Unicode property support, with
      2 # relevance only for the 8-bit library.
      3 
      4 # The next 4 patterns have UTF-8 errors
      5 
      6 /[]/utf
      7 
      8 //utf
      9 
     10 /xxx/utf
     11 
     12 //utf
     13 
     14 # Now test subjects
     15 
     16 /badutf/utf
     17 \= Expect UTF-8 errors
     18     X\xdf
     19     XX\xef
     20     XXX\xef\x80
     21     X\xf7
     22     XX\xf7\x80
     23     XXX\xf7\x80\x80
     24     \xfb
     25     \xfb\x80
     26     \xfb\x80\x80
     27     \xfb\x80\x80\x80
     28     \xfd
     29     \xfd\x80
     30     \xfd\x80\x80
     31     \xfd\x80\x80\x80
     32     \xfd\x80\x80\x80\x80
     33     \xdf\x7f
     34     \xef\x7f\x80
     35     \xef\x80\x7f
     36     \xf7\x7f\x80\x80
     37     \xf7\x80\x7f\x80
     38     \xf7\x80\x80\x7f
     39     \xfb\x7f\x80\x80\x80
     40     \xfb\x80\x7f\x80\x80
     41     \xfb\x80\x80\x7f\x80
     42     \xfb\x80\x80\x80\x7f
     43     \xfd\x7f\x80\x80\x80\x80
     44     \xfd\x80\x7f\x80\x80\x80
     45     \xfd\x80\x80\x7f\x80\x80
     46     \xfd\x80\x80\x80\x7f\x80
     47     \xfd\x80\x80\x80\x80\x7f
     48     \xed\xa0\x80
     49     \xc0\x8f
     50     \xe0\x80\x8f
     51     \xf0\x80\x80\x8f
     52     \xf8\x80\x80\x80\x8f
     53     \xfc\x80\x80\x80\x80\x8f
     54     \x80
     55     \xfe
     56     \xff
     57 
     58 /badutf/utf
     59 \= Expect UTF-8 errors
     60     XX\xfb\x80\x80\x80\x80
     61     XX\xfd\x80\x80\x80\x80\x80
     62     XX\xf7\xbf\xbf\xbf
     63 
     64 /shortutf/utf
     65 \= Expect UTF-8 errors
     66     XX\xdf\=ph
     67     XX\xef\=ph
     68     XX\xef\x80\=ph
     69     \xf7\=ph
     70     \xf7\x80\=ph
     71     \xf7\x80\x80\=ph
     72     \xfb\=ph
     73     \xfb\x80\=ph
     74     \xfb\x80\x80\=ph
     75     \xfb\x80\x80\x80\=ph
     76     \xfd\=ph
     77     \xfd\x80\=ph
     78     \xfd\x80\x80\=ph
     79     \xfd\x80\x80\x80\=ph
     80     \xfd\x80\x80\x80\x80\=ph
     81 
     82 /anything/utf
     83 \= Expect UTF-8 errors
     84     X\xc0\x80
     85     XX\xc1\x8f
     86     XXX\xe0\x9f\x80
     87     \xf0\x8f\x80\x80
     88     \xf8\x87\x80\x80\x80
     89     \xfc\x83\x80\x80\x80\x80
     90     \xfe\x80\x80\x80\x80\x80
     91     \xff\x80\x80\x80\x80\x80
     92     \xf8\x88\x80\x80\x80
     93     \xf9\x87\x80\x80\x80
     94     \xfc\x84\x80\x80\x80\x80
     95     \xfd\x83\x80\x80\x80\x80
     96 \= Expect no match
     97     \xc3\x8f
     98     \xe0\xaf\x80
     99     \xe1\x80\x80
    100     \xf0\x9f\x80\x80
    101     \xf1\x8f\x80\x80
    102     \xf8\x88\x80\x80\x80\=no_utf_check
    103     \xf9\x87\x80\x80\x80\=no_utf_check
    104     \xfc\x84\x80\x80\x80\x80\=no_utf_check
    105     \xfd\x83\x80\x80\x80\x80\=no_utf_check
    106     
    107 # Similar tests with offsets
    108 
    109 /badutf/utf
    110 \= Expect UTF-8 errors
    111     X\xdfabcd
    112     X\xdfabcd\=offset=1
    113 \= Expect no match
    114     X\xdfabcd\=offset=2
    115 
    116 /(?<=x)badutf/utf
    117 \= Expect UTF-8 errors
    118     X\xdfabcd
    119     X\xdfabcd\=offset=1
    120     X\xdfabcd\=offset=2
    121     X\xdfabcd\xdf\=offset=3
    122 \= Expect no match
    123     X\xdfabcd\=offset=3
    124 
    125 /(?<=xx)badutf/utf
    126 \= Expect UTF-8 errors
    127     X\xdfabcd
    128     X\xdfabcd\=offset=1
    129     X\xdfabcd\=offset=2
    130     X\xdfabcd\=offset=3
    131 
    132 /(?<=xxxx)badutf/utf
    133 \= Expect UTF-8 errors
    134     X\xdfabcd
    135     X\xdfabcd\=offset=1
    136     X\xdfabcd\=offset=2
    137     X\xdfabcd\=offset=3
    138     X\xdfabc\xdf\=offset=6
    139     X\xdfabc\xdf\=offset=7
    140 \= Expect no match
    141     X\xdfabcd\=offset=6
    142  
    143 /\x{100}/IB,utf
    144 
    145 /\x{1000}/IB,utf
    146 
    147 /\x{10000}/IB,utf
    148 
    149 /\x{100000}/IB,utf
    150 
    151 /\x{10ffff}/IB,utf
    152 
    153 /[\x{ff}]/IB,utf
    154 
    155 /[\x{100}]/IB,utf
    156 
    157 /\x80/IB,utf
    158 
    159 /\xff/IB,utf
    160 
    161 /\x{D55c}\x{ad6d}\x{C5B4}/IB,utf
    162     \x{D55c}\x{ad6d}\x{C5B4}
    163 
    164 /\x{65e5}\x{672c}\x{8a9e}/IB,utf
    165     \x{65e5}\x{672c}\x{8a9e}
    166 
    167 /\x{80}/IB,utf
    168 
    169 /\x{084}/IB,utf
    170 
    171 /\x{104}/IB,utf
    172 
    173 /\x{861}/IB,utf
    174 
    175 /\x{212ab}/IB,utf
    176 
    177 /[^ab\xC0-\xF0]/IB,utf
    178     \x{f1}
    179     \x{bf}
    180     \x{100}
    181     \x{1000}
    182 \= Expect no match
    183     \x{c0}
    184     \x{f0}
    185 
    186 /{3,4}/IB,utf
    187   \x{100}\x{100}\x{100}\x{100\x{100}
    188 
    189 /(\x{100}+|x)/IB,utf
    190 
    191 /(\x{100}*a|x)/IB,utf
    192 
    193 /(\x{100}{0,2}a|x)/IB,utf
    194 
    195 /(\x{100}{1,2}a|x)/IB,utf
    196 
    197 /\x{100}/IB,utf
    198 
    199 /a\x{100}\x{101}*/IB,utf
    200 
    201 /a\x{100}\x{101}+/IB,utf
    202 
    203 /[^\x{c4}]/IB
    204 
    205 /[\x{100}]/IB,utf
    206     \x{100}
    207     Z\x{100}
    208     \x{100}Z
    209 
    210 /[\xff]/IB,utf
    211     >\x{ff}<
    212 
    213 /[^\xff]/IB,utf
    214 
    215 /\x{100}abc(xyz(?1))/IB,utf
    216 
    217 /\777/I,utf
    218   \x{1ff}
    219   \777
    220 
    221 /\x{100}+\x{200}/IB,utf
    222 
    223 /\x{100}+X/IB,utf
    224 
    225 /^[\Q\E-\Q\E/B,utf
    226 
    227 # This tests the stricter UTF-8 check according to RFC 3629.
    228 
    229 /X/utf
    230 \= Expect UTF-8 errors
    231     \x{d800}
    232     \x{da00}
    233     \x{dfff}
    234     \x{110000}
    235     \x{2000000}
    236     \x{7fffffff}
    237 \= Expect no match
    238     \x{d800}\=no_utf_check
    239     \x{da00}\=no_utf_check
    240     \x{dfff}\=no_utf_check
    241     \x{110000}\=no_utf_check
    242     \x{2000000}\=no_utf_check
    243     \x{7fffffff}\=no_utf_check
    244 
    245 /(*UTF8)\x{1234}/
    246     abcd\x{1234}pqr
    247 
    248 /(*CRLF)(*UTF)(*BSR_UNICODE)a\Rb/I
    249 
    250 /\h/I,utf
    251     ABC\x{09}
    252     ABC\x{20}
    253     ABC\x{a0}
    254     ABC\x{1680}
    255     ABC\x{180e}
    256     ABC\x{2000}
    257     ABC\x{202f}
    258     ABC\x{205f}
    259     ABC\x{3000}
    260 
    261 /\v/I,utf
    262     ABC\x{0a}
    263     ABC\x{0b}
    264     ABC\x{0c}
    265     ABC\x{0d}
    266     ABC\x{85}
    267     ABC\x{2028}
    268 
    269 /\h*A/I,utf
    270     CDBABC
    271 
    272 /\v+A/I,utf
    273 
    274 /\s?xxx\s/I,utf
    275 
    276 /\sxxx\s/I,utf,tables=2
    277     AB\x{85}xxx\x{a0}XYZ
    278     AB\x{a0}xxx\x{85}XYZ
    279 
    280 /\S \S/I,utf,tables=2
    281     \x{a2} \x{84}
    282     A Z
    283 
    284 /a+/utf
    285     a\x{123}aa\=offset=1
    286     a\x{123}aa\=offset=3
    287     a\x{123}aa\=offset=4
    288 \= Expect bad offset value
    289     a\x{123}aa\=offset=6
    290 \= Expect bad UTF-8 offset     
    291     a\x{123}aa\=offset=2
    292 \= Expect no match
    293     a\x{123}aa\=offset=5
    294 
    295 /\x{1234}+/Ii,utf
    296 
    297 /\x{1234}+?/Ii,utf
    298 
    299 /\x{1234}++/Ii,utf
    300 
    301 /\x{1234}{2}/Ii,utf
    302 
    303 /[^\x{c4}]/IB,utf
    304 
    305 /X+\x{200}/IB,utf
    306 
    307 /\R/I,utf
    308 
    309 /\777/IB,utf
    310 
    311 /\w+\x{C4}/B,utf
    312     a\x{C4}\x{C4}
    313 
    314 /\w+\x{C4}/B,utf,tables=2
    315     a\x{C4}\x{C4}
    316 
    317 /\W+\x{C4}/B,utf
    318     !\x{C4}
    319 
    320 /\W+\x{C4}/B,utf,tables=2
    321     !\x{C4}
    322 
    323 /\W+\x{A1}/B,utf
    324     !\x{A1}
    325 
    326 /\W+\x{A1}/B,utf,tables=2
    327     !\x{A1}
    328 
    329 /X\s+\x{A0}/B,utf
    330     X\x20\x{A0}\x{A0}
    331 
    332 /X\s+\x{A0}/B,utf,tables=2
    333     X\x20\x{A0}\x{A0}
    334 
    335 /\S+\x{A0}/B,utf
    336     X\x{A0}\x{A0}
    337 
    338 /\S+\x{A0}/B,utf,tables=2
    339     X\x{A0}\x{A0}
    340 
    341 /\x{a0}+\s!/B,utf
    342     \x{a0}\x20!
    343 
    344 /\x{a0}+\s!/B,utf,tables=2
    345     \x{a0}\x20!
    346 
    347 /A/utf
    348   \x{ff000041}
    349   \x{7f000041}
    350 
    351 /(*UTF8)abc/never_utf
    352 
    353 /abc/utf,never_utf
    354 
    355 /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf
    356 
    357 /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf
    358 
    359 /AB\x{1fb0}/IB,utf
    360 
    361 /AB\x{1fb0}/IBi,utf
    362 
    363 /\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf
    364     \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}
    365     \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f}
    366 
    367 /[]/Bi,utf
    368 
    369 /[^]/Bi,utf
    370 
    371 /\h/I
    372 
    373 /\v/I
    374 
    375 /\R/I
    376 
    377 /[[:blank:]]/B,ucp
    378 
    379 /\x{212a}+/Ii,utf
    380     KKkk\x{212a}
    381 
    382 /s+/Ii,utf
    383     SSss\x{17f}
    384 
    385 /\x{100}*A/IB,utf
    386     A
    387 
    388 /\x{100}*\d(?R)/IB,utf
    389 
    390 /[Z\x{100}]/IB,utf
    391     Z\x{100}
    392     \x{100}
    393     \x{100}Z
    394 
    395 /[z-\x{100}]/IB,utf
    396 
    397 /[z\Qa-d]\E]/IB,utf
    398     \x{100}
    399      
    400 
    401 /[ab\x{100}]abc(xyz(?1))/IB,utf
    402 
    403 /\x{100}*\s/IB,utf
    404 
    405 /\x{100}*\d/IB,utf
    406 
    407 /\x{100}*\w/IB,utf
    408 
    409 /\x{100}*\D/IB,utf
    410 
    411 /\x{100}*\S/IB,utf
    412 
    413 /\x{100}*\W/IB,utf
    414 
    415 /[\x{105}-\x{109}]/IBi,utf
    416     \x{104}
    417     \x{105}
    418     \x{109}  
    419 \= Expect no match
    420     \x{100}
    421     \x{10a} 
    422     
    423 /[z-\x{100}]/IBi,utf
    424     Z
    425     z
    426     \x{39c}
    427     \x{178}
    428     |
    429     \x{80}
    430     \x{ff}
    431     \x{100}
    432     \x{101} 
    433 \= Expect no match
    434     \x{102}
    435     Y
    436     y           
    437 
    438 /[z-\x{100}]/IBi,utf
    439 
    440 /\x{3a3}B/IBi,utf
    441 
    442 /abc/utf,replace=
    443     abc
    444 
    445 /(?<=(a)(?-1))x/I,utf
    446     a\x80zx\=offset=3
    447 
    448 /[\W\p{Any}]/B
    449     abc
    450     123 
    451 
    452 /[\W\pL]/B
    453     abc
    454 \= Expect no match
    455     123     
    456 
    457 /(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':)/utf
    458 
    459 /[\s[:^ascii:]]/B,ucp
    460 
    461 # A special extra option allows excaped surrogate code points in 8-bit mode,
    462 # but subjects containing them must not be UTF-checked.
    463 
    464 /\x{d800}/I,utf,allow_surrogate_escapes
    465     \x{d800}\=no_utf_check
    466 
    467 /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes
    468     \x{dfff}\x{df01}\=no_utf_check
    469     
    470 # This has different starting code units in 8-bit mode. 
    471 
    472 /^[^ab]/IB,utf
    473     c
    474     \x{ff}
    475     \x{100}
    476 \= Expect no match
    477     aaa
    478 
    479 # End of testinput10
    480