1 # This set of tests is for UTF-8 support and Unicode property support, with 2 # relevance only for the 8-bit library. 3 4 # The next 4 patterns have UTF-8 errors 5 6 /[]/utf 7 8 //utf 9 10 /xxx/utf 11 12 //utf 13 14 # Now test subjects 15 16 /badutf/utf 17 \= Expect UTF-8 errors 18 X\xdf 19 XX\xef 20 XXX\xef\x80 21 X\xf7 22 XX\xf7\x80 23 XXX\xf7\x80\x80 24 \xfb 25 \xfb\x80 26 \xfb\x80\x80 27 \xfb\x80\x80\x80 28 \xfd 29 \xfd\x80 30 \xfd\x80\x80 31 \xfd\x80\x80\x80 32 \xfd\x80\x80\x80\x80 33 \xdf\x7f 34 \xef\x7f\x80 35 \xef\x80\x7f 36 \xf7\x7f\x80\x80 37 \xf7\x80\x7f\x80 38 \xf7\x80\x80\x7f 39 \xfb\x7f\x80\x80\x80 40 \xfb\x80\x7f\x80\x80 41 \xfb\x80\x80\x7f\x80 42 \xfb\x80\x80\x80\x7f 43 \xfd\x7f\x80\x80\x80\x80 44 \xfd\x80\x7f\x80\x80\x80 45 \xfd\x80\x80\x7f\x80\x80 46 \xfd\x80\x80\x80\x7f\x80 47 \xfd\x80\x80\x80\x80\x7f 48 \xed\xa0\x80 49 \xc0\x8f 50 \xe0\x80\x8f 51 \xf0\x80\x80\x8f 52 \xf8\x80\x80\x80\x8f 53 \xfc\x80\x80\x80\x80\x8f 54 \x80 55 \xfe 56 \xff 57 58 /badutf/utf 59 \= Expect UTF-8 errors 60 XX\xfb\x80\x80\x80\x80 61 XX\xfd\x80\x80\x80\x80\x80 62 XX\xf7\xbf\xbf\xbf 63 64 /shortutf/utf 65 \= Expect UTF-8 errors 66 XX\xdf\=ph 67 XX\xef\=ph 68 XX\xef\x80\=ph 69 \xf7\=ph 70 \xf7\x80\=ph 71 \xf7\x80\x80\=ph 72 \xfb\=ph 73 \xfb\x80\=ph 74 \xfb\x80\x80\=ph 75 \xfb\x80\x80\x80\=ph 76 \xfd\=ph 77 \xfd\x80\=ph 78 \xfd\x80\x80\=ph 79 \xfd\x80\x80\x80\=ph 80 \xfd\x80\x80\x80\x80\=ph 81 82 /anything/utf 83 \= Expect UTF-8 errors 84 X\xc0\x80 85 XX\xc1\x8f 86 XXX\xe0\x9f\x80 87 \xf0\x8f\x80\x80 88 \xf8\x87\x80\x80\x80 89 \xfc\x83\x80\x80\x80\x80 90 \xfe\x80\x80\x80\x80\x80 91 \xff\x80\x80\x80\x80\x80 92 \xf8\x88\x80\x80\x80 93 \xf9\x87\x80\x80\x80 94 \xfc\x84\x80\x80\x80\x80 95 \xfd\x83\x80\x80\x80\x80 96 \= Expect no match 97 \xc3\x8f 98 \xe0\xaf\x80 99 \xe1\x80\x80 100 \xf0\x9f\x80\x80 101 \xf1\x8f\x80\x80 102 \xf8\x88\x80\x80\x80\=no_utf_check 103 \xf9\x87\x80\x80\x80\=no_utf_check 104 \xfc\x84\x80\x80\x80\x80\=no_utf_check 105 \xfd\x83\x80\x80\x80\x80\=no_utf_check 106 107 # Similar tests with offsets 108 109 /badutf/utf 110 \= Expect UTF-8 errors 111 X\xdfabcd 112 X\xdfabcd\=offset=1 113 \= Expect no match 114 X\xdfabcd\=offset=2 115 116 /(?<=x)badutf/utf 117 \= Expect UTF-8 errors 118 X\xdfabcd 119 X\xdfabcd\=offset=1 120 X\xdfabcd\=offset=2 121 X\xdfabcd\xdf\=offset=3 122 \= Expect no match 123 X\xdfabcd\=offset=3 124 125 /(?<=xx)badutf/utf 126 \= Expect UTF-8 errors 127 X\xdfabcd 128 X\xdfabcd\=offset=1 129 X\xdfabcd\=offset=2 130 X\xdfabcd\=offset=3 131 132 /(?<=xxxx)badutf/utf 133 \= Expect UTF-8 errors 134 X\xdfabcd 135 X\xdfabcd\=offset=1 136 X\xdfabcd\=offset=2 137 X\xdfabcd\=offset=3 138 X\xdfabc\xdf\=offset=6 139 X\xdfabc\xdf\=offset=7 140 \= Expect no match 141 X\xdfabcd\=offset=6 142 143 /\x{100}/IB,utf 144 145 /\x{1000}/IB,utf 146 147 /\x{10000}/IB,utf 148 149 /\x{100000}/IB,utf 150 151 /\x{10ffff}/IB,utf 152 153 /[\x{ff}]/IB,utf 154 155 /[\x{100}]/IB,utf 156 157 /\x80/IB,utf 158 159 /\xff/IB,utf 160 161 /\x{D55c}\x{ad6d}\x{C5B4}/IB,utf 162 \x{D55c}\x{ad6d}\x{C5B4} 163 164 /\x{65e5}\x{672c}\x{8a9e}/IB,utf 165 \x{65e5}\x{672c}\x{8a9e} 166 167 /\x{80}/IB,utf 168 169 /\x{084}/IB,utf 170 171 /\x{104}/IB,utf 172 173 /\x{861}/IB,utf 174 175 /\x{212ab}/IB,utf 176 177 /[^ab\xC0-\xF0]/IB,utf 178 \x{f1} 179 \x{bf} 180 \x{100} 181 \x{1000} 182 \= Expect no match 183 \x{c0} 184 \x{f0} 185 186 /{3,4}/IB,utf 187 \x{100}\x{100}\x{100}\x{100\x{100} 188 189 /(\x{100}+|x)/IB,utf 190 191 /(\x{100}*a|x)/IB,utf 192 193 /(\x{100}{0,2}a|x)/IB,utf 194 195 /(\x{100}{1,2}a|x)/IB,utf 196 197 /\x{100}/IB,utf 198 199 /a\x{100}\x{101}*/IB,utf 200 201 /a\x{100}\x{101}+/IB,utf 202 203 /[^\x{c4}]/IB 204 205 /[\x{100}]/IB,utf 206 \x{100} 207 Z\x{100} 208 \x{100}Z 209 210 /[\xff]/IB,utf 211 >\x{ff}< 212 213 /[^\xff]/IB,utf 214 215 /\x{100}abc(xyz(?1))/IB,utf 216 217 /\777/I,utf 218 \x{1ff} 219 \777 220 221 /\x{100}+\x{200}/IB,utf 222 223 /\x{100}+X/IB,utf 224 225 /^[\Q\E-\Q\E/B,utf 226 227 # This tests the stricter UTF-8 check according to RFC 3629. 228 229 /X/utf 230 \= Expect UTF-8 errors 231 \x{d800} 232 \x{da00} 233 \x{dfff} 234 \x{110000} 235 \x{2000000} 236 \x{7fffffff} 237 \= Expect no match 238 \x{d800}\=no_utf_check 239 \x{da00}\=no_utf_check 240 \x{dfff}\=no_utf_check 241 \x{110000}\=no_utf_check 242 \x{2000000}\=no_utf_check 243 \x{7fffffff}\=no_utf_check 244 245 /(*UTF8)\x{1234}/ 246 abcd\x{1234}pqr 247 248 /(*CRLF)(*UTF)(*BSR_UNICODE)a\Rb/I 249 250 /\h/I,utf 251 ABC\x{09} 252 ABC\x{20} 253 ABC\x{a0} 254 ABC\x{1680} 255 ABC\x{180e} 256 ABC\x{2000} 257 ABC\x{202f} 258 ABC\x{205f} 259 ABC\x{3000} 260 261 /\v/I,utf 262 ABC\x{0a} 263 ABC\x{0b} 264 ABC\x{0c} 265 ABC\x{0d} 266 ABC\x{85} 267 ABC\x{2028} 268 269 /\h*A/I,utf 270 CDBABC 271 272 /\v+A/I,utf 273 274 /\s?xxx\s/I,utf 275 276 /\sxxx\s/I,utf,tables=2 277 AB\x{85}xxx\x{a0}XYZ 278 AB\x{a0}xxx\x{85}XYZ 279 280 /\S \S/I,utf,tables=2 281 \x{a2} \x{84} 282 A Z 283 284 /a+/utf 285 a\x{123}aa\=offset=1 286 a\x{123}aa\=offset=3 287 a\x{123}aa\=offset=4 288 \= Expect bad offset value 289 a\x{123}aa\=offset=6 290 \= Expect bad UTF-8 offset 291 a\x{123}aa\=offset=2 292 \= Expect no match 293 a\x{123}aa\=offset=5 294 295 /\x{1234}+/Ii,utf 296 297 /\x{1234}+?/Ii,utf 298 299 /\x{1234}++/Ii,utf 300 301 /\x{1234}{2}/Ii,utf 302 303 /[^\x{c4}]/IB,utf 304 305 /X+\x{200}/IB,utf 306 307 /\R/I,utf 308 309 /\777/IB,utf 310 311 /\w+\x{C4}/B,utf 312 a\x{C4}\x{C4} 313 314 /\w+\x{C4}/B,utf,tables=2 315 a\x{C4}\x{C4} 316 317 /\W+\x{C4}/B,utf 318 !\x{C4} 319 320 /\W+\x{C4}/B,utf,tables=2 321 !\x{C4} 322 323 /\W+\x{A1}/B,utf 324 !\x{A1} 325 326 /\W+\x{A1}/B,utf,tables=2 327 !\x{A1} 328 329 /X\s+\x{A0}/B,utf 330 X\x20\x{A0}\x{A0} 331 332 /X\s+\x{A0}/B,utf,tables=2 333 X\x20\x{A0}\x{A0} 334 335 /\S+\x{A0}/B,utf 336 X\x{A0}\x{A0} 337 338 /\S+\x{A0}/B,utf,tables=2 339 X\x{A0}\x{A0} 340 341 /\x{a0}+\s!/B,utf 342 \x{a0}\x20! 343 344 /\x{a0}+\s!/B,utf,tables=2 345 \x{a0}\x20! 346 347 /A/utf 348 \x{ff000041} 349 \x{7f000041} 350 351 /(*UTF8)abc/never_utf 352 353 /abc/utf,never_utf 354 355 /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IBi,utf 356 357 /A\x{391}\x{10427}\x{ff3a}\x{1fb0}/IB,utf 358 359 /AB\x{1fb0}/IB,utf 360 361 /AB\x{1fb0}/IBi,utf 362 363 /\x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f}/Ii,utf 364 \x{401}\x{420}\x{421}\x{422}\x{423}\x{424}\x{425}\x{426}\x{427}\x{428}\x{429}\x{42a}\x{42b}\x{42c}\x{42d}\x{42e}\x{42f} 365 \x{451}\x{440}\x{441}\x{442}\x{443}\x{444}\x{445}\x{446}\x{447}\x{448}\x{449}\x{44a}\x{44b}\x{44c}\x{44d}\x{44e}\x{44f} 366 367 /[]/Bi,utf 368 369 /[^]/Bi,utf 370 371 /\h/I 372 373 /\v/I 374 375 /\R/I 376 377 /[[:blank:]]/B,ucp 378 379 /\x{212a}+/Ii,utf 380 KKkk\x{212a} 381 382 /s+/Ii,utf 383 SSss\x{17f} 384 385 /\x{100}*A/IB,utf 386 A 387 388 /\x{100}*\d(?R)/IB,utf 389 390 /[Z\x{100}]/IB,utf 391 Z\x{100} 392 \x{100} 393 \x{100}Z 394 395 /[z-\x{100}]/IB,utf 396 397 /[z\Qa-d]\E]/IB,utf 398 \x{100} 399 400 401 /[ab\x{100}]abc(xyz(?1))/IB,utf 402 403 /\x{100}*\s/IB,utf 404 405 /\x{100}*\d/IB,utf 406 407 /\x{100}*\w/IB,utf 408 409 /\x{100}*\D/IB,utf 410 411 /\x{100}*\S/IB,utf 412 413 /\x{100}*\W/IB,utf 414 415 /[\x{105}-\x{109}]/IBi,utf 416 \x{104} 417 \x{105} 418 \x{109} 419 \= Expect no match 420 \x{100} 421 \x{10a} 422 423 /[z-\x{100}]/IBi,utf 424 Z 425 z 426 \x{39c} 427 \x{178} 428 | 429 \x{80} 430 \x{ff} 431 \x{100} 432 \x{101} 433 \= Expect no match 434 \x{102} 435 Y 436 y 437 438 /[z-\x{100}]/IBi,utf 439 440 /\x{3a3}B/IBi,utf 441 442 /abc/utf,replace= 443 abc 444 445 /(?<=(a)(?-1))x/I,utf 446 a\x80zx\=offset=3 447 448 /[\W\p{Any}]/B 449 abc 450 123 451 452 /[\W\pL]/B 453 abc 454 \= Expect no match 455 123 456 457 /(*:*++++++++++++''''''''''''''''''''+''+++'+++x+++++++++++++++++++++++++++++++++++(++++++++++++++++++++:++++++%++:''''''''''''''''''''''''+++++++++++++++++++++++++++++++++++++++++++++++++++++-++++++++k+++++++''''+++'+++++++++++++++++++++++''''++++++++++++':)/utf 458 459 /[\s[:^ascii:]]/B,ucp 460 461 # A special extra option allows excaped surrogate code points in 8-bit mode, 462 # but subjects containing them must not be UTF-checked. 463 464 /\x{d800}/I,utf,allow_surrogate_escapes 465 \x{d800}\=no_utf_check 466 467 /\udfff\o{157401}/utf,alt_bsux,allow_surrogate_escapes 468 \x{dfff}\x{df01}\=no_utf_check 469 470 # This has different starting code units in 8-bit mode. 471 472 /^[^ab]/IB,utf 473 c 474 \x{ff} 475 \x{100} 476 \= Expect no match 477 aaa 478 479 # End of testinput10 480