Home | History | Annotate | Download | only in testdata
      1 <?xml version="1.0" encoding="UTF-8"?>
      2 
      3 <!-- Copyright (C) 2016 and later: Unicode, Inc. and others. License & terms of use: http://www.unicode.org/copyright.html -->
      4 <!-- Copyright (c) 2007-2009 IBM Corporation and others. All rights reserved -->
      5 
      6 <!-- Test data file for string search  -->
      7 <!DOCTYPE stringsearch-tests [
      8 <!ELEMENT stringsearch-tests (test-case+)>
      9 <!ATTLIST stringsearch-tests debug IDREF #IMPLIED >
     10 <!ELEMENT test-case (pattern, pre?, m?, post?)>
     11 <!ATTLIST test-case 
     12           id ID #REQUIRED
     13           locale CDATA "en" 
     14           strength (PRIMARY | SECONDARY | TERTIARY | QUATERNARY | IDENTICAL) "TERTIARY" 
     15           norm (ON | OFF) "OFF"
     16           alternate_handling (NON_IGNORABLE | SHIFTED) "NON_IGNORABLE"
     17           >
     18 
     19 <!ELEMENT pattern (#PCDATA)>
     20 <!ELEMENT pre  (#PCDATA)>
     21 <!ELEMENT m    (#PCDATA)>
     22 <!ELEMENT post (#PCDATA)>
     23 ]>
     24 
     25 <stringsearch-tests>
     26   <!-- debug="test11"     (for copying into the above element)  -->
     27     
     28     <!-- Very simple match  -->
     29     <test-case id="test01" >
     30        <pattern>abc</pattern>
     31        <pre>xxx</pre><m>abc</m><post>yyy</post>
     32     </test-case>
     33     
     34     <!-- Very simple no-match  -->
     35     <test-case id="test02" >
     36        <pattern>abc</pattern>
     37        <pre>xxx</pre><post>yyy</post>
     38     </test-case>
     39 
     40     <!-- Match after several near-misses. -->
     41     <test-case id="test03" >
     42        <pattern>string</pattern>
     43        <pre>silly spring stling strxng strilg strinx stri</pre><m>string</m><post> fling</post>
     44     </test-case>
     45     
     46     <test-case id="test04" strength="PRIMARY" >
     47        <pattern>FUSS</pattern>
     48        <pre>abc</pre><m>fuss</m><post>sss</post>
     49     </test-case>
     50     
     51     <test-case id="test05" strength="PRIMARY" >
     52        <pattern>FUSS</pattern>
     53        <pre>abc</pre><m>fu</m><post>sss</post>
     54     </test-case>
     55 
     56   <test-case id="test05.5" strength="PRIMARY" >
     57     <pattern>fuss</pattern>
     58     <pre>a </pre>
     59     <m>fu</m>
     60     <post>ball table</post>
     61   </test-case>
     62 
     63   <test-case id="test06" strength="PRIMARY" >
     64       <pattern>fu</pattern>
     65        <pre>abc</pre><m>fuss</m><post>xyz</post>
     66     </test-case>
     67     
     68     <test-case id="test07" strength="SECONDARY" >
     69       <pattern>fu</pattern>
     70       <pre>abcfussxyz</pre>
     71     </test-case>
     72     
     73     <test-case id="test08" strength="PRIMARY" >
     74       <pattern>fus</pattern>
     75       <pre>abcfu</pre><post>xyz</post>
     76     </test-case>
     77     
     78     <!-- A good match following an initial match that failed because
     79          of not ending on a character boundary -->
     80     <test-case id="test09" strength="PRIMARY">
     81       <pattern>fus</pattern>
     82       <pre>fu  </pre><m>fus</m><post>sss</post>
     83     </test-case>
     84 
     85 
     86     <!-- Test cases from usrchdat.c  BREAKITERATOREXACT -->
     87 
     88     <test-case id="test10" strength="TERTIARY">
     89       <pattern>fox</pattern>
     90       <m>fox</m><post>y fox</post>
     91     </test-case>
     92 
     93     <test-case id="test11" strength="PRIMARY" locale="de_DE@collation=phonebook">
     94       <pattern>toe</pattern>
     95       <pre>This is a </pre><m>T</m><post>ne</post>
     96     </test-case>
     97     
     98     <test-case id="test11a" strength="SECONDARY" locale="de_DE@collation=phonebook">
     99       <pattern>toe</pattern>
    100       <pre>This is a </pre><post>Tne</post>
    101     </test-case>
    102     
    103     <test-case id="test12" strength="TERTIARY">
    104       <pattern>e</pattern>
    105       <pre>tsting that  dos not match </pre><m>e</m><post></post>
    106     </test-case>
    107     
    108     <test-case id="test13" strength="PRIMARY" locale="fr">
    109       <pattern>e</pattern>
    110       <pre></pre><m></m><post></post>
    111     </test-case>
    112     
    113     <test-case id="test14" strength="PRIMARY" locale="fr">
    114       <pattern>O</pattern>
    115       <pre>C</pre><m>O\u0302</m><post>T</post>
    116     </test-case>
    117 
    118 
    119     <!-- Test cases from usrchdat.c  STRENGTH -->
    120 
    121 
    122     <test-case id="test15" strength="PRIMARY" locale="en">
    123       <pattern>fox</pattern>
    124       <pre>The quick brown </pre><m>fox</m><post> jumps over the lazy foxes</post>
    125     </test-case>
    126     
    127     <test-case id="test16" strength="PRIMARY" locale="fr">
    128       <pattern>peche</pattern>
    129       <pre>blackbirds pat </pre><m>p\u00E9ch\u00E9</m><post> </post>
    130     </test-case>
    131     
    132     <test-case id="test17" strength="PRIMARY" locale="fr">
    133       <pattern>peche</pattern>
    134       <pre>blackbirds pat </pre><m>p\u00EAche</m><post> </post>
    135     </test-case>
    136     
    137     <test-case id="test18" strength="PRIMARY" locale="fr">
    138       <pattern>peche</pattern>
    139       <pre>blackbirds pat </pre><m>p\u00E9che</m><post>r </post>
    140     </test-case>
    141     
    142     <test-case id="test19" strength="PRIMARY" locale="fr">
    143       <pattern>peche</pattern>
    144       <pre>blackbirds pat </pre><m>p\u00EAche</m><post>r </post>
    145     </test-case>
    146     
    147     <test-case id="test20" strength="PRIMARY" locale="es">
    148       <pattern>channel</pattern>
    149       <pre>A </pre><m>channel</m><post>, </post>
    150     </test-case>
    151     
    152     <test-case id="test21" strength="PRIMARY" locale="es">
    153       <pattern>channel</pattern>
    154       <pre>A </pre><m>CHANNEL</m><post>, </post>
    155     </test-case>
    156     
    157     <test-case id="test22" strength="PRIMARY" locale="es">
    158       <pattern>channel</pattern>
    159       <pre>A </pre><m>Channel</m><post>s, </post>
    160     </test-case>
    161     
    162     <test-case id="test23" strength="PRIMARY" locale="es">
    163       <pattern>channel</pattern>
    164       <pre>A </pre><m>channel</m><post>... </post>
    165     </test-case>
    166     
    167     <test-case id="test24" strength="TERTIARY" locale="en">
    168       <pattern>A\u0300</pattern>
    169       <pre>A miss, and then </pre><m>\u00c0</m><post> should match but not A"</post>
    170     </test-case>
    171     
    172     <!-- TODO:  In the original test data, this test matched at IDENTICAL strength.
    173                 Doesn't seem right.  The characters are different.
    174                 -->
    175     <test-case id="test24a" strength="IDENTICAL" locale="en">
    176       <pattern>A\u0300</pattern>
    177       <pre>At IDENTICAL, shoud this match?  </pre><m>\u00c0</m><post></post>
    178     </test-case>
    179 
    180   <test-case id="test24b" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
    181     <pattern>A\u0300</pattern>
    182     <pre>At IDENTICAL, shoud this match?  </pre>
    183     <m>\u00c0</m>
    184     <post></post>
    185   </test-case>
    186 
    187   <test-case id="test25" strength="SECONDARY" locale="en">
    188       <pattern></pattern>
    189       <pre>12</pre><m></m><post> </post>
    190     </test-case>
    191     
    192     <test-case id="test26" strength="SECONDARY" locale="en">
    193       <pattern>A</pattern>
    194       <pre>12</pre><m>a</m><post>...</post>
    195     </test-case>
    196 
    197 
    198     <!--  Test Cases from usrchdat.c,  VARIABLE -->
    199     <test-case id="test27" strength="TERTIARY" locale="en">
    200       <pattern>blackbird</pattern>
    201       <pre>black-bird </pre><m>blackbird</m><post>...</post>
    202     </test-case>
    203 
    204     <test-case id="test28" strength="TERTIARY" locale="en">
    205       <pattern>go</pattern>
    206       <pre> on</pre>
    207     </test-case>
    208 
    209     <!-- TODO:  this gives an U_ILLEGAL_ARGUMENT error when opening
    210                 the UStringSearch.  How did the orignal test run? -->
    211     <!--
    212     <test-case id="test29" strength="PRIMARY" locale="en">
    213       <pattern>  </pattern>
    214       <pre></pre><m></m><post>abc</post>
    215     </test-case>
    216     -->
    217 
    218     <test-case id="test30" strength="SECONDARY" locale="en">
    219       <pattern>abc</pattern>
    220       <pre>  a bc   ab c    a  bc     ab  c"</pre>
    221     </test-case>
    222 
    223     <test-case id="test31" strength="SECONDARY" locale="en">
    224       <pattern>abc</pattern>
    225       <pre>           ---------------</pre>
    226     </test-case>
    227 
    228 
    229     <!--  Normalization test cases from usrchdat.c  -->
    230     <test-case id="test32" strength="TERTIARY"  norm="ON">
    231       <pattern>a\u0325\u0300</pattern>
    232       <pre></pre><m>a\u0300\u0325</m>
    233     </test-case>
    234 
    235 
    236     <test-case id="test32a" strength="TERTIARY"  norm="OFF">
    237       <pattern>a\u0325\u0300</pattern>
    238       <pre>a\u0300\u0325</pre>
    239     </test-case>
    240 
    241 
    242     <!-- COMPOSITEBOUNDARIES from usrchdat.c
    243          Boundaries are not identical to orignal test data because
    244          of matching only full combining sequences
    245     -->
    246     <test-case id="test40" strength="TERTIARY">
    247       <pattern>A</pattern>
    248       <pre></pre>   <!-- \u00C0 -->
    249     </test-case>
    250     
    251     <test-case id="test41" strength="TERTIARY">
    252       <pattern>A</pattern>
    253       <pre></pre><m>A</m><post>C</post>
    254     </test-case>
    255     
    256     <test-case id="test42" strength="TERTIARY">
    257       <pattern>A\u030A</pattern>
    258       <pre>\u01FA</pre>
    259     </test-case>
    260 
    261 
    262 
    263     <!-- SUPPLEMENTARYCANONICAL from usrchdat.c  -->
    264     <test-case id="test50" strength="TERTIARY">
    265       <pattern>\uD800\uDC00</pattern>
    266       <pre>abc \uD802\uDC00 \uD800\uDC01 \uD801\uDC00 </pre><m>\uD800\uDC00</m>
    267       <post>abc abc\uD800\uDC00 \uD800\uD800\uDC00 \uD800\uDC00\uDC00</post>
    268     </test-case>
    269     
    270     <test-case id="test51" strength="TERTIARY">
    271       <pattern>\\uD834\\uDDB9</pattern>
    272       <pre>and</pre><m>\\uD834\\uDDB9</m><post>this sentence</post>
    273     </test-case>
    274 
    275     <test-case id="test52" strength="TERTIARY">
    276       <pattern> \\uD834\\uDDB9 </pattern>
    277       <pre>and</pre><m> \\uD834\\uDDB9 </m><post>this sentence</post>
    278     </test-case>
    279     
    280     <test-case id="test53" strength="TERTIARY">
    281       <pattern>-\\uD834\\uDDB9-</pattern>
    282       <pre>and</pre><m>-\\uD834\\uDDB9-</m><post>this sentence</post>
    283     </test-case>
    284     
    285     <test-case id="test54" strength="TERTIARY">
    286       <pattern>,\\uD834\\uDDB9,</pattern>
    287       <pre>and</pre><m>,\\uD834\\uDDB9,</m><post>this sentence</post>
    288     </test-case>
    289     
    290     <test-case id="test55" strength="TERTIARY">
    291       <pattern>?\\uD834\\uDDB9?</pattern>
    292       <pre>and</pre><m>?\\uD834\\uDDB9?</m><post>this sentence</post>
    293     </test-case>
    294     
    295 
    296     <!-- Long combining sequences  -->
    297     <!-- Backwards search fails because patterns ends w/ ignorables
    298     <test-case id="test60" strength="PRIMARY">
    299       <pattern>A\u0301\u0301\u0301\u0301</pattern>
    300       <m>A\u0301\u0301\u0301\u0301\u0301</m>
    301     </test-case>
    302     -->
    303 
    304     <test-case id="test61" strength="TERTIARY">
    305       <pattern>A\u0301\u0301\u0301\u0301</pattern>
    306           <pre>A\u0301\u0301\u0301\u0301\u0301</pre>
    307     </test-case>
    308     
    309     <test-case id="test62" strength="TERTIARY">
    310       <pattern>A\u0301\u0301\u0301\u0301</pattern>
    311             <m>A\u0301\u0301\u0301\u0301</m>
    312     </test-case>
    313 
    314     <!-- stand-alone combining marks don't match attached marks  -->
    315     <test-case id="test63" strength="TERTIARY">
    316       <pattern>\u0301</pattern>
    317       <pre>A\u0301\u0301\u0301\u0301</pre>
    318     </test-case>
    319     
    320     <test-case id="test64" strength="TERTIARY">
    321       <pattern>\u0301</pattern>
    322       <post>\u0301\u0301\u0301\u0301</post>
    323     </test-case>
    324 
    325   <!-- stand-alone combining mark does match an un-attached combining mark -->
    326     <test-case id="test65" strength="TERTIARY">
    327        <pattern>\u0301</pattern>
    328        <m>\u0301</m><post>A\u0301\u0301</post>
    329     </test-case>
    330 
    331     <test-case id="test66" strength="TERTIARY">
    332        <pattern>\u0301</pattern>
    333        <m>\u0301</m>
    334     </test-case>
    335           
    336     <!-- stand-alone combining marks at end of the target text -->
    337     <test-case id="test67" strength="TERTIARY">
    338        <pattern>\u0301</pattern>
    339        <pre>abcd\r</pre><m>\u0301</m>
    340     </test-case>
    341 
    342       <!-- attached combining marks at end of the target text, no match -->
    343     <test-case id="test68" strength="TERTIARY">
    344        <pattern>\u0301</pattern>
    345        <pre>abcd\u0301</pre>
    346     </test-case>
    347 
    348 
    349 
    350    <!-- no match within expansions at the start -->
    351     <test-case id="test70" strength="PRIMARY">
    352       <pattern>Eligature</pattern>
    353       <pre>ligature</pre>
    354     </test-case>
    355 
    356     <test-case id="test71" strength="PRIMARY">
    357       <pattern>AEligature</pattern>
    358       <m>ligature</m>
    359     </test-case>
    360 
    361     <test-case id="test72" strength="PRIMARY">
    362         <pattern>AEligature</pattern>
    363         <m>ligature</m>
    364     </test-case>
    365     
    366     <!-- unattached combining Tilde will not match a Tilde that is
    367          part of a composed   (\u00D1)  -->
    368     <test-case id="test73" strength="SECONDARY">
    369         <pattern>\u0303</pattern>  <!-- combining tilde -->
    370         <pre>&#x0d;</pre><m>\u0303</m>
    371     </test-case>
    372     
    373     <test-case id="test74" strength="SECONDARY">
    374         <pattern>\u0303</pattern>  <!-- combining tilde -->
    375         <pre> &#x0d;</pre><m>\u0303</m><post>a</post>
    376     </test-case>
    377 
    378   <test-case id="test75" strength="TERTIARY" locale="fr">
    379     <pattern>\u00EA</pattern>
    380     <pre>p</pre><m>\u00EA</m><post>che</post>
    381   </test-case>
    382 
    383   <test-case id="test76" strength="TERTIARY" locale="fr">
    384     <pattern>\u00EA</pattern>
    385     <pre>p</pre><m>e\u0302</m><post>che</post>
    386   </test-case>
    387 
    388   <test-case id="test77" strength="TERTIARY" locale="fr">
    389     <pattern>e\u0302</pattern>
    390     <pre>p</pre><m>\u00EA</m><post>che</post>
    391   </test-case>
    392 
    393   <!-- Test cases from ticket:5382 -->
    394   <test-case id="test78" strength="SECONDARY" locale="hu_HU">
    395     <pattern>\u0170</pattern>
    396     <m>\u0171</m>
    397     <post>12</post>
    398   </test-case>
    399 
    400   <test-case id="test79" strength="SECONDARY" locale="hu_HU">
    401     <pattern>\u0170</pattern>
    402     <pre>1</pre>
    403     <m>\u0171</m>
    404     <post>2</post>
    405   </test-case>
    406 
    407   <test-case id="test80" strength="SECONDARY" locale="hu_HU">
    408     <pattern>\u0170</pattern>
    409     <pre>12</pre>
    410     <m>\u0171</m>
    411   </test-case>
    412   
    413   <!-- Test cases from ticket:5959 -->
    414   <test-case id="test81" strength="SECONDARY">
    415     <pattern>\u2166</pattern>
    416     <m>VII</m>
    417   </test-case>
    418 
    419   <test-case id="test82" strength="SECONDARY">
    420     <pattern>VII</pattern>
    421     <m>\u2166</m>
    422   </test-case>
    423 
    424   <test-case id="test83" strength="IDENTICAL" alternate_handling="SHIFTED" locale="en">
    425     <pattern>Universal Declaration of Human Rights</pattern>
    426     <pre>Proclaims this </pre><m>Universal Declaration of Human Rights</m><post> as a common standard of achievement for all peoples and all nations</post>
    427   </test-case>
    428 
    429   <test-case id="test83b" strength="TERTIARY" alternate_handling="SHIFTED" locale="en">
    430     <pattern>Universal Declaration of Human Rights</pattern>
    431     <pre>Proclaims this </pre>
    432     <m>Universal-Declaration-of-Human-Rights</m>
    433     <post> as a common standard of achievement for all peoples and all nations</post>
    434   </test-case>
    435 
    436   <test-case id="test84" strength="TERTIARY" locale="en">
    437     <pattern>\u05E9\u0591\u05E9</pattern>
    438     <m>\u05E9\u0592\u05E9</m>
    439   </test-case>
    440 
    441   <test-case id="test84b" strength="IDENTICAL" locale="en">
    442     <pattern>\u05E9\u0591\u05E9</pattern>
    443     <pre>\u05E9\u0592\u05E9</pre>
    444   </test-case>
    445 </stringsearch-tests>
    446   
    447