Home | History | Annotate | Download | only in tests
      1 # This file is derived from 
      2 #
      3 #    http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
      4 #    
      5 # Which was created by   Markus Kuhn <mkuhn (a] acm.org> - 2000-09-02 
      6 #
      7 # lines begining with # and blank lines are ignored
      8 #
      9 # Beyond that, this file consists of a series of test cases. Each test case consists of
     10 # 2 or 3 lines:
     11 #
     12 #  1. A UTF-8 string
     13 #  2. A status
     14 #      VALID      : The string is a valid UTF-8 representation of valid Unicode
     15 #      INCOMPLETE : The string has a partial character at the end
     16 #      NOTUNICODE : The string is valid UTF-8, but the characters represented
     17 #                   are not valid unicode (
     18 #      OVERLONG   : The string includes overlong sequences
     19 #      MALFORMED  : The string is not valid UTF-8
     20 # 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
     21 #    as a series of hex numbers.
     22 
     23 # 1  Some correct UTF-8 text
     24 
     25 VALID
     26 03ba 1f79 03c3 03bc 03b5
     27 
     28 # 2.1  First possible sequence of a certain length
     29 #
     30 # FIXME - handle NULLS?
     31 #
     32 # [ NULL BYTE ]
     33 #VALID
     34 #0000
     35 
     36 
     37 VALID
     38 0080
     39 
     40 
     41 VALID
     42 0800
     43 
     44 
     45 VALID
     46 00010000
     47 
     48 
     49 NOTUNICODE
     50 00200000
     51 
     52 
     53 NOTUNICODE
     54 04000000
     55 
     56 
     57 VALID
     58 0000007f
     59 
     60 
     61 VALID
     62 000007ff
     63 
     64 
     65 NOTUNICODE
     66 0000ffff
     67 
     68 
     69 NOTUNICODE
     70 001fffff
     71 
     72 
     73 NOTUNICODE
     74 03ffffff
     75 
     76 
     77 NOTUNICODE
     78 7fffffff
     79 
     80 # 2.3  Other boundary conditions
     81 
     82 
     83 VALID
     84 d7ff
     85 
     86 
     87 VALID
     88 e000
     89 
     90 
     91 VALID
     92 fffd
     93 
     94 
     95 VALID
     96 0010fffd
     97 
     98 
     99 NOTUNICODE
    100 0010ffff
    101 
    102 
    103 NOTUNICODE
    104 00110000
    105 
    106 # 3.1  Unexpected continuation bytes
    107 
    108 
    109 MALFORMED
    110 
    111 MALFORMED
    112 
    113 MALFORMED
    114 
    115 MALFORMED
    116 
    117 MALFORMED
    118 
    119 MALFORMED
    120 
    121 MALFORMED
    122 
    123 MALFORMED
    124 
    125 MALFORMED
    126 
    127 # 3.2  Lonely start characters
    128 
    129                                 
    130 MALFORMED
    131                 
    132 MALFORMED
    133         
    134 MALFORMED
    135     
    136 MALFORMED
    137   
    138 MALFORMED
    139 
    140 # 3.3  Sequences with last continuation byte missing
    141 
    142 
    143 INCOMPLETE
    144 
    145 INCOMPLETE
    146 
    147 INCOMPLETE
    148 
    149 INCOMPLETE
    150 
    151 INCOMPLETE
    152 
    153 INCOMPLETE
    154 
    155 INCOMPLETE
    156 
    157 INCOMPLETE
    158 
    159 INCOMPLETE
    160 
    161 INCOMPLETE
    162 
    163 # 3.4  Concatenation of incomplete sequences
    164 
    165 
    166 MALFORMED
    167 
    168 # 3.5  Impossible bytes
    169 
    170 
    171 MALFORMED
    172 
    173 MALFORMED
    174 
    175 MALFORMED
    176 
    177 #  Examples of an overlong ASCII character
    178 
    179 
    180 OVERLONG
    181 
    182 OVERLONG
    183 
    184 OVERLONG
    185 
    186 OVERLONG
    187 
    188 OVERLONG
    189 
    190 #  Maximum overlong sequences
    191 
    192 
    193 OVERLONG
    194 
    195 OVERLONG
    196 
    197 OVERLONG
    198 
    199 OVERLONG
    200 
    201 OVERLONG
    202 
    203 # Overlong representation of the NUL character
    204 
    205 
    206 OVERLONG
    207 
    208 OVERLONG
    209 
    210 OVERLONG
    211 
    212 OVERLONG
    213 
    214 OVERLONG
    215 
    216 # Illegal code positions
    217 
    218 # Single UTF-16 surrogates
    219 
    220 
    221 NOTUNICODE
    222 d800
    223 
    224 
    225 NOTUNICODE
    226 db7f
    227 
    228 
    229 NOTUNICODE
    230 db80
    231 
    232 
    233 NOTUNICODE
    234 dbff
    235 
    236 
    237 NOTUNICODE
    238 dc00
    239 
    240 
    241 NOTUNICODE
    242 df80
    243 
    244 
    245 NOTUNICODE
    246 dfff
    247 
    248 # Paired UTF-16 surrogates
    249 
    250 
    251 NOTUNICODE
    252 d800 dc00
    253 
    254 
    255 NOTUNICODE
    256 d800 dfff
    257 
    258 
    259 NOTUNICODE
    260 db7f dc00
    261 
    262 
    263 NOTUNICODE
    264 db7f dfff
    265 
    266 
    267 NOTUNICODE
    268 db80 dc00
    269 
    270 
    271 NOTUNICODE
    272 db80 dfff
    273 
    274 
    275 NOTUNICODE
    276 dbff dc00
    277 
    278 
    279 NOTUNICODE
    280 dbff dfff
    281 
    282 # Other illegal code positions
    283 
    284 
    285 NOTUNICODE
    286 fffe
    287 
    288 
    289 NOTUNICODE
    290 ffff
    291 
    292 ################
    293 #
    294 # Some more tests, not from Markus Kuhn's file
    295 #
    296 
    297 # Mixed plane 0 and higher planes
    298 
    299 ABC
    300 VALID
    301 41 00010000 42 10fffd 43
    302