Home | History | Annotate | Download | only in expat
      1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
      2    See the file COPYING for copying permission.
      3 */
      4 
      5 #ifndef IS_INVALID_CHAR
      6 #define IS_INVALID_CHAR(enc, ptr, n) (0)
      7 #endif
      8 
      9 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
     10     case BT_LEAD ## n: \
     11       if (end - ptr < n) \
     12         return XML_TOK_PARTIAL_CHAR; \
     13       if (IS_INVALID_CHAR(enc, ptr, n)) { \
     14         *(nextTokPtr) = (ptr); \
     15         return XML_TOK_INVALID; \
     16       } \
     17       ptr += n; \
     18       break;
     19 
     20 #define INVALID_CASES(ptr, nextTokPtr) \
     21   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
     22   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
     23   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
     24   case BT_NONXML: \
     25   case BT_MALFORM: \
     26   case BT_TRAIL: \
     27     *(nextTokPtr) = (ptr); \
     28     return XML_TOK_INVALID;
     29 
     30 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
     31    case BT_LEAD ## n: \
     32      if (end - ptr < n) \
     33        return XML_TOK_PARTIAL_CHAR; \
     34      if (!IS_NAME_CHAR(enc, ptr, n)) { \
     35        *nextTokPtr = ptr; \
     36        return XML_TOK_INVALID; \
     37      } \
     38      ptr += n; \
     39      break;
     40 
     41 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
     42   case BT_NONASCII: \
     43     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
     44       *nextTokPtr = ptr; \
     45       return XML_TOK_INVALID; \
     46     } \
     47   case BT_NMSTRT: \
     48   case BT_HEX: \
     49   case BT_DIGIT: \
     50   case BT_NAME: \
     51   case BT_MINUS: \
     52     ptr += MINBPC(enc); \
     53     break; \
     54   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
     55   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
     56   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
     57 
     58 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
     59    case BT_LEAD ## n: \
     60      if (end - ptr < n) \
     61        return XML_TOK_PARTIAL_CHAR; \
     62      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
     63        *nextTokPtr = ptr; \
     64        return XML_TOK_INVALID; \
     65      } \
     66      ptr += n; \
     67      break;
     68 
     69 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
     70   case BT_NONASCII: \
     71     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
     72       *nextTokPtr = ptr; \
     73       return XML_TOK_INVALID; \
     74     } \
     75   case BT_NMSTRT: \
     76   case BT_HEX: \
     77     ptr += MINBPC(enc); \
     78     break; \
     79   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
     80   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
     81   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
     82 
     83 #ifndef PREFIX
     84 #define PREFIX(ident) ident
     85 #endif
     86 
     87 /* ptr points to character following "<!-" */
     88 
     89 static int PTRCALL
     90 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
     91                     const char *end, const char **nextTokPtr)
     92 {
     93   if (ptr != end) {
     94     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
     95       *nextTokPtr = ptr;
     96       return XML_TOK_INVALID;
     97     }
     98     ptr += MINBPC(enc);
     99     while (ptr != end) {
    100       switch (BYTE_TYPE(enc, ptr)) {
    101       INVALID_CASES(ptr, nextTokPtr)
    102       case BT_MINUS:
    103         if ((ptr += MINBPC(enc)) == end)
    104           return XML_TOK_PARTIAL;
    105         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
    106           if ((ptr += MINBPC(enc)) == end)
    107             return XML_TOK_PARTIAL;
    108           if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    109             *nextTokPtr = ptr;
    110             return XML_TOK_INVALID;
    111           }
    112           *nextTokPtr = ptr + MINBPC(enc);
    113           return XML_TOK_COMMENT;
    114         }
    115         break;
    116       default:
    117         ptr += MINBPC(enc);
    118         break;
    119       }
    120     }
    121   }
    122   return XML_TOK_PARTIAL;
    123 }
    124 
    125 /* ptr points to character following "<!" */
    126 
    127 static int PTRCALL
    128 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
    129                  const char *end, const char **nextTokPtr)
    130 {
    131   if (ptr == end)
    132     return XML_TOK_PARTIAL;
    133   switch (BYTE_TYPE(enc, ptr)) {
    134   case BT_MINUS:
    135     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    136   case BT_LSQB:
    137     *nextTokPtr = ptr + MINBPC(enc);
    138     return XML_TOK_COND_SECT_OPEN;
    139   case BT_NMSTRT:
    140   case BT_HEX:
    141     ptr += MINBPC(enc);
    142     break;
    143   default:
    144     *nextTokPtr = ptr;
    145     return XML_TOK_INVALID;
    146   }
    147   while (ptr != end) {
    148     switch (BYTE_TYPE(enc, ptr)) {
    149     case BT_PERCNT:
    150       if (ptr + MINBPC(enc) == end)
    151         return XML_TOK_PARTIAL;
    152       /* don't allow <!ENTITY% foo "whatever"> */
    153       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
    154       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
    155         *nextTokPtr = ptr;
    156         return XML_TOK_INVALID;
    157       }
    158       /* fall through */
    159     case BT_S: case BT_CR: case BT_LF:
    160       *nextTokPtr = ptr;
    161       return XML_TOK_DECL_OPEN;
    162     case BT_NMSTRT:
    163     case BT_HEX:
    164       ptr += MINBPC(enc);
    165       break;
    166     default:
    167       *nextTokPtr = ptr;
    168       return XML_TOK_INVALID;
    169     }
    170   }
    171   return XML_TOK_PARTIAL;
    172 }
    173 
    174 static int PTRCALL
    175 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
    176                       const char *end, int *tokPtr)
    177 {
    178   int upper = 0;
    179   *tokPtr = XML_TOK_PI;
    180   if (end - ptr != MINBPC(enc)*3)
    181     return 1;
    182   switch (BYTE_TO_ASCII(enc, ptr)) {
    183   case ASCII_x:
    184     break;
    185   case ASCII_X:
    186     upper = 1;
    187     break;
    188   default:
    189     return 1;
    190   }
    191   ptr += MINBPC(enc);
    192   switch (BYTE_TO_ASCII(enc, ptr)) {
    193   case ASCII_m:
    194     break;
    195   case ASCII_M:
    196     upper = 1;
    197     break;
    198   default:
    199     return 1;
    200   }
    201   ptr += MINBPC(enc);
    202   switch (BYTE_TO_ASCII(enc, ptr)) {
    203   case ASCII_l:
    204     break;
    205   case ASCII_L:
    206     upper = 1;
    207     break;
    208   default:
    209     return 1;
    210   }
    211   if (upper)
    212     return 0;
    213   *tokPtr = XML_TOK_XML_DECL;
    214   return 1;
    215 }
    216 
    217 /* ptr points to character following "<?" */
    218 
    219 static int PTRCALL
    220 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
    221                const char *end, const char **nextTokPtr)
    222 {
    223   int tok;
    224   const char *target = ptr;
    225   if (ptr == end)
    226     return XML_TOK_PARTIAL;
    227   switch (BYTE_TYPE(enc, ptr)) {
    228   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    229   default:
    230     *nextTokPtr = ptr;
    231     return XML_TOK_INVALID;
    232   }
    233   while (ptr != end) {
    234     switch (BYTE_TYPE(enc, ptr)) {
    235     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    236     case BT_S: case BT_CR: case BT_LF:
    237       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
    238         *nextTokPtr = ptr;
    239         return XML_TOK_INVALID;
    240       }
    241       ptr += MINBPC(enc);
    242       while (ptr != end) {
    243         switch (BYTE_TYPE(enc, ptr)) {
    244         INVALID_CASES(ptr, nextTokPtr)
    245         case BT_QUEST:
    246           ptr += MINBPC(enc);
    247           if (ptr == end)
    248             return XML_TOK_PARTIAL;
    249           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    250             *nextTokPtr = ptr + MINBPC(enc);
    251             return tok;
    252           }
    253           break;
    254         default:
    255           ptr += MINBPC(enc);
    256           break;
    257         }
    258       }
    259       return XML_TOK_PARTIAL;
    260     case BT_QUEST:
    261       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
    262         *nextTokPtr = ptr;
    263         return XML_TOK_INVALID;
    264       }
    265       ptr += MINBPC(enc);
    266       if (ptr == end)
    267         return XML_TOK_PARTIAL;
    268       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    269         *nextTokPtr = ptr + MINBPC(enc);
    270         return tok;
    271       }
    272       /* fall through */
    273     default:
    274       *nextTokPtr = ptr;
    275       return XML_TOK_INVALID;
    276     }
    277   }
    278   return XML_TOK_PARTIAL;
    279 }
    280 
    281 static int PTRCALL
    282 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
    283                          const char *end, const char **nextTokPtr)
    284 {
    285   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
    286                                      ASCII_T, ASCII_A, ASCII_LSQB };
    287   int i;
    288   /* CDATA[ */
    289   if (end - ptr < 6 * MINBPC(enc))
    290     return XML_TOK_PARTIAL;
    291   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
    292     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
    293       *nextTokPtr = ptr;
    294       return XML_TOK_INVALID;
    295     }
    296   }
    297   *nextTokPtr = ptr;
    298   return XML_TOK_CDATA_SECT_OPEN;
    299 }
    300 
    301 static int PTRCALL
    302 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
    303                         const char *end, const char **nextTokPtr)
    304 {
    305   if (ptr == end)
    306     return XML_TOK_NONE;
    307   if (MINBPC(enc) > 1) {
    308     size_t n = end - ptr;
    309     if (n & (MINBPC(enc) - 1)) {
    310       n &= ~(MINBPC(enc) - 1);
    311       if (n == 0)
    312         return XML_TOK_PARTIAL;
    313       end = ptr + n;
    314     }
    315   }
    316   switch (BYTE_TYPE(enc, ptr)) {
    317   case BT_RSQB:
    318     ptr += MINBPC(enc);
    319     if (ptr == end)
    320       return XML_TOK_PARTIAL;
    321     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
    322       break;
    323     ptr += MINBPC(enc);
    324     if (ptr == end)
    325       return XML_TOK_PARTIAL;
    326     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    327       ptr -= MINBPC(enc);
    328       break;
    329     }
    330     *nextTokPtr = ptr + MINBPC(enc);
    331     return XML_TOK_CDATA_SECT_CLOSE;
    332   case BT_CR:
    333     ptr += MINBPC(enc);
    334     if (ptr == end)
    335       return XML_TOK_PARTIAL;
    336     if (BYTE_TYPE(enc, ptr) == BT_LF)
    337       ptr += MINBPC(enc);
    338     *nextTokPtr = ptr;
    339     return XML_TOK_DATA_NEWLINE;
    340   case BT_LF:
    341     *nextTokPtr = ptr + MINBPC(enc);
    342     return XML_TOK_DATA_NEWLINE;
    343   INVALID_CASES(ptr, nextTokPtr)
    344   default:
    345     ptr += MINBPC(enc);
    346     break;
    347   }
    348   while (ptr != end) {
    349     switch (BYTE_TYPE(enc, ptr)) {
    350 #define LEAD_CASE(n) \
    351     case BT_LEAD ## n: \
    352       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
    353         *nextTokPtr = ptr; \
    354         return XML_TOK_DATA_CHARS; \
    355       } \
    356       ptr += n; \
    357       break;
    358     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
    359 #undef LEAD_CASE
    360     case BT_NONXML:
    361     case BT_MALFORM:
    362     case BT_TRAIL:
    363     case BT_CR:
    364     case BT_LF:
    365     case BT_RSQB:
    366       *nextTokPtr = ptr;
    367       return XML_TOK_DATA_CHARS;
    368     default:
    369       ptr += MINBPC(enc);
    370       break;
    371     }
    372   }
    373   *nextTokPtr = ptr;
    374   return XML_TOK_DATA_CHARS;
    375 }
    376 
    377 /* ptr points to character following "</" */
    378 
    379 static int PTRCALL
    380 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
    381                    const char *end, const char **nextTokPtr)
    382 {
    383   if (ptr == end)
    384     return XML_TOK_PARTIAL;
    385   switch (BYTE_TYPE(enc, ptr)) {
    386   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    387   default:
    388     *nextTokPtr = ptr;
    389     return XML_TOK_INVALID;
    390   }
    391   while (ptr != end) {
    392     switch (BYTE_TYPE(enc, ptr)) {
    393     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    394     case BT_S: case BT_CR: case BT_LF:
    395       for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
    396         switch (BYTE_TYPE(enc, ptr)) {
    397         case BT_S: case BT_CR: case BT_LF:
    398           break;
    399         case BT_GT:
    400           *nextTokPtr = ptr + MINBPC(enc);
    401           return XML_TOK_END_TAG;
    402         default:
    403           *nextTokPtr = ptr;
    404           return XML_TOK_INVALID;
    405         }
    406       }
    407       return XML_TOK_PARTIAL;
    408 #ifdef XML_NS
    409     case BT_COLON:
    410       /* no need to check qname syntax here,
    411          since end-tag must match exactly */
    412       ptr += MINBPC(enc);
    413       break;
    414 #endif
    415     case BT_GT:
    416       *nextTokPtr = ptr + MINBPC(enc);
    417       return XML_TOK_END_TAG;
    418     default:
    419       *nextTokPtr = ptr;
    420       return XML_TOK_INVALID;
    421     }
    422   }
    423   return XML_TOK_PARTIAL;
    424 }
    425 
    426 /* ptr points to character following "&#X" */
    427 
    428 static int PTRCALL
    429 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
    430                        const char *end, const char **nextTokPtr)
    431 {
    432   if (ptr != end) {
    433     switch (BYTE_TYPE(enc, ptr)) {
    434     case BT_DIGIT:
    435     case BT_HEX:
    436       break;
    437     default:
    438       *nextTokPtr = ptr;
    439       return XML_TOK_INVALID;
    440     }
    441     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
    442       switch (BYTE_TYPE(enc, ptr)) {
    443       case BT_DIGIT:
    444       case BT_HEX:
    445         break;
    446       case BT_SEMI:
    447         *nextTokPtr = ptr + MINBPC(enc);
    448         return XML_TOK_CHAR_REF;
    449       default:
    450         *nextTokPtr = ptr;
    451         return XML_TOK_INVALID;
    452       }
    453     }
    454   }
    455   return XML_TOK_PARTIAL;
    456 }
    457 
    458 /* ptr points to character following "&#" */
    459 
    460 static int PTRCALL
    461 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
    462                     const char *end, const char **nextTokPtr)
    463 {
    464   if (ptr != end) {
    465     if (CHAR_MATCHES(enc, ptr, ASCII_x))
    466       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    467     switch (BYTE_TYPE(enc, ptr)) {
    468     case BT_DIGIT:
    469       break;
    470     default:
    471       *nextTokPtr = ptr;
    472       return XML_TOK_INVALID;
    473     }
    474     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
    475       switch (BYTE_TYPE(enc, ptr)) {
    476       case BT_DIGIT:
    477         break;
    478       case BT_SEMI:
    479         *nextTokPtr = ptr + MINBPC(enc);
    480         return XML_TOK_CHAR_REF;
    481       default:
    482         *nextTokPtr = ptr;
    483         return XML_TOK_INVALID;
    484       }
    485     }
    486   }
    487   return XML_TOK_PARTIAL;
    488 }
    489 
    490 /* ptr points to character following "&" */
    491 
    492 static int PTRCALL
    493 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
    494                 const char **nextTokPtr)
    495 {
    496   if (ptr == end)
    497     return XML_TOK_PARTIAL;
    498   switch (BYTE_TYPE(enc, ptr)) {
    499   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    500   case BT_NUM:
    501     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    502   default:
    503     *nextTokPtr = ptr;
    504     return XML_TOK_INVALID;
    505   }
    506   while (ptr != end) {
    507     switch (BYTE_TYPE(enc, ptr)) {
    508     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    509     case BT_SEMI:
    510       *nextTokPtr = ptr + MINBPC(enc);
    511       return XML_TOK_ENTITY_REF;
    512     default:
    513       *nextTokPtr = ptr;
    514       return XML_TOK_INVALID;
    515     }
    516   }
    517   return XML_TOK_PARTIAL;
    518 }
    519 
    520 /* ptr points to character following first character of attribute name */
    521 
    522 static int PTRCALL
    523 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
    524                  const char **nextTokPtr)
    525 {
    526 #ifdef XML_NS
    527   int hadColon = 0;
    528 #endif
    529   while (ptr != end) {
    530     switch (BYTE_TYPE(enc, ptr)) {
    531     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    532 #ifdef XML_NS
    533     case BT_COLON:
    534       if (hadColon) {
    535         *nextTokPtr = ptr;
    536         return XML_TOK_INVALID;
    537       }
    538       hadColon = 1;
    539       ptr += MINBPC(enc);
    540       if (ptr == end)
    541         return XML_TOK_PARTIAL;
    542       switch (BYTE_TYPE(enc, ptr)) {
    543       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    544       default:
    545         *nextTokPtr = ptr;
    546         return XML_TOK_INVALID;
    547       }
    548       break;
    549 #endif
    550     case BT_S: case BT_CR: case BT_LF:
    551       for (;;) {
    552         int t;
    553 
    554         ptr += MINBPC(enc);
    555         if (ptr == end)
    556           return XML_TOK_PARTIAL;
    557         t = BYTE_TYPE(enc, ptr);
    558         if (t == BT_EQUALS)
    559           break;
    560         switch (t) {
    561         case BT_S:
    562         case BT_LF:
    563         case BT_CR:
    564           break;
    565         default:
    566           *nextTokPtr = ptr;
    567           return XML_TOK_INVALID;
    568         }
    569       }
    570     /* fall through */
    571     case BT_EQUALS:
    572       {
    573         int open;
    574 #ifdef XML_NS
    575         hadColon = 0;
    576 #endif
    577         for (;;) {
    578           ptr += MINBPC(enc);
    579           if (ptr == end)
    580             return XML_TOK_PARTIAL;
    581           open = BYTE_TYPE(enc, ptr);
    582           if (open == BT_QUOT || open == BT_APOS)
    583             break;
    584           switch (open) {
    585           case BT_S:
    586           case BT_LF:
    587           case BT_CR:
    588             break;
    589           default:
    590             *nextTokPtr = ptr;
    591             return XML_TOK_INVALID;
    592           }
    593         }
    594         ptr += MINBPC(enc);
    595         /* in attribute value */
    596         for (;;) {
    597           int t;
    598           if (ptr == end)
    599             return XML_TOK_PARTIAL;
    600           t = BYTE_TYPE(enc, ptr);
    601           if (t == open)
    602             break;
    603           switch (t) {
    604           INVALID_CASES(ptr, nextTokPtr)
    605           case BT_AMP:
    606             {
    607               int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
    608               if (tok <= 0) {
    609                 if (tok == XML_TOK_INVALID)
    610                   *nextTokPtr = ptr;
    611                 return tok;
    612               }
    613               break;
    614             }
    615           case BT_LT:
    616             *nextTokPtr = ptr;
    617             return XML_TOK_INVALID;
    618           default:
    619             ptr += MINBPC(enc);
    620             break;
    621           }
    622         }
    623         ptr += MINBPC(enc);
    624         if (ptr == end)
    625           return XML_TOK_PARTIAL;
    626         switch (BYTE_TYPE(enc, ptr)) {
    627         case BT_S:
    628         case BT_CR:
    629         case BT_LF:
    630           break;
    631         case BT_SOL:
    632           goto sol;
    633         case BT_GT:
    634           goto gt;
    635         default:
    636           *nextTokPtr = ptr;
    637           return XML_TOK_INVALID;
    638         }
    639         /* ptr points to closing quote */
    640         for (;;) {
    641           ptr += MINBPC(enc);
    642           if (ptr == end)
    643             return XML_TOK_PARTIAL;
    644           switch (BYTE_TYPE(enc, ptr)) {
    645           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    646           case BT_S: case BT_CR: case BT_LF:
    647             continue;
    648           case BT_GT:
    649           gt:
    650             *nextTokPtr = ptr + MINBPC(enc);
    651             return XML_TOK_START_TAG_WITH_ATTS;
    652           case BT_SOL:
    653           sol:
    654             ptr += MINBPC(enc);
    655             if (ptr == end)
    656               return XML_TOK_PARTIAL;
    657             if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    658               *nextTokPtr = ptr;
    659               return XML_TOK_INVALID;
    660             }
    661             *nextTokPtr = ptr + MINBPC(enc);
    662             return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
    663           default:
    664             *nextTokPtr = ptr;
    665             return XML_TOK_INVALID;
    666           }
    667           break;
    668         }
    669         break;
    670       }
    671     default:
    672       *nextTokPtr = ptr;
    673       return XML_TOK_INVALID;
    674     }
    675   }
    676   return XML_TOK_PARTIAL;
    677 }
    678 
    679 /* ptr points to character following "<" */
    680 
    681 static int PTRCALL
    682 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
    683                const char **nextTokPtr)
    684 {
    685 #ifdef XML_NS
    686   int hadColon;
    687 #endif
    688   if (ptr == end)
    689     return XML_TOK_PARTIAL;
    690   switch (BYTE_TYPE(enc, ptr)) {
    691   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    692   case BT_EXCL:
    693     if ((ptr += MINBPC(enc)) == end)
    694       return XML_TOK_PARTIAL;
    695     switch (BYTE_TYPE(enc, ptr)) {
    696     case BT_MINUS:
    697       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    698     case BT_LSQB:
    699       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
    700                                       end, nextTokPtr);
    701     }
    702     *nextTokPtr = ptr;
    703     return XML_TOK_INVALID;
    704   case BT_QUEST:
    705     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    706   case BT_SOL:
    707     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    708   default:
    709     *nextTokPtr = ptr;
    710     return XML_TOK_INVALID;
    711   }
    712 #ifdef XML_NS
    713   hadColon = 0;
    714 #endif
    715   /* we have a start-tag */
    716   while (ptr != end) {
    717     switch (BYTE_TYPE(enc, ptr)) {
    718     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    719 #ifdef XML_NS
    720     case BT_COLON:
    721       if (hadColon) {
    722         *nextTokPtr = ptr;
    723         return XML_TOK_INVALID;
    724       }
    725       hadColon = 1;
    726       ptr += MINBPC(enc);
    727       if (ptr == end)
    728         return XML_TOK_PARTIAL;
    729       switch (BYTE_TYPE(enc, ptr)) {
    730       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    731       default:
    732         *nextTokPtr = ptr;
    733         return XML_TOK_INVALID;
    734       }
    735       break;
    736 #endif
    737     case BT_S: case BT_CR: case BT_LF:
    738       {
    739         ptr += MINBPC(enc);
    740         while (ptr != end) {
    741           switch (BYTE_TYPE(enc, ptr)) {
    742           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    743           case BT_GT:
    744             goto gt;
    745           case BT_SOL:
    746             goto sol;
    747           case BT_S: case BT_CR: case BT_LF:
    748             ptr += MINBPC(enc);
    749             continue;
    750           default:
    751             *nextTokPtr = ptr;
    752             return XML_TOK_INVALID;
    753           }
    754           return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
    755         }
    756         return XML_TOK_PARTIAL;
    757       }
    758     case BT_GT:
    759     gt:
    760       *nextTokPtr = ptr + MINBPC(enc);
    761       return XML_TOK_START_TAG_NO_ATTS;
    762     case BT_SOL:
    763     sol:
    764       ptr += MINBPC(enc);
    765       if (ptr == end)
    766         return XML_TOK_PARTIAL;
    767       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    768         *nextTokPtr = ptr;
    769         return XML_TOK_INVALID;
    770       }
    771       *nextTokPtr = ptr + MINBPC(enc);
    772       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
    773     default:
    774       *nextTokPtr = ptr;
    775       return XML_TOK_INVALID;
    776     }
    777   }
    778   return XML_TOK_PARTIAL;
    779 }
    780 
    781 static int PTRCALL
    782 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
    783                    const char **nextTokPtr)
    784 {
    785   if (ptr == end)
    786     return XML_TOK_NONE;
    787   if (MINBPC(enc) > 1) {
    788     size_t n = end - ptr;
    789     if (n & (MINBPC(enc) - 1)) {
    790       n &= ~(MINBPC(enc) - 1);
    791       if (n == 0)
    792         return XML_TOK_PARTIAL;
    793       end = ptr + n;
    794     }
    795   }
    796   switch (BYTE_TYPE(enc, ptr)) {
    797   case BT_LT:
    798     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    799   case BT_AMP:
    800     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    801   case BT_CR:
    802     ptr += MINBPC(enc);
    803     if (ptr == end)
    804       return XML_TOK_TRAILING_CR;
    805     if (BYTE_TYPE(enc, ptr) == BT_LF)
    806       ptr += MINBPC(enc);
    807     *nextTokPtr = ptr;
    808     return XML_TOK_DATA_NEWLINE;
    809   case BT_LF:
    810     *nextTokPtr = ptr + MINBPC(enc);
    811     return XML_TOK_DATA_NEWLINE;
    812   case BT_RSQB:
    813     ptr += MINBPC(enc);
    814     if (ptr == end)
    815       return XML_TOK_TRAILING_RSQB;
    816     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
    817       break;
    818     ptr += MINBPC(enc);
    819     if (ptr == end)
    820       return XML_TOK_TRAILING_RSQB;
    821     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    822       ptr -= MINBPC(enc);
    823       break;
    824     }
    825     *nextTokPtr = ptr;
    826     return XML_TOK_INVALID;
    827   INVALID_CASES(ptr, nextTokPtr)
    828   default:
    829     ptr += MINBPC(enc);
    830     break;
    831   }
    832   while (ptr != end) {
    833     switch (BYTE_TYPE(enc, ptr)) {
    834 #define LEAD_CASE(n) \
    835     case BT_LEAD ## n: \
    836       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
    837         *nextTokPtr = ptr; \
    838         return XML_TOK_DATA_CHARS; \
    839       } \
    840       ptr += n; \
    841       break;
    842     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
    843 #undef LEAD_CASE
    844     case BT_RSQB:
    845       if (ptr + MINBPC(enc) != end) {
    846          if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
    847            ptr += MINBPC(enc);
    848            break;
    849          }
    850          if (ptr + 2*MINBPC(enc) != end) {
    851            if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
    852              ptr += MINBPC(enc);
    853              break;
    854            }
    855            *nextTokPtr = ptr + 2*MINBPC(enc);
    856            return XML_TOK_INVALID;
    857          }
    858       }
    859       /* fall through */
    860     case BT_AMP:
    861     case BT_LT:
    862     case BT_NONXML:
    863     case BT_MALFORM:
    864     case BT_TRAIL:
    865     case BT_CR:
    866     case BT_LF:
    867       *nextTokPtr = ptr;
    868       return XML_TOK_DATA_CHARS;
    869     default:
    870       ptr += MINBPC(enc);
    871       break;
    872     }
    873   }
    874   *nextTokPtr = ptr;
    875   return XML_TOK_DATA_CHARS;
    876 }
    877 
    878 /* ptr points to character following "%" */
    879 
    880 static int PTRCALL
    881 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
    882                     const char **nextTokPtr)
    883 {
    884   if (ptr == end)
    885     return -XML_TOK_PERCENT;
    886   switch (BYTE_TYPE(enc, ptr)) {
    887   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    888   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
    889     *nextTokPtr = ptr;
    890     return XML_TOK_PERCENT;
    891   default:
    892     *nextTokPtr = ptr;
    893     return XML_TOK_INVALID;
    894   }
    895   while (ptr != end) {
    896     switch (BYTE_TYPE(enc, ptr)) {
    897     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    898     case BT_SEMI:
    899       *nextTokPtr = ptr + MINBPC(enc);
    900       return XML_TOK_PARAM_ENTITY_REF;
    901     default:
    902       *nextTokPtr = ptr;
    903       return XML_TOK_INVALID;
    904     }
    905   }
    906   return XML_TOK_PARTIAL;
    907 }
    908 
    909 static int PTRCALL
    910 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
    911                       const char **nextTokPtr)
    912 {
    913   if (ptr == end)
    914     return XML_TOK_PARTIAL;
    915   switch (BYTE_TYPE(enc, ptr)) {
    916   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    917   default:
    918     *nextTokPtr = ptr;
    919     return XML_TOK_INVALID;
    920   }
    921   while (ptr != end) {
    922     switch (BYTE_TYPE(enc, ptr)) {
    923     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    924     case BT_CR: case BT_LF: case BT_S:
    925     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
    926       *nextTokPtr = ptr;
    927       return XML_TOK_POUND_NAME;
    928     default:
    929       *nextTokPtr = ptr;
    930       return XML_TOK_INVALID;
    931     }
    932   }
    933   return -XML_TOK_POUND_NAME;
    934 }
    935 
    936 static int PTRCALL
    937 PREFIX(scanLit)(int open, const ENCODING *enc,
    938                 const char *ptr, const char *end,
    939                 const char **nextTokPtr)
    940 {
    941   while (ptr != end) {
    942     int t = BYTE_TYPE(enc, ptr);
    943     switch (t) {
    944     INVALID_CASES(ptr, nextTokPtr)
    945     case BT_QUOT:
    946     case BT_APOS:
    947       ptr += MINBPC(enc);
    948       if (t != open)
    949         break;
    950       if (ptr == end)
    951         return -XML_TOK_LITERAL;
    952       *nextTokPtr = ptr;
    953       switch (BYTE_TYPE(enc, ptr)) {
    954       case BT_S: case BT_CR: case BT_LF:
    955       case BT_GT: case BT_PERCNT: case BT_LSQB:
    956         return XML_TOK_LITERAL;
    957       default:
    958         return XML_TOK_INVALID;
    959       }
    960     default:
    961       ptr += MINBPC(enc);
    962       break;
    963     }
    964   }
    965   return XML_TOK_PARTIAL;
    966 }
    967 
    968 static int PTRCALL
    969 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
    970                   const char **nextTokPtr)
    971 {
    972   int tok;
    973   if (ptr == end)
    974     return XML_TOK_NONE;
    975   if (MINBPC(enc) > 1) {
    976     size_t n = end - ptr;
    977     if (n & (MINBPC(enc) - 1)) {
    978       n &= ~(MINBPC(enc) - 1);
    979       if (n == 0)
    980         return XML_TOK_PARTIAL;
    981       end = ptr + n;
    982     }
    983   }
    984   switch (BYTE_TYPE(enc, ptr)) {
    985   case BT_QUOT:
    986     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
    987   case BT_APOS:
    988     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
    989   case BT_LT:
    990     {
    991       ptr += MINBPC(enc);
    992       if (ptr == end)
    993         return XML_TOK_PARTIAL;
    994       switch (BYTE_TYPE(enc, ptr)) {
    995       case BT_EXCL:
    996         return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    997       case BT_QUEST:
    998         return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    999       case BT_NMSTRT:
   1000       case BT_HEX:
   1001       case BT_NONASCII:
   1002       case BT_LEAD2:
   1003       case BT_LEAD3:
   1004       case BT_LEAD4:
   1005         *nextTokPtr = ptr - MINBPC(enc);
   1006         return XML_TOK_INSTANCE_START;
   1007       }
   1008       *nextTokPtr = ptr;
   1009       return XML_TOK_INVALID;
   1010     }
   1011   case BT_CR:
   1012     if (ptr + MINBPC(enc) == end) {
   1013       *nextTokPtr = end;
   1014       /* indicate that this might be part of a CR/LF pair */
   1015       return -XML_TOK_PROLOG_S;
   1016     }
   1017     /* fall through */
   1018   case BT_S: case BT_LF:
   1019     for (;;) {
   1020       ptr += MINBPC(enc);
   1021       if (ptr == end)
   1022         break;
   1023       switch (BYTE_TYPE(enc, ptr)) {
   1024       case BT_S: case BT_LF:
   1025         break;
   1026       case BT_CR:
   1027         /* don't split CR/LF pair */
   1028         if (ptr + MINBPC(enc) != end)
   1029           break;
   1030         /* fall through */
   1031       default:
   1032         *nextTokPtr = ptr;
   1033         return XML_TOK_PROLOG_S;
   1034       }
   1035     }
   1036     *nextTokPtr = ptr;
   1037     return XML_TOK_PROLOG_S;
   1038   case BT_PERCNT:
   1039     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1040   case BT_COMMA:
   1041     *nextTokPtr = ptr + MINBPC(enc);
   1042     return XML_TOK_COMMA;
   1043   case BT_LSQB:
   1044     *nextTokPtr = ptr + MINBPC(enc);
   1045     return XML_TOK_OPEN_BRACKET;
   1046   case BT_RSQB:
   1047     ptr += MINBPC(enc);
   1048     if (ptr == end)
   1049       return -XML_TOK_CLOSE_BRACKET;
   1050     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
   1051       if (ptr + MINBPC(enc) == end)
   1052         return XML_TOK_PARTIAL;
   1053       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
   1054         *nextTokPtr = ptr + 2*MINBPC(enc);
   1055         return XML_TOK_COND_SECT_CLOSE;
   1056       }
   1057     }
   1058     *nextTokPtr = ptr;
   1059     return XML_TOK_CLOSE_BRACKET;
   1060   case BT_LPAR:
   1061     *nextTokPtr = ptr + MINBPC(enc);
   1062     return XML_TOK_OPEN_PAREN;
   1063   case BT_RPAR:
   1064     ptr += MINBPC(enc);
   1065     if (ptr == end)
   1066       return -XML_TOK_CLOSE_PAREN;
   1067     switch (BYTE_TYPE(enc, ptr)) {
   1068     case BT_AST:
   1069       *nextTokPtr = ptr + MINBPC(enc);
   1070       return XML_TOK_CLOSE_PAREN_ASTERISK;
   1071     case BT_QUEST:
   1072       *nextTokPtr = ptr + MINBPC(enc);
   1073       return XML_TOK_CLOSE_PAREN_QUESTION;
   1074     case BT_PLUS:
   1075       *nextTokPtr = ptr + MINBPC(enc);
   1076       return XML_TOK_CLOSE_PAREN_PLUS;
   1077     case BT_CR: case BT_LF: case BT_S:
   1078     case BT_GT: case BT_COMMA: case BT_VERBAR:
   1079     case BT_RPAR:
   1080       *nextTokPtr = ptr;
   1081       return XML_TOK_CLOSE_PAREN;
   1082     }
   1083     *nextTokPtr = ptr;
   1084     return XML_TOK_INVALID;
   1085   case BT_VERBAR:
   1086     *nextTokPtr = ptr + MINBPC(enc);
   1087     return XML_TOK_OR;
   1088   case BT_GT:
   1089     *nextTokPtr = ptr + MINBPC(enc);
   1090     return XML_TOK_DECL_CLOSE;
   1091   case BT_NUM:
   1092     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1093 #define LEAD_CASE(n) \
   1094   case BT_LEAD ## n: \
   1095     if (end - ptr < n) \
   1096       return XML_TOK_PARTIAL_CHAR; \
   1097     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
   1098       ptr += n; \
   1099       tok = XML_TOK_NAME; \
   1100       break; \
   1101     } \
   1102     if (IS_NAME_CHAR(enc, ptr, n)) { \
   1103       ptr += n; \
   1104       tok = XML_TOK_NMTOKEN; \
   1105       break; \
   1106     } \
   1107     *nextTokPtr = ptr; \
   1108     return XML_TOK_INVALID;
   1109     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1110 #undef LEAD_CASE
   1111   case BT_NMSTRT:
   1112   case BT_HEX:
   1113     tok = XML_TOK_NAME;
   1114     ptr += MINBPC(enc);
   1115     break;
   1116   case BT_DIGIT:
   1117   case BT_NAME:
   1118   case BT_MINUS:
   1119 #ifdef XML_NS
   1120   case BT_COLON:
   1121 #endif
   1122     tok = XML_TOK_NMTOKEN;
   1123     ptr += MINBPC(enc);
   1124     break;
   1125   case BT_NONASCII:
   1126     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
   1127       ptr += MINBPC(enc);
   1128       tok = XML_TOK_NAME;
   1129       break;
   1130     }
   1131     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
   1132       ptr += MINBPC(enc);
   1133       tok = XML_TOK_NMTOKEN;
   1134       break;
   1135     }
   1136     /* fall through */
   1137   default:
   1138     *nextTokPtr = ptr;
   1139     return XML_TOK_INVALID;
   1140   }
   1141   while (ptr != end) {
   1142     switch (BYTE_TYPE(enc, ptr)) {
   1143     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
   1144     case BT_GT: case BT_RPAR: case BT_COMMA:
   1145     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
   1146     case BT_S: case BT_CR: case BT_LF:
   1147       *nextTokPtr = ptr;
   1148       return tok;
   1149 #ifdef XML_NS
   1150     case BT_COLON:
   1151       ptr += MINBPC(enc);
   1152       switch (tok) {
   1153       case XML_TOK_NAME:
   1154         if (ptr == end)
   1155           return XML_TOK_PARTIAL;
   1156         tok = XML_TOK_PREFIXED_NAME;
   1157         switch (BYTE_TYPE(enc, ptr)) {
   1158         CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
   1159         default:
   1160           tok = XML_TOK_NMTOKEN;
   1161           break;
   1162         }
   1163         break;
   1164       case XML_TOK_PREFIXED_NAME:
   1165         tok = XML_TOK_NMTOKEN;
   1166         break;
   1167       }
   1168       break;
   1169 #endif
   1170     case BT_PLUS:
   1171       if (tok == XML_TOK_NMTOKEN)  {
   1172         *nextTokPtr = ptr;
   1173         return XML_TOK_INVALID;
   1174       }
   1175       *nextTokPtr = ptr + MINBPC(enc);
   1176       return XML_TOK_NAME_PLUS;
   1177     case BT_AST:
   1178       if (tok == XML_TOK_NMTOKEN)  {
   1179         *nextTokPtr = ptr;
   1180         return XML_TOK_INVALID;
   1181       }
   1182       *nextTokPtr = ptr + MINBPC(enc);
   1183       return XML_TOK_NAME_ASTERISK;
   1184     case BT_QUEST:
   1185       if (tok == XML_TOK_NMTOKEN)  {
   1186         *nextTokPtr = ptr;
   1187         return XML_TOK_INVALID;
   1188       }
   1189       *nextTokPtr = ptr + MINBPC(enc);
   1190       return XML_TOK_NAME_QUESTION;
   1191     default:
   1192       *nextTokPtr = ptr;
   1193       return XML_TOK_INVALID;
   1194     }
   1195   }
   1196   return -tok;
   1197 }
   1198 
   1199 static int PTRCALL
   1200 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
   1201                           const char *end, const char **nextTokPtr)
   1202 {
   1203   const char *start;
   1204   if (ptr == end)
   1205     return XML_TOK_NONE;
   1206   start = ptr;
   1207   while (ptr != end) {
   1208     switch (BYTE_TYPE(enc, ptr)) {
   1209 #define LEAD_CASE(n) \
   1210     case BT_LEAD ## n: ptr += n; break;
   1211     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1212 #undef LEAD_CASE
   1213     case BT_AMP:
   1214       if (ptr == start)
   1215         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1216       *nextTokPtr = ptr;
   1217       return XML_TOK_DATA_CHARS;
   1218     case BT_LT:
   1219       /* this is for inside entity references */
   1220       *nextTokPtr = ptr;
   1221       return XML_TOK_INVALID;
   1222     case BT_LF:
   1223       if (ptr == start) {
   1224         *nextTokPtr = ptr + MINBPC(enc);
   1225         return XML_TOK_DATA_NEWLINE;
   1226       }
   1227       *nextTokPtr = ptr;
   1228       return XML_TOK_DATA_CHARS;
   1229     case BT_CR:
   1230       if (ptr == start) {
   1231         ptr += MINBPC(enc);
   1232         if (ptr == end)
   1233           return XML_TOK_TRAILING_CR;
   1234         if (BYTE_TYPE(enc, ptr) == BT_LF)
   1235           ptr += MINBPC(enc);
   1236         *nextTokPtr = ptr;
   1237         return XML_TOK_DATA_NEWLINE;
   1238       }
   1239       *nextTokPtr = ptr;
   1240       return XML_TOK_DATA_CHARS;
   1241     case BT_S:
   1242       if (ptr == start) {
   1243         *nextTokPtr = ptr + MINBPC(enc);
   1244         return XML_TOK_ATTRIBUTE_VALUE_S;
   1245       }
   1246       *nextTokPtr = ptr;
   1247       return XML_TOK_DATA_CHARS;
   1248     default:
   1249       ptr += MINBPC(enc);
   1250       break;
   1251     }
   1252   }
   1253   *nextTokPtr = ptr;
   1254   return XML_TOK_DATA_CHARS;
   1255 }
   1256 
   1257 static int PTRCALL
   1258 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
   1259                        const char *end, const char **nextTokPtr)
   1260 {
   1261   const char *start;
   1262   if (ptr == end)
   1263     return XML_TOK_NONE;
   1264   start = ptr;
   1265   while (ptr != end) {
   1266     switch (BYTE_TYPE(enc, ptr)) {
   1267 #define LEAD_CASE(n) \
   1268     case BT_LEAD ## n: ptr += n; break;
   1269     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1270 #undef LEAD_CASE
   1271     case BT_AMP:
   1272       if (ptr == start)
   1273         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1274       *nextTokPtr = ptr;
   1275       return XML_TOK_DATA_CHARS;
   1276     case BT_PERCNT:
   1277       if (ptr == start) {
   1278         int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
   1279                                        end, nextTokPtr);
   1280         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
   1281       }
   1282       *nextTokPtr = ptr;
   1283       return XML_TOK_DATA_CHARS;
   1284     case BT_LF:
   1285       if (ptr == start) {
   1286         *nextTokPtr = ptr + MINBPC(enc);
   1287         return XML_TOK_DATA_NEWLINE;
   1288       }
   1289       *nextTokPtr = ptr;
   1290       return XML_TOK_DATA_CHARS;
   1291     case BT_CR:
   1292       if (ptr == start) {
   1293         ptr += MINBPC(enc);
   1294         if (ptr == end)
   1295           return XML_TOK_TRAILING_CR;
   1296         if (BYTE_TYPE(enc, ptr) == BT_LF)
   1297           ptr += MINBPC(enc);
   1298         *nextTokPtr = ptr;
   1299         return XML_TOK_DATA_NEWLINE;
   1300       }
   1301       *nextTokPtr = ptr;
   1302       return XML_TOK_DATA_CHARS;
   1303     default:
   1304       ptr += MINBPC(enc);
   1305       break;
   1306     }
   1307   }
   1308   *nextTokPtr = ptr;
   1309   return XML_TOK_DATA_CHARS;
   1310 }
   1311 
   1312 #ifdef XML_DTD
   1313 
   1314 static int PTRCALL
   1315 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
   1316                          const char *end, const char **nextTokPtr)
   1317 {
   1318   int level = 0;
   1319   if (MINBPC(enc) > 1) {
   1320     size_t n = end - ptr;
   1321     if (n & (MINBPC(enc) - 1)) {
   1322       n &= ~(MINBPC(enc) - 1);
   1323       end = ptr + n;
   1324     }
   1325   }
   1326   while (ptr != end) {
   1327     switch (BYTE_TYPE(enc, ptr)) {
   1328     INVALID_CASES(ptr, nextTokPtr)
   1329     case BT_LT:
   1330       if ((ptr += MINBPC(enc)) == end)
   1331         return XML_TOK_PARTIAL;
   1332       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
   1333         if ((ptr += MINBPC(enc)) == end)
   1334           return XML_TOK_PARTIAL;
   1335         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
   1336           ++level;
   1337           ptr += MINBPC(enc);
   1338         }
   1339       }
   1340       break;
   1341     case BT_RSQB:
   1342       if ((ptr += MINBPC(enc)) == end)
   1343         return XML_TOK_PARTIAL;
   1344       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
   1345         if ((ptr += MINBPC(enc)) == end)
   1346           return XML_TOK_PARTIAL;
   1347         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
   1348           ptr += MINBPC(enc);
   1349           if (level == 0) {
   1350             *nextTokPtr = ptr;
   1351             return XML_TOK_IGNORE_SECT;
   1352           }
   1353           --level;
   1354         }
   1355       }
   1356       break;
   1357     default:
   1358       ptr += MINBPC(enc);
   1359       break;
   1360     }
   1361   }
   1362   return XML_TOK_PARTIAL;
   1363 }
   1364 
   1365 #endif /* XML_DTD */
   1366 
   1367 static int PTRCALL
   1368 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
   1369                    const char **badPtr)
   1370 {
   1371   ptr += MINBPC(enc);
   1372   end -= MINBPC(enc);
   1373   for (; ptr != end; ptr += MINBPC(enc)) {
   1374     switch (BYTE_TYPE(enc, ptr)) {
   1375     case BT_DIGIT:
   1376     case BT_HEX:
   1377     case BT_MINUS:
   1378     case BT_APOS:
   1379     case BT_LPAR:
   1380     case BT_RPAR:
   1381     case BT_PLUS:
   1382     case BT_COMMA:
   1383     case BT_SOL:
   1384     case BT_EQUALS:
   1385     case BT_QUEST:
   1386     case BT_CR:
   1387     case BT_LF:
   1388     case BT_SEMI:
   1389     case BT_EXCL:
   1390     case BT_AST:
   1391     case BT_PERCNT:
   1392     case BT_NUM:
   1393 #ifdef XML_NS
   1394     case BT_COLON:
   1395 #endif
   1396       break;
   1397     case BT_S:
   1398       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
   1399         *badPtr = ptr;
   1400         return 0;
   1401       }
   1402       break;
   1403     case BT_NAME:
   1404     case BT_NMSTRT:
   1405       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
   1406         break;
   1407     default:
   1408       switch (BYTE_TO_ASCII(enc, ptr)) {
   1409       case 0x24: /* $ */
   1410       case 0x40: /* @ */
   1411         break;
   1412       default:
   1413         *badPtr = ptr;
   1414         return 0;
   1415       }
   1416       break;
   1417     }
   1418   }
   1419   return 1;
   1420 }
   1421 
   1422 /* This must only be called for a well-formed start-tag or empty
   1423    element tag.  Returns the number of attributes.  Pointers to the
   1424    first attsMax attributes are stored in atts.
   1425 */
   1426 
   1427 static int PTRCALL
   1428 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
   1429                 int attsMax, ATTRIBUTE *atts)
   1430 {
   1431   enum { other, inName, inValue } state = inName;
   1432   int nAtts = 0;
   1433   int open = 0; /* defined when state == inValue;
   1434                    initialization just to shut up compilers */
   1435 
   1436   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
   1437     switch (BYTE_TYPE(enc, ptr)) {
   1438 #define START_NAME \
   1439       if (state == other) { \
   1440         if (nAtts < attsMax) { \
   1441           atts[nAtts].name = ptr; \
   1442           atts[nAtts].normalized = 1; \
   1443         } \
   1444         state = inName; \
   1445       }
   1446 #define LEAD_CASE(n) \
   1447     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
   1448     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1449 #undef LEAD_CASE
   1450     case BT_NONASCII:
   1451     case BT_NMSTRT:
   1452     case BT_HEX:
   1453       START_NAME
   1454       break;
   1455 #undef START_NAME
   1456     case BT_QUOT:
   1457       if (state != inValue) {
   1458         if (nAtts < attsMax)
   1459           atts[nAtts].valuePtr = ptr + MINBPC(enc);
   1460         state = inValue;
   1461         open = BT_QUOT;
   1462       }
   1463       else if (open == BT_QUOT) {
   1464         state = other;
   1465         if (nAtts < attsMax)
   1466           atts[nAtts].valueEnd = ptr;
   1467         nAtts++;
   1468       }
   1469       break;
   1470     case BT_APOS:
   1471       if (state != inValue) {
   1472         if (nAtts < attsMax)
   1473           atts[nAtts].valuePtr = ptr + MINBPC(enc);
   1474         state = inValue;
   1475         open = BT_APOS;
   1476       }
   1477       else if (open == BT_APOS) {
   1478         state = other;
   1479         if (nAtts < attsMax)
   1480           atts[nAtts].valueEnd = ptr;
   1481         nAtts++;
   1482       }
   1483       break;
   1484     case BT_AMP:
   1485       if (nAtts < attsMax)
   1486         atts[nAtts].normalized = 0;
   1487       break;
   1488     case BT_S:
   1489       if (state == inName)
   1490         state = other;
   1491       else if (state == inValue
   1492                && nAtts < attsMax
   1493                && atts[nAtts].normalized
   1494                && (ptr == atts[nAtts].valuePtr
   1495                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
   1496                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
   1497                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
   1498         atts[nAtts].normalized = 0;
   1499       break;
   1500     case BT_CR: case BT_LF:
   1501       /* This case ensures that the first attribute name is counted
   1502          Apart from that we could just change state on the quote. */
   1503       if (state == inName)
   1504         state = other;
   1505       else if (state == inValue && nAtts < attsMax)
   1506         atts[nAtts].normalized = 0;
   1507       break;
   1508     case BT_GT:
   1509     case BT_SOL:
   1510       if (state != inValue)
   1511         return nAtts;
   1512       break;
   1513     default:
   1514       break;
   1515     }
   1516   }
   1517   /* not reached */
   1518 }
   1519 
   1520 static int PTRFASTCALL
   1521 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
   1522 {
   1523   int result = 0;
   1524   /* skip &# */
   1525   ptr += 2*MINBPC(enc);
   1526   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
   1527     for (ptr += MINBPC(enc);
   1528          !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
   1529          ptr += MINBPC(enc)) {
   1530       int c = BYTE_TO_ASCII(enc, ptr);
   1531       switch (c) {
   1532       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
   1533       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
   1534         result <<= 4;
   1535         result |= (c - ASCII_0);
   1536         break;
   1537       case ASCII_A: case ASCII_B: case ASCII_C:
   1538       case ASCII_D: case ASCII_E: case ASCII_F:
   1539         result <<= 4;
   1540         result += 10 + (c - ASCII_A);
   1541         break;
   1542       case ASCII_a: case ASCII_b: case ASCII_c:
   1543       case ASCII_d: case ASCII_e: case ASCII_f:
   1544         result <<= 4;
   1545         result += 10 + (c - ASCII_a);
   1546         break;
   1547       }
   1548       if (result >= 0x110000)
   1549         return -1;
   1550     }
   1551   }
   1552   else {
   1553     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
   1554       int c = BYTE_TO_ASCII(enc, ptr);
   1555       result *= 10;
   1556       result += (c - ASCII_0);
   1557       if (result >= 0x110000)
   1558         return -1;
   1559     }
   1560   }
   1561   return checkCharRefNumber(result);
   1562 }
   1563 
   1564 static int PTRCALL
   1565 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
   1566                              const char *end)
   1567 {
   1568   switch ((end - ptr)/MINBPC(enc)) {
   1569   case 2:
   1570     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
   1571       switch (BYTE_TO_ASCII(enc, ptr)) {
   1572       case ASCII_l:
   1573         return ASCII_LT;
   1574       case ASCII_g:
   1575         return ASCII_GT;
   1576       }
   1577     }
   1578     break;
   1579   case 3:
   1580     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
   1581       ptr += MINBPC(enc);
   1582       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
   1583         ptr += MINBPC(enc);
   1584         if (CHAR_MATCHES(enc, ptr, ASCII_p))
   1585           return ASCII_AMP;
   1586       }
   1587     }
   1588     break;
   1589   case 4:
   1590     switch (BYTE_TO_ASCII(enc, ptr)) {
   1591     case ASCII_q:
   1592       ptr += MINBPC(enc);
   1593       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
   1594         ptr += MINBPC(enc);
   1595         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
   1596           ptr += MINBPC(enc);
   1597           if (CHAR_MATCHES(enc, ptr, ASCII_t))
   1598             return ASCII_QUOT;
   1599         }
   1600       }
   1601       break;
   1602     case ASCII_a:
   1603       ptr += MINBPC(enc);
   1604       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
   1605         ptr += MINBPC(enc);
   1606         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
   1607           ptr += MINBPC(enc);
   1608           if (CHAR_MATCHES(enc, ptr, ASCII_s))
   1609             return ASCII_APOS;
   1610         }
   1611       }
   1612       break;
   1613     }
   1614   }
   1615   return 0;
   1616 }
   1617 
   1618 static int PTRCALL
   1619 PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
   1620 {
   1621   for (;;) {
   1622     switch (BYTE_TYPE(enc, ptr1)) {
   1623 #define LEAD_CASE(n) \
   1624     case BT_LEAD ## n: \
   1625       if (*ptr1++ != *ptr2++) \
   1626         return 0;
   1627     LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
   1628 #undef LEAD_CASE
   1629       /* fall through */
   1630       if (*ptr1++ != *ptr2++)
   1631         return 0;
   1632       break;
   1633     case BT_NONASCII:
   1634     case BT_NMSTRT:
   1635 #ifdef XML_NS
   1636     case BT_COLON:
   1637 #endif
   1638     case BT_HEX:
   1639     case BT_DIGIT:
   1640     case BT_NAME:
   1641     case BT_MINUS:
   1642       if (*ptr2++ != *ptr1++)
   1643         return 0;
   1644       if (MINBPC(enc) > 1) {
   1645         if (*ptr2++ != *ptr1++)
   1646           return 0;
   1647         if (MINBPC(enc) > 2) {
   1648           if (*ptr2++ != *ptr1++)
   1649             return 0;
   1650           if (MINBPC(enc) > 3) {
   1651             if (*ptr2++ != *ptr1++)
   1652               return 0;
   1653           }
   1654         }
   1655       }
   1656       break;
   1657     default:
   1658       if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
   1659         return 1;
   1660       switch (BYTE_TYPE(enc, ptr2)) {
   1661       case BT_LEAD2:
   1662       case BT_LEAD3:
   1663       case BT_LEAD4:
   1664       case BT_NONASCII:
   1665       case BT_NMSTRT:
   1666 #ifdef XML_NS
   1667       case BT_COLON:
   1668 #endif
   1669       case BT_HEX:
   1670       case BT_DIGIT:
   1671       case BT_NAME:
   1672       case BT_MINUS:
   1673         return 0;
   1674       default:
   1675         return 1;
   1676       }
   1677     }
   1678   }
   1679   /* not reached */
   1680 }
   1681 
   1682 static int PTRCALL
   1683 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
   1684                          const char *end1, const char *ptr2)
   1685 {
   1686   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
   1687     if (ptr1 == end1)
   1688       return 0;
   1689     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
   1690       return 0;
   1691   }
   1692   return ptr1 == end1;
   1693 }
   1694 
   1695 static int PTRFASTCALL
   1696 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
   1697 {
   1698   const char *start = ptr;
   1699   for (;;) {
   1700     switch (BYTE_TYPE(enc, ptr)) {
   1701 #define LEAD_CASE(n) \
   1702     case BT_LEAD ## n: ptr += n; break;
   1703     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1704 #undef LEAD_CASE
   1705     case BT_NONASCII:
   1706     case BT_NMSTRT:
   1707 #ifdef XML_NS
   1708     case BT_COLON:
   1709 #endif
   1710     case BT_HEX:
   1711     case BT_DIGIT:
   1712     case BT_NAME:
   1713     case BT_MINUS:
   1714       ptr += MINBPC(enc);
   1715       break;
   1716     default:
   1717       return (int)(ptr - start);
   1718     }
   1719   }
   1720 }
   1721 
   1722 static const char * PTRFASTCALL
   1723 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
   1724 {
   1725   for (;;) {
   1726     switch (BYTE_TYPE(enc, ptr)) {
   1727     case BT_LF:
   1728     case BT_CR:
   1729     case BT_S:
   1730       ptr += MINBPC(enc);
   1731       break;
   1732     default:
   1733       return ptr;
   1734     }
   1735   }
   1736 }
   1737 
   1738 static void PTRCALL
   1739 PREFIX(updatePosition)(const ENCODING *enc,
   1740                        const char *ptr,
   1741                        const char *end,
   1742                        POSITION *pos)
   1743 {
   1744   while (ptr < end) {
   1745     switch (BYTE_TYPE(enc, ptr)) {
   1746 #define LEAD_CASE(n) \
   1747     case BT_LEAD ## n: \
   1748       ptr += n; \
   1749       break;
   1750     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1751 #undef LEAD_CASE
   1752     case BT_LF:
   1753       pos->columnNumber = (XML_Size)-1;
   1754       pos->lineNumber++;
   1755       ptr += MINBPC(enc);
   1756       break;
   1757     case BT_CR:
   1758       pos->lineNumber++;
   1759       ptr += MINBPC(enc);
   1760       if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
   1761         ptr += MINBPC(enc);
   1762       pos->columnNumber = (XML_Size)-1;
   1763       break;
   1764     default:
   1765       ptr += MINBPC(enc);
   1766       break;
   1767     }
   1768     pos->columnNumber++;
   1769   }
   1770 }
   1771 
   1772 #undef DO_LEAD_CASE
   1773 #undef MULTIBYTE_CASES
   1774 #undef INVALID_CASES
   1775 #undef CHECK_NAME_CASE
   1776 #undef CHECK_NAME_CASES
   1777 #undef CHECK_NMSTRT_CASE
   1778 #undef CHECK_NMSTRT_CASES
   1779 
   1780