Home | History | Annotate | Download | only in lib
      1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
      2    See the file COPYING for copying permission.
      3 */
      4 
      5 /* This file is included! */
      6 #ifdef XML_TOK_IMPL_C
      7 
      8 #ifndef IS_INVALID_CHAR
      9 #define IS_INVALID_CHAR(enc, ptr, n) (0)
     10 #endif
     11 
     12 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
     13     case BT_LEAD ## n: \
     14       if (end - ptr < n) \
     15         return XML_TOK_PARTIAL_CHAR; \
     16       if (IS_INVALID_CHAR(enc, ptr, n)) { \
     17         *(nextTokPtr) = (ptr); \
     18         return XML_TOK_INVALID; \
     19       } \
     20       ptr += n; \
     21       break;
     22 
     23 #define INVALID_CASES(ptr, nextTokPtr) \
     24   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
     25   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
     26   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
     27   case BT_NONXML: \
     28   case BT_MALFORM: \
     29   case BT_TRAIL: \
     30     *(nextTokPtr) = (ptr); \
     31     return XML_TOK_INVALID;
     32 
     33 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
     34    case BT_LEAD ## n: \
     35      if (end - ptr < n) \
     36        return XML_TOK_PARTIAL_CHAR; \
     37      if (!IS_NAME_CHAR(enc, ptr, n)) { \
     38        *nextTokPtr = ptr; \
     39        return XML_TOK_INVALID; \
     40      } \
     41      ptr += n; \
     42      break;
     43 
     44 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
     45   case BT_NONASCII: \
     46     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
     47       *nextTokPtr = ptr; \
     48       return XML_TOK_INVALID; \
     49     } \
     50   case BT_NMSTRT: \
     51   case BT_HEX: \
     52   case BT_DIGIT: \
     53   case BT_NAME: \
     54   case BT_MINUS: \
     55     ptr += MINBPC(enc); \
     56     break; \
     57   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
     58   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
     59   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
     60 
     61 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
     62    case BT_LEAD ## n: \
     63      if (end - ptr < n) \
     64        return XML_TOK_PARTIAL_CHAR; \
     65      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
     66        *nextTokPtr = ptr; \
     67        return XML_TOK_INVALID; \
     68      } \
     69      ptr += n; \
     70      break;
     71 
     72 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
     73   case BT_NONASCII: \
     74     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
     75       *nextTokPtr = ptr; \
     76       return XML_TOK_INVALID; \
     77     } \
     78   case BT_NMSTRT: \
     79   case BT_HEX: \
     80     ptr += MINBPC(enc); \
     81     break; \
     82   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
     83   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
     84   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
     85 
     86 #ifndef PREFIX
     87 #define PREFIX(ident) ident
     88 #endif
     89 
     90 /* ptr points to character following "<!-" */
     91 
     92 static int PTRCALL
     93 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
     94                     const char *end, const char **nextTokPtr)
     95 {
     96   if (ptr != end) {
     97     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
     98       *nextTokPtr = ptr;
     99       return XML_TOK_INVALID;
    100     }
    101     ptr += MINBPC(enc);
    102     while (ptr != end) {
    103       switch (BYTE_TYPE(enc, ptr)) {
    104       INVALID_CASES(ptr, nextTokPtr)
    105       case BT_MINUS:
    106         if ((ptr += MINBPC(enc)) == end)
    107           return XML_TOK_PARTIAL;
    108         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
    109           if ((ptr += MINBPC(enc)) == end)
    110             return XML_TOK_PARTIAL;
    111           if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    112             *nextTokPtr = ptr;
    113             return XML_TOK_INVALID;
    114           }
    115           *nextTokPtr = ptr + MINBPC(enc);
    116           return XML_TOK_COMMENT;
    117         }
    118         break;
    119       default:
    120         ptr += MINBPC(enc);
    121         break;
    122       }
    123     }
    124   }
    125   return XML_TOK_PARTIAL;
    126 }
    127 
    128 /* ptr points to character following "<!" */
    129 
    130 static int PTRCALL
    131 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
    132                  const char *end, const char **nextTokPtr)
    133 {
    134   if (ptr == end)
    135     return XML_TOK_PARTIAL;
    136   switch (BYTE_TYPE(enc, ptr)) {
    137   case BT_MINUS:
    138     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    139   case BT_LSQB:
    140     *nextTokPtr = ptr + MINBPC(enc);
    141     return XML_TOK_COND_SECT_OPEN;
    142   case BT_NMSTRT:
    143   case BT_HEX:
    144     ptr += MINBPC(enc);
    145     break;
    146   default:
    147     *nextTokPtr = ptr;
    148     return XML_TOK_INVALID;
    149   }
    150   while (ptr != end) {
    151     switch (BYTE_TYPE(enc, ptr)) {
    152     case BT_PERCNT:
    153       if (ptr + MINBPC(enc) == end)
    154         return XML_TOK_PARTIAL;
    155       /* don't allow <!ENTITY% foo "whatever"> */
    156       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
    157       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
    158         *nextTokPtr = ptr;
    159         return XML_TOK_INVALID;
    160       }
    161       /* fall through */
    162     case BT_S: case BT_CR: case BT_LF:
    163       *nextTokPtr = ptr;
    164       return XML_TOK_DECL_OPEN;
    165     case BT_NMSTRT:
    166     case BT_HEX:
    167       ptr += MINBPC(enc);
    168       break;
    169     default:
    170       *nextTokPtr = ptr;
    171       return XML_TOK_INVALID;
    172     }
    173   }
    174   return XML_TOK_PARTIAL;
    175 }
    176 
    177 static int PTRCALL
    178 PREFIX(checkPiTarget)(const ENCODING *enc, const char *ptr,
    179                       const char *end, int *tokPtr)
    180 {
    181   int upper = 0;
    182   *tokPtr = XML_TOK_PI;
    183   if (end - ptr != MINBPC(enc)*3)
    184     return 1;
    185   switch (BYTE_TO_ASCII(enc, ptr)) {
    186   case ASCII_x:
    187     break;
    188   case ASCII_X:
    189     upper = 1;
    190     break;
    191   default:
    192     return 1;
    193   }
    194   ptr += MINBPC(enc);
    195   switch (BYTE_TO_ASCII(enc, ptr)) {
    196   case ASCII_m:
    197     break;
    198   case ASCII_M:
    199     upper = 1;
    200     break;
    201   default:
    202     return 1;
    203   }
    204   ptr += MINBPC(enc);
    205   switch (BYTE_TO_ASCII(enc, ptr)) {
    206   case ASCII_l:
    207     break;
    208   case ASCII_L:
    209     upper = 1;
    210     break;
    211   default:
    212     return 1;
    213   }
    214   if (upper)
    215     return 0;
    216   *tokPtr = XML_TOK_XML_DECL;
    217   return 1;
    218 }
    219 
    220 /* ptr points to character following "<?" */
    221 
    222 static int PTRCALL
    223 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
    224                const char *end, const char **nextTokPtr)
    225 {
    226   int tok;
    227   const char *target = ptr;
    228   if (ptr == end)
    229     return XML_TOK_PARTIAL;
    230   switch (BYTE_TYPE(enc, ptr)) {
    231   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    232   default:
    233     *nextTokPtr = ptr;
    234     return XML_TOK_INVALID;
    235   }
    236   while (ptr != end) {
    237     switch (BYTE_TYPE(enc, ptr)) {
    238     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    239     case BT_S: case BT_CR: case BT_LF:
    240       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
    241         *nextTokPtr = ptr;
    242         return XML_TOK_INVALID;
    243       }
    244       ptr += MINBPC(enc);
    245       while (ptr != end) {
    246         switch (BYTE_TYPE(enc, ptr)) {
    247         INVALID_CASES(ptr, nextTokPtr)
    248         case BT_QUEST:
    249           ptr += MINBPC(enc);
    250           if (ptr == end)
    251             return XML_TOK_PARTIAL;
    252           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    253             *nextTokPtr = ptr + MINBPC(enc);
    254             return tok;
    255           }
    256           break;
    257         default:
    258           ptr += MINBPC(enc);
    259           break;
    260         }
    261       }
    262       return XML_TOK_PARTIAL;
    263     case BT_QUEST:
    264       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
    265         *nextTokPtr = ptr;
    266         return XML_TOK_INVALID;
    267       }
    268       ptr += MINBPC(enc);
    269       if (ptr == end)
    270         return XML_TOK_PARTIAL;
    271       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    272         *nextTokPtr = ptr + MINBPC(enc);
    273         return tok;
    274       }
    275       /* fall through */
    276     default:
    277       *nextTokPtr = ptr;
    278       return XML_TOK_INVALID;
    279     }
    280   }
    281   return XML_TOK_PARTIAL;
    282 }
    283 
    284 static int PTRCALL
    285 PREFIX(scanCdataSection)(const ENCODING *enc, const char *ptr,
    286                          const char *end, const char **nextTokPtr)
    287 {
    288   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
    289                                      ASCII_T, ASCII_A, ASCII_LSQB };
    290   int i;
    291   /* CDATA[ */
    292   if (end - ptr < 6 * MINBPC(enc))
    293     return XML_TOK_PARTIAL;
    294   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
    295     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
    296       *nextTokPtr = ptr;
    297       return XML_TOK_INVALID;
    298     }
    299   }
    300   *nextTokPtr = ptr;
    301   return XML_TOK_CDATA_SECT_OPEN;
    302 }
    303 
    304 static int PTRCALL
    305 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
    306                         const char *end, const char **nextTokPtr)
    307 {
    308   if (ptr == end)
    309     return XML_TOK_NONE;
    310   if (MINBPC(enc) > 1) {
    311     size_t n = end - ptr;
    312     if (n & (MINBPC(enc) - 1)) {
    313       n &= ~(MINBPC(enc) - 1);
    314       if (n == 0)
    315         return XML_TOK_PARTIAL;
    316       end = ptr + n;
    317     }
    318   }
    319   switch (BYTE_TYPE(enc, ptr)) {
    320   case BT_RSQB:
    321     ptr += MINBPC(enc);
    322     if (ptr == end)
    323       return XML_TOK_PARTIAL;
    324     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
    325       break;
    326     ptr += MINBPC(enc);
    327     if (ptr == end)
    328       return XML_TOK_PARTIAL;
    329     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    330       ptr -= MINBPC(enc);
    331       break;
    332     }
    333     *nextTokPtr = ptr + MINBPC(enc);
    334     return XML_TOK_CDATA_SECT_CLOSE;
    335   case BT_CR:
    336     ptr += MINBPC(enc);
    337     if (ptr == end)
    338       return XML_TOK_PARTIAL;
    339     if (BYTE_TYPE(enc, ptr) == BT_LF)
    340       ptr += MINBPC(enc);
    341     *nextTokPtr = ptr;
    342     return XML_TOK_DATA_NEWLINE;
    343   case BT_LF:
    344     *nextTokPtr = ptr + MINBPC(enc);
    345     return XML_TOK_DATA_NEWLINE;
    346   INVALID_CASES(ptr, nextTokPtr)
    347   default:
    348     ptr += MINBPC(enc);
    349     break;
    350   }
    351   while (ptr != end) {
    352     switch (BYTE_TYPE(enc, ptr)) {
    353 #define LEAD_CASE(n) \
    354     case BT_LEAD ## n: \
    355       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
    356         *nextTokPtr = ptr; \
    357         return XML_TOK_DATA_CHARS; \
    358       } \
    359       ptr += n; \
    360       break;
    361     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
    362 #undef LEAD_CASE
    363     case BT_NONXML:
    364     case BT_MALFORM:
    365     case BT_TRAIL:
    366     case BT_CR:
    367     case BT_LF:
    368     case BT_RSQB:
    369       *nextTokPtr = ptr;
    370       return XML_TOK_DATA_CHARS;
    371     default:
    372       ptr += MINBPC(enc);
    373       break;
    374     }
    375   }
    376   *nextTokPtr = ptr;
    377   return XML_TOK_DATA_CHARS;
    378 }
    379 
    380 /* ptr points to character following "</" */
    381 
    382 static int PTRCALL
    383 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
    384                    const char *end, const char **nextTokPtr)
    385 {
    386   if (ptr == end)
    387     return XML_TOK_PARTIAL;
    388   switch (BYTE_TYPE(enc, ptr)) {
    389   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    390   default:
    391     *nextTokPtr = ptr;
    392     return XML_TOK_INVALID;
    393   }
    394   while (ptr != end) {
    395     switch (BYTE_TYPE(enc, ptr)) {
    396     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    397     case BT_S: case BT_CR: case BT_LF:
    398       for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
    399         switch (BYTE_TYPE(enc, ptr)) {
    400         case BT_S: case BT_CR: case BT_LF:
    401           break;
    402         case BT_GT:
    403           *nextTokPtr = ptr + MINBPC(enc);
    404           return XML_TOK_END_TAG;
    405         default:
    406           *nextTokPtr = ptr;
    407           return XML_TOK_INVALID;
    408         }
    409       }
    410       return XML_TOK_PARTIAL;
    411 #ifdef XML_NS
    412     case BT_COLON:
    413       /* no need to check qname syntax here,
    414          since end-tag must match exactly */
    415       ptr += MINBPC(enc);
    416       break;
    417 #endif
    418     case BT_GT:
    419       *nextTokPtr = ptr + MINBPC(enc);
    420       return XML_TOK_END_TAG;
    421     default:
    422       *nextTokPtr = ptr;
    423       return XML_TOK_INVALID;
    424     }
    425   }
    426   return XML_TOK_PARTIAL;
    427 }
    428 
    429 /* ptr points to character following "&#X" */
    430 
    431 static int PTRCALL
    432 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
    433                        const char *end, const char **nextTokPtr)
    434 {
    435   if (ptr != end) {
    436     switch (BYTE_TYPE(enc, ptr)) {
    437     case BT_DIGIT:
    438     case BT_HEX:
    439       break;
    440     default:
    441       *nextTokPtr = ptr;
    442       return XML_TOK_INVALID;
    443     }
    444     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
    445       switch (BYTE_TYPE(enc, ptr)) {
    446       case BT_DIGIT:
    447       case BT_HEX:
    448         break;
    449       case BT_SEMI:
    450         *nextTokPtr = ptr + MINBPC(enc);
    451         return XML_TOK_CHAR_REF;
    452       default:
    453         *nextTokPtr = ptr;
    454         return XML_TOK_INVALID;
    455       }
    456     }
    457   }
    458   return XML_TOK_PARTIAL;
    459 }
    460 
    461 /* ptr points to character following "&#" */
    462 
    463 static int PTRCALL
    464 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
    465                     const char *end, const char **nextTokPtr)
    466 {
    467   if (ptr != end) {
    468     if (CHAR_MATCHES(enc, ptr, ASCII_x))
    469       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    470     switch (BYTE_TYPE(enc, ptr)) {
    471     case BT_DIGIT:
    472       break;
    473     default:
    474       *nextTokPtr = ptr;
    475       return XML_TOK_INVALID;
    476     }
    477     for (ptr += MINBPC(enc); ptr != end; ptr += MINBPC(enc)) {
    478       switch (BYTE_TYPE(enc, ptr)) {
    479       case BT_DIGIT:
    480         break;
    481       case BT_SEMI:
    482         *nextTokPtr = ptr + MINBPC(enc);
    483         return XML_TOK_CHAR_REF;
    484       default:
    485         *nextTokPtr = ptr;
    486         return XML_TOK_INVALID;
    487       }
    488     }
    489   }
    490   return XML_TOK_PARTIAL;
    491 }
    492 
    493 /* ptr points to character following "&" */
    494 
    495 static int PTRCALL
    496 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
    497                 const char **nextTokPtr)
    498 {
    499   if (ptr == end)
    500     return XML_TOK_PARTIAL;
    501   switch (BYTE_TYPE(enc, ptr)) {
    502   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    503   case BT_NUM:
    504     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    505   default:
    506     *nextTokPtr = ptr;
    507     return XML_TOK_INVALID;
    508   }
    509   while (ptr != end) {
    510     switch (BYTE_TYPE(enc, ptr)) {
    511     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    512     case BT_SEMI:
    513       *nextTokPtr = ptr + MINBPC(enc);
    514       return XML_TOK_ENTITY_REF;
    515     default:
    516       *nextTokPtr = ptr;
    517       return XML_TOK_INVALID;
    518     }
    519   }
    520   return XML_TOK_PARTIAL;
    521 }
    522 
    523 /* ptr points to character following first character of attribute name */
    524 
    525 static int PTRCALL
    526 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
    527                  const char **nextTokPtr)
    528 {
    529 #ifdef XML_NS
    530   int hadColon = 0;
    531 #endif
    532   while (ptr != end) {
    533     switch (BYTE_TYPE(enc, ptr)) {
    534     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    535 #ifdef XML_NS
    536     case BT_COLON:
    537       if (hadColon) {
    538         *nextTokPtr = ptr;
    539         return XML_TOK_INVALID;
    540       }
    541       hadColon = 1;
    542       ptr += MINBPC(enc);
    543       if (ptr == end)
    544         return XML_TOK_PARTIAL;
    545       switch (BYTE_TYPE(enc, ptr)) {
    546       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    547       default:
    548         *nextTokPtr = ptr;
    549         return XML_TOK_INVALID;
    550       }
    551       break;
    552 #endif
    553     case BT_S: case BT_CR: case BT_LF:
    554       for (;;) {
    555         int t;
    556 
    557         ptr += MINBPC(enc);
    558         if (ptr == end)
    559           return XML_TOK_PARTIAL;
    560         t = BYTE_TYPE(enc, ptr);
    561         if (t == BT_EQUALS)
    562           break;
    563         switch (t) {
    564         case BT_S:
    565         case BT_LF:
    566         case BT_CR:
    567           break;
    568         default:
    569           *nextTokPtr = ptr;
    570           return XML_TOK_INVALID;
    571         }
    572       }
    573     /* fall through */
    574     case BT_EQUALS:
    575       {
    576         int open;
    577 #ifdef XML_NS
    578         hadColon = 0;
    579 #endif
    580         for (;;) {
    581           ptr += MINBPC(enc);
    582           if (ptr == end)
    583             return XML_TOK_PARTIAL;
    584           open = BYTE_TYPE(enc, ptr);
    585           if (open == BT_QUOT || open == BT_APOS)
    586             break;
    587           switch (open) {
    588           case BT_S:
    589           case BT_LF:
    590           case BT_CR:
    591             break;
    592           default:
    593             *nextTokPtr = ptr;
    594             return XML_TOK_INVALID;
    595           }
    596         }
    597         ptr += MINBPC(enc);
    598         /* in attribute value */
    599         for (;;) {
    600           int t;
    601           if (ptr == end)
    602             return XML_TOK_PARTIAL;
    603           t = BYTE_TYPE(enc, ptr);
    604           if (t == open)
    605             break;
    606           switch (t) {
    607           INVALID_CASES(ptr, nextTokPtr)
    608           case BT_AMP:
    609             {
    610               int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
    611               if (tok <= 0) {
    612                 if (tok == XML_TOK_INVALID)
    613                   *nextTokPtr = ptr;
    614                 return tok;
    615               }
    616               break;
    617             }
    618           case BT_LT:
    619             *nextTokPtr = ptr;
    620             return XML_TOK_INVALID;
    621           default:
    622             ptr += MINBPC(enc);
    623             break;
    624           }
    625         }
    626         ptr += MINBPC(enc);
    627         if (ptr == end)
    628           return XML_TOK_PARTIAL;
    629         switch (BYTE_TYPE(enc, ptr)) {
    630         case BT_S:
    631         case BT_CR:
    632         case BT_LF:
    633           break;
    634         case BT_SOL:
    635           goto sol;
    636         case BT_GT:
    637           goto gt;
    638         default:
    639           *nextTokPtr = ptr;
    640           return XML_TOK_INVALID;
    641         }
    642         /* ptr points to closing quote */
    643         for (;;) {
    644           ptr += MINBPC(enc);
    645           if (ptr == end)
    646             return XML_TOK_PARTIAL;
    647           switch (BYTE_TYPE(enc, ptr)) {
    648           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    649           case BT_S: case BT_CR: case BT_LF:
    650             continue;
    651           case BT_GT:
    652           gt:
    653             *nextTokPtr = ptr + MINBPC(enc);
    654             return XML_TOK_START_TAG_WITH_ATTS;
    655           case BT_SOL:
    656           sol:
    657             ptr += MINBPC(enc);
    658             if (ptr == end)
    659               return XML_TOK_PARTIAL;
    660             if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    661               *nextTokPtr = ptr;
    662               return XML_TOK_INVALID;
    663             }
    664             *nextTokPtr = ptr + MINBPC(enc);
    665             return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
    666           default:
    667             *nextTokPtr = ptr;
    668             return XML_TOK_INVALID;
    669           }
    670           break;
    671         }
    672         break;
    673       }
    674     default:
    675       *nextTokPtr = ptr;
    676       return XML_TOK_INVALID;
    677     }
    678   }
    679   return XML_TOK_PARTIAL;
    680 }
    681 
    682 /* ptr points to character following "<" */
    683 
    684 static int PTRCALL
    685 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
    686                const char **nextTokPtr)
    687 {
    688 #ifdef XML_NS
    689   int hadColon;
    690 #endif
    691   if (ptr == end)
    692     return XML_TOK_PARTIAL;
    693   switch (BYTE_TYPE(enc, ptr)) {
    694   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    695   case BT_EXCL:
    696     if ((ptr += MINBPC(enc)) == end)
    697       return XML_TOK_PARTIAL;
    698     switch (BYTE_TYPE(enc, ptr)) {
    699     case BT_MINUS:
    700       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    701     case BT_LSQB:
    702       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
    703                                       end, nextTokPtr);
    704     }
    705     *nextTokPtr = ptr;
    706     return XML_TOK_INVALID;
    707   case BT_QUEST:
    708     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    709   case BT_SOL:
    710     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    711   default:
    712     *nextTokPtr = ptr;
    713     return XML_TOK_INVALID;
    714   }
    715 #ifdef XML_NS
    716   hadColon = 0;
    717 #endif
    718   /* we have a start-tag */
    719   while (ptr != end) {
    720     switch (BYTE_TYPE(enc, ptr)) {
    721     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    722 #ifdef XML_NS
    723     case BT_COLON:
    724       if (hadColon) {
    725         *nextTokPtr = ptr;
    726         return XML_TOK_INVALID;
    727       }
    728       hadColon = 1;
    729       ptr += MINBPC(enc);
    730       if (ptr == end)
    731         return XML_TOK_PARTIAL;
    732       switch (BYTE_TYPE(enc, ptr)) {
    733       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    734       default:
    735         *nextTokPtr = ptr;
    736         return XML_TOK_INVALID;
    737       }
    738       break;
    739 #endif
    740     case BT_S: case BT_CR: case BT_LF:
    741       {
    742         ptr += MINBPC(enc);
    743         while (ptr != end) {
    744           switch (BYTE_TYPE(enc, ptr)) {
    745           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    746           case BT_GT:
    747             goto gt;
    748           case BT_SOL:
    749             goto sol;
    750           case BT_S: case BT_CR: case BT_LF:
    751             ptr += MINBPC(enc);
    752             continue;
    753           default:
    754             *nextTokPtr = ptr;
    755             return XML_TOK_INVALID;
    756           }
    757           return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
    758         }
    759         return XML_TOK_PARTIAL;
    760       }
    761     case BT_GT:
    762     gt:
    763       *nextTokPtr = ptr + MINBPC(enc);
    764       return XML_TOK_START_TAG_NO_ATTS;
    765     case BT_SOL:
    766     sol:
    767       ptr += MINBPC(enc);
    768       if (ptr == end)
    769         return XML_TOK_PARTIAL;
    770       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    771         *nextTokPtr = ptr;
    772         return XML_TOK_INVALID;
    773       }
    774       *nextTokPtr = ptr + MINBPC(enc);
    775       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
    776     default:
    777       *nextTokPtr = ptr;
    778       return XML_TOK_INVALID;
    779     }
    780   }
    781   return XML_TOK_PARTIAL;
    782 }
    783 
    784 static int PTRCALL
    785 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
    786                    const char **nextTokPtr)
    787 {
    788   if (ptr == end)
    789     return XML_TOK_NONE;
    790   if (MINBPC(enc) > 1) {
    791     size_t n = end - ptr;
    792     if (n & (MINBPC(enc) - 1)) {
    793       n &= ~(MINBPC(enc) - 1);
    794       if (n == 0)
    795         return XML_TOK_PARTIAL;
    796       end = ptr + n;
    797     }
    798   }
    799   switch (BYTE_TYPE(enc, ptr)) {
    800   case BT_LT:
    801     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    802   case BT_AMP:
    803     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    804   case BT_CR:
    805     ptr += MINBPC(enc);
    806     if (ptr == end)
    807       return XML_TOK_TRAILING_CR;
    808     if (BYTE_TYPE(enc, ptr) == BT_LF)
    809       ptr += MINBPC(enc);
    810     *nextTokPtr = ptr;
    811     return XML_TOK_DATA_NEWLINE;
    812   case BT_LF:
    813     *nextTokPtr = ptr + MINBPC(enc);
    814     return XML_TOK_DATA_NEWLINE;
    815   case BT_RSQB:
    816     ptr += MINBPC(enc);
    817     if (ptr == end)
    818       return XML_TOK_TRAILING_RSQB;
    819     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
    820       break;
    821     ptr += MINBPC(enc);
    822     if (ptr == end)
    823       return XML_TOK_TRAILING_RSQB;
    824     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    825       ptr -= MINBPC(enc);
    826       break;
    827     }
    828     *nextTokPtr = ptr;
    829     return XML_TOK_INVALID;
    830   INVALID_CASES(ptr, nextTokPtr)
    831   default:
    832     ptr += MINBPC(enc);
    833     break;
    834   }
    835   while (ptr != end) {
    836     switch (BYTE_TYPE(enc, ptr)) {
    837 #define LEAD_CASE(n) \
    838     case BT_LEAD ## n: \
    839       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
    840         *nextTokPtr = ptr; \
    841         return XML_TOK_DATA_CHARS; \
    842       } \
    843       ptr += n; \
    844       break;
    845     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
    846 #undef LEAD_CASE
    847     case BT_RSQB:
    848       if (ptr + MINBPC(enc) != end) {
    849          if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
    850            ptr += MINBPC(enc);
    851            break;
    852          }
    853          if (ptr + 2*MINBPC(enc) != end) {
    854            if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
    855              ptr += MINBPC(enc);
    856              break;
    857            }
    858            *nextTokPtr = ptr + 2*MINBPC(enc);
    859            return XML_TOK_INVALID;
    860          }
    861       }
    862       /* fall through */
    863     case BT_AMP:
    864     case BT_LT:
    865     case BT_NONXML:
    866     case BT_MALFORM:
    867     case BT_TRAIL:
    868     case BT_CR:
    869     case BT_LF:
    870       *nextTokPtr = ptr;
    871       return XML_TOK_DATA_CHARS;
    872     default:
    873       ptr += MINBPC(enc);
    874       break;
    875     }
    876   }
    877   *nextTokPtr = ptr;
    878   return XML_TOK_DATA_CHARS;
    879 }
    880 
    881 /* ptr points to character following "%" */
    882 
    883 static int PTRCALL
    884 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
    885                     const char **nextTokPtr)
    886 {
    887   if (ptr == end)
    888     return XML_TOK_PARTIAL;
    889   switch (BYTE_TYPE(enc, ptr)) {
    890   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    891   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
    892     *nextTokPtr = ptr;
    893     return XML_TOK_PERCENT;
    894   default:
    895     *nextTokPtr = ptr;
    896     return XML_TOK_INVALID;
    897   }
    898   while (ptr != end) {
    899     switch (BYTE_TYPE(enc, ptr)) {
    900     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    901     case BT_SEMI:
    902       *nextTokPtr = ptr + MINBPC(enc);
    903       return XML_TOK_PARAM_ENTITY_REF;
    904     default:
    905       *nextTokPtr = ptr;
    906       return XML_TOK_INVALID;
    907     }
    908   }
    909   return XML_TOK_PARTIAL;
    910 }
    911 
    912 static int PTRCALL
    913 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
    914                       const char **nextTokPtr)
    915 {
    916   if (ptr == end)
    917     return XML_TOK_PARTIAL;
    918   switch (BYTE_TYPE(enc, ptr)) {
    919   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    920   default:
    921     *nextTokPtr = ptr;
    922     return XML_TOK_INVALID;
    923   }
    924   while (ptr != end) {
    925     switch (BYTE_TYPE(enc, ptr)) {
    926     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    927     case BT_CR: case BT_LF: case BT_S:
    928     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
    929       *nextTokPtr = ptr;
    930       return XML_TOK_POUND_NAME;
    931     default:
    932       *nextTokPtr = ptr;
    933       return XML_TOK_INVALID;
    934     }
    935   }
    936   return -XML_TOK_POUND_NAME;
    937 }
    938 
    939 static int PTRCALL
    940 PREFIX(scanLit)(int open, const ENCODING *enc,
    941                 const char *ptr, const char *end,
    942                 const char **nextTokPtr)
    943 {
    944   while (ptr != end) {
    945     int t = BYTE_TYPE(enc, ptr);
    946     switch (t) {
    947     INVALID_CASES(ptr, nextTokPtr)
    948     case BT_QUOT:
    949     case BT_APOS:
    950       ptr += MINBPC(enc);
    951       if (t != open)
    952         break;
    953       if (ptr == end)
    954         return -XML_TOK_LITERAL;
    955       *nextTokPtr = ptr;
    956       switch (BYTE_TYPE(enc, ptr)) {
    957       case BT_S: case BT_CR: case BT_LF:
    958       case BT_GT: case BT_PERCNT: case BT_LSQB:
    959         return XML_TOK_LITERAL;
    960       default:
    961         return XML_TOK_INVALID;
    962       }
    963     default:
    964       ptr += MINBPC(enc);
    965       break;
    966     }
    967   }
    968   return XML_TOK_PARTIAL;
    969 }
    970 
    971 static int PTRCALL
    972 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
    973                   const char **nextTokPtr)
    974 {
    975   int tok;
    976   if (ptr == end)
    977     return XML_TOK_NONE;
    978   if (MINBPC(enc) > 1) {
    979     size_t n = end - ptr;
    980     if (n & (MINBPC(enc) - 1)) {
    981       n &= ~(MINBPC(enc) - 1);
    982       if (n == 0)
    983         return XML_TOK_PARTIAL;
    984       end = ptr + n;
    985     }
    986   }
    987   switch (BYTE_TYPE(enc, ptr)) {
    988   case BT_QUOT:
    989     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
    990   case BT_APOS:
    991     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
    992   case BT_LT:
    993     {
    994       ptr += MINBPC(enc);
    995       if (ptr == end)
    996         return XML_TOK_PARTIAL;
    997       switch (BYTE_TYPE(enc, ptr)) {
    998       case BT_EXCL:
    999         return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1000       case BT_QUEST:
   1001         return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1002       case BT_NMSTRT:
   1003       case BT_HEX:
   1004       case BT_NONASCII:
   1005       case BT_LEAD2:
   1006       case BT_LEAD3:
   1007       case BT_LEAD4:
   1008         *nextTokPtr = ptr - MINBPC(enc);
   1009         return XML_TOK_INSTANCE_START;
   1010       }
   1011       *nextTokPtr = ptr;
   1012       return XML_TOK_INVALID;
   1013     }
   1014   case BT_CR:
   1015     if (ptr + MINBPC(enc) == end) {
   1016       *nextTokPtr = end;
   1017       /* indicate that this might be part of a CR/LF pair */
   1018       return -XML_TOK_PROLOG_S;
   1019     }
   1020     /* fall through */
   1021   case BT_S: case BT_LF:
   1022     for (;;) {
   1023       ptr += MINBPC(enc);
   1024       if (ptr == end)
   1025         break;
   1026       switch (BYTE_TYPE(enc, ptr)) {
   1027       case BT_S: case BT_LF:
   1028         break;
   1029       case BT_CR:
   1030         /* don't split CR/LF pair */
   1031         if (ptr + MINBPC(enc) != end)
   1032           break;
   1033         /* fall through */
   1034       default:
   1035         *nextTokPtr = ptr;
   1036         return XML_TOK_PROLOG_S;
   1037       }
   1038     }
   1039     *nextTokPtr = ptr;
   1040     return XML_TOK_PROLOG_S;
   1041   case BT_PERCNT:
   1042     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1043   case BT_COMMA:
   1044     *nextTokPtr = ptr + MINBPC(enc);
   1045     return XML_TOK_COMMA;
   1046   case BT_LSQB:
   1047     *nextTokPtr = ptr + MINBPC(enc);
   1048     return XML_TOK_OPEN_BRACKET;
   1049   case BT_RSQB:
   1050     ptr += MINBPC(enc);
   1051     if (ptr == end)
   1052       return -XML_TOK_CLOSE_BRACKET;
   1053     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
   1054       if (ptr + MINBPC(enc) == end)
   1055         return XML_TOK_PARTIAL;
   1056       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
   1057         *nextTokPtr = ptr + 2*MINBPC(enc);
   1058         return XML_TOK_COND_SECT_CLOSE;
   1059       }
   1060     }
   1061     *nextTokPtr = ptr;
   1062     return XML_TOK_CLOSE_BRACKET;
   1063   case BT_LPAR:
   1064     *nextTokPtr = ptr + MINBPC(enc);
   1065     return XML_TOK_OPEN_PAREN;
   1066   case BT_RPAR:
   1067     ptr += MINBPC(enc);
   1068     if (ptr == end)
   1069       return -XML_TOK_CLOSE_PAREN;
   1070     switch (BYTE_TYPE(enc, ptr)) {
   1071     case BT_AST:
   1072       *nextTokPtr = ptr + MINBPC(enc);
   1073       return XML_TOK_CLOSE_PAREN_ASTERISK;
   1074     case BT_QUEST:
   1075       *nextTokPtr = ptr + MINBPC(enc);
   1076       return XML_TOK_CLOSE_PAREN_QUESTION;
   1077     case BT_PLUS:
   1078       *nextTokPtr = ptr + MINBPC(enc);
   1079       return XML_TOK_CLOSE_PAREN_PLUS;
   1080     case BT_CR: case BT_LF: case BT_S:
   1081     case BT_GT: case BT_COMMA: case BT_VERBAR:
   1082     case BT_RPAR:
   1083       *nextTokPtr = ptr;
   1084       return XML_TOK_CLOSE_PAREN;
   1085     }
   1086     *nextTokPtr = ptr;
   1087     return XML_TOK_INVALID;
   1088   case BT_VERBAR:
   1089     *nextTokPtr = ptr + MINBPC(enc);
   1090     return XML_TOK_OR;
   1091   case BT_GT:
   1092     *nextTokPtr = ptr + MINBPC(enc);
   1093     return XML_TOK_DECL_CLOSE;
   1094   case BT_NUM:
   1095     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1096 #define LEAD_CASE(n) \
   1097   case BT_LEAD ## n: \
   1098     if (end - ptr < n) \
   1099       return XML_TOK_PARTIAL_CHAR; \
   1100     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
   1101       ptr += n; \
   1102       tok = XML_TOK_NAME; \
   1103       break; \
   1104     } \
   1105     if (IS_NAME_CHAR(enc, ptr, n)) { \
   1106       ptr += n; \
   1107       tok = XML_TOK_NMTOKEN; \
   1108       break; \
   1109     } \
   1110     *nextTokPtr = ptr; \
   1111     return XML_TOK_INVALID;
   1112     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1113 #undef LEAD_CASE
   1114   case BT_NMSTRT:
   1115   case BT_HEX:
   1116     tok = XML_TOK_NAME;
   1117     ptr += MINBPC(enc);
   1118     break;
   1119   case BT_DIGIT:
   1120   case BT_NAME:
   1121   case BT_MINUS:
   1122 #ifdef XML_NS
   1123   case BT_COLON:
   1124 #endif
   1125     tok = XML_TOK_NMTOKEN;
   1126     ptr += MINBPC(enc);
   1127     break;
   1128   case BT_NONASCII:
   1129     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
   1130       ptr += MINBPC(enc);
   1131       tok = XML_TOK_NAME;
   1132       break;
   1133     }
   1134     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
   1135       ptr += MINBPC(enc);
   1136       tok = XML_TOK_NMTOKEN;
   1137       break;
   1138     }
   1139     /* fall through */
   1140   default:
   1141     *nextTokPtr = ptr;
   1142     return XML_TOK_INVALID;
   1143   }
   1144   while (ptr != end) {
   1145     switch (BYTE_TYPE(enc, ptr)) {
   1146     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
   1147     case BT_GT: case BT_RPAR: case BT_COMMA:
   1148     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
   1149     case BT_S: case BT_CR: case BT_LF:
   1150       *nextTokPtr = ptr;
   1151       return tok;
   1152 #ifdef XML_NS
   1153     case BT_COLON:
   1154       ptr += MINBPC(enc);
   1155       switch (tok) {
   1156       case XML_TOK_NAME:
   1157         if (ptr == end)
   1158           return XML_TOK_PARTIAL;
   1159         tok = XML_TOK_PREFIXED_NAME;
   1160         switch (BYTE_TYPE(enc, ptr)) {
   1161         CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
   1162         default:
   1163           tok = XML_TOK_NMTOKEN;
   1164           break;
   1165         }
   1166         break;
   1167       case XML_TOK_PREFIXED_NAME:
   1168         tok = XML_TOK_NMTOKEN;
   1169         break;
   1170       }
   1171       break;
   1172 #endif
   1173     case BT_PLUS:
   1174       if (tok == XML_TOK_NMTOKEN)  {
   1175         *nextTokPtr = ptr;
   1176         return XML_TOK_INVALID;
   1177       }
   1178       *nextTokPtr = ptr + MINBPC(enc);
   1179       return XML_TOK_NAME_PLUS;
   1180     case BT_AST:
   1181       if (tok == XML_TOK_NMTOKEN)  {
   1182         *nextTokPtr = ptr;
   1183         return XML_TOK_INVALID;
   1184       }
   1185       *nextTokPtr = ptr + MINBPC(enc);
   1186       return XML_TOK_NAME_ASTERISK;
   1187     case BT_QUEST:
   1188       if (tok == XML_TOK_NMTOKEN)  {
   1189         *nextTokPtr = ptr;
   1190         return XML_TOK_INVALID;
   1191       }
   1192       *nextTokPtr = ptr + MINBPC(enc);
   1193       return XML_TOK_NAME_QUESTION;
   1194     default:
   1195       *nextTokPtr = ptr;
   1196       return XML_TOK_INVALID;
   1197     }
   1198   }
   1199   return -tok;
   1200 }
   1201 
   1202 static int PTRCALL
   1203 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
   1204                           const char *end, const char **nextTokPtr)
   1205 {
   1206   const char *start;
   1207   if (ptr == end)
   1208     return XML_TOK_NONE;
   1209   start = ptr;
   1210   while (ptr != end) {
   1211     switch (BYTE_TYPE(enc, ptr)) {
   1212 #define LEAD_CASE(n) \
   1213     case BT_LEAD ## n: ptr += n; break;
   1214     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1215 #undef LEAD_CASE
   1216     case BT_AMP:
   1217       if (ptr == start)
   1218         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1219       *nextTokPtr = ptr;
   1220       return XML_TOK_DATA_CHARS;
   1221     case BT_LT:
   1222       /* this is for inside entity references */
   1223       *nextTokPtr = ptr;
   1224       return XML_TOK_INVALID;
   1225     case BT_LF:
   1226       if (ptr == start) {
   1227         *nextTokPtr = ptr + MINBPC(enc);
   1228         return XML_TOK_DATA_NEWLINE;
   1229       }
   1230       *nextTokPtr = ptr;
   1231       return XML_TOK_DATA_CHARS;
   1232     case BT_CR:
   1233       if (ptr == start) {
   1234         ptr += MINBPC(enc);
   1235         if (ptr == end)
   1236           return XML_TOK_TRAILING_CR;
   1237         if (BYTE_TYPE(enc, ptr) == BT_LF)
   1238           ptr += MINBPC(enc);
   1239         *nextTokPtr = ptr;
   1240         return XML_TOK_DATA_NEWLINE;
   1241       }
   1242       *nextTokPtr = ptr;
   1243       return XML_TOK_DATA_CHARS;
   1244     case BT_S:
   1245       if (ptr == start) {
   1246         *nextTokPtr = ptr + MINBPC(enc);
   1247         return XML_TOK_ATTRIBUTE_VALUE_S;
   1248       }
   1249       *nextTokPtr = ptr;
   1250       return XML_TOK_DATA_CHARS;
   1251     default:
   1252       ptr += MINBPC(enc);
   1253       break;
   1254     }
   1255   }
   1256   *nextTokPtr = ptr;
   1257   return XML_TOK_DATA_CHARS;
   1258 }
   1259 
   1260 static int PTRCALL
   1261 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
   1262                        const char *end, const char **nextTokPtr)
   1263 {
   1264   const char *start;
   1265   if (ptr == end)
   1266     return XML_TOK_NONE;
   1267   start = ptr;
   1268   while (ptr != end) {
   1269     switch (BYTE_TYPE(enc, ptr)) {
   1270 #define LEAD_CASE(n) \
   1271     case BT_LEAD ## n: ptr += n; break;
   1272     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1273 #undef LEAD_CASE
   1274     case BT_AMP:
   1275       if (ptr == start)
   1276         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1277       *nextTokPtr = ptr;
   1278       return XML_TOK_DATA_CHARS;
   1279     case BT_PERCNT:
   1280       if (ptr == start) {
   1281         int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
   1282                                        end, nextTokPtr);
   1283         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
   1284       }
   1285       *nextTokPtr = ptr;
   1286       return XML_TOK_DATA_CHARS;
   1287     case BT_LF:
   1288       if (ptr == start) {
   1289         *nextTokPtr = ptr + MINBPC(enc);
   1290         return XML_TOK_DATA_NEWLINE;
   1291       }
   1292       *nextTokPtr = ptr;
   1293       return XML_TOK_DATA_CHARS;
   1294     case BT_CR:
   1295       if (ptr == start) {
   1296         ptr += MINBPC(enc);
   1297         if (ptr == end)
   1298           return XML_TOK_TRAILING_CR;
   1299         if (BYTE_TYPE(enc, ptr) == BT_LF)
   1300           ptr += MINBPC(enc);
   1301         *nextTokPtr = ptr;
   1302         return XML_TOK_DATA_NEWLINE;
   1303       }
   1304       *nextTokPtr = ptr;
   1305       return XML_TOK_DATA_CHARS;
   1306     default:
   1307       ptr += MINBPC(enc);
   1308       break;
   1309     }
   1310   }
   1311   *nextTokPtr = ptr;
   1312   return XML_TOK_DATA_CHARS;
   1313 }
   1314 
   1315 #ifdef XML_DTD
   1316 
   1317 static int PTRCALL
   1318 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
   1319                          const char *end, const char **nextTokPtr)
   1320 {
   1321   int level = 0;
   1322   if (MINBPC(enc) > 1) {
   1323     size_t n = end - ptr;
   1324     if (n & (MINBPC(enc) - 1)) {
   1325       n &= ~(MINBPC(enc) - 1);
   1326       end = ptr + n;
   1327     }
   1328   }
   1329   while (ptr != end) {
   1330     switch (BYTE_TYPE(enc, ptr)) {
   1331     INVALID_CASES(ptr, nextTokPtr)
   1332     case BT_LT:
   1333       if ((ptr += MINBPC(enc)) == end)
   1334         return XML_TOK_PARTIAL;
   1335       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
   1336         if ((ptr += MINBPC(enc)) == end)
   1337           return XML_TOK_PARTIAL;
   1338         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
   1339           ++level;
   1340           ptr += MINBPC(enc);
   1341         }
   1342       }
   1343       break;
   1344     case BT_RSQB:
   1345       if ((ptr += MINBPC(enc)) == end)
   1346         return XML_TOK_PARTIAL;
   1347       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
   1348         if ((ptr += MINBPC(enc)) == end)
   1349           return XML_TOK_PARTIAL;
   1350         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
   1351           ptr += MINBPC(enc);
   1352           if (level == 0) {
   1353             *nextTokPtr = ptr;
   1354             return XML_TOK_IGNORE_SECT;
   1355           }
   1356           --level;
   1357         }
   1358       }
   1359       break;
   1360     default:
   1361       ptr += MINBPC(enc);
   1362       break;
   1363     }
   1364   }
   1365   return XML_TOK_PARTIAL;
   1366 }
   1367 
   1368 #endif /* XML_DTD */
   1369 
   1370 static int PTRCALL
   1371 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
   1372                    const char **badPtr)
   1373 {
   1374   ptr += MINBPC(enc);
   1375   end -= MINBPC(enc);
   1376   for (; ptr != end; ptr += MINBPC(enc)) {
   1377     switch (BYTE_TYPE(enc, ptr)) {
   1378     case BT_DIGIT:
   1379     case BT_HEX:
   1380     case BT_MINUS:
   1381     case BT_APOS:
   1382     case BT_LPAR:
   1383     case BT_RPAR:
   1384     case BT_PLUS:
   1385     case BT_COMMA:
   1386     case BT_SOL:
   1387     case BT_EQUALS:
   1388     case BT_QUEST:
   1389     case BT_CR:
   1390     case BT_LF:
   1391     case BT_SEMI:
   1392     case BT_EXCL:
   1393     case BT_AST:
   1394     case BT_PERCNT:
   1395     case BT_NUM:
   1396 #ifdef XML_NS
   1397     case BT_COLON:
   1398 #endif
   1399       break;
   1400     case BT_S:
   1401       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
   1402         *badPtr = ptr;
   1403         return 0;
   1404       }
   1405       break;
   1406     case BT_NAME:
   1407     case BT_NMSTRT:
   1408       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
   1409         break;
   1410     default:
   1411       switch (BYTE_TO_ASCII(enc, ptr)) {
   1412       case 0x24: /* $ */
   1413       case 0x40: /* @ */
   1414         break;
   1415       default:
   1416         *badPtr = ptr;
   1417         return 0;
   1418       }
   1419       break;
   1420     }
   1421   }
   1422   return 1;
   1423 }
   1424 
   1425 /* This must only be called for a well-formed start-tag or empty
   1426    element tag.  Returns the number of attributes.  Pointers to the
   1427    first attsMax attributes are stored in atts.
   1428 */
   1429 
   1430 static int PTRCALL
   1431 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
   1432                 int attsMax, ATTRIBUTE *atts)
   1433 {
   1434   enum { other, inName, inValue } state = inName;
   1435   int nAtts = 0;
   1436   int open = 0; /* defined when state == inValue;
   1437                    initialization just to shut up compilers */
   1438 
   1439   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
   1440     switch (BYTE_TYPE(enc, ptr)) {
   1441 #define START_NAME \
   1442       if (state == other) { \
   1443         if (nAtts < attsMax) { \
   1444           atts[nAtts].name = ptr; \
   1445           atts[nAtts].normalized = 1; \
   1446         } \
   1447         state = inName; \
   1448       }
   1449 #define LEAD_CASE(n) \
   1450     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
   1451     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1452 #undef LEAD_CASE
   1453     case BT_NONASCII:
   1454     case BT_NMSTRT:
   1455     case BT_HEX:
   1456       START_NAME
   1457       break;
   1458 #undef START_NAME
   1459     case BT_QUOT:
   1460       if (state != inValue) {
   1461         if (nAtts < attsMax)
   1462           atts[nAtts].valuePtr = ptr + MINBPC(enc);
   1463         state = inValue;
   1464         open = BT_QUOT;
   1465       }
   1466       else if (open == BT_QUOT) {
   1467         state = other;
   1468         if (nAtts < attsMax)
   1469           atts[nAtts].valueEnd = ptr;
   1470         nAtts++;
   1471       }
   1472       break;
   1473     case BT_APOS:
   1474       if (state != inValue) {
   1475         if (nAtts < attsMax)
   1476           atts[nAtts].valuePtr = ptr + MINBPC(enc);
   1477         state = inValue;
   1478         open = BT_APOS;
   1479       }
   1480       else if (open == BT_APOS) {
   1481         state = other;
   1482         if (nAtts < attsMax)
   1483           atts[nAtts].valueEnd = ptr;
   1484         nAtts++;
   1485       }
   1486       break;
   1487     case BT_AMP:
   1488       if (nAtts < attsMax)
   1489         atts[nAtts].normalized = 0;
   1490       break;
   1491     case BT_S:
   1492       if (state == inName)
   1493         state = other;
   1494       else if (state == inValue
   1495                && nAtts < attsMax
   1496                && atts[nAtts].normalized
   1497                && (ptr == atts[nAtts].valuePtr
   1498                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
   1499                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
   1500                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
   1501         atts[nAtts].normalized = 0;
   1502       break;
   1503     case BT_CR: case BT_LF:
   1504       /* This case ensures that the first attribute name is counted
   1505          Apart from that we could just change state on the quote. */
   1506       if (state == inName)
   1507         state = other;
   1508       else if (state == inValue && nAtts < attsMax)
   1509         atts[nAtts].normalized = 0;
   1510       break;
   1511     case BT_GT:
   1512     case BT_SOL:
   1513       if (state != inValue)
   1514         return nAtts;
   1515       break;
   1516     default:
   1517       break;
   1518     }
   1519   }
   1520   /* not reached */
   1521 }
   1522 
   1523 static int PTRFASTCALL
   1524 PREFIX(charRefNumber)(const ENCODING *enc, const char *ptr)
   1525 {
   1526   int result = 0;
   1527   /* skip &# */
   1528   ptr += 2*MINBPC(enc);
   1529   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
   1530     for (ptr += MINBPC(enc);
   1531          !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
   1532          ptr += MINBPC(enc)) {
   1533       int c = BYTE_TO_ASCII(enc, ptr);
   1534       switch (c) {
   1535       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
   1536       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
   1537         result <<= 4;
   1538         result |= (c - ASCII_0);
   1539         break;
   1540       case ASCII_A: case ASCII_B: case ASCII_C:
   1541       case ASCII_D: case ASCII_E: case ASCII_F:
   1542         result <<= 4;
   1543         result += 10 + (c - ASCII_A);
   1544         break;
   1545       case ASCII_a: case ASCII_b: case ASCII_c:
   1546       case ASCII_d: case ASCII_e: case ASCII_f:
   1547         result <<= 4;
   1548         result += 10 + (c - ASCII_a);
   1549         break;
   1550       }
   1551       if (result >= 0x110000)
   1552         return -1;
   1553     }
   1554   }
   1555   else {
   1556     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
   1557       int c = BYTE_TO_ASCII(enc, ptr);
   1558       result *= 10;
   1559       result += (c - ASCII_0);
   1560       if (result >= 0x110000)
   1561         return -1;
   1562     }
   1563   }
   1564   return checkCharRefNumber(result);
   1565 }
   1566 
   1567 static int PTRCALL
   1568 PREFIX(predefinedEntityName)(const ENCODING *enc, const char *ptr,
   1569                              const char *end)
   1570 {
   1571   switch ((end - ptr)/MINBPC(enc)) {
   1572   case 2:
   1573     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
   1574       switch (BYTE_TO_ASCII(enc, ptr)) {
   1575       case ASCII_l:
   1576         return ASCII_LT;
   1577       case ASCII_g:
   1578         return ASCII_GT;
   1579       }
   1580     }
   1581     break;
   1582   case 3:
   1583     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
   1584       ptr += MINBPC(enc);
   1585       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
   1586         ptr += MINBPC(enc);
   1587         if (CHAR_MATCHES(enc, ptr, ASCII_p))
   1588           return ASCII_AMP;
   1589       }
   1590     }
   1591     break;
   1592   case 4:
   1593     switch (BYTE_TO_ASCII(enc, ptr)) {
   1594     case ASCII_q:
   1595       ptr += MINBPC(enc);
   1596       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
   1597         ptr += MINBPC(enc);
   1598         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
   1599           ptr += MINBPC(enc);
   1600           if (CHAR_MATCHES(enc, ptr, ASCII_t))
   1601             return ASCII_QUOT;
   1602         }
   1603       }
   1604       break;
   1605     case ASCII_a:
   1606       ptr += MINBPC(enc);
   1607       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
   1608         ptr += MINBPC(enc);
   1609         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
   1610           ptr += MINBPC(enc);
   1611           if (CHAR_MATCHES(enc, ptr, ASCII_s))
   1612             return ASCII_APOS;
   1613         }
   1614       }
   1615       break;
   1616     }
   1617   }
   1618   return 0;
   1619 }
   1620 
   1621 static int PTRCALL
   1622 PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
   1623 {
   1624   for (;;) {
   1625     switch (BYTE_TYPE(enc, ptr1)) {
   1626 #define LEAD_CASE(n) \
   1627     case BT_LEAD ## n: \
   1628       if (*ptr1++ != *ptr2++) \
   1629         return 0;
   1630     LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
   1631 #undef LEAD_CASE
   1632       /* fall through */
   1633       if (*ptr1++ != *ptr2++)
   1634         return 0;
   1635       break;
   1636     case BT_NONASCII:
   1637     case BT_NMSTRT:
   1638 #ifdef XML_NS
   1639     case BT_COLON:
   1640 #endif
   1641     case BT_HEX:
   1642     case BT_DIGIT:
   1643     case BT_NAME:
   1644     case BT_MINUS:
   1645       if (*ptr2++ != *ptr1++)
   1646         return 0;
   1647       if (MINBPC(enc) > 1) {
   1648         if (*ptr2++ != *ptr1++)
   1649           return 0;
   1650         if (MINBPC(enc) > 2) {
   1651           if (*ptr2++ != *ptr1++)
   1652             return 0;
   1653           if (MINBPC(enc) > 3) {
   1654             if (*ptr2++ != *ptr1++)
   1655               return 0;
   1656           }
   1657         }
   1658       }
   1659       break;
   1660     default:
   1661       if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
   1662         return 1;
   1663       switch (BYTE_TYPE(enc, ptr2)) {
   1664       case BT_LEAD2:
   1665       case BT_LEAD3:
   1666       case BT_LEAD4:
   1667       case BT_NONASCII:
   1668       case BT_NMSTRT:
   1669 #ifdef XML_NS
   1670       case BT_COLON:
   1671 #endif
   1672       case BT_HEX:
   1673       case BT_DIGIT:
   1674       case BT_NAME:
   1675       case BT_MINUS:
   1676         return 0;
   1677       default:
   1678         return 1;
   1679       }
   1680     }
   1681   }
   1682   /* not reached */
   1683 }
   1684 
   1685 static int PTRCALL
   1686 PREFIX(nameMatchesAscii)(const ENCODING *enc, const char *ptr1,
   1687                          const char *end1, const char *ptr2)
   1688 {
   1689   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
   1690     if (ptr1 == end1)
   1691       return 0;
   1692     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
   1693       return 0;
   1694   }
   1695   return ptr1 == end1;
   1696 }
   1697 
   1698 static int PTRFASTCALL
   1699 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
   1700 {
   1701   const char *start = ptr;
   1702   for (;;) {
   1703     switch (BYTE_TYPE(enc, ptr)) {
   1704 #define LEAD_CASE(n) \
   1705     case BT_LEAD ## n: ptr += n; break;
   1706     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1707 #undef LEAD_CASE
   1708     case BT_NONASCII:
   1709     case BT_NMSTRT:
   1710 #ifdef XML_NS
   1711     case BT_COLON:
   1712 #endif
   1713     case BT_HEX:
   1714     case BT_DIGIT:
   1715     case BT_NAME:
   1716     case BT_MINUS:
   1717       ptr += MINBPC(enc);
   1718       break;
   1719     default:
   1720       return (int)(ptr - start);
   1721     }
   1722   }
   1723 }
   1724 
   1725 static const char * PTRFASTCALL
   1726 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
   1727 {
   1728   for (;;) {
   1729     switch (BYTE_TYPE(enc, ptr)) {
   1730     case BT_LF:
   1731     case BT_CR:
   1732     case BT_S:
   1733       ptr += MINBPC(enc);
   1734       break;
   1735     default:
   1736       return ptr;
   1737     }
   1738   }
   1739 }
   1740 
   1741 static void PTRCALL
   1742 PREFIX(updatePosition)(const ENCODING *enc,
   1743                        const char *ptr,
   1744                        const char *end,
   1745                        POSITION *pos)
   1746 {
   1747   while (ptr < end) {
   1748     switch (BYTE_TYPE(enc, ptr)) {
   1749 #define LEAD_CASE(n) \
   1750     case BT_LEAD ## n: \
   1751       ptr += n; \
   1752       break;
   1753     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1754 #undef LEAD_CASE
   1755     case BT_LF:
   1756       pos->columnNumber = (XML_Size)-1;
   1757       pos->lineNumber++;
   1758       ptr += MINBPC(enc);
   1759       break;
   1760     case BT_CR:
   1761       pos->lineNumber++;
   1762       ptr += MINBPC(enc);
   1763       if (ptr != end && BYTE_TYPE(enc, ptr) == BT_LF)
   1764         ptr += MINBPC(enc);
   1765       pos->columnNumber = (XML_Size)-1;
   1766       break;
   1767     default:
   1768       ptr += MINBPC(enc);
   1769       break;
   1770     }
   1771     pos->columnNumber++;
   1772   }
   1773 }
   1774 
   1775 #undef DO_LEAD_CASE
   1776 #undef MULTIBYTE_CASES
   1777 #undef INVALID_CASES
   1778 #undef CHECK_NAME_CASE
   1779 #undef CHECK_NAME_CASES
   1780 #undef CHECK_NMSTRT_CASE
   1781 #undef CHECK_NMSTRT_CASES
   1782 
   1783 #endif /* XML_TOK_IMPL_C */
   1784