Home | History | Annotate | Download | only in lib
      1 /* Copyright (c) 1998, 1999 Thai Open Source Software Center Ltd
      2    See the file COPYING for copying permission.
      3 */
      4 
      5 /* This file is included! */
      6 #ifdef XML_TOK_IMPL_C
      7 
      8 #ifndef IS_INVALID_CHAR
      9 #define IS_INVALID_CHAR(enc, ptr, n) (0)
     10 #endif
     11 
     12 #define INVALID_LEAD_CASE(n, ptr, nextTokPtr) \
     13     case BT_LEAD ## n: \
     14       if (end - ptr < n) \
     15         return XML_TOK_PARTIAL_CHAR; \
     16       if (IS_INVALID_CHAR(enc, ptr, n)) { \
     17         *(nextTokPtr) = (ptr); \
     18         return XML_TOK_INVALID; \
     19       } \
     20       ptr += n; \
     21       break;
     22 
     23 #define INVALID_CASES(ptr, nextTokPtr) \
     24   INVALID_LEAD_CASE(2, ptr, nextTokPtr) \
     25   INVALID_LEAD_CASE(3, ptr, nextTokPtr) \
     26   INVALID_LEAD_CASE(4, ptr, nextTokPtr) \
     27   case BT_NONXML: \
     28   case BT_MALFORM: \
     29   case BT_TRAIL: \
     30     *(nextTokPtr) = (ptr); \
     31     return XML_TOK_INVALID;
     32 
     33 #define CHECK_NAME_CASE(n, enc, ptr, end, nextTokPtr) \
     34    case BT_LEAD ## n: \
     35      if (end - ptr < n) \
     36        return XML_TOK_PARTIAL_CHAR; \
     37      if (!IS_NAME_CHAR(enc, ptr, n)) { \
     38        *nextTokPtr = ptr; \
     39        return XML_TOK_INVALID; \
     40      } \
     41      ptr += n; \
     42      break;
     43 
     44 #define CHECK_NAME_CASES(enc, ptr, end, nextTokPtr) \
     45   case BT_NONASCII: \
     46     if (!IS_NAME_CHAR_MINBPC(enc, ptr)) { \
     47       *nextTokPtr = ptr; \
     48       return XML_TOK_INVALID; \
     49     } \
     50   case BT_NMSTRT: \
     51   case BT_HEX: \
     52   case BT_DIGIT: \
     53   case BT_NAME: \
     54   case BT_MINUS: \
     55     ptr += MINBPC(enc); \
     56     break; \
     57   CHECK_NAME_CASE(2, enc, ptr, end, nextTokPtr) \
     58   CHECK_NAME_CASE(3, enc, ptr, end, nextTokPtr) \
     59   CHECK_NAME_CASE(4, enc, ptr, end, nextTokPtr)
     60 
     61 #define CHECK_NMSTRT_CASE(n, enc, ptr, end, nextTokPtr) \
     62    case BT_LEAD ## n: \
     63      if (end - ptr < n) \
     64        return XML_TOK_PARTIAL_CHAR; \
     65      if (!IS_NMSTRT_CHAR(enc, ptr, n)) { \
     66        *nextTokPtr = ptr; \
     67        return XML_TOK_INVALID; \
     68      } \
     69      ptr += n; \
     70      break;
     71 
     72 #define CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr) \
     73   case BT_NONASCII: \
     74     if (!IS_NMSTRT_CHAR_MINBPC(enc, ptr)) { \
     75       *nextTokPtr = ptr; \
     76       return XML_TOK_INVALID; \
     77     } \
     78   case BT_NMSTRT: \
     79   case BT_HEX: \
     80     ptr += MINBPC(enc); \
     81     break; \
     82   CHECK_NMSTRT_CASE(2, enc, ptr, end, nextTokPtr) \
     83   CHECK_NMSTRT_CASE(3, enc, ptr, end, nextTokPtr) \
     84   CHECK_NMSTRT_CASE(4, enc, ptr, end, nextTokPtr)
     85 
     86 #ifndef PREFIX
     87 #define PREFIX(ident) ident
     88 #endif
     89 
     90 
     91 #define HAS_CHARS(enc, ptr, end, count) \
     92     (end - ptr >= count * MINBPC(enc))
     93 
     94 #define HAS_CHAR(enc, ptr, end) \
     95     HAS_CHARS(enc, ptr, end, 1)
     96 
     97 #define REQUIRE_CHARS(enc, ptr, end, count) \
     98     { \
     99       if (! HAS_CHARS(enc, ptr, end, count)) { \
    100         return XML_TOK_PARTIAL; \
    101       } \
    102     }
    103 
    104 #define REQUIRE_CHAR(enc, ptr, end) \
    105     REQUIRE_CHARS(enc, ptr, end, 1)
    106 
    107 
    108 /* ptr points to character following "<!-" */
    109 
    110 static int PTRCALL
    111 PREFIX(scanComment)(const ENCODING *enc, const char *ptr,
    112                     const char *end, const char **nextTokPtr)
    113 {
    114   if (HAS_CHAR(enc, ptr, end)) {
    115     if (!CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
    116       *nextTokPtr = ptr;
    117       return XML_TOK_INVALID;
    118     }
    119     ptr += MINBPC(enc);
    120     while (HAS_CHAR(enc, ptr, end)) {
    121       switch (BYTE_TYPE(enc, ptr)) {
    122       INVALID_CASES(ptr, nextTokPtr)
    123       case BT_MINUS:
    124         ptr += MINBPC(enc);
    125         REQUIRE_CHAR(enc, ptr, end);
    126         if (CHAR_MATCHES(enc, ptr, ASCII_MINUS)) {
    127           ptr += MINBPC(enc);
    128           REQUIRE_CHAR(enc, ptr, end);
    129           if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    130             *nextTokPtr = ptr;
    131             return XML_TOK_INVALID;
    132           }
    133           *nextTokPtr = ptr + MINBPC(enc);
    134           return XML_TOK_COMMENT;
    135         }
    136         break;
    137       default:
    138         ptr += MINBPC(enc);
    139         break;
    140       }
    141     }
    142   }
    143   return XML_TOK_PARTIAL;
    144 }
    145 
    146 /* ptr points to character following "<!" */
    147 
    148 static int PTRCALL
    149 PREFIX(scanDecl)(const ENCODING *enc, const char *ptr,
    150                  const char *end, const char **nextTokPtr)
    151 {
    152   REQUIRE_CHAR(enc, ptr, end);
    153   switch (BYTE_TYPE(enc, ptr)) {
    154   case BT_MINUS:
    155     return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    156   case BT_LSQB:
    157     *nextTokPtr = ptr + MINBPC(enc);
    158     return XML_TOK_COND_SECT_OPEN;
    159   case BT_NMSTRT:
    160   case BT_HEX:
    161     ptr += MINBPC(enc);
    162     break;
    163   default:
    164     *nextTokPtr = ptr;
    165     return XML_TOK_INVALID;
    166   }
    167   while (HAS_CHAR(enc, ptr, end)) {
    168     switch (BYTE_TYPE(enc, ptr)) {
    169     case BT_PERCNT:
    170       REQUIRE_CHARS(enc, ptr, end, 2);
    171       /* don't allow <!ENTITY% foo "whatever"> */
    172       switch (BYTE_TYPE(enc, ptr + MINBPC(enc))) {
    173       case BT_S: case BT_CR: case BT_LF: case BT_PERCNT:
    174         *nextTokPtr = ptr;
    175         return XML_TOK_INVALID;
    176       }
    177       /* fall through */
    178     case BT_S: case BT_CR: case BT_LF:
    179       *nextTokPtr = ptr;
    180       return XML_TOK_DECL_OPEN;
    181     case BT_NMSTRT:
    182     case BT_HEX:
    183       ptr += MINBPC(enc);
    184       break;
    185     default:
    186       *nextTokPtr = ptr;
    187       return XML_TOK_INVALID;
    188     }
    189   }
    190   return XML_TOK_PARTIAL;
    191 }
    192 
    193 static int PTRCALL
    194 PREFIX(checkPiTarget)(const ENCODING *UNUSED_P(enc), const char *ptr,
    195                       const char *end, int *tokPtr)
    196 {
    197   int upper = 0;
    198   *tokPtr = XML_TOK_PI;
    199   if (end - ptr != MINBPC(enc)*3)
    200     return 1;
    201   switch (BYTE_TO_ASCII(enc, ptr)) {
    202   case ASCII_x:
    203     break;
    204   case ASCII_X:
    205     upper = 1;
    206     break;
    207   default:
    208     return 1;
    209   }
    210   ptr += MINBPC(enc);
    211   switch (BYTE_TO_ASCII(enc, ptr)) {
    212   case ASCII_m:
    213     break;
    214   case ASCII_M:
    215     upper = 1;
    216     break;
    217   default:
    218     return 1;
    219   }
    220   ptr += MINBPC(enc);
    221   switch (BYTE_TO_ASCII(enc, ptr)) {
    222   case ASCII_l:
    223     break;
    224   case ASCII_L:
    225     upper = 1;
    226     break;
    227   default:
    228     return 1;
    229   }
    230   if (upper)
    231     return 0;
    232   *tokPtr = XML_TOK_XML_DECL;
    233   return 1;
    234 }
    235 
    236 /* ptr points to character following "<?" */
    237 
    238 static int PTRCALL
    239 PREFIX(scanPi)(const ENCODING *enc, const char *ptr,
    240                const char *end, const char **nextTokPtr)
    241 {
    242   int tok;
    243   const char *target = ptr;
    244   REQUIRE_CHAR(enc, ptr, end);
    245   switch (BYTE_TYPE(enc, ptr)) {
    246   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    247   default:
    248     *nextTokPtr = ptr;
    249     return XML_TOK_INVALID;
    250   }
    251   while (HAS_CHAR(enc, ptr, end)) {
    252     switch (BYTE_TYPE(enc, ptr)) {
    253     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    254     case BT_S: case BT_CR: case BT_LF:
    255       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
    256         *nextTokPtr = ptr;
    257         return XML_TOK_INVALID;
    258       }
    259       ptr += MINBPC(enc);
    260       while (HAS_CHAR(enc, ptr, end)) {
    261         switch (BYTE_TYPE(enc, ptr)) {
    262         INVALID_CASES(ptr, nextTokPtr)
    263         case BT_QUEST:
    264           ptr += MINBPC(enc);
    265           REQUIRE_CHAR(enc, ptr, end);
    266           if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    267             *nextTokPtr = ptr + MINBPC(enc);
    268             return tok;
    269           }
    270           break;
    271         default:
    272           ptr += MINBPC(enc);
    273           break;
    274         }
    275       }
    276       return XML_TOK_PARTIAL;
    277     case BT_QUEST:
    278       if (!PREFIX(checkPiTarget)(enc, target, ptr, &tok)) {
    279         *nextTokPtr = ptr;
    280         return XML_TOK_INVALID;
    281       }
    282       ptr += MINBPC(enc);
    283       REQUIRE_CHAR(enc, ptr, end);
    284       if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    285         *nextTokPtr = ptr + MINBPC(enc);
    286         return tok;
    287       }
    288       /* fall through */
    289     default:
    290       *nextTokPtr = ptr;
    291       return XML_TOK_INVALID;
    292     }
    293   }
    294   return XML_TOK_PARTIAL;
    295 }
    296 
    297 static int PTRCALL
    298 PREFIX(scanCdataSection)(const ENCODING *UNUSED_P(enc), const char *ptr,
    299                          const char *end, const char **nextTokPtr)
    300 {
    301   static const char CDATA_LSQB[] = { ASCII_C, ASCII_D, ASCII_A,
    302                                      ASCII_T, ASCII_A, ASCII_LSQB };
    303   int i;
    304   /* CDATA[ */
    305   REQUIRE_CHARS(enc, ptr, end, 6);
    306   for (i = 0; i < 6; i++, ptr += MINBPC(enc)) {
    307     if (!CHAR_MATCHES(enc, ptr, CDATA_LSQB[i])) {
    308       *nextTokPtr = ptr;
    309       return XML_TOK_INVALID;
    310     }
    311   }
    312   *nextTokPtr = ptr;
    313   return XML_TOK_CDATA_SECT_OPEN;
    314 }
    315 
    316 static int PTRCALL
    317 PREFIX(cdataSectionTok)(const ENCODING *enc, const char *ptr,
    318                         const char *end, const char **nextTokPtr)
    319 {
    320   if (ptr >= end)
    321     return XML_TOK_NONE;
    322   if (MINBPC(enc) > 1) {
    323     size_t n = end - ptr;
    324     if (n & (MINBPC(enc) - 1)) {
    325       n &= ~(MINBPC(enc) - 1);
    326       if (n == 0)
    327         return XML_TOK_PARTIAL;
    328       end = ptr + n;
    329     }
    330   }
    331   switch (BYTE_TYPE(enc, ptr)) {
    332   case BT_RSQB:
    333     ptr += MINBPC(enc);
    334     REQUIRE_CHAR(enc, ptr, end);
    335     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
    336       break;
    337     ptr += MINBPC(enc);
    338     REQUIRE_CHAR(enc, ptr, end);
    339     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    340       ptr -= MINBPC(enc);
    341       break;
    342     }
    343     *nextTokPtr = ptr + MINBPC(enc);
    344     return XML_TOK_CDATA_SECT_CLOSE;
    345   case BT_CR:
    346     ptr += MINBPC(enc);
    347     REQUIRE_CHAR(enc, ptr, end);
    348     if (BYTE_TYPE(enc, ptr) == BT_LF)
    349       ptr += MINBPC(enc);
    350     *nextTokPtr = ptr;
    351     return XML_TOK_DATA_NEWLINE;
    352   case BT_LF:
    353     *nextTokPtr = ptr + MINBPC(enc);
    354     return XML_TOK_DATA_NEWLINE;
    355   INVALID_CASES(ptr, nextTokPtr)
    356   default:
    357     ptr += MINBPC(enc);
    358     break;
    359   }
    360   while (HAS_CHAR(enc, ptr, end)) {
    361     switch (BYTE_TYPE(enc, ptr)) {
    362 #define LEAD_CASE(n) \
    363     case BT_LEAD ## n: \
    364       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
    365         *nextTokPtr = ptr; \
    366         return XML_TOK_DATA_CHARS; \
    367       } \
    368       ptr += n; \
    369       break;
    370     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
    371 #undef LEAD_CASE
    372     case BT_NONXML:
    373     case BT_MALFORM:
    374     case BT_TRAIL:
    375     case BT_CR:
    376     case BT_LF:
    377     case BT_RSQB:
    378       *nextTokPtr = ptr;
    379       return XML_TOK_DATA_CHARS;
    380     default:
    381       ptr += MINBPC(enc);
    382       break;
    383     }
    384   }
    385   *nextTokPtr = ptr;
    386   return XML_TOK_DATA_CHARS;
    387 }
    388 
    389 /* ptr points to character following "</" */
    390 
    391 static int PTRCALL
    392 PREFIX(scanEndTag)(const ENCODING *enc, const char *ptr,
    393                    const char *end, const char **nextTokPtr)
    394 {
    395   REQUIRE_CHAR(enc, ptr, end);
    396   switch (BYTE_TYPE(enc, ptr)) {
    397   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    398   default:
    399     *nextTokPtr = ptr;
    400     return XML_TOK_INVALID;
    401   }
    402   while (HAS_CHAR(enc, ptr, end)) {
    403     switch (BYTE_TYPE(enc, ptr)) {
    404     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    405     case BT_S: case BT_CR: case BT_LF:
    406       for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
    407         switch (BYTE_TYPE(enc, ptr)) {
    408         case BT_S: case BT_CR: case BT_LF:
    409           break;
    410         case BT_GT:
    411           *nextTokPtr = ptr + MINBPC(enc);
    412           return XML_TOK_END_TAG;
    413         default:
    414           *nextTokPtr = ptr;
    415           return XML_TOK_INVALID;
    416         }
    417       }
    418       return XML_TOK_PARTIAL;
    419 #ifdef XML_NS
    420     case BT_COLON:
    421       /* no need to check qname syntax here,
    422          since end-tag must match exactly */
    423       ptr += MINBPC(enc);
    424       break;
    425 #endif
    426     case BT_GT:
    427       *nextTokPtr = ptr + MINBPC(enc);
    428       return XML_TOK_END_TAG;
    429     default:
    430       *nextTokPtr = ptr;
    431       return XML_TOK_INVALID;
    432     }
    433   }
    434   return XML_TOK_PARTIAL;
    435 }
    436 
    437 /* ptr points to character following "&#X" */
    438 
    439 static int PTRCALL
    440 PREFIX(scanHexCharRef)(const ENCODING *enc, const char *ptr,
    441                        const char *end, const char **nextTokPtr)
    442 {
    443   if (HAS_CHAR(enc, ptr, end)) {
    444     switch (BYTE_TYPE(enc, ptr)) {
    445     case BT_DIGIT:
    446     case BT_HEX:
    447       break;
    448     default:
    449       *nextTokPtr = ptr;
    450       return XML_TOK_INVALID;
    451     }
    452     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
    453       switch (BYTE_TYPE(enc, ptr)) {
    454       case BT_DIGIT:
    455       case BT_HEX:
    456         break;
    457       case BT_SEMI:
    458         *nextTokPtr = ptr + MINBPC(enc);
    459         return XML_TOK_CHAR_REF;
    460       default:
    461         *nextTokPtr = ptr;
    462         return XML_TOK_INVALID;
    463       }
    464     }
    465   }
    466   return XML_TOK_PARTIAL;
    467 }
    468 
    469 /* ptr points to character following "&#" */
    470 
    471 static int PTRCALL
    472 PREFIX(scanCharRef)(const ENCODING *enc, const char *ptr,
    473                     const char *end, const char **nextTokPtr)
    474 {
    475   if (HAS_CHAR(enc, ptr, end)) {
    476     if (CHAR_MATCHES(enc, ptr, ASCII_x))
    477       return PREFIX(scanHexCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    478     switch (BYTE_TYPE(enc, ptr)) {
    479     case BT_DIGIT:
    480       break;
    481     default:
    482       *nextTokPtr = ptr;
    483       return XML_TOK_INVALID;
    484     }
    485     for (ptr += MINBPC(enc); HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
    486       switch (BYTE_TYPE(enc, ptr)) {
    487       case BT_DIGIT:
    488         break;
    489       case BT_SEMI:
    490         *nextTokPtr = ptr + MINBPC(enc);
    491         return XML_TOK_CHAR_REF;
    492       default:
    493         *nextTokPtr = ptr;
    494         return XML_TOK_INVALID;
    495       }
    496     }
    497   }
    498   return XML_TOK_PARTIAL;
    499 }
    500 
    501 /* ptr points to character following "&" */
    502 
    503 static int PTRCALL
    504 PREFIX(scanRef)(const ENCODING *enc, const char *ptr, const char *end,
    505                 const char **nextTokPtr)
    506 {
    507   REQUIRE_CHAR(enc, ptr, end);
    508   switch (BYTE_TYPE(enc, ptr)) {
    509   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    510   case BT_NUM:
    511     return PREFIX(scanCharRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    512   default:
    513     *nextTokPtr = ptr;
    514     return XML_TOK_INVALID;
    515   }
    516   while (HAS_CHAR(enc, ptr, end)) {
    517     switch (BYTE_TYPE(enc, ptr)) {
    518     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    519     case BT_SEMI:
    520       *nextTokPtr = ptr + MINBPC(enc);
    521       return XML_TOK_ENTITY_REF;
    522     default:
    523       *nextTokPtr = ptr;
    524       return XML_TOK_INVALID;
    525     }
    526   }
    527   return XML_TOK_PARTIAL;
    528 }
    529 
    530 /* ptr points to character following first character of attribute name */
    531 
    532 static int PTRCALL
    533 PREFIX(scanAtts)(const ENCODING *enc, const char *ptr, const char *end,
    534                  const char **nextTokPtr)
    535 {
    536 #ifdef XML_NS
    537   int hadColon = 0;
    538 #endif
    539   while (HAS_CHAR(enc, ptr, end)) {
    540     switch (BYTE_TYPE(enc, ptr)) {
    541     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    542 #ifdef XML_NS
    543     case BT_COLON:
    544       if (hadColon) {
    545         *nextTokPtr = ptr;
    546         return XML_TOK_INVALID;
    547       }
    548       hadColon = 1;
    549       ptr += MINBPC(enc);
    550       REQUIRE_CHAR(enc, ptr, end);
    551       switch (BYTE_TYPE(enc, ptr)) {
    552       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    553       default:
    554         *nextTokPtr = ptr;
    555         return XML_TOK_INVALID;
    556       }
    557       break;
    558 #endif
    559     case BT_S: case BT_CR: case BT_LF:
    560       for (;;) {
    561         int t;
    562 
    563         ptr += MINBPC(enc);
    564         REQUIRE_CHAR(enc, ptr, end);
    565         t = BYTE_TYPE(enc, ptr);
    566         if (t == BT_EQUALS)
    567           break;
    568         switch (t) {
    569         case BT_S:
    570         case BT_LF:
    571         case BT_CR:
    572           break;
    573         default:
    574           *nextTokPtr = ptr;
    575           return XML_TOK_INVALID;
    576         }
    577       }
    578     /* fall through */
    579     case BT_EQUALS:
    580       {
    581         int open;
    582 #ifdef XML_NS
    583         hadColon = 0;
    584 #endif
    585         for (;;) {
    586           ptr += MINBPC(enc);
    587           REQUIRE_CHAR(enc, ptr, end);
    588           open = BYTE_TYPE(enc, ptr);
    589           if (open == BT_QUOT || open == BT_APOS)
    590             break;
    591           switch (open) {
    592           case BT_S:
    593           case BT_LF:
    594           case BT_CR:
    595             break;
    596           default:
    597             *nextTokPtr = ptr;
    598             return XML_TOK_INVALID;
    599           }
    600         }
    601         ptr += MINBPC(enc);
    602         /* in attribute value */
    603         for (;;) {
    604           int t;
    605           REQUIRE_CHAR(enc, ptr, end);
    606           t = BYTE_TYPE(enc, ptr);
    607           if (t == open)
    608             break;
    609           switch (t) {
    610           INVALID_CASES(ptr, nextTokPtr)
    611           case BT_AMP:
    612             {
    613               int tok = PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, &ptr);
    614               if (tok <= 0) {
    615                 if (tok == XML_TOK_INVALID)
    616                   *nextTokPtr = ptr;
    617                 return tok;
    618               }
    619               break;
    620             }
    621           case BT_LT:
    622             *nextTokPtr = ptr;
    623             return XML_TOK_INVALID;
    624           default:
    625             ptr += MINBPC(enc);
    626             break;
    627           }
    628         }
    629         ptr += MINBPC(enc);
    630         REQUIRE_CHAR(enc, ptr, end);
    631         switch (BYTE_TYPE(enc, ptr)) {
    632         case BT_S:
    633         case BT_CR:
    634         case BT_LF:
    635           break;
    636         case BT_SOL:
    637           goto sol;
    638         case BT_GT:
    639           goto gt;
    640         default:
    641           *nextTokPtr = ptr;
    642           return XML_TOK_INVALID;
    643         }
    644         /* ptr points to closing quote */
    645         for (;;) {
    646           ptr += MINBPC(enc);
    647           REQUIRE_CHAR(enc, ptr, end);
    648           switch (BYTE_TYPE(enc, ptr)) {
    649           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    650           case BT_S: case BT_CR: case BT_LF:
    651             continue;
    652           case BT_GT:
    653           gt:
    654             *nextTokPtr = ptr + MINBPC(enc);
    655             return XML_TOK_START_TAG_WITH_ATTS;
    656           case BT_SOL:
    657           sol:
    658             ptr += MINBPC(enc);
    659             REQUIRE_CHAR(enc, ptr, end);
    660             if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    661               *nextTokPtr = ptr;
    662               return XML_TOK_INVALID;
    663             }
    664             *nextTokPtr = ptr + MINBPC(enc);
    665             return XML_TOK_EMPTY_ELEMENT_WITH_ATTS;
    666           default:
    667             *nextTokPtr = ptr;
    668             return XML_TOK_INVALID;
    669           }
    670           break;
    671         }
    672         break;
    673       }
    674     default:
    675       *nextTokPtr = ptr;
    676       return XML_TOK_INVALID;
    677     }
    678   }
    679   return XML_TOK_PARTIAL;
    680 }
    681 
    682 /* ptr points to character following "<" */
    683 
    684 static int PTRCALL
    685 PREFIX(scanLt)(const ENCODING *enc, const char *ptr, const char *end,
    686                const char **nextTokPtr)
    687 {
    688 #ifdef XML_NS
    689   int hadColon;
    690 #endif
    691   REQUIRE_CHAR(enc, ptr, end);
    692   switch (BYTE_TYPE(enc, ptr)) {
    693   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    694   case BT_EXCL:
    695     ptr += MINBPC(enc);
    696     REQUIRE_CHAR(enc, ptr, end);
    697     switch (BYTE_TYPE(enc, ptr)) {
    698     case BT_MINUS:
    699       return PREFIX(scanComment)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    700     case BT_LSQB:
    701       return PREFIX(scanCdataSection)(enc, ptr + MINBPC(enc),
    702                                       end, nextTokPtr);
    703     }
    704     *nextTokPtr = ptr;
    705     return XML_TOK_INVALID;
    706   case BT_QUEST:
    707     return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    708   case BT_SOL:
    709     return PREFIX(scanEndTag)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    710   default:
    711     *nextTokPtr = ptr;
    712     return XML_TOK_INVALID;
    713   }
    714 #ifdef XML_NS
    715   hadColon = 0;
    716 #endif
    717   /* we have a start-tag */
    718   while (HAS_CHAR(enc, ptr, end)) {
    719     switch (BYTE_TYPE(enc, ptr)) {
    720     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    721 #ifdef XML_NS
    722     case BT_COLON:
    723       if (hadColon) {
    724         *nextTokPtr = ptr;
    725         return XML_TOK_INVALID;
    726       }
    727       hadColon = 1;
    728       ptr += MINBPC(enc);
    729       REQUIRE_CHAR(enc, ptr, end);
    730       switch (BYTE_TYPE(enc, ptr)) {
    731       CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    732       default:
    733         *nextTokPtr = ptr;
    734         return XML_TOK_INVALID;
    735       }
    736       break;
    737 #endif
    738     case BT_S: case BT_CR: case BT_LF:
    739       {
    740         ptr += MINBPC(enc);
    741         while (HAS_CHAR(enc, ptr, end)) {
    742           switch (BYTE_TYPE(enc, ptr)) {
    743           CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    744           case BT_GT:
    745             goto gt;
    746           case BT_SOL:
    747             goto sol;
    748           case BT_S: case BT_CR: case BT_LF:
    749             ptr += MINBPC(enc);
    750             continue;
    751           default:
    752             *nextTokPtr = ptr;
    753             return XML_TOK_INVALID;
    754           }
    755           return PREFIX(scanAtts)(enc, ptr, end, nextTokPtr);
    756         }
    757         return XML_TOK_PARTIAL;
    758       }
    759     case BT_GT:
    760     gt:
    761       *nextTokPtr = ptr + MINBPC(enc);
    762       return XML_TOK_START_TAG_NO_ATTS;
    763     case BT_SOL:
    764     sol:
    765       ptr += MINBPC(enc);
    766       REQUIRE_CHAR(enc, ptr, end);
    767       if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    768         *nextTokPtr = ptr;
    769         return XML_TOK_INVALID;
    770       }
    771       *nextTokPtr = ptr + MINBPC(enc);
    772       return XML_TOK_EMPTY_ELEMENT_NO_ATTS;
    773     default:
    774       *nextTokPtr = ptr;
    775       return XML_TOK_INVALID;
    776     }
    777   }
    778   return XML_TOK_PARTIAL;
    779 }
    780 
    781 static int PTRCALL
    782 PREFIX(contentTok)(const ENCODING *enc, const char *ptr, const char *end,
    783                    const char **nextTokPtr)
    784 {
    785   if (ptr >= end)
    786     return XML_TOK_NONE;
    787   if (MINBPC(enc) > 1) {
    788     size_t n = end - ptr;
    789     if (n & (MINBPC(enc) - 1)) {
    790       n &= ~(MINBPC(enc) - 1);
    791       if (n == 0)
    792         return XML_TOK_PARTIAL;
    793       end = ptr + n;
    794     }
    795   }
    796   switch (BYTE_TYPE(enc, ptr)) {
    797   case BT_LT:
    798     return PREFIX(scanLt)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    799   case BT_AMP:
    800     return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    801   case BT_CR:
    802     ptr += MINBPC(enc);
    803     if (! HAS_CHAR(enc, ptr, end))
    804       return XML_TOK_TRAILING_CR;
    805     if (BYTE_TYPE(enc, ptr) == BT_LF)
    806       ptr += MINBPC(enc);
    807     *nextTokPtr = ptr;
    808     return XML_TOK_DATA_NEWLINE;
    809   case BT_LF:
    810     *nextTokPtr = ptr + MINBPC(enc);
    811     return XML_TOK_DATA_NEWLINE;
    812   case BT_RSQB:
    813     ptr += MINBPC(enc);
    814     if (! HAS_CHAR(enc, ptr, end))
    815       return XML_TOK_TRAILING_RSQB;
    816     if (!CHAR_MATCHES(enc, ptr, ASCII_RSQB))
    817       break;
    818     ptr += MINBPC(enc);
    819     if (! HAS_CHAR(enc, ptr, end))
    820       return XML_TOK_TRAILING_RSQB;
    821     if (!CHAR_MATCHES(enc, ptr, ASCII_GT)) {
    822       ptr -= MINBPC(enc);
    823       break;
    824     }
    825     *nextTokPtr = ptr;
    826     return XML_TOK_INVALID;
    827   INVALID_CASES(ptr, nextTokPtr)
    828   default:
    829     ptr += MINBPC(enc);
    830     break;
    831   }
    832   while (HAS_CHAR(enc, ptr, end)) {
    833     switch (BYTE_TYPE(enc, ptr)) {
    834 #define LEAD_CASE(n) \
    835     case BT_LEAD ## n: \
    836       if (end - ptr < n || IS_INVALID_CHAR(enc, ptr, n)) { \
    837         *nextTokPtr = ptr; \
    838         return XML_TOK_DATA_CHARS; \
    839       } \
    840       ptr += n; \
    841       break;
    842     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
    843 #undef LEAD_CASE
    844     case BT_RSQB:
    845       if (HAS_CHARS(enc, ptr, end, 2)) {
    846          if (!CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_RSQB)) {
    847            ptr += MINBPC(enc);
    848            break;
    849          }
    850          if (HAS_CHARS(enc, ptr, end, 3)) {
    851            if (!CHAR_MATCHES(enc, ptr + 2*MINBPC(enc), ASCII_GT)) {
    852              ptr += MINBPC(enc);
    853              break;
    854            }
    855            *nextTokPtr = ptr + 2*MINBPC(enc);
    856            return XML_TOK_INVALID;
    857          }
    858       }
    859       /* fall through */
    860     case BT_AMP:
    861     case BT_LT:
    862     case BT_NONXML:
    863     case BT_MALFORM:
    864     case BT_TRAIL:
    865     case BT_CR:
    866     case BT_LF:
    867       *nextTokPtr = ptr;
    868       return XML_TOK_DATA_CHARS;
    869     default:
    870       ptr += MINBPC(enc);
    871       break;
    872     }
    873   }
    874   *nextTokPtr = ptr;
    875   return XML_TOK_DATA_CHARS;
    876 }
    877 
    878 /* ptr points to character following "%" */
    879 
    880 static int PTRCALL
    881 PREFIX(scanPercent)(const ENCODING *enc, const char *ptr, const char *end,
    882                     const char **nextTokPtr)
    883 {
    884   REQUIRE_CHAR(enc, ptr, end);
    885   switch (BYTE_TYPE(enc, ptr)) {
    886   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    887   case BT_S: case BT_LF: case BT_CR: case BT_PERCNT:
    888     *nextTokPtr = ptr;
    889     return XML_TOK_PERCENT;
    890   default:
    891     *nextTokPtr = ptr;
    892     return XML_TOK_INVALID;
    893   }
    894   while (HAS_CHAR(enc, ptr, end)) {
    895     switch (BYTE_TYPE(enc, ptr)) {
    896     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    897     case BT_SEMI:
    898       *nextTokPtr = ptr + MINBPC(enc);
    899       return XML_TOK_PARAM_ENTITY_REF;
    900     default:
    901       *nextTokPtr = ptr;
    902       return XML_TOK_INVALID;
    903     }
    904   }
    905   return XML_TOK_PARTIAL;
    906 }
    907 
    908 static int PTRCALL
    909 PREFIX(scanPoundName)(const ENCODING *enc, const char *ptr, const char *end,
    910                       const char **nextTokPtr)
    911 {
    912   REQUIRE_CHAR(enc, ptr, end);
    913   switch (BYTE_TYPE(enc, ptr)) {
    914   CHECK_NMSTRT_CASES(enc, ptr, end, nextTokPtr)
    915   default:
    916     *nextTokPtr = ptr;
    917     return XML_TOK_INVALID;
    918   }
    919   while (HAS_CHAR(enc, ptr, end)) {
    920     switch (BYTE_TYPE(enc, ptr)) {
    921     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
    922     case BT_CR: case BT_LF: case BT_S:
    923     case BT_RPAR: case BT_GT: case BT_PERCNT: case BT_VERBAR:
    924       *nextTokPtr = ptr;
    925       return XML_TOK_POUND_NAME;
    926     default:
    927       *nextTokPtr = ptr;
    928       return XML_TOK_INVALID;
    929     }
    930   }
    931   return -XML_TOK_POUND_NAME;
    932 }
    933 
    934 static int PTRCALL
    935 PREFIX(scanLit)(int open, const ENCODING *enc,
    936                 const char *ptr, const char *end,
    937                 const char **nextTokPtr)
    938 {
    939   while (HAS_CHAR(enc, ptr, end)) {
    940     int t = BYTE_TYPE(enc, ptr);
    941     switch (t) {
    942     INVALID_CASES(ptr, nextTokPtr)
    943     case BT_QUOT:
    944     case BT_APOS:
    945       ptr += MINBPC(enc);
    946       if (t != open)
    947         break;
    948       if (! HAS_CHAR(enc, ptr, end))
    949         return -XML_TOK_LITERAL;
    950       *nextTokPtr = ptr;
    951       switch (BYTE_TYPE(enc, ptr)) {
    952       case BT_S: case BT_CR: case BT_LF:
    953       case BT_GT: case BT_PERCNT: case BT_LSQB:
    954         return XML_TOK_LITERAL;
    955       default:
    956         return XML_TOK_INVALID;
    957       }
    958     default:
    959       ptr += MINBPC(enc);
    960       break;
    961     }
    962   }
    963   return XML_TOK_PARTIAL;
    964 }
    965 
    966 static int PTRCALL
    967 PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end,
    968                   const char **nextTokPtr)
    969 {
    970   int tok;
    971   if (ptr >= end)
    972     return XML_TOK_NONE;
    973   if (MINBPC(enc) > 1) {
    974     size_t n = end - ptr;
    975     if (n & (MINBPC(enc) - 1)) {
    976       n &= ~(MINBPC(enc) - 1);
    977       if (n == 0)
    978         return XML_TOK_PARTIAL;
    979       end = ptr + n;
    980     }
    981   }
    982   switch (BYTE_TYPE(enc, ptr)) {
    983   case BT_QUOT:
    984     return PREFIX(scanLit)(BT_QUOT, enc, ptr + MINBPC(enc), end, nextTokPtr);
    985   case BT_APOS:
    986     return PREFIX(scanLit)(BT_APOS, enc, ptr + MINBPC(enc), end, nextTokPtr);
    987   case BT_LT:
    988     {
    989       ptr += MINBPC(enc);
    990       REQUIRE_CHAR(enc, ptr, end);
    991       switch (BYTE_TYPE(enc, ptr)) {
    992       case BT_EXCL:
    993         return PREFIX(scanDecl)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    994       case BT_QUEST:
    995         return PREFIX(scanPi)(enc, ptr + MINBPC(enc), end, nextTokPtr);
    996       case BT_NMSTRT:
    997       case BT_HEX:
    998       case BT_NONASCII:
    999       case BT_LEAD2:
   1000       case BT_LEAD3:
   1001       case BT_LEAD4:
   1002         *nextTokPtr = ptr - MINBPC(enc);
   1003         return XML_TOK_INSTANCE_START;
   1004       }
   1005       *nextTokPtr = ptr;
   1006       return XML_TOK_INVALID;
   1007     }
   1008   case BT_CR:
   1009     if (ptr + MINBPC(enc) == end) {
   1010       *nextTokPtr = end;
   1011       /* indicate that this might be part of a CR/LF pair */
   1012       return -XML_TOK_PROLOG_S;
   1013     }
   1014     /* fall through */
   1015   case BT_S: case BT_LF:
   1016     for (;;) {
   1017       ptr += MINBPC(enc);
   1018       if (! HAS_CHAR(enc, ptr, end))
   1019         break;
   1020       switch (BYTE_TYPE(enc, ptr)) {
   1021       case BT_S: case BT_LF:
   1022         break;
   1023       case BT_CR:
   1024         /* don't split CR/LF pair */
   1025         if (ptr + MINBPC(enc) != end)
   1026           break;
   1027         /* fall through */
   1028       default:
   1029         *nextTokPtr = ptr;
   1030         return XML_TOK_PROLOG_S;
   1031       }
   1032     }
   1033     *nextTokPtr = ptr;
   1034     return XML_TOK_PROLOG_S;
   1035   case BT_PERCNT:
   1036     return PREFIX(scanPercent)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1037   case BT_COMMA:
   1038     *nextTokPtr = ptr + MINBPC(enc);
   1039     return XML_TOK_COMMA;
   1040   case BT_LSQB:
   1041     *nextTokPtr = ptr + MINBPC(enc);
   1042     return XML_TOK_OPEN_BRACKET;
   1043   case BT_RSQB:
   1044     ptr += MINBPC(enc);
   1045     if (! HAS_CHAR(enc, ptr, end))
   1046       return -XML_TOK_CLOSE_BRACKET;
   1047     if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
   1048       REQUIRE_CHARS(enc, ptr, end, 2);
   1049       if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_GT)) {
   1050         *nextTokPtr = ptr + 2*MINBPC(enc);
   1051         return XML_TOK_COND_SECT_CLOSE;
   1052       }
   1053     }
   1054     *nextTokPtr = ptr;
   1055     return XML_TOK_CLOSE_BRACKET;
   1056   case BT_LPAR:
   1057     *nextTokPtr = ptr + MINBPC(enc);
   1058     return XML_TOK_OPEN_PAREN;
   1059   case BT_RPAR:
   1060     ptr += MINBPC(enc);
   1061     if (! HAS_CHAR(enc, ptr, end))
   1062       return -XML_TOK_CLOSE_PAREN;
   1063     switch (BYTE_TYPE(enc, ptr)) {
   1064     case BT_AST:
   1065       *nextTokPtr = ptr + MINBPC(enc);
   1066       return XML_TOK_CLOSE_PAREN_ASTERISK;
   1067     case BT_QUEST:
   1068       *nextTokPtr = ptr + MINBPC(enc);
   1069       return XML_TOK_CLOSE_PAREN_QUESTION;
   1070     case BT_PLUS:
   1071       *nextTokPtr = ptr + MINBPC(enc);
   1072       return XML_TOK_CLOSE_PAREN_PLUS;
   1073     case BT_CR: case BT_LF: case BT_S:
   1074     case BT_GT: case BT_COMMA: case BT_VERBAR:
   1075     case BT_RPAR:
   1076       *nextTokPtr = ptr;
   1077       return XML_TOK_CLOSE_PAREN;
   1078     }
   1079     *nextTokPtr = ptr;
   1080     return XML_TOK_INVALID;
   1081   case BT_VERBAR:
   1082     *nextTokPtr = ptr + MINBPC(enc);
   1083     return XML_TOK_OR;
   1084   case BT_GT:
   1085     *nextTokPtr = ptr + MINBPC(enc);
   1086     return XML_TOK_DECL_CLOSE;
   1087   case BT_NUM:
   1088     return PREFIX(scanPoundName)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1089 #define LEAD_CASE(n) \
   1090   case BT_LEAD ## n: \
   1091     if (end - ptr < n) \
   1092       return XML_TOK_PARTIAL_CHAR; \
   1093     if (IS_NMSTRT_CHAR(enc, ptr, n)) { \
   1094       ptr += n; \
   1095       tok = XML_TOK_NAME; \
   1096       break; \
   1097     } \
   1098     if (IS_NAME_CHAR(enc, ptr, n)) { \
   1099       ptr += n; \
   1100       tok = XML_TOK_NMTOKEN; \
   1101       break; \
   1102     } \
   1103     *nextTokPtr = ptr; \
   1104     return XML_TOK_INVALID;
   1105     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1106 #undef LEAD_CASE
   1107   case BT_NMSTRT:
   1108   case BT_HEX:
   1109     tok = XML_TOK_NAME;
   1110     ptr += MINBPC(enc);
   1111     break;
   1112   case BT_DIGIT:
   1113   case BT_NAME:
   1114   case BT_MINUS:
   1115 #ifdef XML_NS
   1116   case BT_COLON:
   1117 #endif
   1118     tok = XML_TOK_NMTOKEN;
   1119     ptr += MINBPC(enc);
   1120     break;
   1121   case BT_NONASCII:
   1122     if (IS_NMSTRT_CHAR_MINBPC(enc, ptr)) {
   1123       ptr += MINBPC(enc);
   1124       tok = XML_TOK_NAME;
   1125       break;
   1126     }
   1127     if (IS_NAME_CHAR_MINBPC(enc, ptr)) {
   1128       ptr += MINBPC(enc);
   1129       tok = XML_TOK_NMTOKEN;
   1130       break;
   1131     }
   1132     /* fall through */
   1133   default:
   1134     *nextTokPtr = ptr;
   1135     return XML_TOK_INVALID;
   1136   }
   1137   while (HAS_CHAR(enc, ptr, end)) {
   1138     switch (BYTE_TYPE(enc, ptr)) {
   1139     CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
   1140     case BT_GT: case BT_RPAR: case BT_COMMA:
   1141     case BT_VERBAR: case BT_LSQB: case BT_PERCNT:
   1142     case BT_S: case BT_CR: case BT_LF:
   1143       *nextTokPtr = ptr;
   1144       return tok;
   1145 #ifdef XML_NS
   1146     case BT_COLON:
   1147       ptr += MINBPC(enc);
   1148       switch (tok) {
   1149       case XML_TOK_NAME:
   1150         REQUIRE_CHAR(enc, ptr, end);
   1151         tok = XML_TOK_PREFIXED_NAME;
   1152         switch (BYTE_TYPE(enc, ptr)) {
   1153         CHECK_NAME_CASES(enc, ptr, end, nextTokPtr)
   1154         default:
   1155           tok = XML_TOK_NMTOKEN;
   1156           break;
   1157         }
   1158         break;
   1159       case XML_TOK_PREFIXED_NAME:
   1160         tok = XML_TOK_NMTOKEN;
   1161         break;
   1162       }
   1163       break;
   1164 #endif
   1165     case BT_PLUS:
   1166       if (tok == XML_TOK_NMTOKEN)  {
   1167         *nextTokPtr = ptr;
   1168         return XML_TOK_INVALID;
   1169       }
   1170       *nextTokPtr = ptr + MINBPC(enc);
   1171       return XML_TOK_NAME_PLUS;
   1172     case BT_AST:
   1173       if (tok == XML_TOK_NMTOKEN)  {
   1174         *nextTokPtr = ptr;
   1175         return XML_TOK_INVALID;
   1176       }
   1177       *nextTokPtr = ptr + MINBPC(enc);
   1178       return XML_TOK_NAME_ASTERISK;
   1179     case BT_QUEST:
   1180       if (tok == XML_TOK_NMTOKEN)  {
   1181         *nextTokPtr = ptr;
   1182         return XML_TOK_INVALID;
   1183       }
   1184       *nextTokPtr = ptr + MINBPC(enc);
   1185       return XML_TOK_NAME_QUESTION;
   1186     default:
   1187       *nextTokPtr = ptr;
   1188       return XML_TOK_INVALID;
   1189     }
   1190   }
   1191   return -tok;
   1192 }
   1193 
   1194 static int PTRCALL
   1195 PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr,
   1196                           const char *end, const char **nextTokPtr)
   1197 {
   1198   const char *start;
   1199   if (ptr >= end)
   1200     return XML_TOK_NONE;
   1201   else if (! HAS_CHAR(enc, ptr, end))
   1202     return XML_TOK_PARTIAL;
   1203   start = ptr;
   1204   while (HAS_CHAR(enc, ptr, end)) {
   1205     switch (BYTE_TYPE(enc, ptr)) {
   1206 #define LEAD_CASE(n) \
   1207     case BT_LEAD ## n: ptr += n; break;
   1208     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1209 #undef LEAD_CASE
   1210     case BT_AMP:
   1211       if (ptr == start)
   1212         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1213       *nextTokPtr = ptr;
   1214       return XML_TOK_DATA_CHARS;
   1215     case BT_LT:
   1216       /* this is for inside entity references */
   1217       *nextTokPtr = ptr;
   1218       return XML_TOK_INVALID;
   1219     case BT_LF:
   1220       if (ptr == start) {
   1221         *nextTokPtr = ptr + MINBPC(enc);
   1222         return XML_TOK_DATA_NEWLINE;
   1223       }
   1224       *nextTokPtr = ptr;
   1225       return XML_TOK_DATA_CHARS;
   1226     case BT_CR:
   1227       if (ptr == start) {
   1228         ptr += MINBPC(enc);
   1229         if (! HAS_CHAR(enc, ptr, end))
   1230           return XML_TOK_TRAILING_CR;
   1231         if (BYTE_TYPE(enc, ptr) == BT_LF)
   1232           ptr += MINBPC(enc);
   1233         *nextTokPtr = ptr;
   1234         return XML_TOK_DATA_NEWLINE;
   1235       }
   1236       *nextTokPtr = ptr;
   1237       return XML_TOK_DATA_CHARS;
   1238     case BT_S:
   1239       if (ptr == start) {
   1240         *nextTokPtr = ptr + MINBPC(enc);
   1241         return XML_TOK_ATTRIBUTE_VALUE_S;
   1242       }
   1243       *nextTokPtr = ptr;
   1244       return XML_TOK_DATA_CHARS;
   1245     default:
   1246       ptr += MINBPC(enc);
   1247       break;
   1248     }
   1249   }
   1250   *nextTokPtr = ptr;
   1251   return XML_TOK_DATA_CHARS;
   1252 }
   1253 
   1254 static int PTRCALL
   1255 PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr,
   1256                        const char *end, const char **nextTokPtr)
   1257 {
   1258   const char *start;
   1259   if (ptr >= end)
   1260     return XML_TOK_NONE;
   1261   else if (! HAS_CHAR(enc, ptr, end))
   1262     return XML_TOK_PARTIAL;
   1263   start = ptr;
   1264   while (HAS_CHAR(enc, ptr, end)) {
   1265     switch (BYTE_TYPE(enc, ptr)) {
   1266 #define LEAD_CASE(n) \
   1267     case BT_LEAD ## n: ptr += n; break;
   1268     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1269 #undef LEAD_CASE
   1270     case BT_AMP:
   1271       if (ptr == start)
   1272         return PREFIX(scanRef)(enc, ptr + MINBPC(enc), end, nextTokPtr);
   1273       *nextTokPtr = ptr;
   1274       return XML_TOK_DATA_CHARS;
   1275     case BT_PERCNT:
   1276       if (ptr == start) {
   1277         int tok =  PREFIX(scanPercent)(enc, ptr + MINBPC(enc),
   1278                                        end, nextTokPtr);
   1279         return (tok == XML_TOK_PERCENT) ? XML_TOK_INVALID : tok;
   1280       }
   1281       *nextTokPtr = ptr;
   1282       return XML_TOK_DATA_CHARS;
   1283     case BT_LF:
   1284       if (ptr == start) {
   1285         *nextTokPtr = ptr + MINBPC(enc);
   1286         return XML_TOK_DATA_NEWLINE;
   1287       }
   1288       *nextTokPtr = ptr;
   1289       return XML_TOK_DATA_CHARS;
   1290     case BT_CR:
   1291       if (ptr == start) {
   1292         ptr += MINBPC(enc);
   1293         if (! HAS_CHAR(enc, ptr, end))
   1294           return XML_TOK_TRAILING_CR;
   1295         if (BYTE_TYPE(enc, ptr) == BT_LF)
   1296           ptr += MINBPC(enc);
   1297         *nextTokPtr = ptr;
   1298         return XML_TOK_DATA_NEWLINE;
   1299       }
   1300       *nextTokPtr = ptr;
   1301       return XML_TOK_DATA_CHARS;
   1302     default:
   1303       ptr += MINBPC(enc);
   1304       break;
   1305     }
   1306   }
   1307   *nextTokPtr = ptr;
   1308   return XML_TOK_DATA_CHARS;
   1309 }
   1310 
   1311 #ifdef XML_DTD
   1312 
   1313 static int PTRCALL
   1314 PREFIX(ignoreSectionTok)(const ENCODING *enc, const char *ptr,
   1315                          const char *end, const char **nextTokPtr)
   1316 {
   1317   int level = 0;
   1318   if (MINBPC(enc) > 1) {
   1319     size_t n = end - ptr;
   1320     if (n & (MINBPC(enc) - 1)) {
   1321       n &= ~(MINBPC(enc) - 1);
   1322       end = ptr + n;
   1323     }
   1324   }
   1325   while (HAS_CHAR(enc, ptr, end)) {
   1326     switch (BYTE_TYPE(enc, ptr)) {
   1327     INVALID_CASES(ptr, nextTokPtr)
   1328     case BT_LT:
   1329       ptr += MINBPC(enc);
   1330       REQUIRE_CHAR(enc, ptr, end);
   1331       if (CHAR_MATCHES(enc, ptr, ASCII_EXCL)) {
   1332         ptr += MINBPC(enc);
   1333         REQUIRE_CHAR(enc, ptr, end);
   1334         if (CHAR_MATCHES(enc, ptr, ASCII_LSQB)) {
   1335           ++level;
   1336           ptr += MINBPC(enc);
   1337         }
   1338       }
   1339       break;
   1340     case BT_RSQB:
   1341       ptr += MINBPC(enc);
   1342       REQUIRE_CHAR(enc, ptr, end);
   1343       if (CHAR_MATCHES(enc, ptr, ASCII_RSQB)) {
   1344         ptr += MINBPC(enc);
   1345         REQUIRE_CHAR(enc, ptr, end);
   1346         if (CHAR_MATCHES(enc, ptr, ASCII_GT)) {
   1347           ptr += MINBPC(enc);
   1348           if (level == 0) {
   1349             *nextTokPtr = ptr;
   1350             return XML_TOK_IGNORE_SECT;
   1351           }
   1352           --level;
   1353         }
   1354       }
   1355       break;
   1356     default:
   1357       ptr += MINBPC(enc);
   1358       break;
   1359     }
   1360   }
   1361   return XML_TOK_PARTIAL;
   1362 }
   1363 
   1364 #endif /* XML_DTD */
   1365 
   1366 static int PTRCALL
   1367 PREFIX(isPublicId)(const ENCODING *enc, const char *ptr, const char *end,
   1368                    const char **badPtr)
   1369 {
   1370   ptr += MINBPC(enc);
   1371   end -= MINBPC(enc);
   1372   for (; HAS_CHAR(enc, ptr, end); ptr += MINBPC(enc)) {
   1373     switch (BYTE_TYPE(enc, ptr)) {
   1374     case BT_DIGIT:
   1375     case BT_HEX:
   1376     case BT_MINUS:
   1377     case BT_APOS:
   1378     case BT_LPAR:
   1379     case BT_RPAR:
   1380     case BT_PLUS:
   1381     case BT_COMMA:
   1382     case BT_SOL:
   1383     case BT_EQUALS:
   1384     case BT_QUEST:
   1385     case BT_CR:
   1386     case BT_LF:
   1387     case BT_SEMI:
   1388     case BT_EXCL:
   1389     case BT_AST:
   1390     case BT_PERCNT:
   1391     case BT_NUM:
   1392 #ifdef XML_NS
   1393     case BT_COLON:
   1394 #endif
   1395       break;
   1396     case BT_S:
   1397       if (CHAR_MATCHES(enc, ptr, ASCII_TAB)) {
   1398         *badPtr = ptr;
   1399         return 0;
   1400       }
   1401       break;
   1402     case BT_NAME:
   1403     case BT_NMSTRT:
   1404       if (!(BYTE_TO_ASCII(enc, ptr) & ~0x7f))
   1405         break;
   1406     default:
   1407       switch (BYTE_TO_ASCII(enc, ptr)) {
   1408       case 0x24: /* $ */
   1409       case 0x40: /* @ */
   1410         break;
   1411       default:
   1412         *badPtr = ptr;
   1413         return 0;
   1414       }
   1415       break;
   1416     }
   1417   }
   1418   return 1;
   1419 }
   1420 
   1421 /* This must only be called for a well-formed start-tag or empty
   1422    element tag.  Returns the number of attributes.  Pointers to the
   1423    first attsMax attributes are stored in atts.
   1424 */
   1425 
   1426 static int PTRCALL
   1427 PREFIX(getAtts)(const ENCODING *enc, const char *ptr,
   1428                 int attsMax, ATTRIBUTE *atts)
   1429 {
   1430   enum { other, inName, inValue } state = inName;
   1431   int nAtts = 0;
   1432   int open = 0; /* defined when state == inValue;
   1433                    initialization just to shut up compilers */
   1434 
   1435   for (ptr += MINBPC(enc);; ptr += MINBPC(enc)) {
   1436     switch (BYTE_TYPE(enc, ptr)) {
   1437 #define START_NAME \
   1438       if (state == other) { \
   1439         if (nAtts < attsMax) { \
   1440           atts[nAtts].name = ptr; \
   1441           atts[nAtts].normalized = 1; \
   1442         } \
   1443         state = inName; \
   1444       }
   1445 #define LEAD_CASE(n) \
   1446     case BT_LEAD ## n: START_NAME ptr += (n - MINBPC(enc)); break;
   1447     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1448 #undef LEAD_CASE
   1449     case BT_NONASCII:
   1450     case BT_NMSTRT:
   1451     case BT_HEX:
   1452       START_NAME
   1453       break;
   1454 #undef START_NAME
   1455     case BT_QUOT:
   1456       if (state != inValue) {
   1457         if (nAtts < attsMax)
   1458           atts[nAtts].valuePtr = ptr + MINBPC(enc);
   1459         state = inValue;
   1460         open = BT_QUOT;
   1461       }
   1462       else if (open == BT_QUOT) {
   1463         state = other;
   1464         if (nAtts < attsMax)
   1465           atts[nAtts].valueEnd = ptr;
   1466         nAtts++;
   1467       }
   1468       break;
   1469     case BT_APOS:
   1470       if (state != inValue) {
   1471         if (nAtts < attsMax)
   1472           atts[nAtts].valuePtr = ptr + MINBPC(enc);
   1473         state = inValue;
   1474         open = BT_APOS;
   1475       }
   1476       else if (open == BT_APOS) {
   1477         state = other;
   1478         if (nAtts < attsMax)
   1479           atts[nAtts].valueEnd = ptr;
   1480         nAtts++;
   1481       }
   1482       break;
   1483     case BT_AMP:
   1484       if (nAtts < attsMax)
   1485         atts[nAtts].normalized = 0;
   1486       break;
   1487     case BT_S:
   1488       if (state == inName)
   1489         state = other;
   1490       else if (state == inValue
   1491                && nAtts < attsMax
   1492                && atts[nAtts].normalized
   1493                && (ptr == atts[nAtts].valuePtr
   1494                    || BYTE_TO_ASCII(enc, ptr) != ASCII_SPACE
   1495                    || BYTE_TO_ASCII(enc, ptr + MINBPC(enc)) == ASCII_SPACE
   1496                    || BYTE_TYPE(enc, ptr + MINBPC(enc)) == open))
   1497         atts[nAtts].normalized = 0;
   1498       break;
   1499     case BT_CR: case BT_LF:
   1500       /* This case ensures that the first attribute name is counted
   1501          Apart from that we could just change state on the quote. */
   1502       if (state == inName)
   1503         state = other;
   1504       else if (state == inValue && nAtts < attsMax)
   1505         atts[nAtts].normalized = 0;
   1506       break;
   1507     case BT_GT:
   1508     case BT_SOL:
   1509       if (state != inValue)
   1510         return nAtts;
   1511       break;
   1512     default:
   1513       break;
   1514     }
   1515   }
   1516   /* not reached */
   1517 }
   1518 
   1519 static int PTRFASTCALL
   1520 PREFIX(charRefNumber)(const ENCODING *UNUSED_P(enc), const char *ptr)
   1521 {
   1522   int result = 0;
   1523   /* skip &# */
   1524   ptr += 2*MINBPC(enc);
   1525   if (CHAR_MATCHES(enc, ptr, ASCII_x)) {
   1526     for (ptr += MINBPC(enc);
   1527          !CHAR_MATCHES(enc, ptr, ASCII_SEMI);
   1528          ptr += MINBPC(enc)) {
   1529       int c = BYTE_TO_ASCII(enc, ptr);
   1530       switch (c) {
   1531       case ASCII_0: case ASCII_1: case ASCII_2: case ASCII_3: case ASCII_4:
   1532       case ASCII_5: case ASCII_6: case ASCII_7: case ASCII_8: case ASCII_9:
   1533         result <<= 4;
   1534         result |= (c - ASCII_0);
   1535         break;
   1536       case ASCII_A: case ASCII_B: case ASCII_C:
   1537       case ASCII_D: case ASCII_E: case ASCII_F:
   1538         result <<= 4;
   1539         result += 10 + (c - ASCII_A);
   1540         break;
   1541       case ASCII_a: case ASCII_b: case ASCII_c:
   1542       case ASCII_d: case ASCII_e: case ASCII_f:
   1543         result <<= 4;
   1544         result += 10 + (c - ASCII_a);
   1545         break;
   1546       }
   1547       if (result >= 0x110000)
   1548         return -1;
   1549     }
   1550   }
   1551   else {
   1552     for (; !CHAR_MATCHES(enc, ptr, ASCII_SEMI); ptr += MINBPC(enc)) {
   1553       int c = BYTE_TO_ASCII(enc, ptr);
   1554       result *= 10;
   1555       result += (c - ASCII_0);
   1556       if (result >= 0x110000)
   1557         return -1;
   1558     }
   1559   }
   1560   return checkCharRefNumber(result);
   1561 }
   1562 
   1563 static int PTRCALL
   1564 PREFIX(predefinedEntityName)(const ENCODING *UNUSED_P(enc), const char *ptr,
   1565                              const char *end)
   1566 {
   1567   switch ((end - ptr)/MINBPC(enc)) {
   1568   case 2:
   1569     if (CHAR_MATCHES(enc, ptr + MINBPC(enc), ASCII_t)) {
   1570       switch (BYTE_TO_ASCII(enc, ptr)) {
   1571       case ASCII_l:
   1572         return ASCII_LT;
   1573       case ASCII_g:
   1574         return ASCII_GT;
   1575       }
   1576     }
   1577     break;
   1578   case 3:
   1579     if (CHAR_MATCHES(enc, ptr, ASCII_a)) {
   1580       ptr += MINBPC(enc);
   1581       if (CHAR_MATCHES(enc, ptr, ASCII_m)) {
   1582         ptr += MINBPC(enc);
   1583         if (CHAR_MATCHES(enc, ptr, ASCII_p))
   1584           return ASCII_AMP;
   1585       }
   1586     }
   1587     break;
   1588   case 4:
   1589     switch (BYTE_TO_ASCII(enc, ptr)) {
   1590     case ASCII_q:
   1591       ptr += MINBPC(enc);
   1592       if (CHAR_MATCHES(enc, ptr, ASCII_u)) {
   1593         ptr += MINBPC(enc);
   1594         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
   1595           ptr += MINBPC(enc);
   1596           if (CHAR_MATCHES(enc, ptr, ASCII_t))
   1597             return ASCII_QUOT;
   1598         }
   1599       }
   1600       break;
   1601     case ASCII_a:
   1602       ptr += MINBPC(enc);
   1603       if (CHAR_MATCHES(enc, ptr, ASCII_p)) {
   1604         ptr += MINBPC(enc);
   1605         if (CHAR_MATCHES(enc, ptr, ASCII_o)) {
   1606           ptr += MINBPC(enc);
   1607           if (CHAR_MATCHES(enc, ptr, ASCII_s))
   1608             return ASCII_APOS;
   1609         }
   1610       }
   1611       break;
   1612     }
   1613   }
   1614   return 0;
   1615 }
   1616 
   1617 static int PTRCALL
   1618 PREFIX(sameName)(const ENCODING *enc, const char *ptr1, const char *ptr2)
   1619 {
   1620   for (;;) {
   1621     switch (BYTE_TYPE(enc, ptr1)) {
   1622 #define LEAD_CASE(n) \
   1623     case BT_LEAD ## n: \
   1624       if (*ptr1++ != *ptr2++) \
   1625         return 0;
   1626     LEAD_CASE(4) LEAD_CASE(3) LEAD_CASE(2)
   1627 #undef LEAD_CASE
   1628       /* fall through */
   1629       if (*ptr1++ != *ptr2++)
   1630         return 0;
   1631       break;
   1632     case BT_NONASCII:
   1633     case BT_NMSTRT:
   1634 #ifdef XML_NS
   1635     case BT_COLON:
   1636 #endif
   1637     case BT_HEX:
   1638     case BT_DIGIT:
   1639     case BT_NAME:
   1640     case BT_MINUS:
   1641       if (*ptr2++ != *ptr1++)
   1642         return 0;
   1643       if (MINBPC(enc) > 1) {
   1644         if (*ptr2++ != *ptr1++)
   1645           return 0;
   1646         if (MINBPC(enc) > 2) {
   1647           if (*ptr2++ != *ptr1++)
   1648             return 0;
   1649           if (MINBPC(enc) > 3) {
   1650             if (*ptr2++ != *ptr1++)
   1651               return 0;
   1652           }
   1653         }
   1654       }
   1655       break;
   1656     default:
   1657       if (MINBPC(enc) == 1 && *ptr1 == *ptr2)
   1658         return 1;
   1659       switch (BYTE_TYPE(enc, ptr2)) {
   1660       case BT_LEAD2:
   1661       case BT_LEAD3:
   1662       case BT_LEAD4:
   1663       case BT_NONASCII:
   1664       case BT_NMSTRT:
   1665 #ifdef XML_NS
   1666       case BT_COLON:
   1667 #endif
   1668       case BT_HEX:
   1669       case BT_DIGIT:
   1670       case BT_NAME:
   1671       case BT_MINUS:
   1672         return 0;
   1673       default:
   1674         return 1;
   1675       }
   1676     }
   1677   }
   1678   /* not reached */
   1679 }
   1680 
   1681 static int PTRCALL
   1682 PREFIX(nameMatchesAscii)(const ENCODING *UNUSED_P(enc), const char *ptr1,
   1683                          const char *end1, const char *ptr2)
   1684 {
   1685   for (; *ptr2; ptr1 += MINBPC(enc), ptr2++) {
   1686     if (end1 - ptr1 < MINBPC(enc))
   1687       return 0;
   1688     if (!CHAR_MATCHES(enc, ptr1, *ptr2))
   1689       return 0;
   1690   }
   1691   return ptr1 == end1;
   1692 }
   1693 
   1694 static int PTRFASTCALL
   1695 PREFIX(nameLength)(const ENCODING *enc, const char *ptr)
   1696 {
   1697   const char *start = ptr;
   1698   for (;;) {
   1699     switch (BYTE_TYPE(enc, ptr)) {
   1700 #define LEAD_CASE(n) \
   1701     case BT_LEAD ## n: ptr += n; break;
   1702     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1703 #undef LEAD_CASE
   1704     case BT_NONASCII:
   1705     case BT_NMSTRT:
   1706 #ifdef XML_NS
   1707     case BT_COLON:
   1708 #endif
   1709     case BT_HEX:
   1710     case BT_DIGIT:
   1711     case BT_NAME:
   1712     case BT_MINUS:
   1713       ptr += MINBPC(enc);
   1714       break;
   1715     default:
   1716       return (int)(ptr - start);
   1717     }
   1718   }
   1719 }
   1720 
   1721 static const char * PTRFASTCALL
   1722 PREFIX(skipS)(const ENCODING *enc, const char *ptr)
   1723 {
   1724   for (;;) {
   1725     switch (BYTE_TYPE(enc, ptr)) {
   1726     case BT_LF:
   1727     case BT_CR:
   1728     case BT_S:
   1729       ptr += MINBPC(enc);
   1730       break;
   1731     default:
   1732       return ptr;
   1733     }
   1734   }
   1735 }
   1736 
   1737 static void PTRCALL
   1738 PREFIX(updatePosition)(const ENCODING *enc,
   1739                        const char *ptr,
   1740                        const char *end,
   1741                        POSITION *pos)
   1742 {
   1743   while (HAS_CHAR(enc, ptr, end)) {
   1744     switch (BYTE_TYPE(enc, ptr)) {
   1745 #define LEAD_CASE(n) \
   1746     case BT_LEAD ## n: \
   1747       ptr += n; \
   1748       break;
   1749     LEAD_CASE(2) LEAD_CASE(3) LEAD_CASE(4)
   1750 #undef LEAD_CASE
   1751     case BT_LF:
   1752       pos->columnNumber = (XML_Size)-1;
   1753       pos->lineNumber++;
   1754       ptr += MINBPC(enc);
   1755       break;
   1756     case BT_CR:
   1757       pos->lineNumber++;
   1758       ptr += MINBPC(enc);
   1759       if (HAS_CHAR(enc, ptr, end) && BYTE_TYPE(enc, ptr) == BT_LF)
   1760         ptr += MINBPC(enc);
   1761       pos->columnNumber = (XML_Size)-1;
   1762       break;
   1763     default:
   1764       ptr += MINBPC(enc);
   1765       break;
   1766     }
   1767     pos->columnNumber++;
   1768   }
   1769 }
   1770 
   1771 #undef DO_LEAD_CASE
   1772 #undef MULTIBYTE_CASES
   1773 #undef INVALID_CASES
   1774 #undef CHECK_NAME_CASE
   1775 #undef CHECK_NAME_CASES
   1776 #undef CHECK_NMSTRT_CASE
   1777 #undef CHECK_NMSTRT_CASES
   1778 
   1779 #endif /* XML_TOK_IMPL_C */
   1780