Home | History | Annotate | Download | only in bldcsndfa
      1 /**
      2 ***     Build a deterministic finite automaton to associate CCSIDs with
      3 ***             character set names.
      4 ***
      5 ***     Compile on OS/400 with options SYSIFCOPT(*IFSIO).
      6 ***
      7 ***     See Copyright for the status of this software.
      8 ***
      9 ***     Author: Patrick Monnerat <pm (at) datasphere.ch>, DATASPHERE S.A.
     10 **/
     11 
     12 #include <stdio.h>
     13 #include <errno.h>
     14 #include <stdlib.h>
     15 #include <string.h>
     16 #include <fcntl.h>
     17 #include <ctype.h>
     18 
     19 #include <iconv.h>
     20 
     21 
     22 #ifdef OLDXML
     23 #include "xml.h"
     24 #else
     25 #include <libxml/hash.h>
     26 #include <libxml/parser.h>
     27 #include <libxml/xpath.h>
     28 #include <libxml/xpathInternals.h>
     29 #endif
     30 
     31 
     32 #ifdef __OS400__
     33 #define iconv_open_error(cd)            ((cd).return_value == -1)
     34 #define set_iconv_open_error(cd)        ((cd).return_value = -1)
     35 #else
     36 #define iconv_open_error(cd)            ((cd) == (iconv_t) -1)
     37 #define set_iconv_open_error(cd)        ((cd) = (iconv_t) -1)
     38 #endif
     39 
     40 
     41 #define C_SOURCE_CCSID          500
     42 #define C_UTF8_CCSID            1208
     43 
     44 
     45 #define UTF8_SPACE      0x20
     46 #define UTF8_HT         0x09
     47 #define UTF8_0          0x30
     48 #define UTF8_9          0x39
     49 #define UTF8_A          0x41
     50 #define UTF8_Z          0x5A
     51 #define UTF8_a          0x61
     52 #define UTF8_z          0x7A
     53 
     54 
     55 #define GRANULE         128             /* Memory allocation granule. */
     56 
     57 #define EPSILON         0x100           /* Token for empty transition. */
     58 
     59 
     60 #ifndef OFFSETOF
     61 #define OFFSETOF(t, f)  ((unsigned int) ((char *) &((t *) 0)->f - (char *) 0))
     62 #endif
     63 
     64 #ifndef OFFSETBY
     65 #define OFFSETBY(t, p, o)       ((t *) ((char *) (p) + (unsigned int) (o)))
     66 #endif
     67 
     68 
     69 typedef struct t_transition     t_transition;   /* NFA/DFA transition. */
     70 typedef struct t_state          t_state;        /* NFA/DFA state node. */
     71 typedef struct t_symlist        t_symlist;      /* Symbol (i.e.: name) list. */
     72 typedef struct t_chset          t_chset;        /* Character set. */
     73 typedef struct t_stategroup     t_stategroup;   /* Optimization group. */
     74 typedef unsigned char           utf8char;       /* UTF-8 character byte. */
     75 typedef unsigned char           byte;           /* Untyped data byte. */
     76 
     77 
     78 typedef struct {                        /* Set of pointers. */
     79         unsigned int    p_size;         /* Current allocated size. */
     80         unsigned int    p_card;         /* Current element count. */
     81         void *          p_set[1];       /* Element array. */
     82 }               t_powerset;
     83 
     84 
     85 struct t_transition {
     86         t_transition *  t_forwprev;     /* Head of forward transition list. */
     87         t_transition *  t_forwnext;     /* Tail of forward transition list. */
     88         t_transition *  t_backprev;     /* Head of backward transition list. */
     89         t_transition *  t_backnext;     /* Tail of backward transition list. */
     90         t_state *       t_from;         /* Incoming state. */
     91         t_state *       t_to;           /* Destination state. */
     92         unsigned short  t_token;        /* Transition token. */
     93         unsigned int    t_index;        /* Transition array index. */
     94 };
     95 
     96 
     97 struct t_state {
     98         t_state *       s_next;         /* Next state (for DFA construction). */
     99         t_state *       s_stack;        /* Unprocessed DFA states stack. */
    100         t_transition *  s_forward;      /* Forward transitions. */
    101         t_transition *  s_backward;     /* Backward transitions. */
    102         t_chset *       s_final;        /* Recognized character set. */
    103         t_powerset *    s_nfastates;    /* Corresponding NFA states. */
    104         unsigned int    s_index;        /* State index. */
    105 };
    106 
    107 
    108 struct t_symlist {
    109         t_symlist *     l_next;         /* Next name in list. */
    110         utf8char        l_symbol[1];    /* Name bytes. */
    111 };
    112 
    113 
    114 struct t_chset {
    115         t_chset *       c_next;         /* Next character set. */
    116         t_symlist *     c_names;        /* Character set name list. */
    117         iconv_t         c_fromUTF8;     /* Conversion from UTF-8. */
    118         unsigned int    c_ccsid;        /* IBM character set code. */
    119         unsigned int    c_mibenum;      /* IANA character code. */
    120 };
    121 
    122 
    123 struct t_stategroup {
    124         t_stategroup *  g_next;         /* Next group. */
    125         t_state *       g_member;       /* Group member (s_stack) list. */
    126         unsigned int    g_id;           /* Group ident. */
    127 };
    128 
    129 
    130 
    131 t_chset *       chset_list;             /* Character set list. */
    132 t_state *       initial_state;          /* Initial NFA state. */
    133 iconv_t         job2utf8;               /* Job CCSID to UTF-8 conversion. */
    134 iconv_t         utf82job;               /* UTF-8 to job CCSID conversion. */
    135 t_state *       dfa_states;             /* List of DFA states. */
    136 unsigned int    groupid;                /* Group ident counter. */
    137 
    138 
    139 /**
    140 ***     UTF-8 strings.
    141 **/
    142 
    143 #pragma convert(819)
    144 
    145 static const utf8char   utf8_MIBenum[] = "MIBenum";
    146 static const utf8char   utf8_mibenum[] = "mibenum";
    147 static const utf8char   utf8_ibm_[] = "ibm-";
    148 static const utf8char   utf8_IBMCCSID[] = "IBMCCSID";
    149 static const utf8char   utf8_iana_[] = "iana-";
    150 static const utf8char   utf8_Name[] = "Name";
    151 static const utf8char   utf8_Pref_MIME_Name[] = "Preferred MIME Name";
    152 static const utf8char   utf8_Aliases[] = "Aliases";
    153 static const utf8char   utf8_html[] = "html";
    154 static const utf8char   utf8_htmluri[] = "http://www.w3.org/1999/xhtml";
    155 static const utf8char   utf8_A[] = "A";
    156 static const utf8char   utf8_C[] = "C";
    157 static const utf8char   utf8_M[] = "M";
    158 static const utf8char   utf8_N[] = "N";
    159 static const utf8char   utf8_P[] = "P";
    160 static const utf8char   utf8_T[] = "T";
    161 static const utf8char   utf8_ccsid[] = "ccsid";
    162 static const utf8char   utf8_EBCDIC[] = "EBCDIC";
    163 static const utf8char   utf8_ASCII[] = "ASCII";
    164 static const utf8char   utf8_assocnodes[] = "/ccsid_mibenum/assoc[@ccsid]";
    165 static const utf8char   utf8_aliastext[] =
    166                                 "/ccsid_mibenum/assoc[@ccsid=$C]/alias/text()";
    167 #ifdef OLDXML
    168 static const utf8char   utf8_tablerows[] =
    169                         "//table[@id='table-character-sets-1']/*/tr";
    170 static const utf8char   utf8_headerpos[] =
    171                 "count(th[text()=$T]/preceding-sibling::th)+1";
    172 static const utf8char   utf8_getmibenum[] = "number(td[$M])";
    173 static const utf8char   utf8_getprefname[] = "string(td[$P])";
    174 static const utf8char   utf8_getname[] = "string(td[$N])";
    175 static const utf8char   utf8_getaliases[] = "td[$A]/text()";
    176 #else
    177 static const utf8char   utf8_tablerows[] =
    178                         "//html:table[@id='table-character-sets-1']/*/html:tr";
    179 static const utf8char   utf8_headerpos[] =
    180                 "count(html:th[text()=$T]/preceding-sibling::html:th)+1";
    181 static const utf8char   utf8_getmibenum[] = "number(html:td[$M])";
    182 static const utf8char   utf8_getprefname[] = "string(html:td[$P])";
    183 static const utf8char   utf8_getname[] = "string(html:td[$N])";
    184 static const utf8char   utf8_getaliases[] = "html:td[$A]/text()";
    185 #endif
    186 
    187 #pragma convert(0)
    188 
    189 
    190 /**
    191 ***     UTF-8 character length table.
    192 ***
    193 ***     Index is first character byte, value is the character byte count.
    194 **/
    195 
    196 static signed char      utf8_chlen[] = {
    197 /* 00-07 */     1,      1,      1,      1,      1,      1,      1,      1,
    198 /* 08-0F */     1,      1,      1,      1,      1,      1,      1,      1,
    199 /* 10-17 */     1,      1,      1,      1,      1,      1,      1,      1,
    200 /* 18-1F */     1,      1,      1,      1,      1,      1,      1,      1,
    201 /* 20-27 */     1,      1,      1,      1,      1,      1,      1,      1,
    202 /* 28-2F */     1,      1,      1,      1,      1,      1,      1,      1,
    203 /* 30-37 */     1,      1,      1,      1,      1,      1,      1,      1,
    204 /* 38-3F */     1,      1,      1,      1,      1,      1,      1,      1,
    205 /* 40-47 */     1,      1,      1,      1,      1,      1,      1,      1,
    206 /* 48-4F */     1,      1,      1,      1,      1,      1,      1,      1,
    207 /* 50-57 */     1,      1,      1,      1,      1,      1,      1,      1,
    208 /* 58-5F */     1,      1,      1,      1,      1,      1,      1,      1,
    209 /* 60-67 */     1,      1,      1,      1,      1,      1,      1,      1,
    210 /* 68-6F */     1,      1,      1,      1,      1,      1,      1,      1,
    211 /* 70-77 */     1,      1,      1,      1,      1,      1,      1,      1,
    212 /* 78-7F */     1,      1,      1,      1,      1,      1,      1,      1,
    213 /* 80-87 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
    214 /* 88-8F */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
    215 /* 90-97 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
    216 /* 98-9F */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
    217 /* A0-A7 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
    218 /* A8-AF */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
    219 /* B0-B7 */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
    220 /* B8-BF */     -1,     -1,     -1,     -1,     -1,     -1,     -1,     -1,
    221 /* C0-C7 */     2,      2,      2,      2,      2,      2,      2,      2,
    222 /* C8-CF */     2,      2,      2,      2,      2,      2,      2,      2,
    223 /* D0-D7 */     2,      2,      2,      2,      2,      2,      2,      2,
    224 /* D8-DF */     2,      2,      2,      2,      2,      2,      2,      2,
    225 /* E0-E7 */     3,      3,      3,      3,      3,      3,      3,      3,
    226 /* E8-EF */     3,      3,      3,      3,      3,      3,      3,      3,
    227 /* F0-F7 */     4,      4,      4,      4,      4,      4,      4,      4,
    228 /* F8-FF */     5,      5,      5,      5,      6,      6,      -1,     -1
    229 };
    230 
    231 
    232 
    233 void
    234 chknull(void * p)
    235 
    236 {
    237         if (p)
    238                 return;
    239 
    240         fprintf(stderr, "Not enough memory\n");
    241         exit(1);
    242 }
    243 
    244 
    245 void
    246 makecode(char * buf, unsigned int ccsid)
    247 
    248 {
    249         ccsid &= 0xFFFF;
    250         memset(buf, 0, 32);
    251         sprintf(buf, "IBMCCSID%05u0000000", ccsid);
    252 }
    253 
    254 
    255 iconv_t
    256 iconv_open_ccsid(unsigned int ccsidout,
    257                                 unsigned int ccsidin, unsigned int nullflag)
    258 
    259 {
    260         char fromcode[33];
    261         char tocode[33];
    262 
    263         makecode(fromcode, ccsidin);
    264         makecode(tocode, ccsidout);
    265         memset(tocode + 13, 0, sizeof tocode - 13);
    266 
    267         if (nullflag)
    268                 fromcode[18] = '1';
    269 
    270         return iconv_open(tocode, fromcode);
    271 }
    272 
    273 
    274 unsigned int
    275 getnum(char * * cpp)
    276 
    277 {
    278         unsigned int n;
    279         char * cp;
    280 
    281         cp = *cpp;
    282         n = 0;
    283 
    284         while (isdigit(*cp))
    285                 n = 10 * n + *cp++ - '0';
    286 
    287         *cpp = cp;
    288         return n;
    289 }
    290 
    291 
    292 const utf8char *
    293 hashBinaryKey(const byte * bytes, unsigned int len)
    294 
    295 {
    296         const byte * bp;
    297         utf8char * key;
    298         utf8char * cp;
    299         unsigned int n;
    300         unsigned int n4;
    301         unsigned int i;
    302 
    303         /**
    304         ***     Encode binary data in character form to be used as hash
    305         ***             table key.
    306         **/
    307 
    308         n = (4 * len + 2) / 3;
    309         key = (utf8char *) malloc(n + 1);
    310         chknull(key);
    311         bp = bytes;
    312         cp = key;
    313 
    314         for (n4 = n >> 2; n4; n4--) {
    315                 i = (bp[0] << 16) | (bp[1] << 8) | bp[2];
    316                 *cp++ = 0x21 + ((i >> 18) & 0x3F);
    317                 *cp++ = 0x21 + ((i >> 12) & 0x3F);
    318                 *cp++ = 0x21 + ((i >> 6) & 0x3F);
    319                 *cp++ = 0x21 + (i & 0x3F);
    320                 bp += 3;
    321                 }
    322 
    323         switch (n & 0x3) {
    324 
    325         case 2:
    326                 *cp++ = 0x21 + ((*bp >> 2) & 0x3F);
    327                 *cp++ = 0x21 + ((*bp << 4) & 0x3F);
    328                 break;
    329 
    330         case 3:
    331                 i = (bp[0] << 8) | bp[1];
    332                 *cp++ = 0x21 + ((i >> 10) & 0x3F);
    333                 *cp++ = 0x21 + ((i >> 4) & 0x3F);
    334                 *cp++ = 0x21 + ((i << 2) & 0x3F);
    335                 break;
    336                 }
    337 
    338         *cp = '\0';
    339         return key;
    340 }
    341 
    342 
    343 void *
    344 hash_get(xmlHashTablePtr h, const void * binkey, unsigned int len)
    345 
    346 {
    347         const utf8char * key;
    348         void * result;
    349 
    350         key = hashBinaryKey((const byte *) binkey, len);
    351         result = xmlHashLookup(h, key);
    352         free((char *) key);
    353         return result;
    354 }
    355 
    356 
    357 int
    358 hash_add(xmlHashTablePtr h, const void * binkey, unsigned int len, void * data)
    359 
    360 {
    361         const utf8char * key;
    362         int result;
    363 
    364         key = hashBinaryKey((const byte *) binkey, len);
    365         result = xmlHashAddEntry(h, key, data);
    366         free((char *) key);
    367         return result;
    368 }
    369 
    370 
    371 xmlDocPtr
    372 loadXMLFile(const char * filename)
    373 
    374 {
    375         struct stat sbuf;
    376         byte * databuf;
    377         int fd;
    378         int i;
    379         xmlDocPtr doc;
    380 
    381         if (stat(filename, &sbuf))
    382                 return (xmlDocPtr) NULL;
    383 
    384         databuf = malloc(sbuf.st_size + 4);
    385 
    386         if (!databuf)
    387                 return (xmlDocPtr) NULL;
    388 
    389         fd = open(filename, O_RDONLY
    390 #ifdef O_BINARY
    391                                          | O_BINARY
    392 #endif
    393                                                         );
    394 
    395         if (fd < 0) {
    396                 free((char *) databuf);
    397                 return (xmlDocPtr) NULL;
    398                 }
    399 
    400         i = read(fd, (char *) databuf, sbuf.st_size);
    401         close(fd);
    402 
    403         if (i != sbuf.st_size) {
    404                 free((char *) databuf);
    405                 return (xmlDocPtr) NULL;
    406                 }
    407 
    408         databuf[i] = databuf[i + 1] = databuf[i + 2] = databuf[i + 3] = 0;
    409         doc = xmlParseMemory((xmlChar *) databuf, i);
    410         free((char *) databuf);
    411         return doc;
    412 }
    413 
    414 
    415 int
    416 match(char * * cpp, char * s)
    417 
    418 {
    419         char * cp;
    420         int c1;
    421         int c2;
    422 
    423         cp = *cpp;
    424 
    425         for (cp = *cpp; c2 = *s++; cp++) {
    426                 c1 = *cp;
    427 
    428                 if (c1 != c2) {
    429                         if (isupper(c1))
    430                                 c1 = tolower(c1);
    431 
    432                         if (isupper(c2))
    433                                 c2 = tolower(c2);
    434                         }
    435 
    436                 if (c1 != c2)
    437                         return 0;
    438                 }
    439 
    440         c1 = *cp;
    441 
    442         while (c1 == ' ' || c1 == '\t')
    443                 c1 = *++cp;
    444 
    445         *cpp = cp;
    446         return 1;
    447 }
    448 
    449 
    450 t_state *
    451 newstate(void)
    452 
    453 {
    454         t_state * s;
    455 
    456         s = (t_state *) malloc(sizeof *s);
    457         chknull(s);
    458         memset((char *) s, 0, sizeof *s);
    459         return s;
    460 }
    461 
    462 
    463 void
    464 unlink_transition(t_transition * t)
    465 
    466 {
    467         if (t->t_backnext)
    468                 t->t_backnext->t_backprev = t->t_backprev;
    469 
    470         if (t->t_backprev)
    471                 t->t_backprev->t_backnext = t->t_backnext;
    472         else if (t->t_to)
    473                 t->t_to->s_backward = t->t_backnext;
    474 
    475         if (t->t_forwnext)
    476                 t->t_forwnext->t_forwprev = t->t_forwprev;
    477 
    478         if (t->t_forwprev)
    479                 t->t_forwprev->t_forwnext = t->t_forwnext;
    480         else if (t->t_from)
    481                 t->t_from->s_forward = t->t_forwnext;
    482 
    483         t->t_backprev = (t_transition *) NULL;
    484         t->t_backnext = (t_transition *) NULL;
    485         t->t_forwprev = (t_transition *) NULL;
    486         t->t_forwnext = (t_transition *) NULL;
    487         t->t_from = (t_state *) NULL;
    488         t->t_to = (t_state *) NULL;
    489 }
    490 
    491 
    492 void
    493 link_transition(t_transition * t, t_state * from, t_state * to)
    494 
    495 {
    496         if (!from)
    497                 from = t->t_from;
    498 
    499         if (!to)
    500                 to = t->t_to;
    501 
    502         unlink_transition(t);
    503 
    504         if ((t->t_from = from)) {
    505                 if ((t->t_forwnext = from->s_forward))
    506                         t->t_forwnext->t_forwprev = t;
    507 
    508                 from->s_forward = t;
    509                 }
    510 
    511         if ((t->t_to = to)) {
    512                 if ((t->t_backnext = to->s_backward))
    513                         t->t_backnext->t_backprev = t;
    514 
    515                 to->s_backward = t;
    516                 }
    517 }
    518 
    519 
    520 t_transition *
    521 newtransition(unsigned int token, t_state * from, t_state * to)
    522 
    523 {
    524         t_transition * t;
    525 
    526         t = (t_transition *) malloc(sizeof *t);
    527         chknull(t);
    528         memset((char *) t, 0, sizeof *t);
    529         t->t_token = token;
    530         link_transition(t, from, to);
    531         return t;
    532 }
    533 
    534 
    535 t_transition *
    536 uniquetransition(unsigned int token, t_state * from, t_state * to)
    537 
    538 {
    539         t_transition * t;
    540 
    541         for (t = from->s_forward; t; t = t->t_forwnext)
    542                 if (t->t_token == token && (t->t_to == to || !to))
    543                         return t;
    544 
    545         return to? newtransition(token, from, to): (t_transition *) NULL;
    546 }
    547 
    548 
    549 int
    550 set_position(t_powerset * s, void * e)
    551 
    552 {
    553         unsigned int l;
    554         unsigned int h;
    555         unsigned int m;
    556         int i;
    557 
    558         l = 0;
    559         h = s->p_card;
    560 
    561         while (l < h) {
    562                 m = (l + h) >> 1;
    563 
    564                 /**
    565                 ***     If both pointers belong to different allocation arenas,
    566                 ***             native comparison may find them neither
    567                 ***             equal, nor greater, nor smaller.
    568                 ***     We thus compare using memcmp() to get an orthogonal
    569                 ***             result.
    570                 **/
    571 
    572                 i = memcmp(&e, s->p_set + m, sizeof e);
    573 
    574                 if (i < 0)
    575                         h = m;
    576                 else if (!i)
    577                         return m;
    578                 else
    579                         l = m + 1;
    580                 }
    581 
    582         return l;
    583 }
    584 
    585 
    586 t_powerset *
    587 set_include(t_powerset * s, void * e)
    588 
    589 {
    590         unsigned int pos;
    591         unsigned int n;
    592 
    593         if (!s) {
    594                 s = (t_powerset *) malloc(sizeof *s +
    595                     GRANULE * sizeof s->p_set);
    596                 chknull(s);
    597                 s->p_size = GRANULE;
    598                 s->p_set[GRANULE] = (t_state *) NULL;
    599                 s->p_set[0] = e;
    600                 s->p_card = 1;
    601                 return s;
    602                 }
    603 
    604         pos = set_position(s, e);
    605 
    606         if (pos < s->p_card && s->p_set[pos] == e)
    607                 return s;
    608 
    609         if (s->p_card >= s->p_size) {
    610                 s->p_size += GRANULE;
    611                 s = (t_powerset *) realloc(s,
    612                     sizeof *s + s->p_size * sizeof s->p_set);
    613                 chknull(s);
    614                 s->p_set[s->p_size] = (t_state *) NULL;
    615                 }
    616 
    617         n = s->p_card - pos;
    618 
    619         if (n)
    620                 memmove((char *) (s->p_set + pos + 1),
    621                     (char *) (s->p_set + pos), n * sizeof s->p_set[0]);
    622 
    623         s->p_set[pos] = e;
    624         s->p_card++;
    625         return s;
    626 }
    627 
    628 
    629 t_state *
    630 nfatransition(t_state * to, byte token)
    631 
    632 {
    633         t_state * from;
    634 
    635         from = newstate();
    636         newtransition(token, from, to);
    637         return from;
    638 }
    639 
    640 
    641 static t_state *        nfadevelop(t_state * from, t_state * final, iconv_t icc,
    642                                 const utf8char * name, unsigned int len);
    643 
    644 
    645 void
    646 nfaslice(t_state * * from, t_state * * to, iconv_t icc,
    647                 const utf8char * chr, unsigned int chlen,
    648                 const utf8char * name, unsigned int len, t_state * final)
    649 
    650 {
    651         char * srcp;
    652         char * dstp;
    653         size_t srcc;
    654         size_t dstc;
    655         unsigned int cnt;
    656         t_state * f;
    657         t_state * t;
    658         t_transition * tp;
    659         byte bytebuf[8];
    660 
    661         srcp = (char *) chr;
    662         srcc = chlen;
    663         dstp = (char *) bytebuf;
    664         dstc = sizeof bytebuf;
    665         iconv(icc, &srcp, &srcc, &dstp, &dstc);
    666         dstp = (char *) bytebuf;
    667         cnt = sizeof bytebuf - dstc;
    668         t = *to;
    669         f = *from;
    670 
    671         /**
    672         ***     Check for end of string.
    673         **/
    674 
    675         if (!len)
    676                 if (t && t != final)
    677                         uniquetransition(EPSILON, t, final);
    678                 else
    679                         t = final;
    680 
    681         if (f)
    682                 while (cnt) {
    683                         tp = uniquetransition(*dstp, f, (t_state *) NULL);
    684 
    685                         if (!tp)
    686                                 break;
    687 
    688                         f = tp->t_to;
    689                         dstp++;
    690                         cnt--;
    691                         }
    692 
    693         if (!cnt) {
    694                 if (!t)
    695                         t = nfadevelop(f, final, icc, name, len);
    696 
    697                 *to = t;
    698                 return;
    699                 }
    700 
    701         if (!t) {
    702                 t = nfadevelop((t_state *) NULL, final, icc, name, len);
    703                 *to = t;
    704                 }
    705 
    706         if (!f)
    707                 *from = f = newstate();
    708 
    709         while (cnt > 1)
    710                 t = nfatransition(t, dstp[--cnt]);
    711 
    712         newtransition(*dstp, f, t);
    713 }
    714 
    715 
    716 t_state *
    717 nfadevelop(t_state * from, t_state * final, iconv_t icc,
    718                                         const utf8char * name, unsigned int len)
    719 
    720 {
    721         int chlen;
    722         int i;
    723         t_state * to;
    724         int uccnt;
    725         int lccnt;
    726         utf8char chr;
    727 
    728         chlen = utf8_chlen[*name];
    729 
    730         for (i = 1; i < chlen; i++)
    731                 if ((name[i] & 0xC0) != 0x80)
    732                         break;
    733 
    734         if (i != chlen) {
    735                 fprintf(stderr,
    736                     "Invalid UTF8 character in character set name\n");
    737                 return (t_state *) NULL;
    738                 }
    739 
    740         to = (t_state *) NULL;
    741         nfaslice(&from, &to,
    742             icc, name, chlen, name + chlen, len - chlen, final);
    743 
    744         if (*name >= UTF8_a && *name <= UTF8_z)
    745                 chr = *name - UTF8_a + UTF8_A;
    746         else if (*name >= UTF8_A && *name <= UTF8_Z)
    747                 chr = *name - UTF8_A + UTF8_a;
    748         else
    749                 return from;
    750 
    751         nfaslice(&from, &to, icc, &chr, 1, name + chlen, len - chlen, final);
    752         return from;
    753 }
    754 
    755 
    756 
    757 void
    758 nfaenter(const utf8char * name, int len, t_chset * charset)
    759 
    760 {
    761         t_chset * s;
    762         t_state * final;
    763         t_state * sp;
    764         t_symlist * lp;
    765 
    766         /**
    767         ***     Enter case-insensitive `name' in NFA in all known
    768         ***             character codes.
    769         ***     Redundant shift state changes as well as shift state
    770         ***             differences between uppercase and lowercase are
    771         ***             not handled.
    772         **/
    773 
    774         if (len < 0)
    775                 len = strlen(name) + 1;
    776 
    777         for (lp = charset->c_names; lp; lp = lp->l_next)
    778                 if (!memcmp(name, lp->l_symbol, len))
    779                         return;         /* Already entered. */
    780 
    781         lp = (t_symlist *) malloc(sizeof *lp + len);
    782         chknull(lp);
    783         memcpy(lp->l_symbol, name, len);
    784         lp->l_symbol[len] = '\0';
    785         lp->l_next = charset->c_names;
    786         charset->c_names = lp;
    787         final = newstate();
    788         final->s_final = charset;
    789 
    790         for (s = chset_list; s; s = s->c_next)
    791                 if (!iconv_open_error(s->c_fromUTF8))
    792                         sp = nfadevelop(initial_state, final,
    793                             s->c_fromUTF8, name, len);
    794 }
    795 
    796 
    797 unsigned int
    798 utf8_utostr(utf8char * s, unsigned int v)
    799 
    800 {
    801         unsigned int d;
    802         unsigned int i;
    803 
    804         d = v / 10;
    805         v -= d * 10;
    806         i = d? utf8_utostr(s, d): 0;
    807         s[i++] = v + UTF8_0;
    808         s[i] = '\0';
    809         return i;
    810 }
    811 
    812 
    813 unsigned int
    814 utf8_utostrpad(utf8char * s, unsigned int v, int digits)
    815 
    816 {
    817         unsigned int i = utf8_utostr(s, v);
    818         utf8char pad = UTF8_SPACE;
    819 
    820         if (digits < 0) {
    821                 pad = UTF8_0;
    822                 digits = -digits;
    823                 }
    824 
    825         if (i >= digits)
    826                 return i;
    827 
    828         memmove(s + digits - i, s, i + 1);
    829         memset(s, pad, digits - i);
    830         return digits;
    831 }
    832 
    833 
    834 unsigned int
    835 utf8_strtou(const utf8char * s)
    836 
    837 {
    838         unsigned int v;
    839 
    840         while (*s == UTF8_SPACE || *s == UTF8_HT)
    841                 s++;
    842 
    843         for (v = 0; *s >= UTF8_0 && *s <= UTF8_9;)
    844                 v = 10 * v + *s++ - UTF8_0;
    845 
    846         return v;
    847 }
    848 
    849 
    850 unsigned int
    851 getNumAttr(xmlNodePtr node, const xmlChar * name)
    852 
    853 {
    854         const xmlChar * s;
    855         unsigned int val;
    856 
    857         s = xmlGetProp(node, name);
    858 
    859         if (!s)
    860                 return 0;
    861 
    862         val = utf8_strtou(s);
    863         xmlFree((xmlChar *) s);
    864         return val;
    865 }
    866 
    867 
    868 void
    869 read_assocs(const char * filename)
    870 
    871 {
    872         xmlDocPtr doc;
    873         xmlXPathContextPtr ctxt;
    874         xmlXPathObjectPtr obj;
    875         xmlNodePtr node;
    876         t_chset * sp;
    877         int i;
    878         unsigned int ccsid;
    879         unsigned int mibenum;
    880         utf8char symbuf[32];
    881 
    882         doc = loadXMLFile(filename);
    883 
    884         if (!doc) {
    885                 fprintf(stderr, "Cannot load file %s\n", filename);
    886                 exit(1);
    887                 }
    888 
    889         ctxt = xmlXPathNewContext(doc);
    890         obj = xmlXPathEval(utf8_assocnodes, ctxt);
    891 
    892         if (!obj || obj->type != XPATH_NODESET || !obj->nodesetval ||
    893             !obj->nodesetval->nodeTab || !obj->nodesetval->nodeNr) {
    894                 fprintf(stderr, "No association found in %s\n", filename);
    895                 exit(1);
    896                 }
    897 
    898         for (i = 0; i < obj->nodesetval->nodeNr; i++) {
    899                 node = obj->nodesetval->nodeTab[i];
    900                 ccsid = getNumAttr(node, utf8_ccsid);
    901                 mibenum = getNumAttr(node, utf8_mibenum);
    902 
    903                 /**
    904                 ***     Check for duplicate.
    905                 **/
    906 
    907                 for (sp = chset_list; sp; sp = sp->c_next)
    908                         if (ccsid && ccsid == sp->c_ccsid ||
    909                             mibenum && mibenum == sp->c_mibenum) {
    910                                 fprintf(stderr, "Duplicate character set: ");
    911                                 fprintf(stderr, "CCSID = %u/%u, ",
    912                                     ccsid, sp->c_ccsid);
    913                                 fprintf(stderr, "MIBenum = %u/%u\n",
    914                                     mibenum, sp->c_mibenum);
    915                                 break;
    916                                 }
    917 
    918                 if (sp)
    919                         continue;
    920 
    921                 /**
    922                 ***     Allocate the new character set.
    923                 **/
    924 
    925                 sp = (t_chset *) malloc(sizeof *sp);
    926                 chknull(sp);
    927                 memset(sp, 0, sizeof *sp);
    928 
    929                 if (!ccsid)     /* Do not attempt with current job CCSID. */
    930                         set_iconv_open_error(sp->c_fromUTF8);
    931                 else {
    932                         sp->c_fromUTF8 =
    933                             iconv_open_ccsid(ccsid, C_UTF8_CCSID, 0);
    934 
    935                         if (iconv_open_error(sp->c_fromUTF8) == -1)
    936                                 fprintf(stderr,
    937                                     "Cannot convert into CCSID %u: ignored\n",
    938                                     ccsid);
    939                         }
    940 
    941                 sp->c_ccsid = ccsid;
    942                 sp->c_mibenum = mibenum;
    943                 sp->c_next = chset_list;
    944                 chset_list = sp;
    945                 }
    946 
    947         xmlXPathFreeObject(obj);
    948 
    949         /**
    950         ***     Enter aliases.
    951         **/
    952 
    953         for (sp = chset_list; sp; sp = sp->c_next) {
    954                 strcpy(symbuf, utf8_ibm_);
    955                 utf8_utostr(symbuf + 4, sp->c_ccsid);
    956                 nfaenter(symbuf, -1, sp);
    957                 strcpy(symbuf, utf8_IBMCCSID);
    958                 utf8_utostrpad(symbuf + 8, sp->c_ccsid, -5);
    959                 nfaenter(symbuf, 13, sp);       /* Not null-terminated. */
    960 
    961                 if (sp->c_mibenum) {
    962                         strcpy(symbuf, utf8_iana_);
    963                         utf8_utostr(symbuf + 5, sp->c_mibenum);
    964                         nfaenter(symbuf, -1, sp);
    965                         }
    966 
    967                 xmlXPathRegisterVariable(ctxt, utf8_C,
    968                     xmlXPathNewFloat((double) sp->c_ccsid));
    969                 obj = xmlXPathEval(utf8_aliastext, ctxt);
    970 
    971                 if (!obj || obj->type != XPATH_NODESET) {
    972                         fprintf(stderr, "getAlias failed in %s\n", filename);
    973                         exit(1);
    974                         }
    975 
    976                 if (obj->nodesetval &&
    977                     obj->nodesetval->nodeTab && obj->nodesetval->nodeNr) {
    978                         for (i = 0; i < obj->nodesetval->nodeNr; i++) {
    979                                 node = obj->nodesetval->nodeTab[i];
    980                                 nfaenter(node->content, -1, sp);
    981                                 }
    982                         }
    983 
    984                 xmlXPathFreeObject(obj);
    985                 }
    986 
    987         xmlXPathFreeContext(ctxt);
    988         xmlFreeDoc(doc);
    989 }
    990 
    991 
    992 unsigned int
    993 columnPosition(xmlXPathContextPtr ctxt, const xmlChar * header)
    994 
    995 {
    996         xmlXPathObjectPtr obj;
    997         unsigned int res = 0;
    998 
    999         xmlXPathRegisterVariable(ctxt, utf8_T, xmlXPathNewString(header));
   1000         obj = xmlXPathEval(utf8_headerpos, ctxt);
   1001 
   1002         if (obj) {
   1003                 if (obj->type == XPATH_NUMBER)
   1004                         res = (unsigned int) obj->floatval;
   1005 
   1006                 xmlXPathFreeObject(obj);
   1007                 }
   1008 
   1009         return res;
   1010 }
   1011 
   1012 
   1013 void
   1014 read_iana(const char * filename)
   1015 
   1016 {
   1017         xmlDocPtr doc;
   1018         xmlXPathContextPtr ctxt;
   1019         xmlXPathObjectPtr obj1;
   1020         xmlXPathObjectPtr obj2;
   1021         xmlNodePtr node;
   1022         int prefnamecol;
   1023         int namecol;
   1024         int mibenumcol;
   1025         int aliascol;
   1026         int mibenum;
   1027         t_chset * sp;
   1028         int n;
   1029         int i;
   1030 
   1031         doc = loadXMLFile(filename);
   1032 
   1033         if (!doc) {
   1034                 fprintf(stderr, "Cannot load file %s\n", filename);
   1035                 exit(1);
   1036                 }
   1037 
   1038         ctxt = xmlXPathNewContext(doc);
   1039 
   1040 #ifndef OLDXML
   1041         xmlXPathRegisterNs(ctxt, utf8_html, utf8_htmluri);
   1042 #endif
   1043 
   1044         obj1 = xmlXPathEval(utf8_tablerows, ctxt);
   1045 
   1046         if (!obj1 || obj1->type != XPATH_NODESET || !obj1->nodesetval ||
   1047             !obj1->nodesetval->nodeTab || obj1->nodesetval->nodeNr <= 1) {
   1048                 fprintf(stderr, "No data in %s\n", filename);
   1049                 exit(1);
   1050                 }
   1051 
   1052         /**
   1053         ***     Identify columns.
   1054         **/
   1055 
   1056         xmlXPathSetContextNode(obj1->nodesetval->nodeTab[0], ctxt);
   1057         prefnamecol = columnPosition(ctxt, utf8_Pref_MIME_Name);
   1058         namecol = columnPosition(ctxt, utf8_Name);
   1059         mibenumcol = columnPosition(ctxt, utf8_MIBenum);
   1060         aliascol = columnPosition(ctxt, utf8_Aliases);
   1061 
   1062         if (!prefnamecol || !namecol || !mibenumcol || !aliascol) {
   1063                 fprintf(stderr, "Key column(s) missing in %s\n", filename);
   1064                 exit(1);
   1065                 }
   1066 
   1067         xmlXPathRegisterVariable(ctxt, utf8_P,
   1068             xmlXPathNewFloat((double) prefnamecol));
   1069         xmlXPathRegisterVariable(ctxt, utf8_N,
   1070             xmlXPathNewFloat((double) namecol));
   1071         xmlXPathRegisterVariable(ctxt, utf8_M,
   1072             xmlXPathNewFloat((double) mibenumcol));
   1073         xmlXPathRegisterVariable(ctxt, utf8_A,
   1074             xmlXPathNewFloat((double) aliascol));
   1075 
   1076         /**
   1077         ***     Process each row.
   1078         **/
   1079 
   1080         for (n = 1; n < obj1->nodesetval->nodeNr; n++) {
   1081                 xmlXPathSetContextNode(obj1->nodesetval->nodeTab[n], ctxt);
   1082 
   1083                 /**
   1084                 ***     Get the MIBenum from current row.
   1085                 */
   1086 
   1087                 obj2 = xmlXPathEval(utf8_getmibenum, ctxt);
   1088 
   1089                 if (!obj2 || obj2->type != XPATH_NUMBER) {
   1090                         fprintf(stderr, "get MIBenum failed at row %u\n", n);
   1091                         exit(1);
   1092                         }
   1093 
   1094                 if (xmlXPathIsNaN(obj2->floatval) ||
   1095                     obj2->floatval < 1.0 || obj2->floatval > 65535.0 ||
   1096                     ((unsigned int) obj2->floatval) != obj2->floatval) {
   1097                         fprintf(stderr, "invalid MIBenum at row %u\n", n);
   1098                         xmlXPathFreeObject(obj2);
   1099                         continue;
   1100                         }
   1101 
   1102                 mibenum = obj2->floatval;
   1103                 xmlXPathFreeObject(obj2);
   1104 
   1105                 /**
   1106                 ***     Search the associations for a corresponding CCSID.
   1107                 **/
   1108 
   1109                 for (sp = chset_list; sp; sp = sp->c_next)
   1110                         if (sp->c_mibenum == mibenum)
   1111                                 break;
   1112 
   1113                 if (!sp)
   1114                         continue;       /* No CCSID for this MIBenum. */
   1115 
   1116                 /**
   1117                 ***     Process preferred MIME name.
   1118                 **/
   1119 
   1120                 obj2 = xmlXPathEval(utf8_getprefname, ctxt);
   1121 
   1122                 if (!obj2 || obj2->type != XPATH_STRING) {
   1123                         fprintf(stderr,
   1124                             "get Preferred_MIME_Name failed at row %u\n", n);
   1125                         exit(1);
   1126                         }
   1127 
   1128                 if (obj2->stringval && obj2->stringval[0])
   1129                         nfaenter(obj2->stringval, -1, sp);
   1130 
   1131                 xmlXPathFreeObject(obj2);
   1132 
   1133                 /**
   1134                 ***     Process name.
   1135                 **/
   1136 
   1137                 obj2 = xmlXPathEval(utf8_getname, ctxt);
   1138 
   1139                 if (!obj2 || obj2->type != XPATH_STRING) {
   1140                         fprintf(stderr, "get name failed at row %u\n", n);
   1141                         exit(1);
   1142                         }
   1143 
   1144                 if (obj2->stringval && obj2->stringval[0])
   1145                         nfaenter(obj2->stringval, -1, sp);
   1146 
   1147                 xmlXPathFreeObject(obj2);
   1148 
   1149                 /**
   1150                 ***     Process aliases.
   1151                 **/
   1152 
   1153                 obj2 = xmlXPathEval(utf8_getaliases, ctxt);
   1154 
   1155                 if (!obj2 || obj2->type != XPATH_NODESET) {
   1156                         fprintf(stderr, "get aliases failed at row %u\n", n);
   1157                         exit(1);
   1158                         }
   1159 
   1160                 if (obj2->nodesetval && obj2->nodesetval->nodeTab)
   1161                         for (i = 0; i < obj2->nodesetval->nodeNr; i++) {
   1162                                 node = obj2->nodesetval->nodeTab[i];
   1163 
   1164                                 if (node && node->content && node->content[0])
   1165                                         nfaenter(node->content, -1, sp);
   1166                                 }
   1167 
   1168                 xmlXPathFreeObject(obj2);
   1169                 }
   1170 
   1171         xmlXPathFreeObject(obj1);
   1172         xmlXPathFreeContext(ctxt);
   1173         xmlFreeDoc(doc);
   1174 }
   1175 
   1176 
   1177 t_powerset *    closureset(t_powerset * dst, t_powerset * src);
   1178 
   1179 
   1180 t_powerset *
   1181 closure(t_powerset * dst, t_state * src)
   1182 
   1183 {
   1184         t_transition * t;
   1185         unsigned int oldcard;
   1186 
   1187         if (src->s_nfastates) {
   1188                 /**
   1189                 ***     Is a DFA state: return closure of set of equivalent
   1190                 ***             NFA states.
   1191                 **/
   1192 
   1193                 return closureset(dst, src->s_nfastates);
   1194                 }
   1195 
   1196         /**
   1197         ***     Compute closure of NFA state.
   1198         **/
   1199 
   1200         dst = set_include(dst, src);
   1201 
   1202         for (t = src->s_forward; t; t = t->t_forwnext)
   1203                 if (t->t_token == EPSILON) {
   1204                         oldcard = dst->p_card;
   1205                         dst = set_include(dst, t->t_to);
   1206 
   1207                         if (oldcard != dst->p_card)
   1208                                 dst = closure(dst, t->t_to);
   1209                         }
   1210 
   1211         return dst;
   1212 }
   1213 
   1214 
   1215 t_powerset *
   1216 closureset(t_powerset * dst, t_powerset * src)
   1217 
   1218 {
   1219         unsigned int i;
   1220 
   1221         for (i = 0; i < src->p_card; i++)
   1222                 dst = closure(dst, (t_state *) src->p_set[i]);
   1223 
   1224         return dst;
   1225 }
   1226 
   1227 
   1228 t_state *
   1229 get_dfa_state(t_state * * stack,
   1230                         t_powerset * nfastates, xmlHashTablePtr sethash)
   1231 
   1232 {
   1233         t_state * s;
   1234 
   1235         if (s = hash_get(sethash, nfastates->p_set,
   1236             nfastates->p_card * sizeof nfastates->p_set[0])) {
   1237                 /**
   1238                 ***     DFA state already present.
   1239                 ***     Release the NFA state set and return
   1240                 ***             the address of the old DFA state.
   1241                 **/
   1242 
   1243                 free((char *) nfastates);
   1244                 return s;
   1245                 }
   1246 
   1247         /**
   1248         ***     Build the new state.
   1249         **/
   1250 
   1251         s = newstate();
   1252         s->s_nfastates = nfastates;
   1253         s->s_next = dfa_states;
   1254         dfa_states = s;
   1255         s->s_stack = *stack;
   1256         *stack = s;
   1257 
   1258         /**
   1259         ***     Enter it in hash.
   1260         **/
   1261 
   1262         if (hash_add(sethash, nfastates->p_set,
   1263             nfastates->p_card * sizeof nfastates->p_set[0], s))
   1264                 chknull(NULL);          /* Memory allocation error. */
   1265 
   1266         return s;
   1267 }
   1268 
   1269 
   1270 int
   1271 transcmp(const void * p1, const void * p2)
   1272 
   1273 {
   1274         t_transition * t1;
   1275         t_transition * t2;
   1276 
   1277         t1 = *(t_transition * *) p1;
   1278         t2 = *(t_transition * *) p2;
   1279         return ((int) t1->t_token) - ((int) t2->t_token);
   1280 }
   1281 
   1282 
   1283 void
   1284 builddfa(void)
   1285 
   1286 {
   1287         t_powerset * transset;
   1288         t_powerset * stateset;
   1289         t_state * s;
   1290         t_state * s2;
   1291         unsigned int n;
   1292         unsigned int i;
   1293         unsigned int token;
   1294         t_transition * t;
   1295         t_state * stack;
   1296         xmlHashTablePtr sethash;
   1297         unsigned int nst;
   1298 
   1299         transset = set_include(NULL, NULL);
   1300         chknull(transset);
   1301         stateset = set_include(NULL, NULL);
   1302         chknull(stateset);
   1303         sethash = xmlHashCreate(1);
   1304         chknull(sethash);
   1305         dfa_states = (t_state *) NULL;
   1306         stack = (t_state *) NULL;
   1307         nst = 0;
   1308 
   1309         /**
   1310         ***     Build the DFA initial state.
   1311         **/
   1312 
   1313         get_dfa_state(&stack, closure(NULL, initial_state), sethash);
   1314 
   1315         /**
   1316         ***     Build the other DFA states by looking at each
   1317         ***             possible transition from stacked DFA states.
   1318         **/
   1319 
   1320         do {
   1321                 if (!(++nst % 100))
   1322                         fprintf(stderr, "%u DFA states\n", nst);
   1323 
   1324                 s = stack;
   1325                 stack = s->s_stack;
   1326                 s->s_stack = (t_state *) NULL;
   1327 
   1328                 /**
   1329                 ***     Build a set of all non-epsilon transitions from this
   1330                 ***             state.
   1331                 **/
   1332 
   1333                 transset->p_card = 0;
   1334 
   1335                 for (n = 0; n < s->s_nfastates->p_card; n++) {
   1336                         s2 = s->s_nfastates->p_set[n];
   1337 
   1338                         for (t = s2->s_forward; t; t = t->t_forwnext)
   1339                                 if (t->t_token != EPSILON) {
   1340                                         transset = set_include(transset, t);
   1341                                         chknull(transset);
   1342                                         }
   1343                         }
   1344 
   1345                 /**
   1346                 ***     Sort transitions by token.
   1347                 **/
   1348 
   1349                 qsort(transset->p_set, transset->p_card,
   1350                     sizeof transset->p_set[0], transcmp);
   1351 
   1352                 /**
   1353                 ***     Process all transitions, grouping them by token.
   1354                 **/
   1355 
   1356                 stateset->p_card = 0;
   1357                 token = EPSILON;
   1358 
   1359                 for (i = 0; i < transset->p_card; i++) {
   1360                         t = transset->p_set[i];
   1361 
   1362                         if (token != t->t_token) {
   1363                                 if (stateset->p_card) {
   1364                                         /**
   1365                                         ***     Get the equivalent DFA state
   1366                                         ***             and create transition.
   1367                                         **/
   1368 
   1369                                         newtransition(token, s,
   1370                                             get_dfa_state(&stack,
   1371                                             closureset(NULL, stateset),
   1372                                             sethash));
   1373                                         stateset->p_card = 0;
   1374                                         }
   1375 
   1376                                 token = t->t_token;
   1377                                 }
   1378 
   1379                         stateset = set_include(stateset, t->t_to);
   1380                         }
   1381 
   1382                 if (stateset->p_card)
   1383                         newtransition(token, s, get_dfa_state(&stack,
   1384                             closureset(NULL, stateset), sethash));
   1385         } while (stack);
   1386 
   1387         free((char *) transset);
   1388         free((char *) stateset);
   1389         xmlHashFree(sethash, NULL);
   1390 
   1391         /**
   1392         ***     Reverse the state list to get the initial state first,
   1393         ***             check for ambiguous prefixes, determine final states,
   1394         ***             destroy NFA state sets.
   1395         **/
   1396 
   1397         while (s = dfa_states) {
   1398                 dfa_states = s->s_next;
   1399                 s->s_next = stack;
   1400                 stack = s;
   1401                 stateset = s->s_nfastates;
   1402                 s->s_nfastates = (t_powerset *) NULL;
   1403 
   1404                 for (n = 0; n < stateset->p_card; n++) {
   1405                         s2 = (t_state *) stateset->p_set[n];
   1406 
   1407                         if (s2->s_final) {
   1408                                 if (s->s_final && s->s_final != s2->s_final)
   1409                                         fprintf(stderr,
   1410                                             "Ambiguous name for CCSIDs %u/%u\n",
   1411                                             s->s_final->c_ccsid,
   1412                                             s2->s_final->c_ccsid);
   1413 
   1414                                 s->s_final = s2->s_final;
   1415                                 }
   1416                         }
   1417 
   1418                 free((char *) stateset);
   1419                 }
   1420 
   1421         dfa_states = stack;
   1422 }
   1423 
   1424 
   1425 void
   1426 deletenfa(void)
   1427 
   1428 {
   1429         t_transition * t;
   1430         t_state * s;
   1431         t_state * u;
   1432         t_state * stack;
   1433 
   1434         stack = initial_state;
   1435         stack->s_stack = (t_state *) NULL;
   1436 
   1437         while ((s = stack)) {
   1438                 stack = s->s_stack;
   1439 
   1440                 while ((t = s->s_forward)) {
   1441                         u = t->t_to;
   1442                         unlink_transition(t);
   1443                         free((char *) t);
   1444 
   1445                         if (!u->s_backward) {
   1446                                 u->s_stack = stack;
   1447                                 stack = u;
   1448                                 }
   1449                         }
   1450 
   1451                 free((char *) s);
   1452                 }
   1453 }
   1454 
   1455 
   1456 t_stategroup *
   1457 newgroup(void)
   1458 
   1459 {
   1460         t_stategroup * g;
   1461 
   1462         g = (t_stategroup *) malloc(sizeof *g);
   1463         chknull(g);
   1464         memset((char *) g, 0, sizeof *g);
   1465         g->g_id = groupid++;
   1466         return g;
   1467 }
   1468 
   1469 
   1470 void
   1471 optimizedfa(void)
   1472 
   1473 {
   1474         unsigned int i;
   1475         xmlHashTablePtr h;
   1476         t_state * s1;
   1477         t_state * s2;
   1478         t_state * finstates;
   1479         t_state * * sp;
   1480         t_stategroup * g1;
   1481         t_stategroup * g2;
   1482         t_stategroup * ghead;
   1483         t_transition * t1;
   1484         t_transition * t2;
   1485         unsigned int done;
   1486         unsigned int startgroup;
   1487         unsigned int gtrans[1 << (8 * sizeof(unsigned char))];
   1488 
   1489         /**
   1490         ***     Reduce DFA state count.
   1491         **/
   1492 
   1493         groupid = 0;
   1494         ghead = (t_stategroup *) NULL;
   1495 
   1496         /**
   1497         ***     First split: non-final and each distinct final states.
   1498         **/
   1499 
   1500         h = xmlHashCreate(4);
   1501         chknull(h);
   1502 
   1503         for (s1 = dfa_states; s1; s1 = s1->s_next) {
   1504                 if (!(g1 = hash_get(h, &s1->s_final, sizeof s1->s_final))) {
   1505                         g1 = newgroup();
   1506                         g1->g_next = ghead;
   1507                         ghead = g1;
   1508 
   1509                         if (hash_add(h, &s1->s_final, sizeof s1->s_final, g1))
   1510                                 chknull(NULL);  /* Memory allocation error. */
   1511                         }
   1512 
   1513                 s1->s_index = g1->g_id;
   1514                 s1->s_stack = g1->g_member;
   1515                 g1->g_member = s1;
   1516                 }
   1517 
   1518         xmlHashFree(h, NULL);
   1519 
   1520         /**
   1521         ***     Subsequent splits: states that have the same forward
   1522         ***             transition tokens to states in the same group.
   1523         **/
   1524 
   1525         do {
   1526                 done = 1;
   1527 
   1528                 for (g2 = ghead; g2; g2 = g2->g_next) {
   1529                         s1 = g2->g_member;
   1530 
   1531                         if (!s1->s_stack)
   1532                                 continue;
   1533 
   1534                         h = xmlHashCreate(1);
   1535                         chknull(h);
   1536 
   1537                         /**
   1538                         ***     Build the group transition map.
   1539                         **/
   1540 
   1541                         memset((char *) gtrans, ~0, sizeof gtrans);
   1542 
   1543                         for (t1 = s1->s_forward; t1; t1 = t1->t_forwnext)
   1544                                 gtrans[t1->t_token] = t1->t_to->s_index;
   1545 
   1546                         if (hash_add(h, gtrans, sizeof gtrans, g2))
   1547                                 chknull(NULL);
   1548 
   1549                         /**
   1550                         ***     Process other states in group.
   1551                         **/
   1552 
   1553                         sp = &s1->s_stack;
   1554                         s1 = *sp;
   1555 
   1556                         do {
   1557                                 *sp = s1->s_stack;
   1558 
   1559                                 /**
   1560                                 ***     Build the transition map.
   1561                                 **/
   1562 
   1563                                 memset((char *) gtrans, ~0, sizeof gtrans);
   1564 
   1565                                 for (t1 = s1->s_forward;
   1566                                     t1; t1 = t1->t_forwnext)
   1567                                         gtrans[t1->t_token] = t1->t_to->s_index;
   1568 
   1569                                 g1 = hash_get(h, gtrans, sizeof gtrans);
   1570 
   1571                                 if (g1 == g2) {
   1572                                         *sp = s1;
   1573                                         sp = &s1->s_stack;
   1574                                         }
   1575                                 else {
   1576                                         if (!g1) {
   1577                                                 g1 = newgroup();
   1578                                                 g1->g_next = ghead;
   1579                                                 ghead = g1;
   1580 
   1581                                                 if (hash_add(h, gtrans,
   1582                                                     sizeof gtrans, g1))
   1583                                                         chknull(NULL);
   1584                                                 }
   1585 
   1586                                         s1->s_index = g1->g_id;
   1587                                         s1->s_stack = g1->g_member;
   1588                                         g1->g_member = s1;
   1589                                         done = 0;
   1590                                         }
   1591                         } while (s1 = *sp);
   1592 
   1593                         xmlHashFree(h, NULL);
   1594                         }
   1595         } while (!done);
   1596 
   1597         /**
   1598         ***     Establish group leaders and remap transitions.
   1599         **/
   1600 
   1601         startgroup = dfa_states->s_index;
   1602 
   1603         for (g1 = ghead; g1; g1 = g1->g_next)
   1604                 for (s1 = g1->g_member->s_stack; s1; s1 = s1->s_stack)
   1605                         for (t1 = s1->s_backward; t1; t1 = t2) {
   1606                                 t2 = t1->t_backnext;
   1607                                 link_transition(t1, NULL, g1->g_member);
   1608                                 }
   1609 
   1610         /**
   1611         ***     Remove redundant states and transitions.
   1612         **/
   1613 
   1614         for (g1 = ghead; g1; g1 = g1->g_next) {
   1615                 g1->g_member->s_next = (t_state *) NULL;
   1616 
   1617                 while ((s1 = g1->g_member->s_stack)) {
   1618                         g1->g_member->s_stack = s1->s_stack;
   1619 
   1620                         for (t1 = s1->s_forward; t1; t1 = t2) {
   1621                                 t2 = t1->t_forwnext;
   1622                                 unlink_transition(t1);
   1623                                 free((char *) t1);
   1624                                 }
   1625 
   1626                         free((char *) s1);
   1627                         }
   1628                 }
   1629 
   1630         /**
   1631         ***     Remove group support and relink DFA states.
   1632         **/
   1633 
   1634         dfa_states = (t_state *) NULL;
   1635         s2 = (t_state *) NULL;
   1636         finstates = (t_state *) NULL;
   1637 
   1638         while (g1 = ghead) {
   1639                 ghead = g1->g_next;
   1640                 s1 = g1->g_member;
   1641 
   1642                 if (g1->g_id == startgroup)
   1643                         dfa_states = s1;        /* Keep start state first. */
   1644                 else if (s1->s_final) {         /* Then final states. */
   1645                         s1->s_next = finstates;
   1646                         finstates = s1;
   1647                         }
   1648                 else {                  /* Finish with non-final states. */
   1649                         s1->s_next = s2;
   1650                         s2 = s1;
   1651                         }
   1652 
   1653                 free((char *) g1);
   1654                 }
   1655 
   1656         for (dfa_states->s_next = finstates; finstates->s_next;)
   1657                 finstates = finstates->s_next;
   1658 
   1659         finstates->s_next = s2;
   1660 }
   1661 
   1662 
   1663 const char *
   1664 inttype(unsigned long max)
   1665 
   1666 {
   1667         int i;
   1668 
   1669         for (i = 0; max; i++)
   1670                 max >>= 1;
   1671 
   1672         if (i > 8 * sizeof(unsigned int))
   1673                 return "unsigned long";
   1674 
   1675         if (i > 8 * sizeof(unsigned short))
   1676                 return "unsigned int";
   1677 
   1678         if (i > 8 * sizeof(unsigned char))
   1679                 return "unsigned short";
   1680 
   1681         return "unsigned char";
   1682 }
   1683 
   1684 
   1685 listids(FILE * fp)
   1686 
   1687 {
   1688         unsigned int pos;
   1689         t_chset * cp;
   1690         t_symlist * lp;
   1691         char * srcp;
   1692         char * dstp;
   1693         size_t srcc;
   1694         size_t dstc;
   1695         char buf[80];
   1696 
   1697         fprintf(fp, "/**\n***     CCSID   For arg   Recognized name.\n");
   1698         pos = 0;
   1699 
   1700         for (cp = chset_list; cp; cp = cp->c_next) {
   1701                 if (pos) {
   1702                         fprintf(fp, "\n");
   1703                         pos = 0;
   1704                         }
   1705 
   1706                 if (!cp->c_names)
   1707                         continue;
   1708 
   1709                 pos = fprintf(fp, "***     %5u      %c     ", cp->c_ccsid,
   1710                     iconv_open_error(cp->c_fromUTF8)? ' ': 'X');
   1711 
   1712                 for (lp = cp->c_names; lp; lp = lp->l_next) {
   1713                         srcp = (char *) lp->l_symbol;
   1714                         srcc = strlen(srcp);
   1715                         dstp = buf;
   1716                         dstc = sizeof buf;
   1717                         iconv(utf82job, &srcp, &srcc, &dstp, &dstc);
   1718                         srcc = dstp - buf;
   1719 
   1720                         if (pos + srcc > 79) {
   1721                                 fprintf(fp, "\n***%22c", ' ');
   1722                                 pos = 25;
   1723                                 }
   1724 
   1725                         pos += fprintf(fp, " %.*s", srcc, buf);
   1726                         }
   1727                 }
   1728 
   1729         if (pos)
   1730                 fprintf(fp, "\n");
   1731 
   1732         fprintf(fp, "**/\n\n");
   1733 }
   1734 
   1735 
   1736 void
   1737 generate(FILE * fp)
   1738 
   1739 {
   1740         unsigned int nstates;
   1741         unsigned int ntrans;
   1742         unsigned int maxfinal;
   1743         t_state * s;
   1744         t_transition * t;
   1745         unsigned int i;
   1746         unsigned int pos;
   1747         char * ns;
   1748 
   1749         /**
   1750         ***     Assign indexes to states and transitions.
   1751         **/
   1752 
   1753         nstates = 0;
   1754         ntrans = 0;
   1755         maxfinal = 0;
   1756 
   1757         for (s = dfa_states; s; s = s->s_next) {
   1758                 s->s_index = nstates++;
   1759 
   1760                 if (s->s_final)
   1761                         maxfinal = nstates;
   1762 
   1763                 for (t = s->s_forward; t; t = t->t_forwnext)
   1764                         t->t_index = ntrans++;
   1765                 }
   1766 
   1767         fprintf(fp,
   1768             "/**\n***     %u states, %u finals, %u transitions.\n**/\n\n",
   1769             nstates, maxfinal, ntrans);
   1770         fprintf(stderr, "%u states, %u finals, %u transitions.\n",
   1771             nstates, maxfinal, ntrans);
   1772 
   1773         /**
   1774         ***     Generate types.
   1775         **/
   1776 
   1777         fprintf(fp, "typedef unsigned short          t_ccsid;\n");
   1778         fprintf(fp, "typedef %-23s t_staterange;\n", inttype(nstates));
   1779         fprintf(fp, "typedef %-23s t_transrange;\n\n", inttype(ntrans));
   1780 
   1781         /**
   1782         ***     Generate first transition index for each state.
   1783         **/
   1784 
   1785         fprintf(fp, "static t_transrange     trans_array[] = {\n");
   1786         pos = 0;
   1787         ntrans = 0;
   1788 
   1789         for (s = dfa_states; s; s = s->s_next) {
   1790                 pos += fprintf(fp, " %u,", ntrans);
   1791 
   1792                 if (pos > 72) {
   1793                         fprintf(fp, "\n");
   1794                         pos = 0;
   1795                         }
   1796 
   1797                 for (t = s->s_forward; t; t = t->t_forwnext)
   1798                         ntrans++;
   1799                 }
   1800 
   1801         fprintf(fp, " %u\n};\n\n", ntrans);
   1802 
   1803         /**
   1804         ***     Generate final state info.
   1805         **/
   1806 
   1807         fprintf(fp, "static t_ccsid          final_array[] = {\n");
   1808         pos = 0;
   1809         ns ="";
   1810         i = 0;
   1811 
   1812         for (s = dfa_states; s && i++ < maxfinal; s = s->s_next) {
   1813                 pos += fprintf(fp, "%s", ns);
   1814                 ns = ",";
   1815 
   1816                 if (pos > 72) {
   1817                         fprintf(fp, "\n");
   1818                         pos = 0;
   1819                         }
   1820 
   1821                 pos += fprintf(fp, " %u",
   1822                     s->s_final? s->s_final->c_ccsid + 1: 0);
   1823                 }
   1824 
   1825         fprintf(fp, "\n};\n\n");
   1826 
   1827         /**
   1828         ***     Generate goto table.
   1829         **/
   1830 
   1831         fprintf(fp, "static t_staterange     goto_array[] = {\n");
   1832         pos = 0;
   1833 
   1834         for (s = dfa_states; s; s = s->s_next)
   1835                 for (t = s->s_forward; t; t = t->t_forwnext) {
   1836                         pos += fprintf(fp, " %u,", t->t_to->s_index);
   1837 
   1838                         if (pos > 72) {
   1839                                 fprintf(fp, "\n");
   1840                                 pos = 0;
   1841                                 }
   1842                         }
   1843 
   1844         fprintf(fp, " %u\n};\n\n", nstates);
   1845 
   1846         /**
   1847         ***     Generate transition label table.
   1848         **/
   1849 
   1850         fprintf(fp, "static unsigned char    label_array[] = {\n");
   1851         pos = 0;
   1852         ns ="";
   1853 
   1854         for (s = dfa_states; s; s = s->s_next)
   1855                 for (t = s->s_forward; t; t = t->t_forwnext) {
   1856                         pos += fprintf(fp, "%s", ns);
   1857                         ns = ",";
   1858 
   1859                         if (pos > 72) {
   1860                                 fprintf(fp, "\n");
   1861                                 pos = 0;
   1862                                 }
   1863 
   1864                         pos += fprintf(fp, " 0x%02X", t->t_token);
   1865                         }
   1866 
   1867         fprintf(fp, "\n};\n", nstates);
   1868 }
   1869 
   1870 
   1871 main(argc, argv)
   1872 int argc;
   1873 char * * argv;
   1874 
   1875 {
   1876         FILE * fp;
   1877         t_chset * csp;
   1878         char symbuf[20];
   1879 
   1880         chset_list = (t_chset *) NULL;
   1881         initial_state = newstate();
   1882         job2utf8 = iconv_open_ccsid(C_UTF8_CCSID, C_SOURCE_CCSID, 0);
   1883         utf82job = iconv_open_ccsid(C_SOURCE_CCSID, C_UTF8_CCSID, 0);
   1884 
   1885         if (argc != 4) {
   1886                 fprintf(stderr, "Usage: %s <ccsid-mibenum file> ", *argv);
   1887                 fprintf(stderr, "<iana-character-set file> <output file>\n");
   1888                 exit(1);
   1889                 }
   1890 
   1891         /**
   1892         ***     Read CCSID/MIBenum associations. Define special names.
   1893         **/
   1894 
   1895         read_assocs(argv[1]);
   1896 
   1897         /**
   1898         ***     Read character set names and establish the case-independent
   1899         ***             name DFA in all possible CCSIDs.
   1900         **/
   1901 
   1902         read_iana(argv[2]);
   1903 
   1904         /**
   1905         ***     Build DFA from NFA.
   1906         **/
   1907 
   1908         builddfa();
   1909 
   1910         /**
   1911         ***     Delete NFA.
   1912         **/
   1913 
   1914         deletenfa();
   1915 
   1916         /**
   1917         ***     Minimize the DFA state count.
   1918         **/
   1919 
   1920         optimizedfa();
   1921 
   1922         /**
   1923         ***     Generate the table.
   1924         **/
   1925 
   1926         fp = fopen(argv[3], "w+");
   1927 
   1928         if (!fp) {
   1929                 perror(argv[3]);
   1930                 exit(1);
   1931                 }
   1932 
   1933         fprintf(fp, "/**\n");
   1934         fprintf(fp, "***     Character set names table.\n");
   1935         fprintf(fp, "***     Generated by program BLDCSNDFA from");
   1936         fprintf(fp, " IANA character set assignment file\n");
   1937         fprintf(fp, "***          and CCSID/MIBenum equivalence file.\n");
   1938         fprintf(fp, "***     *** Do not edit by hand ***\n");
   1939         fprintf(fp, "**/\n\n");
   1940         listids(fp);
   1941         generate(fp);
   1942 
   1943         if (ferror(fp)) {
   1944                 perror(argv[3]);
   1945                 fclose(fp);
   1946                 exit(1);
   1947                 }
   1948 
   1949         fclose(fp);
   1950         iconv_close(job2utf8);
   1951         iconv_close(utf82job);
   1952         exit(0);
   1953 }
   1954