1 /** 2 *** Build a deterministic finite automaton to associate CCSIDs with 3 *** character set names. 4 *** 5 *** Compile on OS/400 with options SYSIFCOPT(*IFSIO). 6 *** 7 *** See Copyright for the status of this software. 8 *** 9 *** Author: Patrick Monnerat <pm (at) datasphere.ch>, DATASPHERE S.A. 10 **/ 11 12 #include <stdio.h> 13 #include <errno.h> 14 #include <stdlib.h> 15 #include <string.h> 16 #include <fcntl.h> 17 #include <ctype.h> 18 19 #include <iconv.h> 20 21 22 #ifdef OLDXML 23 #include "xml.h" 24 #else 25 #include <libxml/hash.h> 26 #include <libxml/parser.h> 27 #include <libxml/xpath.h> 28 #include <libxml/xpathInternals.h> 29 #endif 30 31 32 #ifdef __OS400__ 33 #define iconv_open_error(cd) ((cd).return_value == -1) 34 #define set_iconv_open_error(cd) ((cd).return_value = -1) 35 #else 36 #define iconv_open_error(cd) ((cd) == (iconv_t) -1) 37 #define set_iconv_open_error(cd) ((cd) = (iconv_t) -1) 38 #endif 39 40 41 #define C_SOURCE_CCSID 500 42 #define C_UTF8_CCSID 1208 43 44 45 #define UTF8_SPACE 0x20 46 #define UTF8_HT 0x09 47 #define UTF8_0 0x30 48 #define UTF8_9 0x39 49 #define UTF8_A 0x41 50 #define UTF8_Z 0x5A 51 #define UTF8_a 0x61 52 #define UTF8_z 0x7A 53 54 55 #define GRANULE 128 /* Memory allocation granule. */ 56 57 #define EPSILON 0x100 /* Token for empty transition. */ 58 59 60 #ifndef OFFSETOF 61 #define OFFSETOF(t, f) ((unsigned int) ((char *) &((t *) 0)->f - (char *) 0)) 62 #endif 63 64 #ifndef OFFSETBY 65 #define OFFSETBY(t, p, o) ((t *) ((char *) (p) + (unsigned int) (o))) 66 #endif 67 68 69 typedef struct t_transition t_transition; /* NFA/DFA transition. */ 70 typedef struct t_state t_state; /* NFA/DFA state node. */ 71 typedef struct t_symlist t_symlist; /* Symbol (i.e.: name) list. */ 72 typedef struct t_chset t_chset; /* Character set. */ 73 typedef struct t_stategroup t_stategroup; /* Optimization group. */ 74 typedef unsigned char utf8char; /* UTF-8 character byte. */ 75 typedef unsigned char byte; /* Untyped data byte. */ 76 77 78 typedef struct { /* Set of pointers. */ 79 unsigned int p_size; /* Current allocated size. */ 80 unsigned int p_card; /* Current element count. */ 81 void * p_set[1]; /* Element array. */ 82 } t_powerset; 83 84 85 struct t_transition { 86 t_transition * t_forwprev; /* Head of forward transition list. */ 87 t_transition * t_forwnext; /* Tail of forward transition list. */ 88 t_transition * t_backprev; /* Head of backward transition list. */ 89 t_transition * t_backnext; /* Tail of backward transition list. */ 90 t_state * t_from; /* Incoming state. */ 91 t_state * t_to; /* Destination state. */ 92 unsigned short t_token; /* Transition token. */ 93 unsigned int t_index; /* Transition array index. */ 94 }; 95 96 97 struct t_state { 98 t_state * s_next; /* Next state (for DFA construction). */ 99 t_state * s_stack; /* Unprocessed DFA states stack. */ 100 t_transition * s_forward; /* Forward transitions. */ 101 t_transition * s_backward; /* Backward transitions. */ 102 t_chset * s_final; /* Recognized character set. */ 103 t_powerset * s_nfastates; /* Corresponding NFA states. */ 104 unsigned int s_index; /* State index. */ 105 }; 106 107 108 struct t_symlist { 109 t_symlist * l_next; /* Next name in list. */ 110 utf8char l_symbol[1]; /* Name bytes. */ 111 }; 112 113 114 struct t_chset { 115 t_chset * c_next; /* Next character set. */ 116 t_symlist * c_names; /* Character set name list. */ 117 iconv_t c_fromUTF8; /* Conversion from UTF-8. */ 118 unsigned int c_ccsid; /* IBM character set code. */ 119 unsigned int c_mibenum; /* IANA character code. */ 120 }; 121 122 123 struct t_stategroup { 124 t_stategroup * g_next; /* Next group. */ 125 t_state * g_member; /* Group member (s_stack) list. */ 126 unsigned int g_id; /* Group ident. */ 127 }; 128 129 130 131 t_chset * chset_list; /* Character set list. */ 132 t_state * initial_state; /* Initial NFA state. */ 133 iconv_t job2utf8; /* Job CCSID to UTF-8 conversion. */ 134 iconv_t utf82job; /* UTF-8 to job CCSID conversion. */ 135 t_state * dfa_states; /* List of DFA states. */ 136 unsigned int groupid; /* Group ident counter. */ 137 138 139 /** 140 *** UTF-8 strings. 141 **/ 142 143 #pragma convert(819) 144 145 static const utf8char utf8_MIBenum[] = "MIBenum"; 146 static const utf8char utf8_mibenum[] = "mibenum"; 147 static const utf8char utf8_ibm_[] = "ibm-"; 148 static const utf8char utf8_IBMCCSID[] = "IBMCCSID"; 149 static const utf8char utf8_iana_[] = "iana-"; 150 static const utf8char utf8_Name[] = "Name"; 151 static const utf8char utf8_Pref_MIME_Name[] = "Preferred MIME Name"; 152 static const utf8char utf8_Aliases[] = "Aliases"; 153 static const utf8char utf8_html[] = "html"; 154 static const utf8char utf8_htmluri[] = "http://www.w3.org/1999/xhtml"; 155 static const utf8char utf8_A[] = "A"; 156 static const utf8char utf8_C[] = "C"; 157 static const utf8char utf8_M[] = "M"; 158 static const utf8char utf8_N[] = "N"; 159 static const utf8char utf8_P[] = "P"; 160 static const utf8char utf8_T[] = "T"; 161 static const utf8char utf8_ccsid[] = "ccsid"; 162 static const utf8char utf8_EBCDIC[] = "EBCDIC"; 163 static const utf8char utf8_ASCII[] = "ASCII"; 164 static const utf8char utf8_assocnodes[] = "/ccsid_mibenum/assoc[@ccsid]"; 165 static const utf8char utf8_aliastext[] = 166 "/ccsid_mibenum/assoc[@ccsid=$C]/alias/text()"; 167 #ifdef OLDXML 168 static const utf8char utf8_tablerows[] = 169 "//table[@id='table-character-sets-1']/*/tr"; 170 static const utf8char utf8_headerpos[] = 171 "count(th[text()=$T]/preceding-sibling::th)+1"; 172 static const utf8char utf8_getmibenum[] = "number(td[$M])"; 173 static const utf8char utf8_getprefname[] = "string(td[$P])"; 174 static const utf8char utf8_getname[] = "string(td[$N])"; 175 static const utf8char utf8_getaliases[] = "td[$A]/text()"; 176 #else 177 static const utf8char utf8_tablerows[] = 178 "//html:table[@id='table-character-sets-1']/*/html:tr"; 179 static const utf8char utf8_headerpos[] = 180 "count(html:th[text()=$T]/preceding-sibling::html:th)+1"; 181 static const utf8char utf8_getmibenum[] = "number(html:td[$M])"; 182 static const utf8char utf8_getprefname[] = "string(html:td[$P])"; 183 static const utf8char utf8_getname[] = "string(html:td[$N])"; 184 static const utf8char utf8_getaliases[] = "html:td[$A]/text()"; 185 #endif 186 187 #pragma convert(0) 188 189 190 /** 191 *** UTF-8 character length table. 192 *** 193 *** Index is first character byte, value is the character byte count. 194 **/ 195 196 static signed char utf8_chlen[] = { 197 /* 00-07 */ 1, 1, 1, 1, 1, 1, 1, 1, 198 /* 08-0F */ 1, 1, 1, 1, 1, 1, 1, 1, 199 /* 10-17 */ 1, 1, 1, 1, 1, 1, 1, 1, 200 /* 18-1F */ 1, 1, 1, 1, 1, 1, 1, 1, 201 /* 20-27 */ 1, 1, 1, 1, 1, 1, 1, 1, 202 /* 28-2F */ 1, 1, 1, 1, 1, 1, 1, 1, 203 /* 30-37 */ 1, 1, 1, 1, 1, 1, 1, 1, 204 /* 38-3F */ 1, 1, 1, 1, 1, 1, 1, 1, 205 /* 40-47 */ 1, 1, 1, 1, 1, 1, 1, 1, 206 /* 48-4F */ 1, 1, 1, 1, 1, 1, 1, 1, 207 /* 50-57 */ 1, 1, 1, 1, 1, 1, 1, 1, 208 /* 58-5F */ 1, 1, 1, 1, 1, 1, 1, 1, 209 /* 60-67 */ 1, 1, 1, 1, 1, 1, 1, 1, 210 /* 68-6F */ 1, 1, 1, 1, 1, 1, 1, 1, 211 /* 70-77 */ 1, 1, 1, 1, 1, 1, 1, 1, 212 /* 78-7F */ 1, 1, 1, 1, 1, 1, 1, 1, 213 /* 80-87 */ -1, -1, -1, -1, -1, -1, -1, -1, 214 /* 88-8F */ -1, -1, -1, -1, -1, -1, -1, -1, 215 /* 90-97 */ -1, -1, -1, -1, -1, -1, -1, -1, 216 /* 98-9F */ -1, -1, -1, -1, -1, -1, -1, -1, 217 /* A0-A7 */ -1, -1, -1, -1, -1, -1, -1, -1, 218 /* A8-AF */ -1, -1, -1, -1, -1, -1, -1, -1, 219 /* B0-B7 */ -1, -1, -1, -1, -1, -1, -1, -1, 220 /* B8-BF */ -1, -1, -1, -1, -1, -1, -1, -1, 221 /* C0-C7 */ 2, 2, 2, 2, 2, 2, 2, 2, 222 /* C8-CF */ 2, 2, 2, 2, 2, 2, 2, 2, 223 /* D0-D7 */ 2, 2, 2, 2, 2, 2, 2, 2, 224 /* D8-DF */ 2, 2, 2, 2, 2, 2, 2, 2, 225 /* E0-E7 */ 3, 3, 3, 3, 3, 3, 3, 3, 226 /* E8-EF */ 3, 3, 3, 3, 3, 3, 3, 3, 227 /* F0-F7 */ 4, 4, 4, 4, 4, 4, 4, 4, 228 /* F8-FF */ 5, 5, 5, 5, 6, 6, -1, -1 229 }; 230 231 232 233 void 234 chknull(void * p) 235 236 { 237 if (p) 238 return; 239 240 fprintf(stderr, "Not enough memory\n"); 241 exit(1); 242 } 243 244 245 void 246 makecode(char * buf, unsigned int ccsid) 247 248 { 249 ccsid &= 0xFFFF; 250 memset(buf, 0, 32); 251 sprintf(buf, "IBMCCSID%05u0000000", ccsid); 252 } 253 254 255 iconv_t 256 iconv_open_ccsid(unsigned int ccsidout, 257 unsigned int ccsidin, unsigned int nullflag) 258 259 { 260 char fromcode[33]; 261 char tocode[33]; 262 263 makecode(fromcode, ccsidin); 264 makecode(tocode, ccsidout); 265 memset(tocode + 13, 0, sizeof tocode - 13); 266 267 if (nullflag) 268 fromcode[18] = '1'; 269 270 return iconv_open(tocode, fromcode); 271 } 272 273 274 unsigned int 275 getnum(char * * cpp) 276 277 { 278 unsigned int n; 279 char * cp; 280 281 cp = *cpp; 282 n = 0; 283 284 while (isdigit(*cp)) 285 n = 10 * n + *cp++ - '0'; 286 287 *cpp = cp; 288 return n; 289 } 290 291 292 const utf8char * 293 hashBinaryKey(const byte * bytes, unsigned int len) 294 295 { 296 const byte * bp; 297 utf8char * key; 298 utf8char * cp; 299 unsigned int n; 300 unsigned int n4; 301 unsigned int i; 302 303 /** 304 *** Encode binary data in character form to be used as hash 305 *** table key. 306 **/ 307 308 n = (4 * len + 2) / 3; 309 key = (utf8char *) malloc(n + 1); 310 chknull(key); 311 bp = bytes; 312 cp = key; 313 314 for (n4 = n >> 2; n4; n4--) { 315 i = (bp[0] << 16) | (bp[1] << 8) | bp[2]; 316 *cp++ = 0x21 + ((i >> 18) & 0x3F); 317 *cp++ = 0x21 + ((i >> 12) & 0x3F); 318 *cp++ = 0x21 + ((i >> 6) & 0x3F); 319 *cp++ = 0x21 + (i & 0x3F); 320 bp += 3; 321 } 322 323 switch (n & 0x3) { 324 325 case 2: 326 *cp++ = 0x21 + ((*bp >> 2) & 0x3F); 327 *cp++ = 0x21 + ((*bp << 4) & 0x3F); 328 break; 329 330 case 3: 331 i = (bp[0] << 8) | bp[1]; 332 *cp++ = 0x21 + ((i >> 10) & 0x3F); 333 *cp++ = 0x21 + ((i >> 4) & 0x3F); 334 *cp++ = 0x21 + ((i << 2) & 0x3F); 335 break; 336 } 337 338 *cp = '\0'; 339 return key; 340 } 341 342 343 void * 344 hash_get(xmlHashTablePtr h, const void * binkey, unsigned int len) 345 346 { 347 const utf8char * key; 348 void * result; 349 350 key = hashBinaryKey((const byte *) binkey, len); 351 result = xmlHashLookup(h, key); 352 free((char *) key); 353 return result; 354 } 355 356 357 int 358 hash_add(xmlHashTablePtr h, const void * binkey, unsigned int len, void * data) 359 360 { 361 const utf8char * key; 362 int result; 363 364 key = hashBinaryKey((const byte *) binkey, len); 365 result = xmlHashAddEntry(h, key, data); 366 free((char *) key); 367 return result; 368 } 369 370 371 xmlDocPtr 372 loadXMLFile(const char * filename) 373 374 { 375 struct stat sbuf; 376 byte * databuf; 377 int fd; 378 int i; 379 xmlDocPtr doc; 380 381 if (stat(filename, &sbuf)) 382 return (xmlDocPtr) NULL; 383 384 databuf = malloc(sbuf.st_size + 4); 385 386 if (!databuf) 387 return (xmlDocPtr) NULL; 388 389 fd = open(filename, O_RDONLY 390 #ifdef O_BINARY 391 | O_BINARY 392 #endif 393 ); 394 395 if (fd < 0) { 396 free((char *) databuf); 397 return (xmlDocPtr) NULL; 398 } 399 400 i = read(fd, (char *) databuf, sbuf.st_size); 401 close(fd); 402 403 if (i != sbuf.st_size) { 404 free((char *) databuf); 405 return (xmlDocPtr) NULL; 406 } 407 408 databuf[i] = databuf[i + 1] = databuf[i + 2] = databuf[i + 3] = 0; 409 doc = xmlParseMemory((xmlChar *) databuf, i); 410 free((char *) databuf); 411 return doc; 412 } 413 414 415 int 416 match(char * * cpp, char * s) 417 418 { 419 char * cp; 420 int c1; 421 int c2; 422 423 cp = *cpp; 424 425 for (cp = *cpp; c2 = *s++; cp++) { 426 c1 = *cp; 427 428 if (c1 != c2) { 429 if (isupper(c1)) 430 c1 = tolower(c1); 431 432 if (isupper(c2)) 433 c2 = tolower(c2); 434 } 435 436 if (c1 != c2) 437 return 0; 438 } 439 440 c1 = *cp; 441 442 while (c1 == ' ' || c1 == '\t') 443 c1 = *++cp; 444 445 *cpp = cp; 446 return 1; 447 } 448 449 450 t_state * 451 newstate(void) 452 453 { 454 t_state * s; 455 456 s = (t_state *) malloc(sizeof *s); 457 chknull(s); 458 memset((char *) s, 0, sizeof *s); 459 return s; 460 } 461 462 463 void 464 unlink_transition(t_transition * t) 465 466 { 467 if (t->t_backnext) 468 t->t_backnext->t_backprev = t->t_backprev; 469 470 if (t->t_backprev) 471 t->t_backprev->t_backnext = t->t_backnext; 472 else if (t->t_to) 473 t->t_to->s_backward = t->t_backnext; 474 475 if (t->t_forwnext) 476 t->t_forwnext->t_forwprev = t->t_forwprev; 477 478 if (t->t_forwprev) 479 t->t_forwprev->t_forwnext = t->t_forwnext; 480 else if (t->t_from) 481 t->t_from->s_forward = t->t_forwnext; 482 483 t->t_backprev = (t_transition *) NULL; 484 t->t_backnext = (t_transition *) NULL; 485 t->t_forwprev = (t_transition *) NULL; 486 t->t_forwnext = (t_transition *) NULL; 487 t->t_from = (t_state *) NULL; 488 t->t_to = (t_state *) NULL; 489 } 490 491 492 void 493 link_transition(t_transition * t, t_state * from, t_state * to) 494 495 { 496 if (!from) 497 from = t->t_from; 498 499 if (!to) 500 to = t->t_to; 501 502 unlink_transition(t); 503 504 if ((t->t_from = from)) { 505 if ((t->t_forwnext = from->s_forward)) 506 t->t_forwnext->t_forwprev = t; 507 508 from->s_forward = t; 509 } 510 511 if ((t->t_to = to)) { 512 if ((t->t_backnext = to->s_backward)) 513 t->t_backnext->t_backprev = t; 514 515 to->s_backward = t; 516 } 517 } 518 519 520 t_transition * 521 newtransition(unsigned int token, t_state * from, t_state * to) 522 523 { 524 t_transition * t; 525 526 t = (t_transition *) malloc(sizeof *t); 527 chknull(t); 528 memset((char *) t, 0, sizeof *t); 529 t->t_token = token; 530 link_transition(t, from, to); 531 return t; 532 } 533 534 535 t_transition * 536 uniquetransition(unsigned int token, t_state * from, t_state * to) 537 538 { 539 t_transition * t; 540 541 for (t = from->s_forward; t; t = t->t_forwnext) 542 if (t->t_token == token && (t->t_to == to || !to)) 543 return t; 544 545 return to? newtransition(token, from, to): (t_transition *) NULL; 546 } 547 548 549 int 550 set_position(t_powerset * s, void * e) 551 552 { 553 unsigned int l; 554 unsigned int h; 555 unsigned int m; 556 int i; 557 558 l = 0; 559 h = s->p_card; 560 561 while (l < h) { 562 m = (l + h) >> 1; 563 564 /** 565 *** If both pointers belong to different allocation arenas, 566 *** native comparison may find them neither 567 *** equal, nor greater, nor smaller. 568 *** We thus compare using memcmp() to get an orthogonal 569 *** result. 570 **/ 571 572 i = memcmp(&e, s->p_set + m, sizeof e); 573 574 if (i < 0) 575 h = m; 576 else if (!i) 577 return m; 578 else 579 l = m + 1; 580 } 581 582 return l; 583 } 584 585 586 t_powerset * 587 set_include(t_powerset * s, void * e) 588 589 { 590 unsigned int pos; 591 unsigned int n; 592 593 if (!s) { 594 s = (t_powerset *) malloc(sizeof *s + 595 GRANULE * sizeof s->p_set); 596 chknull(s); 597 s->p_size = GRANULE; 598 s->p_set[GRANULE] = (t_state *) NULL; 599 s->p_set[0] = e; 600 s->p_card = 1; 601 return s; 602 } 603 604 pos = set_position(s, e); 605 606 if (pos < s->p_card && s->p_set[pos] == e) 607 return s; 608 609 if (s->p_card >= s->p_size) { 610 s->p_size += GRANULE; 611 s = (t_powerset *) realloc(s, 612 sizeof *s + s->p_size * sizeof s->p_set); 613 chknull(s); 614 s->p_set[s->p_size] = (t_state *) NULL; 615 } 616 617 n = s->p_card - pos; 618 619 if (n) 620 memmove((char *) (s->p_set + pos + 1), 621 (char *) (s->p_set + pos), n * sizeof s->p_set[0]); 622 623 s->p_set[pos] = e; 624 s->p_card++; 625 return s; 626 } 627 628 629 t_state * 630 nfatransition(t_state * to, byte token) 631 632 { 633 t_state * from; 634 635 from = newstate(); 636 newtransition(token, from, to); 637 return from; 638 } 639 640 641 static t_state * nfadevelop(t_state * from, t_state * final, iconv_t icc, 642 const utf8char * name, unsigned int len); 643 644 645 void 646 nfaslice(t_state * * from, t_state * * to, iconv_t icc, 647 const utf8char * chr, unsigned int chlen, 648 const utf8char * name, unsigned int len, t_state * final) 649 650 { 651 char * srcp; 652 char * dstp; 653 size_t srcc; 654 size_t dstc; 655 unsigned int cnt; 656 t_state * f; 657 t_state * t; 658 t_transition * tp; 659 byte bytebuf[8]; 660 661 srcp = (char *) chr; 662 srcc = chlen; 663 dstp = (char *) bytebuf; 664 dstc = sizeof bytebuf; 665 iconv(icc, &srcp, &srcc, &dstp, &dstc); 666 dstp = (char *) bytebuf; 667 cnt = sizeof bytebuf - dstc; 668 t = *to; 669 f = *from; 670 671 /** 672 *** Check for end of string. 673 **/ 674 675 if (!len) 676 if (t && t != final) 677 uniquetransition(EPSILON, t, final); 678 else 679 t = final; 680 681 if (f) 682 while (cnt) { 683 tp = uniquetransition(*dstp, f, (t_state *) NULL); 684 685 if (!tp) 686 break; 687 688 f = tp->t_to; 689 dstp++; 690 cnt--; 691 } 692 693 if (!cnt) { 694 if (!t) 695 t = nfadevelop(f, final, icc, name, len); 696 697 *to = t; 698 return; 699 } 700 701 if (!t) { 702 t = nfadevelop((t_state *) NULL, final, icc, name, len); 703 *to = t; 704 } 705 706 if (!f) 707 *from = f = newstate(); 708 709 while (cnt > 1) 710 t = nfatransition(t, dstp[--cnt]); 711 712 newtransition(*dstp, f, t); 713 } 714 715 716 t_state * 717 nfadevelop(t_state * from, t_state * final, iconv_t icc, 718 const utf8char * name, unsigned int len) 719 720 { 721 int chlen; 722 int i; 723 t_state * to; 724 int uccnt; 725 int lccnt; 726 utf8char chr; 727 728 chlen = utf8_chlen[*name]; 729 730 for (i = 1; i < chlen; i++) 731 if ((name[i] & 0xC0) != 0x80) 732 break; 733 734 if (i != chlen) { 735 fprintf(stderr, 736 "Invalid UTF8 character in character set name\n"); 737 return (t_state *) NULL; 738 } 739 740 to = (t_state *) NULL; 741 nfaslice(&from, &to, 742 icc, name, chlen, name + chlen, len - chlen, final); 743 744 if (*name >= UTF8_a && *name <= UTF8_z) 745 chr = *name - UTF8_a + UTF8_A; 746 else if (*name >= UTF8_A && *name <= UTF8_Z) 747 chr = *name - UTF8_A + UTF8_a; 748 else 749 return from; 750 751 nfaslice(&from, &to, icc, &chr, 1, name + chlen, len - chlen, final); 752 return from; 753 } 754 755 756 757 void 758 nfaenter(const utf8char * name, int len, t_chset * charset) 759 760 { 761 t_chset * s; 762 t_state * final; 763 t_state * sp; 764 t_symlist * lp; 765 766 /** 767 *** Enter case-insensitive `name' in NFA in all known 768 *** character codes. 769 *** Redundant shift state changes as well as shift state 770 *** differences between uppercase and lowercase are 771 *** not handled. 772 **/ 773 774 if (len < 0) 775 len = strlen(name) + 1; 776 777 for (lp = charset->c_names; lp; lp = lp->l_next) 778 if (!memcmp(name, lp->l_symbol, len)) 779 return; /* Already entered. */ 780 781 lp = (t_symlist *) malloc(sizeof *lp + len); 782 chknull(lp); 783 memcpy(lp->l_symbol, name, len); 784 lp->l_symbol[len] = '\0'; 785 lp->l_next = charset->c_names; 786 charset->c_names = lp; 787 final = newstate(); 788 final->s_final = charset; 789 790 for (s = chset_list; s; s = s->c_next) 791 if (!iconv_open_error(s->c_fromUTF8)) 792 sp = nfadevelop(initial_state, final, 793 s->c_fromUTF8, name, len); 794 } 795 796 797 unsigned int 798 utf8_utostr(utf8char * s, unsigned int v) 799 800 { 801 unsigned int d; 802 unsigned int i; 803 804 d = v / 10; 805 v -= d * 10; 806 i = d? utf8_utostr(s, d): 0; 807 s[i++] = v + UTF8_0; 808 s[i] = '\0'; 809 return i; 810 } 811 812 813 unsigned int 814 utf8_utostrpad(utf8char * s, unsigned int v, int digits) 815 816 { 817 unsigned int i = utf8_utostr(s, v); 818 utf8char pad = UTF8_SPACE; 819 820 if (digits < 0) { 821 pad = UTF8_0; 822 digits = -digits; 823 } 824 825 if (i >= digits) 826 return i; 827 828 memmove(s + digits - i, s, i + 1); 829 memset(s, pad, digits - i); 830 return digits; 831 } 832 833 834 unsigned int 835 utf8_strtou(const utf8char * s) 836 837 { 838 unsigned int v; 839 840 while (*s == UTF8_SPACE || *s == UTF8_HT) 841 s++; 842 843 for (v = 0; *s >= UTF8_0 && *s <= UTF8_9;) 844 v = 10 * v + *s++ - UTF8_0; 845 846 return v; 847 } 848 849 850 unsigned int 851 getNumAttr(xmlNodePtr node, const xmlChar * name) 852 853 { 854 const xmlChar * s; 855 unsigned int val; 856 857 s = xmlGetProp(node, name); 858 859 if (!s) 860 return 0; 861 862 val = utf8_strtou(s); 863 xmlFree((xmlChar *) s); 864 return val; 865 } 866 867 868 void 869 read_assocs(const char * filename) 870 871 { 872 xmlDocPtr doc; 873 xmlXPathContextPtr ctxt; 874 xmlXPathObjectPtr obj; 875 xmlNodePtr node; 876 t_chset * sp; 877 int i; 878 unsigned int ccsid; 879 unsigned int mibenum; 880 utf8char symbuf[32]; 881 882 doc = loadXMLFile(filename); 883 884 if (!doc) { 885 fprintf(stderr, "Cannot load file %s\n", filename); 886 exit(1); 887 } 888 889 ctxt = xmlXPathNewContext(doc); 890 obj = xmlXPathEval(utf8_assocnodes, ctxt); 891 892 if (!obj || obj->type != XPATH_NODESET || !obj->nodesetval || 893 !obj->nodesetval->nodeTab || !obj->nodesetval->nodeNr) { 894 fprintf(stderr, "No association found in %s\n", filename); 895 exit(1); 896 } 897 898 for (i = 0; i < obj->nodesetval->nodeNr; i++) { 899 node = obj->nodesetval->nodeTab[i]; 900 ccsid = getNumAttr(node, utf8_ccsid); 901 mibenum = getNumAttr(node, utf8_mibenum); 902 903 /** 904 *** Check for duplicate. 905 **/ 906 907 for (sp = chset_list; sp; sp = sp->c_next) 908 if (ccsid && ccsid == sp->c_ccsid || 909 mibenum && mibenum == sp->c_mibenum) { 910 fprintf(stderr, "Duplicate character set: "); 911 fprintf(stderr, "CCSID = %u/%u, ", 912 ccsid, sp->c_ccsid); 913 fprintf(stderr, "MIBenum = %u/%u\n", 914 mibenum, sp->c_mibenum); 915 break; 916 } 917 918 if (sp) 919 continue; 920 921 /** 922 *** Allocate the new character set. 923 **/ 924 925 sp = (t_chset *) malloc(sizeof *sp); 926 chknull(sp); 927 memset(sp, 0, sizeof *sp); 928 929 if (!ccsid) /* Do not attempt with current job CCSID. */ 930 set_iconv_open_error(sp->c_fromUTF8); 931 else { 932 sp->c_fromUTF8 = 933 iconv_open_ccsid(ccsid, C_UTF8_CCSID, 0); 934 935 if (iconv_open_error(sp->c_fromUTF8) == -1) 936 fprintf(stderr, 937 "Cannot convert into CCSID %u: ignored\n", 938 ccsid); 939 } 940 941 sp->c_ccsid = ccsid; 942 sp->c_mibenum = mibenum; 943 sp->c_next = chset_list; 944 chset_list = sp; 945 } 946 947 xmlXPathFreeObject(obj); 948 949 /** 950 *** Enter aliases. 951 **/ 952 953 for (sp = chset_list; sp; sp = sp->c_next) { 954 strcpy(symbuf, utf8_ibm_); 955 utf8_utostr(symbuf + 4, sp->c_ccsid); 956 nfaenter(symbuf, -1, sp); 957 strcpy(symbuf, utf8_IBMCCSID); 958 utf8_utostrpad(symbuf + 8, sp->c_ccsid, -5); 959 nfaenter(symbuf, 13, sp); /* Not null-terminated. */ 960 961 if (sp->c_mibenum) { 962 strcpy(symbuf, utf8_iana_); 963 utf8_utostr(symbuf + 5, sp->c_mibenum); 964 nfaenter(symbuf, -1, sp); 965 } 966 967 xmlXPathRegisterVariable(ctxt, utf8_C, 968 xmlXPathNewFloat((double) sp->c_ccsid)); 969 obj = xmlXPathEval(utf8_aliastext, ctxt); 970 971 if (!obj || obj->type != XPATH_NODESET) { 972 fprintf(stderr, "getAlias failed in %s\n", filename); 973 exit(1); 974 } 975 976 if (obj->nodesetval && 977 obj->nodesetval->nodeTab && obj->nodesetval->nodeNr) { 978 for (i = 0; i < obj->nodesetval->nodeNr; i++) { 979 node = obj->nodesetval->nodeTab[i]; 980 nfaenter(node->content, -1, sp); 981 } 982 } 983 984 xmlXPathFreeObject(obj); 985 } 986 987 xmlXPathFreeContext(ctxt); 988 xmlFreeDoc(doc); 989 } 990 991 992 unsigned int 993 columnPosition(xmlXPathContextPtr ctxt, const xmlChar * header) 994 995 { 996 xmlXPathObjectPtr obj; 997 unsigned int res = 0; 998 999 xmlXPathRegisterVariable(ctxt, utf8_T, xmlXPathNewString(header)); 1000 obj = xmlXPathEval(utf8_headerpos, ctxt); 1001 1002 if (obj) { 1003 if (obj->type == XPATH_NUMBER) 1004 res = (unsigned int) obj->floatval; 1005 1006 xmlXPathFreeObject(obj); 1007 } 1008 1009 return res; 1010 } 1011 1012 1013 void 1014 read_iana(const char * filename) 1015 1016 { 1017 xmlDocPtr doc; 1018 xmlXPathContextPtr ctxt; 1019 xmlXPathObjectPtr obj1; 1020 xmlXPathObjectPtr obj2; 1021 xmlNodePtr node; 1022 int prefnamecol; 1023 int namecol; 1024 int mibenumcol; 1025 int aliascol; 1026 int mibenum; 1027 t_chset * sp; 1028 int n; 1029 int i; 1030 1031 doc = loadXMLFile(filename); 1032 1033 if (!doc) { 1034 fprintf(stderr, "Cannot load file %s\n", filename); 1035 exit(1); 1036 } 1037 1038 ctxt = xmlXPathNewContext(doc); 1039 1040 #ifndef OLDXML 1041 xmlXPathRegisterNs(ctxt, utf8_html, utf8_htmluri); 1042 #endif 1043 1044 obj1 = xmlXPathEval(utf8_tablerows, ctxt); 1045 1046 if (!obj1 || obj1->type != XPATH_NODESET || !obj1->nodesetval || 1047 !obj1->nodesetval->nodeTab || obj1->nodesetval->nodeNr <= 1) { 1048 fprintf(stderr, "No data in %s\n", filename); 1049 exit(1); 1050 } 1051 1052 /** 1053 *** Identify columns. 1054 **/ 1055 1056 xmlXPathSetContextNode(obj1->nodesetval->nodeTab[0], ctxt); 1057 prefnamecol = columnPosition(ctxt, utf8_Pref_MIME_Name); 1058 namecol = columnPosition(ctxt, utf8_Name); 1059 mibenumcol = columnPosition(ctxt, utf8_MIBenum); 1060 aliascol = columnPosition(ctxt, utf8_Aliases); 1061 1062 if (!prefnamecol || !namecol || !mibenumcol || !aliascol) { 1063 fprintf(stderr, "Key column(s) missing in %s\n", filename); 1064 exit(1); 1065 } 1066 1067 xmlXPathRegisterVariable(ctxt, utf8_P, 1068 xmlXPathNewFloat((double) prefnamecol)); 1069 xmlXPathRegisterVariable(ctxt, utf8_N, 1070 xmlXPathNewFloat((double) namecol)); 1071 xmlXPathRegisterVariable(ctxt, utf8_M, 1072 xmlXPathNewFloat((double) mibenumcol)); 1073 xmlXPathRegisterVariable(ctxt, utf8_A, 1074 xmlXPathNewFloat((double) aliascol)); 1075 1076 /** 1077 *** Process each row. 1078 **/ 1079 1080 for (n = 1; n < obj1->nodesetval->nodeNr; n++) { 1081 xmlXPathSetContextNode(obj1->nodesetval->nodeTab[n], ctxt); 1082 1083 /** 1084 *** Get the MIBenum from current row. 1085 */ 1086 1087 obj2 = xmlXPathEval(utf8_getmibenum, ctxt); 1088 1089 if (!obj2 || obj2->type != XPATH_NUMBER) { 1090 fprintf(stderr, "get MIBenum failed at row %u\n", n); 1091 exit(1); 1092 } 1093 1094 if (xmlXPathIsNaN(obj2->floatval) || 1095 obj2->floatval < 1.0 || obj2->floatval > 65535.0 || 1096 ((unsigned int) obj2->floatval) != obj2->floatval) { 1097 fprintf(stderr, "invalid MIBenum at row %u\n", n); 1098 xmlXPathFreeObject(obj2); 1099 continue; 1100 } 1101 1102 mibenum = obj2->floatval; 1103 xmlXPathFreeObject(obj2); 1104 1105 /** 1106 *** Search the associations for a corresponding CCSID. 1107 **/ 1108 1109 for (sp = chset_list; sp; sp = sp->c_next) 1110 if (sp->c_mibenum == mibenum) 1111 break; 1112 1113 if (!sp) 1114 continue; /* No CCSID for this MIBenum. */ 1115 1116 /** 1117 *** Process preferred MIME name. 1118 **/ 1119 1120 obj2 = xmlXPathEval(utf8_getprefname, ctxt); 1121 1122 if (!obj2 || obj2->type != XPATH_STRING) { 1123 fprintf(stderr, 1124 "get Preferred_MIME_Name failed at row %u\n", n); 1125 exit(1); 1126 } 1127 1128 if (obj2->stringval && obj2->stringval[0]) 1129 nfaenter(obj2->stringval, -1, sp); 1130 1131 xmlXPathFreeObject(obj2); 1132 1133 /** 1134 *** Process name. 1135 **/ 1136 1137 obj2 = xmlXPathEval(utf8_getname, ctxt); 1138 1139 if (!obj2 || obj2->type != XPATH_STRING) { 1140 fprintf(stderr, "get name failed at row %u\n", n); 1141 exit(1); 1142 } 1143 1144 if (obj2->stringval && obj2->stringval[0]) 1145 nfaenter(obj2->stringval, -1, sp); 1146 1147 xmlXPathFreeObject(obj2); 1148 1149 /** 1150 *** Process aliases. 1151 **/ 1152 1153 obj2 = xmlXPathEval(utf8_getaliases, ctxt); 1154 1155 if (!obj2 || obj2->type != XPATH_NODESET) { 1156 fprintf(stderr, "get aliases failed at row %u\n", n); 1157 exit(1); 1158 } 1159 1160 if (obj2->nodesetval && obj2->nodesetval->nodeTab) 1161 for (i = 0; i < obj2->nodesetval->nodeNr; i++) { 1162 node = obj2->nodesetval->nodeTab[i]; 1163 1164 if (node && node->content && node->content[0]) 1165 nfaenter(node->content, -1, sp); 1166 } 1167 1168 xmlXPathFreeObject(obj2); 1169 } 1170 1171 xmlXPathFreeObject(obj1); 1172 xmlXPathFreeContext(ctxt); 1173 xmlFreeDoc(doc); 1174 } 1175 1176 1177 t_powerset * closureset(t_powerset * dst, t_powerset * src); 1178 1179 1180 t_powerset * 1181 closure(t_powerset * dst, t_state * src) 1182 1183 { 1184 t_transition * t; 1185 unsigned int oldcard; 1186 1187 if (src->s_nfastates) { 1188 /** 1189 *** Is a DFA state: return closure of set of equivalent 1190 *** NFA states. 1191 **/ 1192 1193 return closureset(dst, src->s_nfastates); 1194 } 1195 1196 /** 1197 *** Compute closure of NFA state. 1198 **/ 1199 1200 dst = set_include(dst, src); 1201 1202 for (t = src->s_forward; t; t = t->t_forwnext) 1203 if (t->t_token == EPSILON) { 1204 oldcard = dst->p_card; 1205 dst = set_include(dst, t->t_to); 1206 1207 if (oldcard != dst->p_card) 1208 dst = closure(dst, t->t_to); 1209 } 1210 1211 return dst; 1212 } 1213 1214 1215 t_powerset * 1216 closureset(t_powerset * dst, t_powerset * src) 1217 1218 { 1219 unsigned int i; 1220 1221 for (i = 0; i < src->p_card; i++) 1222 dst = closure(dst, (t_state *) src->p_set[i]); 1223 1224 return dst; 1225 } 1226 1227 1228 t_state * 1229 get_dfa_state(t_state * * stack, 1230 t_powerset * nfastates, xmlHashTablePtr sethash) 1231 1232 { 1233 t_state * s; 1234 1235 if (s = hash_get(sethash, nfastates->p_set, 1236 nfastates->p_card * sizeof nfastates->p_set[0])) { 1237 /** 1238 *** DFA state already present. 1239 *** Release the NFA state set and return 1240 *** the address of the old DFA state. 1241 **/ 1242 1243 free((char *) nfastates); 1244 return s; 1245 } 1246 1247 /** 1248 *** Build the new state. 1249 **/ 1250 1251 s = newstate(); 1252 s->s_nfastates = nfastates; 1253 s->s_next = dfa_states; 1254 dfa_states = s; 1255 s->s_stack = *stack; 1256 *stack = s; 1257 1258 /** 1259 *** Enter it in hash. 1260 **/ 1261 1262 if (hash_add(sethash, nfastates->p_set, 1263 nfastates->p_card * sizeof nfastates->p_set[0], s)) 1264 chknull(NULL); /* Memory allocation error. */ 1265 1266 return s; 1267 } 1268 1269 1270 int 1271 transcmp(const void * p1, const void * p2) 1272 1273 { 1274 t_transition * t1; 1275 t_transition * t2; 1276 1277 t1 = *(t_transition * *) p1; 1278 t2 = *(t_transition * *) p2; 1279 return ((int) t1->t_token) - ((int) t2->t_token); 1280 } 1281 1282 1283 void 1284 builddfa(void) 1285 1286 { 1287 t_powerset * transset; 1288 t_powerset * stateset; 1289 t_state * s; 1290 t_state * s2; 1291 unsigned int n; 1292 unsigned int i; 1293 unsigned int token; 1294 t_transition * t; 1295 t_state * stack; 1296 xmlHashTablePtr sethash; 1297 unsigned int nst; 1298 1299 transset = set_include(NULL, NULL); 1300 chknull(transset); 1301 stateset = set_include(NULL, NULL); 1302 chknull(stateset); 1303 sethash = xmlHashCreate(1); 1304 chknull(sethash); 1305 dfa_states = (t_state *) NULL; 1306 stack = (t_state *) NULL; 1307 nst = 0; 1308 1309 /** 1310 *** Build the DFA initial state. 1311 **/ 1312 1313 get_dfa_state(&stack, closure(NULL, initial_state), sethash); 1314 1315 /** 1316 *** Build the other DFA states by looking at each 1317 *** possible transition from stacked DFA states. 1318 **/ 1319 1320 do { 1321 if (!(++nst % 100)) 1322 fprintf(stderr, "%u DFA states\n", nst); 1323 1324 s = stack; 1325 stack = s->s_stack; 1326 s->s_stack = (t_state *) NULL; 1327 1328 /** 1329 *** Build a set of all non-epsilon transitions from this 1330 *** state. 1331 **/ 1332 1333 transset->p_card = 0; 1334 1335 for (n = 0; n < s->s_nfastates->p_card; n++) { 1336 s2 = s->s_nfastates->p_set[n]; 1337 1338 for (t = s2->s_forward; t; t = t->t_forwnext) 1339 if (t->t_token != EPSILON) { 1340 transset = set_include(transset, t); 1341 chknull(transset); 1342 } 1343 } 1344 1345 /** 1346 *** Sort transitions by token. 1347 **/ 1348 1349 qsort(transset->p_set, transset->p_card, 1350 sizeof transset->p_set[0], transcmp); 1351 1352 /** 1353 *** Process all transitions, grouping them by token. 1354 **/ 1355 1356 stateset->p_card = 0; 1357 token = EPSILON; 1358 1359 for (i = 0; i < transset->p_card; i++) { 1360 t = transset->p_set[i]; 1361 1362 if (token != t->t_token) { 1363 if (stateset->p_card) { 1364 /** 1365 *** Get the equivalent DFA state 1366 *** and create transition. 1367 **/ 1368 1369 newtransition(token, s, 1370 get_dfa_state(&stack, 1371 closureset(NULL, stateset), 1372 sethash)); 1373 stateset->p_card = 0; 1374 } 1375 1376 token = t->t_token; 1377 } 1378 1379 stateset = set_include(stateset, t->t_to); 1380 } 1381 1382 if (stateset->p_card) 1383 newtransition(token, s, get_dfa_state(&stack, 1384 closureset(NULL, stateset), sethash)); 1385 } while (stack); 1386 1387 free((char *) transset); 1388 free((char *) stateset); 1389 xmlHashFree(sethash, NULL); 1390 1391 /** 1392 *** Reverse the state list to get the initial state first, 1393 *** check for ambiguous prefixes, determine final states, 1394 *** destroy NFA state sets. 1395 **/ 1396 1397 while (s = dfa_states) { 1398 dfa_states = s->s_next; 1399 s->s_next = stack; 1400 stack = s; 1401 stateset = s->s_nfastates; 1402 s->s_nfastates = (t_powerset *) NULL; 1403 1404 for (n = 0; n < stateset->p_card; n++) { 1405 s2 = (t_state *) stateset->p_set[n]; 1406 1407 if (s2->s_final) { 1408 if (s->s_final && s->s_final != s2->s_final) 1409 fprintf(stderr, 1410 "Ambiguous name for CCSIDs %u/%u\n", 1411 s->s_final->c_ccsid, 1412 s2->s_final->c_ccsid); 1413 1414 s->s_final = s2->s_final; 1415 } 1416 } 1417 1418 free((char *) stateset); 1419 } 1420 1421 dfa_states = stack; 1422 } 1423 1424 1425 void 1426 deletenfa(void) 1427 1428 { 1429 t_transition * t; 1430 t_state * s; 1431 t_state * u; 1432 t_state * stack; 1433 1434 stack = initial_state; 1435 stack->s_stack = (t_state *) NULL; 1436 1437 while ((s = stack)) { 1438 stack = s->s_stack; 1439 1440 while ((t = s->s_forward)) { 1441 u = t->t_to; 1442 unlink_transition(t); 1443 free((char *) t); 1444 1445 if (!u->s_backward) { 1446 u->s_stack = stack; 1447 stack = u; 1448 } 1449 } 1450 1451 free((char *) s); 1452 } 1453 } 1454 1455 1456 t_stategroup * 1457 newgroup(void) 1458 1459 { 1460 t_stategroup * g; 1461 1462 g = (t_stategroup *) malloc(sizeof *g); 1463 chknull(g); 1464 memset((char *) g, 0, sizeof *g); 1465 g->g_id = groupid++; 1466 return g; 1467 } 1468 1469 1470 void 1471 optimizedfa(void) 1472 1473 { 1474 unsigned int i; 1475 xmlHashTablePtr h; 1476 t_state * s1; 1477 t_state * s2; 1478 t_state * finstates; 1479 t_state * * sp; 1480 t_stategroup * g1; 1481 t_stategroup * g2; 1482 t_stategroup * ghead; 1483 t_transition * t1; 1484 t_transition * t2; 1485 unsigned int done; 1486 unsigned int startgroup; 1487 unsigned int gtrans[1 << (8 * sizeof(unsigned char))]; 1488 1489 /** 1490 *** Reduce DFA state count. 1491 **/ 1492 1493 groupid = 0; 1494 ghead = (t_stategroup *) NULL; 1495 1496 /** 1497 *** First split: non-final and each distinct final states. 1498 **/ 1499 1500 h = xmlHashCreate(4); 1501 chknull(h); 1502 1503 for (s1 = dfa_states; s1; s1 = s1->s_next) { 1504 if (!(g1 = hash_get(h, &s1->s_final, sizeof s1->s_final))) { 1505 g1 = newgroup(); 1506 g1->g_next = ghead; 1507 ghead = g1; 1508 1509 if (hash_add(h, &s1->s_final, sizeof s1->s_final, g1)) 1510 chknull(NULL); /* Memory allocation error. */ 1511 } 1512 1513 s1->s_index = g1->g_id; 1514 s1->s_stack = g1->g_member; 1515 g1->g_member = s1; 1516 } 1517 1518 xmlHashFree(h, NULL); 1519 1520 /** 1521 *** Subsequent splits: states that have the same forward 1522 *** transition tokens to states in the same group. 1523 **/ 1524 1525 do { 1526 done = 1; 1527 1528 for (g2 = ghead; g2; g2 = g2->g_next) { 1529 s1 = g2->g_member; 1530 1531 if (!s1->s_stack) 1532 continue; 1533 1534 h = xmlHashCreate(1); 1535 chknull(h); 1536 1537 /** 1538 *** Build the group transition map. 1539 **/ 1540 1541 memset((char *) gtrans, ~0, sizeof gtrans); 1542 1543 for (t1 = s1->s_forward; t1; t1 = t1->t_forwnext) 1544 gtrans[t1->t_token] = t1->t_to->s_index; 1545 1546 if (hash_add(h, gtrans, sizeof gtrans, g2)) 1547 chknull(NULL); 1548 1549 /** 1550 *** Process other states in group. 1551 **/ 1552 1553 sp = &s1->s_stack; 1554 s1 = *sp; 1555 1556 do { 1557 *sp = s1->s_stack; 1558 1559 /** 1560 *** Build the transition map. 1561 **/ 1562 1563 memset((char *) gtrans, ~0, sizeof gtrans); 1564 1565 for (t1 = s1->s_forward; 1566 t1; t1 = t1->t_forwnext) 1567 gtrans[t1->t_token] = t1->t_to->s_index; 1568 1569 g1 = hash_get(h, gtrans, sizeof gtrans); 1570 1571 if (g1 == g2) { 1572 *sp = s1; 1573 sp = &s1->s_stack; 1574 } 1575 else { 1576 if (!g1) { 1577 g1 = newgroup(); 1578 g1->g_next = ghead; 1579 ghead = g1; 1580 1581 if (hash_add(h, gtrans, 1582 sizeof gtrans, g1)) 1583 chknull(NULL); 1584 } 1585 1586 s1->s_index = g1->g_id; 1587 s1->s_stack = g1->g_member; 1588 g1->g_member = s1; 1589 done = 0; 1590 } 1591 } while (s1 = *sp); 1592 1593 xmlHashFree(h, NULL); 1594 } 1595 } while (!done); 1596 1597 /** 1598 *** Establish group leaders and remap transitions. 1599 **/ 1600 1601 startgroup = dfa_states->s_index; 1602 1603 for (g1 = ghead; g1; g1 = g1->g_next) 1604 for (s1 = g1->g_member->s_stack; s1; s1 = s1->s_stack) 1605 for (t1 = s1->s_backward; t1; t1 = t2) { 1606 t2 = t1->t_backnext; 1607 link_transition(t1, NULL, g1->g_member); 1608 } 1609 1610 /** 1611 *** Remove redundant states and transitions. 1612 **/ 1613 1614 for (g1 = ghead; g1; g1 = g1->g_next) { 1615 g1->g_member->s_next = (t_state *) NULL; 1616 1617 while ((s1 = g1->g_member->s_stack)) { 1618 g1->g_member->s_stack = s1->s_stack; 1619 1620 for (t1 = s1->s_forward; t1; t1 = t2) { 1621 t2 = t1->t_forwnext; 1622 unlink_transition(t1); 1623 free((char *) t1); 1624 } 1625 1626 free((char *) s1); 1627 } 1628 } 1629 1630 /** 1631 *** Remove group support and relink DFA states. 1632 **/ 1633 1634 dfa_states = (t_state *) NULL; 1635 s2 = (t_state *) NULL; 1636 finstates = (t_state *) NULL; 1637 1638 while (g1 = ghead) { 1639 ghead = g1->g_next; 1640 s1 = g1->g_member; 1641 1642 if (g1->g_id == startgroup) 1643 dfa_states = s1; /* Keep start state first. */ 1644 else if (s1->s_final) { /* Then final states. */ 1645 s1->s_next = finstates; 1646 finstates = s1; 1647 } 1648 else { /* Finish with non-final states. */ 1649 s1->s_next = s2; 1650 s2 = s1; 1651 } 1652 1653 free((char *) g1); 1654 } 1655 1656 for (dfa_states->s_next = finstates; finstates->s_next;) 1657 finstates = finstates->s_next; 1658 1659 finstates->s_next = s2; 1660 } 1661 1662 1663 const char * 1664 inttype(unsigned long max) 1665 1666 { 1667 int i; 1668 1669 for (i = 0; max; i++) 1670 max >>= 1; 1671 1672 if (i > 8 * sizeof(unsigned int)) 1673 return "unsigned long"; 1674 1675 if (i > 8 * sizeof(unsigned short)) 1676 return "unsigned int"; 1677 1678 if (i > 8 * sizeof(unsigned char)) 1679 return "unsigned short"; 1680 1681 return "unsigned char"; 1682 } 1683 1684 1685 listids(FILE * fp) 1686 1687 { 1688 unsigned int pos; 1689 t_chset * cp; 1690 t_symlist * lp; 1691 char * srcp; 1692 char * dstp; 1693 size_t srcc; 1694 size_t dstc; 1695 char buf[80]; 1696 1697 fprintf(fp, "/**\n*** CCSID For arg Recognized name.\n"); 1698 pos = 0; 1699 1700 for (cp = chset_list; cp; cp = cp->c_next) { 1701 if (pos) { 1702 fprintf(fp, "\n"); 1703 pos = 0; 1704 } 1705 1706 if (!cp->c_names) 1707 continue; 1708 1709 pos = fprintf(fp, "*** %5u %c ", cp->c_ccsid, 1710 iconv_open_error(cp->c_fromUTF8)? ' ': 'X'); 1711 1712 for (lp = cp->c_names; lp; lp = lp->l_next) { 1713 srcp = (char *) lp->l_symbol; 1714 srcc = strlen(srcp); 1715 dstp = buf; 1716 dstc = sizeof buf; 1717 iconv(utf82job, &srcp, &srcc, &dstp, &dstc); 1718 srcc = dstp - buf; 1719 1720 if (pos + srcc > 79) { 1721 fprintf(fp, "\n***%22c", ' '); 1722 pos = 25; 1723 } 1724 1725 pos += fprintf(fp, " %.*s", srcc, buf); 1726 } 1727 } 1728 1729 if (pos) 1730 fprintf(fp, "\n"); 1731 1732 fprintf(fp, "**/\n\n"); 1733 } 1734 1735 1736 void 1737 generate(FILE * fp) 1738 1739 { 1740 unsigned int nstates; 1741 unsigned int ntrans; 1742 unsigned int maxfinal; 1743 t_state * s; 1744 t_transition * t; 1745 unsigned int i; 1746 unsigned int pos; 1747 char * ns; 1748 1749 /** 1750 *** Assign indexes to states and transitions. 1751 **/ 1752 1753 nstates = 0; 1754 ntrans = 0; 1755 maxfinal = 0; 1756 1757 for (s = dfa_states; s; s = s->s_next) { 1758 s->s_index = nstates++; 1759 1760 if (s->s_final) 1761 maxfinal = nstates; 1762 1763 for (t = s->s_forward; t; t = t->t_forwnext) 1764 t->t_index = ntrans++; 1765 } 1766 1767 fprintf(fp, 1768 "/**\n*** %u states, %u finals, %u transitions.\n**/\n\n", 1769 nstates, maxfinal, ntrans); 1770 fprintf(stderr, "%u states, %u finals, %u transitions.\n", 1771 nstates, maxfinal, ntrans); 1772 1773 /** 1774 *** Generate types. 1775 **/ 1776 1777 fprintf(fp, "typedef unsigned short t_ccsid;\n"); 1778 fprintf(fp, "typedef %-23s t_staterange;\n", inttype(nstates)); 1779 fprintf(fp, "typedef %-23s t_transrange;\n\n", inttype(ntrans)); 1780 1781 /** 1782 *** Generate first transition index for each state. 1783 **/ 1784 1785 fprintf(fp, "static t_transrange trans_array[] = {\n"); 1786 pos = 0; 1787 ntrans = 0; 1788 1789 for (s = dfa_states; s; s = s->s_next) { 1790 pos += fprintf(fp, " %u,", ntrans); 1791 1792 if (pos > 72) { 1793 fprintf(fp, "\n"); 1794 pos = 0; 1795 } 1796 1797 for (t = s->s_forward; t; t = t->t_forwnext) 1798 ntrans++; 1799 } 1800 1801 fprintf(fp, " %u\n};\n\n", ntrans); 1802 1803 /** 1804 *** Generate final state info. 1805 **/ 1806 1807 fprintf(fp, "static t_ccsid final_array[] = {\n"); 1808 pos = 0; 1809 ns =""; 1810 i = 0; 1811 1812 for (s = dfa_states; s && i++ < maxfinal; s = s->s_next) { 1813 pos += fprintf(fp, "%s", ns); 1814 ns = ","; 1815 1816 if (pos > 72) { 1817 fprintf(fp, "\n"); 1818 pos = 0; 1819 } 1820 1821 pos += fprintf(fp, " %u", 1822 s->s_final? s->s_final->c_ccsid + 1: 0); 1823 } 1824 1825 fprintf(fp, "\n};\n\n"); 1826 1827 /** 1828 *** Generate goto table. 1829 **/ 1830 1831 fprintf(fp, "static t_staterange goto_array[] = {\n"); 1832 pos = 0; 1833 1834 for (s = dfa_states; s; s = s->s_next) 1835 for (t = s->s_forward; t; t = t->t_forwnext) { 1836 pos += fprintf(fp, " %u,", t->t_to->s_index); 1837 1838 if (pos > 72) { 1839 fprintf(fp, "\n"); 1840 pos = 0; 1841 } 1842 } 1843 1844 fprintf(fp, " %u\n};\n\n", nstates); 1845 1846 /** 1847 *** Generate transition label table. 1848 **/ 1849 1850 fprintf(fp, "static unsigned char label_array[] = {\n"); 1851 pos = 0; 1852 ns =""; 1853 1854 for (s = dfa_states; s; s = s->s_next) 1855 for (t = s->s_forward; t; t = t->t_forwnext) { 1856 pos += fprintf(fp, "%s", ns); 1857 ns = ","; 1858 1859 if (pos > 72) { 1860 fprintf(fp, "\n"); 1861 pos = 0; 1862 } 1863 1864 pos += fprintf(fp, " 0x%02X", t->t_token); 1865 } 1866 1867 fprintf(fp, "\n};\n", nstates); 1868 } 1869 1870 1871 main(argc, argv) 1872 int argc; 1873 char * * argv; 1874 1875 { 1876 FILE * fp; 1877 t_chset * csp; 1878 char symbuf[20]; 1879 1880 chset_list = (t_chset *) NULL; 1881 initial_state = newstate(); 1882 job2utf8 = iconv_open_ccsid(C_UTF8_CCSID, C_SOURCE_CCSID, 0); 1883 utf82job = iconv_open_ccsid(C_SOURCE_CCSID, C_UTF8_CCSID, 0); 1884 1885 if (argc != 4) { 1886 fprintf(stderr, "Usage: %s <ccsid-mibenum file> ", *argv); 1887 fprintf(stderr, "<iana-character-set file> <output file>\n"); 1888 exit(1); 1889 } 1890 1891 /** 1892 *** Read CCSID/MIBenum associations. Define special names. 1893 **/ 1894 1895 read_assocs(argv[1]); 1896 1897 /** 1898 *** Read character set names and establish the case-independent 1899 *** name DFA in all possible CCSIDs. 1900 **/ 1901 1902 read_iana(argv[2]); 1903 1904 /** 1905 *** Build DFA from NFA. 1906 **/ 1907 1908 builddfa(); 1909 1910 /** 1911 *** Delete NFA. 1912 **/ 1913 1914 deletenfa(); 1915 1916 /** 1917 *** Minimize the DFA state count. 1918 **/ 1919 1920 optimizedfa(); 1921 1922 /** 1923 *** Generate the table. 1924 **/ 1925 1926 fp = fopen(argv[3], "w+"); 1927 1928 if (!fp) { 1929 perror(argv[3]); 1930 exit(1); 1931 } 1932 1933 fprintf(fp, "/**\n"); 1934 fprintf(fp, "*** Character set names table.\n"); 1935 fprintf(fp, "*** Generated by program BLDCSNDFA from"); 1936 fprintf(fp, " IANA character set assignment file\n"); 1937 fprintf(fp, "*** and CCSID/MIBenum equivalence file.\n"); 1938 fprintf(fp, "*** *** Do not edit by hand ***\n"); 1939 fprintf(fp, "**/\n\n"); 1940 listids(fp); 1941 generate(fp); 1942 1943 if (ferror(fp)) { 1944 perror(argv[3]); 1945 fclose(fp); 1946 exit(1); 1947 } 1948 1949 fclose(fp); 1950 iconv_close(job2utf8); 1951 iconv_close(utf82job); 1952 exit(0); 1953 } 1954