1 // Copyright (C) 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * 6 * Copyright (C) 2003-2012, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************* 10 * file name: ucmstate.c 11 * encoding: US-ASCII 12 * tab size: 8 (not used) 13 * indentation:4 14 * 15 * created on: 2003oct09 16 * created by: Markus W. Scherer 17 * 18 * This file handles ICU .ucm file state information as part of the ucm module. 19 * Most of this code used to be in makeconv.c. 20 */ 21 22 #include "unicode/utypes.h" 23 #include "cstring.h" 24 #include "cmemory.h" 25 #include "uarrsort.h" 26 #include "ucnvmbcs.h" 27 #include "ucnv_ext.h" 28 #include "uparse.h" 29 #include "ucm.h" 30 #include <stdio.h> 31 32 #if !UCONFIG_NO_CONVERSION 33 34 /* MBCS state handling ------------------------------------------------------ */ 35 36 /* 37 * state table row grammar (ebnf-style): 38 * (whitespace is allowed between all tokens) 39 * 40 * row=[[firstentry ','] entry (',' entry)*] 41 * firstentry="initial" | "surrogates" 42 * (initial state (default for state 0), output is all surrogate pairs) 43 * entry=range [':' nextstate] ['.' action] 44 * range=number ['-' number] 45 * nextstate=number 46 * (0..7f) 47 * action='u' | 's' | 'p' | 'i' 48 * (unassigned, state change only, surrogate pair, illegal) 49 * number=(1- or 2-digit hexadecimal number) 50 */ 51 static const char * 52 parseState(const char *s, int32_t state[256], uint32_t *pFlags) { 53 const char *t; 54 uint32_t start, end, i; 55 int32_t entry; 56 57 /* initialize the state: all illegal with U+ffff */ 58 for(i=0; i<256; ++i) { 59 state[i]=MBCS_ENTRY_FINAL(0, MBCS_STATE_ILLEGAL, 0xffff); 60 } 61 62 /* skip leading white space */ 63 s=u_skipWhitespace(s); 64 65 /* is there an "initial" or "surrogates" directive? */ 66 if(uprv_strncmp("initial", s, 7)==0) { 67 *pFlags=MBCS_STATE_FLAG_DIRECT; 68 s=u_skipWhitespace(s+7); 69 if(*s++!=',') { 70 return s-1; 71 } 72 } else if(*pFlags==0 && uprv_strncmp("surrogates", s, 10)==0) { 73 *pFlags=MBCS_STATE_FLAG_SURROGATES; 74 s=u_skipWhitespace(s+10); 75 if(*s++!=',') { 76 return s-1; 77 } 78 } else if(*s==0) { 79 /* empty state row: all-illegal */ 80 return NULL; 81 } 82 83 for(;;) { 84 /* read an entry, the start of the range first */ 85 s=u_skipWhitespace(s); 86 start=uprv_strtoul(s, (char **)&t, 16); 87 if(s==t || 0xff<start) { 88 return s; 89 } 90 s=u_skipWhitespace(t); 91 92 /* read the end of the range if there is one */ 93 if(*s=='-') { 94 s=u_skipWhitespace(s+1); 95 end=uprv_strtoul(s, (char **)&t, 16); 96 if(s==t || end<start || 0xff<end) { 97 return s; 98 } 99 s=u_skipWhitespace(t); 100 } else { 101 end=start; 102 } 103 104 /* determine the state entrys for this range */ 105 if(*s!=':' && *s!='.') { 106 /* the default is: final state with valid entries */ 107 entry=MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_16, 0); 108 } else { 109 entry=MBCS_ENTRY_TRANSITION(0, 0); 110 if(*s==':') { 111 /* get the next state, default to 0 */ 112 s=u_skipWhitespace(s+1); 113 i=uprv_strtoul(s, (char **)&t, 16); 114 if(s!=t) { 115 if(0x7f<i) { 116 return s; 117 } 118 s=u_skipWhitespace(t); 119 entry=MBCS_ENTRY_SET_STATE(entry, i); 120 } 121 } 122 123 /* get the state action, default to valid */ 124 if(*s=='.') { 125 /* this is a final state */ 126 entry=MBCS_ENTRY_SET_FINAL(entry); 127 128 s=u_skipWhitespace(s+1); 129 if(*s=='u') { 130 /* unassigned set U+fffe */ 131 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); 132 s=u_skipWhitespace(s+1); 133 } else if(*s=='p') { 134 if(*pFlags!=MBCS_STATE_FLAG_DIRECT) { 135 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16_PAIR); 136 } else { 137 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); 138 } 139 s=u_skipWhitespace(s+1); 140 } else if(*s=='s') { 141 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_CHANGE_ONLY); 142 s=u_skipWhitespace(s+1); 143 } else if(*s=='i') { 144 /* illegal set U+ffff */ 145 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_ILLEGAL, 0xffff); 146 s=u_skipWhitespace(s+1); 147 } else { 148 /* default to valid */ 149 entry=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_VALID_16); 150 } 151 } else { 152 /* this is an intermediate state, nothing to do */ 153 } 154 } 155 156 /* adjust "final valid" states according to the state flags */ 157 if(MBCS_ENTRY_FINAL_ACTION(entry)==MBCS_STATE_VALID_16) { 158 switch(*pFlags) { 159 case 0: 160 /* no adjustment */ 161 break; 162 case MBCS_STATE_FLAG_DIRECT: 163 /* set the valid-direct code point to "unassigned"==0xfffe */ 164 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_DIRECT_16, 0xfffe); 165 break; 166 case MBCS_STATE_FLAG_SURROGATES: 167 entry=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_VALID_16_PAIR, 0); 168 break; 169 default: 170 break; 171 } 172 } 173 174 /* set this entry for the range */ 175 for(i=start; i<=end; ++i) { 176 state[i]=entry; 177 } 178 179 if(*s==',') { 180 ++s; 181 } else { 182 return *s==0 ? NULL : s; 183 } 184 } 185 } 186 187 U_CAPI void U_EXPORT2 188 ucm_addState(UCMStates *states, const char *s) { 189 const char *error; 190 191 if(states->countStates==MBCS_MAX_STATE_COUNT) { 192 fprintf(stderr, "ucm error: too many states (maximum %u)\n", MBCS_MAX_STATE_COUNT); 193 exit(U_INVALID_TABLE_FORMAT); 194 } 195 196 error=parseState(s, states->stateTable[states->countStates], 197 &states->stateFlags[states->countStates]); 198 if(error!=NULL) { 199 fprintf(stderr, "ucm error: parse error in state definition at '%s'\n", error); 200 exit(U_INVALID_TABLE_FORMAT); 201 } 202 203 ++states->countStates; 204 } 205 206 U_CAPI UBool U_EXPORT2 207 ucm_parseHeaderLine(UCMFile *ucm, 208 char *line, char **pKey, char **pValue) { 209 UCMStates *states; 210 char *s, *end; 211 char c; 212 213 states=&ucm->states; 214 215 /* remove comments and trailing CR and LF and remove whitespace from the end */ 216 for(end=line; (c=*end)!=0; ++end) { 217 if(c=='#' || c=='\r' || c=='\n') { 218 break; 219 } 220 } 221 while(end>line && (*(end-1)==' ' || *(end-1)=='\t')) { 222 --end; 223 } 224 *end=0; 225 226 /* skip leading white space and ignore empty lines */ 227 s=(char *)u_skipWhitespace(line); 228 if(*s==0) { 229 return TRUE; 230 } 231 232 /* stop at the beginning of the mapping section */ 233 if(uprv_memcmp(s, "CHARMAP", 7)==0) { 234 return FALSE; 235 } 236 237 /* get the key name, bracketed in <> */ 238 if(*s!='<') { 239 fprintf(stderr, "ucm error: no header field <key> in line \"%s\"\n", line); 240 exit(U_INVALID_TABLE_FORMAT); 241 } 242 *pKey=++s; 243 while(*s!='>') { 244 if(*s==0) { 245 fprintf(stderr, "ucm error: incomplete header field <key> in line \"%s\"\n", line); 246 exit(U_INVALID_TABLE_FORMAT); 247 } 248 ++s; 249 } 250 *s=0; 251 252 /* get the value string, possibly quoted */ 253 s=(char *)u_skipWhitespace(s+1); 254 if(*s!='"') { 255 *pValue=s; 256 } else { 257 /* remove the quotes */ 258 *pValue=s+1; 259 if(end>*pValue && *(end-1)=='"') { 260 *--end=0; 261 } 262 } 263 264 /* collect the information from the header field, ignore unknown keys */ 265 if(uprv_strcmp(*pKey, "uconv_class")==0) { 266 if(uprv_strcmp(*pValue, "DBCS")==0) { 267 states->conversionType=UCNV_DBCS; 268 } else if(uprv_strcmp(*pValue, "SBCS")==0) { 269 states->conversionType = UCNV_SBCS; 270 } else if(uprv_strcmp(*pValue, "MBCS")==0) { 271 states->conversionType = UCNV_MBCS; 272 } else if(uprv_strcmp(*pValue, "EBCDIC_STATEFUL")==0) { 273 states->conversionType = UCNV_EBCDIC_STATEFUL; 274 } else { 275 fprintf(stderr, "ucm error: unknown <uconv_class> %s\n", *pValue); 276 exit(U_INVALID_TABLE_FORMAT); 277 } 278 return TRUE; 279 } else if(uprv_strcmp(*pKey, "mb_cur_max")==0) { 280 c=**pValue; 281 if('1'<=c && c<='4' && (*pValue)[1]==0) { 282 states->maxCharLength=(int8_t)(c-'0'); 283 states->outputType=(int8_t)(states->maxCharLength-1); 284 } else { 285 fprintf(stderr, "ucm error: illegal <mb_cur_max> %s\n", *pValue); 286 exit(U_INVALID_TABLE_FORMAT); 287 } 288 return TRUE; 289 } else if(uprv_strcmp(*pKey, "mb_cur_min")==0) { 290 c=**pValue; 291 if('1'<=c && c<='4' && (*pValue)[1]==0) { 292 states->minCharLength=(int8_t)(c-'0'); 293 } else { 294 fprintf(stderr, "ucm error: illegal <mb_cur_min> %s\n", *pValue); 295 exit(U_INVALID_TABLE_FORMAT); 296 } 297 return TRUE; 298 } else if(uprv_strcmp(*pKey, "icu:state")==0) { 299 /* if an SBCS/DBCS/EBCDIC_STATEFUL converter has icu:state, then turn it into MBCS */ 300 switch(states->conversionType) { 301 case UCNV_SBCS: 302 case UCNV_DBCS: 303 case UCNV_EBCDIC_STATEFUL: 304 states->conversionType=UCNV_MBCS; 305 break; 306 case UCNV_MBCS: 307 break; 308 default: 309 fprintf(stderr, "ucm error: <icu:state> entry for non-MBCS table or before the <uconv_class> line\n"); 310 exit(U_INVALID_TABLE_FORMAT); 311 } 312 313 if(states->maxCharLength==0) { 314 fprintf(stderr, "ucm error: <icu:state> before the <mb_cur_max> line\n"); 315 exit(U_INVALID_TABLE_FORMAT); 316 } 317 ucm_addState(states, *pValue); 318 return TRUE; 319 } else if(uprv_strcmp(*pKey, "icu:base")==0) { 320 if(**pValue==0) { 321 fprintf(stderr, "ucm error: <icu:base> without a base table name\n"); 322 exit(U_INVALID_TABLE_FORMAT); 323 } 324 uprv_strcpy(ucm->baseName, *pValue); 325 return TRUE; 326 } 327 328 return FALSE; 329 } 330 331 /* post-processing ---------------------------------------------------------- */ 332 333 static int32_t 334 sumUpStates(UCMStates *states) { 335 int32_t entry, sum, state, cell, count; 336 UBool allStatesReady; 337 338 /* 339 * Sum up the offsets for all states. 340 * In each final state (where there are only final entries), 341 * the offsets add up directly. 342 * In all other state table rows, for each transition entry to another state, 343 * the offsets sum of that state needs to be added. 344 * This is achieved in at most countStates iterations. 345 */ 346 allStatesReady=FALSE; 347 for(count=states->countStates; !allStatesReady && count>=0; --count) { 348 allStatesReady=TRUE; 349 for(state=states->countStates-1; state>=0; --state) { 350 if(!(states->stateFlags[state]&MBCS_STATE_FLAG_READY)) { 351 allStatesReady=FALSE; 352 sum=0; 353 354 /* at first, add up only the final delta offsets to keep them <512 */ 355 for(cell=0; cell<256; ++cell) { 356 entry=states->stateTable[state][cell]; 357 if(MBCS_ENTRY_IS_FINAL(entry)) { 358 switch(MBCS_ENTRY_FINAL_ACTION(entry)) { 359 case MBCS_STATE_VALID_16: 360 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); 361 sum+=1; 362 break; 363 case MBCS_STATE_VALID_16_PAIR: 364 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_VALUE(entry, sum); 365 sum+=2; 366 break; 367 default: 368 /* no addition */ 369 break; 370 } 371 } 372 } 373 374 /* now, add up the delta offsets for the transitional entries */ 375 for(cell=0; cell<256; ++cell) { 376 entry=states->stateTable[state][cell]; 377 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 378 if(states->stateFlags[MBCS_ENTRY_TRANSITION_STATE(entry)]&MBCS_STATE_FLAG_READY) { 379 states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_SET_OFFSET(entry, sum); 380 sum+=states->stateOffsetSum[MBCS_ENTRY_TRANSITION_STATE(entry)]; 381 } else { 382 /* that next state does not have a sum yet, we cannot finish the one for this state */ 383 sum=-1; 384 break; 385 } 386 } 387 } 388 389 if(sum!=-1) { 390 states->stateOffsetSum[state]=sum; 391 states->stateFlags[state]|=MBCS_STATE_FLAG_READY; 392 } 393 } 394 } 395 } 396 397 if(!allStatesReady) { 398 fprintf(stderr, "ucm error: the state table contains loops\n"); 399 exit(U_INVALID_TABLE_FORMAT); 400 } 401 402 /* 403 * For all "direct" (i.e., initial) states>0, 404 * the offsets need to be increased by the sum of 405 * the previous initial states. 406 */ 407 sum=states->stateOffsetSum[0]; 408 for(state=1; state<states->countStates; ++state) { 409 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { 410 int32_t sum2=sum; 411 sum+=states->stateOffsetSum[state]; 412 for(cell=0; cell<256; ++cell) { 413 entry=states->stateTable[state][cell]; 414 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 415 states->stateTable[state][cell]=MBCS_ENTRY_TRANSITION_ADD_OFFSET(entry, sum2); 416 } 417 } 418 } 419 } 420 421 /* round up to the next even number to have the following data 32-bit-aligned */ 422 return states->countToUCodeUnits=(sum+1)&~1; 423 } 424 425 U_CAPI void U_EXPORT2 426 ucm_processStates(UCMStates *states, UBool ignoreSISOCheck) { 427 int32_t entry, state, cell, count; 428 429 if(states->conversionType==UCNV_UNSUPPORTED_CONVERTER) { 430 fprintf(stderr, "ucm error: missing conversion type (<uconv_class>)\n"); 431 exit(U_INVALID_TABLE_FORMAT); 432 } 433 434 if(states->countStates==0) { 435 switch(states->conversionType) { 436 case UCNV_SBCS: 437 /* SBCS: use MBCS data structure with a default state table */ 438 if(states->maxCharLength!=1) { 439 fprintf(stderr, "error: SBCS codepage with max B/char!=1\n"); 440 exit(U_INVALID_TABLE_FORMAT); 441 } 442 states->conversionType=UCNV_MBCS; 443 ucm_addState(states, "0-ff"); 444 break; 445 case UCNV_MBCS: 446 fprintf(stderr, "ucm error: missing state table information (<icu:state>) for MBCS\n"); 447 exit(U_INVALID_TABLE_FORMAT); 448 break; 449 case UCNV_EBCDIC_STATEFUL: 450 /* EBCDIC_STATEFUL: use MBCS data structure with a default state table */ 451 if(states->minCharLength!=1 || states->maxCharLength!=2) { 452 fprintf(stderr, "error: DBCS codepage with min B/char!=1 or max B/char!=2\n"); 453 exit(U_INVALID_TABLE_FORMAT); 454 } 455 states->conversionType=UCNV_MBCS; 456 ucm_addState(states, "0-ff, e:1.s, f:0.s"); 457 ucm_addState(states, "initial, 0-3f:4, e:1.s, f:0.s, 40:3, 41-fe:2, ff:4"); 458 ucm_addState(states, "0-40:1.i, 41-fe:1., ff:1.i"); 459 ucm_addState(states, "0-ff:1.i, 40:1."); 460 ucm_addState(states, "0-ff:1.i"); 461 break; 462 case UCNV_DBCS: 463 /* DBCS: use MBCS data structure with a default state table */ 464 if(states->minCharLength!=2 || states->maxCharLength!=2) { 465 fprintf(stderr, "error: DBCS codepage with min or max B/char!=2\n"); 466 exit(U_INVALID_TABLE_FORMAT); 467 } 468 states->conversionType = UCNV_MBCS; 469 ucm_addState(states, "0-3f:3, 40:2, 41-fe:1, ff:3"); 470 ucm_addState(states, "41-fe"); 471 ucm_addState(states, "40"); 472 ucm_addState(states, ""); 473 break; 474 default: 475 fprintf(stderr, "ucm error: unknown charset structure\n"); 476 exit(U_INVALID_TABLE_FORMAT); 477 break; 478 } 479 } 480 481 /* 482 * check that the min/max character lengths are reasonable; 483 * to do this right, all paths through the state table would have to be 484 * recursively walked while keeping track of the sequence lengths, 485 * but these simple checks cover most state tables in practice 486 */ 487 if(states->maxCharLength<states->minCharLength) { 488 fprintf(stderr, "ucm error: max B/char < min B/char\n"); 489 exit(U_INVALID_TABLE_FORMAT); 490 } 491 492 /* count non-direct states and compare with max B/char */ 493 count=0; 494 for(state=0; state<states->countStates; ++state) { 495 if((states->stateFlags[state]&0xf)!=MBCS_STATE_FLAG_DIRECT) { 496 ++count; 497 } 498 } 499 if(states->maxCharLength>count+1) { 500 fprintf(stderr, "ucm error: max B/char too large\n"); 501 exit(U_INVALID_TABLE_FORMAT); 502 } 503 504 if(states->minCharLength==1) { 505 int32_t action; 506 507 /* 508 * if there are single-byte characters, 509 * then the initial state must have direct result states 510 */ 511 for(cell=0; cell<256; ++cell) { 512 entry=states->stateTable[0][cell]; 513 if( MBCS_ENTRY_IS_FINAL(entry) && 514 ((action=MBCS_ENTRY_FINAL_ACTION(entry))==MBCS_STATE_VALID_DIRECT_16 || 515 action==MBCS_STATE_UNASSIGNED) 516 ) { 517 break; 518 } 519 } 520 521 if(cell==256) { 522 fprintf(stderr, "ucm warning: min B/char too small\n"); 523 } 524 } 525 526 /* 527 * make sure that all "next state" values are within limits 528 * and that all next states after final ones have the "direct" 529 * flag of initial states 530 */ 531 for(state=states->countStates-1; state>=0; --state) { 532 for(cell=0; cell<256; ++cell) { 533 entry=states->stateTable[state][cell]; 534 if((uint8_t)MBCS_ENTRY_STATE(entry)>=states->countStates) { 535 fprintf(stderr, "ucm error: state table entry [%x][%x] has a next state of %x that is too high\n", 536 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); 537 exit(U_INVALID_TABLE_FORMAT); 538 } 539 if(MBCS_ENTRY_IS_FINAL(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)!=MBCS_STATE_FLAG_DIRECT) { 540 fprintf(stderr, "ucm error: state table entry [%x][%x] is final but has a non-initial next state of %x\n", 541 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); 542 exit(U_INVALID_TABLE_FORMAT); 543 } else if(MBCS_ENTRY_IS_TRANSITION(entry) && (states->stateFlags[MBCS_ENTRY_STATE(entry)]&0xf)==MBCS_STATE_FLAG_DIRECT) { 544 fprintf(stderr, "ucm error: state table entry [%x][%x] is not final but has an initial next state of %x\n", 545 (int)state, (int)cell, (int)MBCS_ENTRY_STATE(entry)); 546 exit(U_INVALID_TABLE_FORMAT); 547 } 548 } 549 } 550 551 /* is this an SI/SO (like EBCDIC-stateful) state table? */ 552 if(states->countStates>=2 && (states->stateFlags[1]&0xf)==MBCS_STATE_FLAG_DIRECT) { 553 if(states->maxCharLength!=2) { 554 fprintf(stderr, "ucm error: SI/SO codepages must have max 2 bytes/char (not %x)\n", (int)states->maxCharLength); 555 exit(U_INVALID_TABLE_FORMAT); 556 } 557 if(states->countStates<3) { 558 fprintf(stderr, "ucm error: SI/SO codepages must have at least 3 states (not %x)\n", (int)states->countStates); 559 exit(U_INVALID_TABLE_FORMAT); 560 } 561 /* are the SI/SO all in the right places? */ 562 if( ignoreSISOCheck || 563 (states->stateTable[0][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && 564 states->stateTable[0][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0) && 565 states->stateTable[1][0xe]==MBCS_ENTRY_FINAL(1, MBCS_STATE_CHANGE_ONLY, 0) && 566 states->stateTable[1][0xf]==MBCS_ENTRY_FINAL(0, MBCS_STATE_CHANGE_ONLY, 0)) 567 ) { 568 states->outputType=MBCS_OUTPUT_2_SISO; 569 } else { 570 fprintf(stderr, "ucm error: SI/SO codepages must have in states 0 and 1 transitions e:1.s, f:0.s\n"); 571 exit(U_INVALID_TABLE_FORMAT); 572 } 573 state=2; 574 } else { 575 state=1; 576 } 577 578 /* check that no unexpected state is a "direct" one */ 579 while(state<states->countStates) { 580 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { 581 fprintf(stderr, "ucm error: state %d is 'initial' - not supported except for SI/SO codepages\n", (int)state); 582 exit(U_INVALID_TABLE_FORMAT); 583 } 584 ++state; 585 } 586 587 sumUpStates(states); 588 } 589 590 /* find a fallback for this offset; return the index or -1 if not found */ 591 U_CAPI int32_t U_EXPORT2 592 ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 593 uint32_t offset) { 594 int32_t i; 595 596 if(countToUFallbacks==0) { 597 /* shortcut: most codepages do not have fallbacks from codepage to Unicode */ 598 return -1; 599 } 600 601 /* do a linear search for the fallback mapping (the table is not yet sorted) */ 602 for(i=0; i<countToUFallbacks; ++i) { 603 if(offset==toUFallbacks[i].offset) { 604 return i; 605 } 606 } 607 return -1; 608 } 609 610 /* 611 * This function tries to compact toUnicode tables for 2-byte codepages 612 * by finding lead bytes with all-unassigned trail bytes and adding another state 613 * for them. 614 */ 615 static void 616 compactToUnicode2(UCMStates *states, 617 uint16_t **pUnicodeCodeUnits, 618 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 619 UBool verbose) { 620 int32_t (*oldStateTable)[256]; 621 uint16_t count[256]; 622 uint16_t *oldUnicodeCodeUnits; 623 int32_t entry, offset, oldOffset, trailOffset, oldTrailOffset, savings, sum; 624 int32_t i, j, leadState, trailState, newState, fallback; 625 uint16_t unit; 626 627 /* find the lead state */ 628 if(states->outputType==MBCS_OUTPUT_2_SISO) { 629 /* use the DBCS lead state for SI/SO codepages */ 630 leadState=1; 631 } else { 632 leadState=0; 633 } 634 635 /* find the main trail state: the most used target state */ 636 uprv_memset(count, 0, sizeof(count)); 637 for(i=0; i<256; ++i) { 638 entry=states->stateTable[leadState][i]; 639 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 640 ++count[MBCS_ENTRY_TRANSITION_STATE(entry)]; 641 } 642 } 643 trailState=0; 644 for(i=1; i<states->countStates; ++i) { 645 if(count[i]>count[trailState]) { 646 trailState=i; 647 } 648 } 649 650 /* count possible savings from lead bytes with all-unassigned results in all trail bytes */ 651 uprv_memset(count, 0, sizeof(count)); 652 savings=0; 653 /* for each lead byte */ 654 for(i=0; i<256; ++i) { 655 entry=states->stateTable[leadState][i]; 656 if(MBCS_ENTRY_IS_TRANSITION(entry) && (MBCS_ENTRY_TRANSITION_STATE(entry))==trailState) { 657 /* the offset is different for each lead byte */ 658 offset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 659 /* for each trail byte for this lead byte */ 660 for(j=0; j<256; ++j) { 661 entry=states->stateTable[trailState][j]; 662 switch(MBCS_ENTRY_FINAL_ACTION(entry)) { 663 case MBCS_STATE_VALID_16: 664 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 665 if((*pUnicodeCodeUnits)[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { 666 ++count[i]; 667 } else { 668 j=999; /* do not count for this lead byte because there are assignments */ 669 } 670 break; 671 case MBCS_STATE_VALID_16_PAIR: 672 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 673 if((*pUnicodeCodeUnits)[entry]==0xfffe) { 674 count[i]+=2; 675 } else { 676 j=999; /* do not count for this lead byte because there are assignments */ 677 } 678 break; 679 default: 680 break; 681 } 682 } 683 if(j==256) { 684 /* all trail bytes for this lead byte are unassigned */ 685 savings+=count[i]; 686 } else { 687 count[i]=0; 688 } 689 } 690 } 691 /* subtract from the possible savings the cost of an additional state */ 692 savings=savings*2-1024; /* count bytes, not 16-bit words */ 693 if(savings<=0) { 694 return; 695 } 696 if(verbose) { 697 printf("compacting toUnicode data saves %ld bytes\n", (long)savings); 698 } 699 if(states->countStates>=MBCS_MAX_STATE_COUNT) { 700 fprintf(stderr, "cannot compact toUnicode because the maximum number of states is reached\n"); 701 return; 702 } 703 704 /* make a copy of the state table */ 705 oldStateTable=(int32_t (*)[256])uprv_malloc(states->countStates*1024); 706 if(oldStateTable==NULL) { 707 fprintf(stderr, "cannot compact toUnicode: out of memory\n"); 708 return; 709 } 710 uprv_memcpy(oldStateTable, states->stateTable, states->countStates*1024); 711 712 /* add the new state */ 713 /* 714 * this function does not catch the degenerate case where all lead bytes 715 * have all-unassigned trail bytes and the lead state could be removed 716 */ 717 newState=states->countStates++; 718 states->stateFlags[newState]=0; 719 /* copy the old trail state, turning all assigned states into unassigned ones */ 720 for(i=0; i<256; ++i) { 721 entry=states->stateTable[trailState][i]; 722 switch(MBCS_ENTRY_FINAL_ACTION(entry)) { 723 case MBCS_STATE_VALID_16: 724 case MBCS_STATE_VALID_16_PAIR: 725 states->stateTable[newState][i]=MBCS_ENTRY_FINAL_SET_ACTION_VALUE(entry, MBCS_STATE_UNASSIGNED, 0xfffe); 726 break; 727 default: 728 states->stateTable[newState][i]=entry; 729 break; 730 } 731 } 732 733 /* in the lead state, redirect all lead bytes with all-unassigned trail bytes to the new state */ 734 for(i=0; i<256; ++i) { 735 if(count[i]>0) { 736 states->stateTable[leadState][i]=MBCS_ENTRY_SET_STATE(states->stateTable[leadState][i], newState); 737 } 738 } 739 740 /* sum up the new state table */ 741 for(i=0; i<states->countStates; ++i) { 742 states->stateFlags[i]&=~MBCS_STATE_FLAG_READY; 743 } 744 sum=sumUpStates(states); 745 746 /* allocate a new, smaller code units array */ 747 oldUnicodeCodeUnits=*pUnicodeCodeUnits; 748 if(sum==0) { 749 *pUnicodeCodeUnits=NULL; 750 if(oldUnicodeCodeUnits!=NULL) { 751 uprv_free(oldUnicodeCodeUnits); 752 } 753 uprv_free(oldStateTable); 754 return; 755 } 756 *pUnicodeCodeUnits=(uint16_t *)uprv_malloc(sum*sizeof(uint16_t)); 757 if(*pUnicodeCodeUnits==NULL) { 758 fprintf(stderr, "cannot compact toUnicode: out of memory allocating %ld 16-bit code units\n", 759 (long)sum); 760 /* revert to the old state table */ 761 *pUnicodeCodeUnits=oldUnicodeCodeUnits; 762 --states->countStates; 763 uprv_memcpy(states->stateTable, oldStateTable, states->countStates*1024); 764 uprv_free(oldStateTable); 765 return; 766 } 767 for(i=0; i<sum; ++i) { 768 (*pUnicodeCodeUnits)[i]=0xfffe; 769 } 770 771 /* copy the code units for all assigned characters */ 772 /* 773 * The old state table has the same lead _and_ trail states for assigned characters! 774 * The differences are in the offsets, and in the trail states for some unassigned characters. 775 * For each character with an assigned state in the new table, it was assigned in the old one. 776 * Only still-assigned characters are copied. 777 * Note that fallback mappings need to get their offset values adjusted. 778 */ 779 780 /* for each initial state */ 781 for(leadState=0; leadState<states->countStates; ++leadState) { 782 if((states->stateFlags[leadState]&0xf)==MBCS_STATE_FLAG_DIRECT) { 783 /* for each lead byte from there */ 784 for(i=0; i<256; ++i) { 785 entry=states->stateTable[leadState][i]; 786 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 787 trailState=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 788 /* the new state does not have assigned states */ 789 if(trailState!=newState) { 790 trailOffset=MBCS_ENTRY_TRANSITION_OFFSET(entry); 791 oldTrailOffset=MBCS_ENTRY_TRANSITION_OFFSET(oldStateTable[leadState][i]); 792 /* for each trail byte */ 793 for(j=0; j<256; ++j) { 794 entry=states->stateTable[trailState][j]; 795 /* copy assigned-character code units and adjust fallback offsets */ 796 switch(MBCS_ENTRY_FINAL_ACTION(entry)) { 797 case MBCS_STATE_VALID_16: 798 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); 799 /* find the old offset according to the old state table */ 800 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); 801 unit=(*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; 802 if(unit==0xfffe && (fallback=ucm_findFallback(toUFallbacks, countToUFallbacks, oldOffset))>=0) { 803 toUFallbacks[fallback].offset=0x80000000|offset; 804 } 805 break; 806 case MBCS_STATE_VALID_16_PAIR: 807 offset=trailOffset+MBCS_ENTRY_FINAL_VALUE_16(entry); 808 /* find the old offset according to the old state table */ 809 oldOffset=oldTrailOffset+MBCS_ENTRY_FINAL_VALUE_16(oldStateTable[trailState][j]); 810 (*pUnicodeCodeUnits)[offset++]=oldUnicodeCodeUnits[oldOffset++]; 811 (*pUnicodeCodeUnits)[offset]=oldUnicodeCodeUnits[oldOffset]; 812 break; 813 default: 814 break; 815 } 816 } 817 } 818 } 819 } 820 } 821 } 822 823 /* remove temporary flags from fallback offsets that protected them from being modified twice */ 824 for(i=0; i<countToUFallbacks; ++i) { 825 toUFallbacks[i].offset&=0x7fffffff; 826 } 827 828 /* free temporary memory */ 829 uprv_free(oldUnicodeCodeUnits); 830 uprv_free(oldStateTable); 831 } 832 833 /* 834 * recursive sub-function of compactToUnicodeHelper() 835 * returns: 836 * >0 number of bytes that are used in unicodeCodeUnits[] that could be saved, 837 * if all sequences from this state are unassigned, returns the 838 * <0 there are assignments in unicodeCodeUnits[] 839 * 0 no use of unicodeCodeUnits[] 840 */ 841 static int32_t 842 findUnassigned(UCMStates *states, 843 uint16_t *unicodeCodeUnits, 844 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 845 int32_t state, int32_t offset, uint32_t b) { 846 int32_t i, entry, savings, localSavings, belowSavings; 847 UBool haveAssigned; 848 849 localSavings=belowSavings=0; 850 haveAssigned=FALSE; 851 for(i=0; i<256; ++i) { 852 entry=states->stateTable[state][i]; 853 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 854 savings=findUnassigned(states, 855 unicodeCodeUnits, 856 toUFallbacks, countToUFallbacks, 857 MBCS_ENTRY_TRANSITION_STATE(entry), 858 offset+MBCS_ENTRY_TRANSITION_OFFSET(entry), 859 (b<<8)|(uint32_t)i); 860 if(savings<0) { 861 haveAssigned=TRUE; 862 } else if(savings>0) { 863 printf(" all-unassigned sequences from prefix 0x%02lx state %ld use %ld bytes\n", 864 (unsigned long)((b<<8)|i), (long)state, (long)savings); 865 belowSavings+=savings; 866 } 867 } else if(!haveAssigned) { 868 switch(MBCS_ENTRY_FINAL_ACTION(entry)) { 869 case MBCS_STATE_VALID_16: 870 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 871 if(unicodeCodeUnits[entry]==0xfffe && ucm_findFallback(toUFallbacks, countToUFallbacks, entry)<0) { 872 localSavings+=2; 873 } else { 874 haveAssigned=TRUE; 875 } 876 break; 877 case MBCS_STATE_VALID_16_PAIR: 878 entry=offset+MBCS_ENTRY_FINAL_VALUE_16(entry); 879 if(unicodeCodeUnits[entry]==0xfffe) { 880 localSavings+=4; 881 } else { 882 haveAssigned=TRUE; 883 } 884 break; 885 default: 886 break; 887 } 888 } 889 } 890 if(haveAssigned) { 891 return -1; 892 } else { 893 return localSavings+belowSavings; 894 } 895 } 896 897 /* helper function for finding compaction opportunities */ 898 static void 899 compactToUnicodeHelper(UCMStates *states, 900 uint16_t *unicodeCodeUnits, 901 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks) { 902 int32_t state, savings; 903 904 /* for each initial state */ 905 for(state=0; state<states->countStates; ++state) { 906 if((states->stateFlags[state]&0xf)==MBCS_STATE_FLAG_DIRECT) { 907 savings=findUnassigned(states, 908 unicodeCodeUnits, 909 toUFallbacks, countToUFallbacks, 910 state, 0, 0); 911 if(savings>0) { 912 printf(" all-unassigned sequences from initial state %ld use %ld bytes\n", 913 (long)state, (long)savings); 914 } 915 } 916 } 917 } 918 919 static int32_t 920 compareFallbacks(const void *context, const void *fb1, const void *fb2) { 921 return ((const _MBCSToUFallback *)fb1)->offset-((const _MBCSToUFallback *)fb2)->offset; 922 } 923 924 U_CAPI void U_EXPORT2 925 ucm_optimizeStates(UCMStates *states, 926 uint16_t **pUnicodeCodeUnits, 927 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 928 UBool verbose) { 929 UErrorCode errorCode; 930 int32_t state, cell, entry; 931 932 /* test each state table entry */ 933 for(state=0; state<states->countStates; ++state) { 934 for(cell=0; cell<256; ++cell) { 935 entry=states->stateTable[state][cell]; 936 /* 937 * if the entry is a final one with an MBCS_STATE_VALID_DIRECT_16 action code 938 * and the code point is "unassigned" (0xfffe), then change it to 939 * the "unassigned" action code with bits 26..23 set to zero and U+fffe. 940 */ 941 if(MBCS_ENTRY_SET_STATE(entry, 0)==MBCS_ENTRY_FINAL(0, MBCS_STATE_VALID_DIRECT_16, 0xfffe)) { 942 states->stateTable[state][cell]=MBCS_ENTRY_FINAL_SET_ACTION(entry, MBCS_STATE_UNASSIGNED); 943 } 944 } 945 } 946 947 /* try to compact the toUnicode tables */ 948 if(states->maxCharLength==2) { 949 compactToUnicode2(states, pUnicodeCodeUnits, toUFallbacks, countToUFallbacks, verbose); 950 } else if(states->maxCharLength>2) { 951 if(verbose) { 952 compactToUnicodeHelper(states, *pUnicodeCodeUnits, toUFallbacks, countToUFallbacks); 953 } 954 } 955 956 /* sort toUFallbacks */ 957 /* 958 * It should be safe to sort them before compactToUnicode2() is called, 959 * because it should not change the relative order of the offset values 960 * that it adjusts, but they need to be sorted at some point, and 961 * it is safest here. 962 */ 963 if(countToUFallbacks>0) { 964 errorCode=U_ZERO_ERROR; /* nothing bad will happen... */ 965 uprv_sortArray(toUFallbacks, countToUFallbacks, 966 sizeof(_MBCSToUFallback), 967 compareFallbacks, NULL, FALSE, &errorCode); 968 } 969 } 970 971 /* use a complete state table ----------------------------------------------- */ 972 973 U_CAPI int32_t U_EXPORT2 974 ucm_countChars(UCMStates *states, 975 const uint8_t *bytes, int32_t length) { 976 uint32_t offset; 977 int32_t i, entry, count; 978 uint8_t state; 979 980 offset=0; 981 count=0; 982 state=0; 983 984 if(states->countStates==0) { 985 fprintf(stderr, "ucm error: there is no state information!\n"); 986 return -1; 987 } 988 989 /* for SI/SO (like EBCDIC-stateful), double-byte sequences start in state 1 */ 990 if(length==2 && states->outputType==MBCS_OUTPUT_2_SISO) { 991 state=1; 992 } 993 994 /* 995 * Walk down the state table like in conversion, 996 * much like getNextUChar(). 997 * We assume that c<=0x10ffff. 998 */ 999 for(i=0; i<length; ++i) { 1000 entry=states->stateTable[state][bytes[i]]; 1001 if(MBCS_ENTRY_IS_TRANSITION(entry)) { 1002 state=(uint8_t)MBCS_ENTRY_TRANSITION_STATE(entry); 1003 offset+=MBCS_ENTRY_TRANSITION_OFFSET(entry); 1004 } else { 1005 switch(MBCS_ENTRY_FINAL_ACTION(entry)) { 1006 case MBCS_STATE_ILLEGAL: 1007 fprintf(stderr, "ucm error: byte sequence ends in illegal state\n"); 1008 return -1; 1009 case MBCS_STATE_CHANGE_ONLY: 1010 fprintf(stderr, "ucm error: byte sequence ends in state-change-only\n"); 1011 return -1; 1012 case MBCS_STATE_UNASSIGNED: 1013 case MBCS_STATE_FALLBACK_DIRECT_16: 1014 case MBCS_STATE_VALID_DIRECT_16: 1015 case MBCS_STATE_FALLBACK_DIRECT_20: 1016 case MBCS_STATE_VALID_DIRECT_20: 1017 case MBCS_STATE_VALID_16: 1018 case MBCS_STATE_VALID_16_PAIR: 1019 /* count a complete character and prepare for a new one */ 1020 ++count; 1021 state=(uint8_t)MBCS_ENTRY_FINAL_STATE(entry); 1022 offset=0; 1023 break; 1024 default: 1025 /* reserved, must never occur */ 1026 fprintf(stderr, "ucm error: byte sequence reached reserved action code, entry: 0x%02lx\n", (unsigned long)entry); 1027 return -1; 1028 } 1029 } 1030 } 1031 1032 if(offset!=0) { 1033 fprintf(stderr, "ucm error: byte sequence too short, ends in non-final state %u\n", state); 1034 return -1; 1035 } 1036 1037 /* 1038 * for SI/SO (like EBCDIC-stateful), multiple-character results 1039 * must consist of only double-byte sequences 1040 */ 1041 if(count>1 && states->outputType==MBCS_OUTPUT_2_SISO && length!=2*count) { 1042 fprintf(stderr, "ucm error: SI/SO (like EBCDIC-stateful) result with %d characters does not contain all DBCS\n", (int)count); 1043 return -1; 1044 } 1045 1046 return count; 1047 } 1048 #endif 1049 1050