1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both 2 * licenses follows. 3 */ 4 5 /* LibHnj - a library for high quality hyphenation and justification 6 * Copyright (C) 1998 Raph Levien, 7 * (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org), 8 * (C) 2001 Peter Novodvorsky (nidd (at) cs.msu.su) 9 * (C) 2006, 2007, 2008 Lszl Nmeth (nemeth at OOo) 10 * 11 * This library is free software; you can redistribute it and/or 12 * modify it under the terms of the GNU Library General Public 13 * License as published by the Free Software Foundation; either 14 * version 2 of the License, or (at your option) any later version. 15 * 16 * This library is distributed in the hope that it will be useful, 17 * but WITHOUT ANY WARRANTY; without even the implied warranty of 18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 19 * Library General Public License for more details. 20 * 21 * You should have received a copy of the GNU Library General Public 22 * License along with this library; if not, write to the 23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 24 * Boston, MA 02111-1307 USA. 25 */ 26 27 /* 28 * The contents of this file are subject to the Mozilla Public License 29 * Version 1.0 (the "MPL"); you may not use this file except in 30 * compliance with the MPL. You may obtain a copy of the MPL at 31 * http://www.mozilla.org/MPL/ 32 * 33 * Software distributed under the MPL is distributed on an "AS IS" basis, 34 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL 35 * for the specific language governing rights and limitations under the 36 * MPL. 37 * 38 */ 39 #include <fcntl.h> 40 #include <sys/mman.h> 41 #include <sys/stat.h> 42 #include <stdlib.h> /* for NULL, malloc */ 43 #include <stdio.h> /* for fprintf */ 44 #include <string.h> /* for strdup */ 45 #include <unistd.h> /* for close */ 46 47 #define noVERBOSE 48 49 #include "hnjalloc.h" 50 #include "hyphen.h" 51 52 static char * 53 hnj_strdup (const char *s) 54 { 55 char *new; 56 int l; 57 58 l = strlen (s); 59 new = hnj_malloc (l + 1); 60 memcpy (new, s, l); 61 new[l] = 0; 62 return new; 63 } 64 65 /* remove cross-platform text line end characters */ 66 void hnj_strchomp(char * s) 67 { 68 int k = strlen(s); 69 if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0'; 70 if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0'; 71 } 72 73 /* a little bit of a hash table implementation. This simply maps strings 74 to state numbers */ 75 76 typedef struct _HashTab HashTab; 77 typedef struct _HashEntry HashEntry; 78 79 /* A cheap, but effective, hack. */ 80 #define HASH_SIZE 31627 81 82 struct _HashTab { 83 HashEntry *entries[HASH_SIZE]; 84 }; 85 86 struct _HashEntry { 87 HashEntry *next; 88 char *key; 89 int val; 90 }; 91 92 /* a char* hash function from ASU - adapted from Gtk+ */ 93 static unsigned int 94 hnj_string_hash (const char *s) 95 { 96 const char *p; 97 unsigned int h=0, g; 98 for(p = s; *p != '\0'; p += 1) { 99 h = ( h << 4 ) + *p; 100 if ( ( g = h & 0xf0000000 ) ) { 101 h = h ^ (g >> 24); 102 h = h ^ g; 103 } 104 } 105 return h /* % M */; 106 } 107 108 static HashTab * 109 hnj_hash_new (void) 110 { 111 HashTab *hashtab; 112 int i; 113 114 hashtab = hnj_malloc (sizeof(HashTab)); 115 for (i = 0; i < HASH_SIZE; i++) 116 hashtab->entries[i] = NULL; 117 118 return hashtab; 119 } 120 121 static void 122 hnj_hash_free (HashTab *hashtab) 123 { 124 int i; 125 HashEntry *e, *next; 126 127 for (i = 0; i < HASH_SIZE; i++) 128 for (e = hashtab->entries[i]; e; e = next) 129 { 130 next = e->next; 131 hnj_free (e->key); 132 hnj_free (e); 133 } 134 135 hnj_free (hashtab); 136 } 137 138 /* assumes that key is not already present! */ 139 static void 140 hnj_hash_insert (HashTab *hashtab, const char *key, int val) 141 { 142 int i; 143 HashEntry *e; 144 145 i = hnj_string_hash (key) % HASH_SIZE; 146 e = hnj_malloc (sizeof(HashEntry)); 147 e->next = hashtab->entries[i]; 148 e->key = hnj_strdup (key); 149 e->val = val; 150 hashtab->entries[i] = e; 151 } 152 153 /* return val if found, otherwise -1 */ 154 static int 155 hnj_hash_lookup (HashTab *hashtab, const char *key) 156 { 157 int i; 158 HashEntry *e; 159 i = hnj_string_hash (key) % HASH_SIZE; 160 for (e = hashtab->entries[i]; e; e = e->next) 161 if (!strcmp (key, e->key)) 162 return e->val; 163 return -1; 164 } 165 166 /* Get the state number, allocating a new state if necessary. */ 167 static int 168 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string) 169 { 170 int state_num; 171 172 state_num = hnj_hash_lookup (hashtab, string); 173 174 if (state_num >= 0) 175 return state_num; 176 177 hnj_hash_insert (hashtab, string, dict->num_states); 178 /* predicate is true if dict->num_states is a power of two */ 179 if (!(dict->num_states & (dict->num_states - 1))) 180 { 181 dict->states = hnj_realloc (dict->states, 182 (dict->num_states << 1) * 183 sizeof(HyphenState)); 184 } 185 dict->states[dict->num_states].match = NULL; 186 dict->states[dict->num_states].repl = NULL; 187 dict->states[dict->num_states].fallback_state = -1; 188 dict->states[dict->num_states].num_trans = 0; 189 dict->states[dict->num_states].trans = NULL; 190 return dict->num_states++; 191 } 192 193 /* add a transition from state1 to state2 through ch - assumes that the 194 transition does not already exist */ 195 static void 196 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch) 197 { 198 int num_trans; 199 200 num_trans = dict->states[state1].num_trans; 201 if (num_trans == 0) 202 { 203 dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans)); 204 } 205 else if (!(num_trans & (num_trans - 1))) 206 { 207 dict->states[state1].trans = hnj_realloc (dict->states[state1].trans, 208 (num_trans << 1) * 209 sizeof(HyphenTrans)); 210 } 211 dict->states[state1].trans[num_trans].ch = ch; 212 dict->states[state1].trans[num_trans].new_state = state2; 213 dict->states[state1].num_trans++; 214 } 215 216 #ifdef VERBOSE 217 HashTab *global; 218 219 static char * 220 get_state_str (int state) 221 { 222 int i; 223 HashEntry *e; 224 225 for (i = 0; i < HASH_SIZE; i++) 226 for (e = global->entries[i]; e; e = e->next) 227 if (e->val == state) 228 return e->key; 229 return NULL; 230 } 231 #endif 232 233 // Get a line from the dictionary contents. 234 static char * 235 get_line (char *s, int size, const char *dict_contents, int dict_length, 236 int *dict_ptr) 237 { 238 int len = 0; 239 while (len < (size - 1) && *dict_ptr < dict_length) { 240 s[len++] = *(dict_contents + *dict_ptr); 241 (*dict_ptr)++; 242 if (s[len - 1] == '\n') 243 break; 244 } 245 s[len] = '\0'; 246 if (len > 0) { 247 return s; 248 } else { 249 return NULL; 250 } 251 } 252 253 HyphenDict * 254 hnj_hyphen_load (const char *fn) 255 { 256 if (fn == NULL) 257 return NULL; 258 const int fd = open(fn, O_RDONLY); 259 if (fd == -1) 260 return NULL; 261 struct stat sb; 262 if (fstat(fd, &sb) == -1) { /* To obtain file size */ 263 close(fd); 264 return NULL; 265 } 266 267 const char *addr = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); 268 if (addr == MAP_FAILED) { 269 close(fd); 270 return NULL; 271 } 272 HyphenDict *dict = hnj_hyphen_load_from_buffer(addr, sb.st_size); 273 munmap((void *)addr, sb.st_size); 274 close(fd); 275 276 return dict; 277 } 278 279 HyphenDict * 280 hnj_hyphen_load_from_buffer (const char *dict_contents, int dict_length) 281 { 282 HyphenDict *dict[2]; 283 HashTab *hashtab; 284 char buf[MAX_CHARS]; 285 char word[MAX_CHARS]; 286 char pattern[MAX_CHARS]; 287 char * repl; 288 signed char replindex; 289 signed char replcut; 290 int state_num = 0, last_state; 291 int i, j, k; 292 char ch; 293 int found; 294 HashEntry *e; 295 int nextlevel = 0; 296 297 if (dict_contents == NULL) 298 return NULL; 299 300 int dict_ptr = 0; 301 // loading one or two dictionaries (separated by NEXTLEVEL keyword) 302 for (k = 0; k == 0 || (k == 1 && nextlevel); k++) { 303 hashtab = hnj_hash_new (); 304 #ifdef VERBOSE 305 global = hashtab; 306 #endif 307 hnj_hash_insert (hashtab, "", 0); 308 dict[k] = hnj_malloc (sizeof(HyphenDict)); 309 dict[k]->num_states = 1; 310 dict[k]->states = hnj_malloc (sizeof(HyphenState)); 311 dict[k]->states[0].match = NULL; 312 dict[k]->states[0].repl = NULL; 313 dict[k]->states[0].fallback_state = -1; 314 dict[k]->states[0].num_trans = 0; 315 dict[k]->states[0].trans = NULL; 316 dict[k]->nextlevel = NULL; 317 dict[k]->lhmin = 0; 318 dict[k]->rhmin = 0; 319 dict[k]->clhmin = 0; 320 dict[k]->crhmin = 0; 321 322 /* read in character set info */ 323 if (k == 0) { 324 for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0; 325 get_line(dict[k]->cset, sizeof(dict[k]->cset), dict_contents, 326 dict_length, &dict_ptr); 327 for (i=0;i<MAX_NAME;i++) 328 if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n')) 329 dict[k]->cset[i] = 0; 330 dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0); 331 } else { 332 strcpy(dict[k]->cset, dict[0]->cset); 333 dict[k]->utf8 = dict[0]->utf8; 334 } 335 336 while (get_line(buf, sizeof(buf), dict_contents, dict_length, 337 &dict_ptr) != NULL) 338 { 339 if (buf[0] != '%') 340 { 341 if (strncmp(buf, "NEXTLEVEL", 9) == 0) { 342 nextlevel = 1; 343 break; 344 } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) { 345 dict[k]->lhmin = atoi(buf + 13); 346 continue; 347 } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) { 348 dict[k]->rhmin = atoi(buf + 14); 349 continue; 350 } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) { 351 dict[k]->clhmin = atoi(buf + 21); 352 continue; 353 } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) { 354 dict[k]->crhmin = atoi(buf + 22); 355 continue; 356 } 357 j = 0; 358 pattern[j] = '0'; 359 repl = strchr(buf, '/'); 360 replindex = 0; 361 replcut = 0; 362 if (repl) { 363 char * index = strchr(repl + 1, ','); 364 *repl = '\0'; 365 if (index) { 366 char * index2 = strchr(index + 1, ','); 367 *index = '\0'; 368 if (index2) { 369 *index2 = '\0'; 370 replindex = (signed char) atoi(index + 1) - 1; 371 replcut = (signed char) atoi(index2 + 1); 372 } 373 } else { 374 hnj_strchomp(repl + 1); 375 replindex = 0; 376 replcut = strlen(buf); 377 } 378 repl = hnj_strdup(repl + 1); 379 } 380 for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++) 381 { 382 if (buf[i] >= '0' && buf[i] <= '9') 383 pattern[j] = buf[i]; 384 else 385 { 386 word[j] = buf[i]; 387 pattern[++j] = '0'; 388 } 389 } 390 word[j] = '\0'; 391 pattern[j + 1] = '\0'; 392 393 i = 0; 394 if (!repl) { 395 /* Optimize away leading zeroes */ 396 for (; pattern[i] == '0'; i++); 397 } else { 398 if (*word == '.') i++; 399 /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */ 400 if (dict[k]->utf8) { 401 int pu = -1; /* unicode character position */ 402 int ps = -1; /* unicode start position (original replindex) */ 403 int pc = (*word == '.') ? 1: 0; /* 8-bit character position */ 404 for (; pc < (strlen(word) + 1); pc++) { 405 /* beginning of an UTF-8 character (not '10' start bits) */ 406 if ((((unsigned char) word[pc]) >> 6) != 2) pu++; 407 if ((ps < 0) && (replindex == pu)) { 408 ps = replindex; 409 replindex = pc; 410 } 411 if ((ps >= 0) && ((pu - ps) == replcut)) { 412 replcut = (pc - replindex); 413 break; 414 } 415 } 416 if (*word == '.') replindex--; 417 } 418 } 419 420 #ifdef VERBOSE 421 printf ("word %s pattern %s, j = %d repl: %s\n", word, pattern + i, j, repl); 422 #endif 423 found = hnj_hash_lookup (hashtab, word); 424 state_num = hnj_get_state (dict[k], hashtab, word); 425 dict[k]->states[state_num].match = hnj_strdup (pattern + i); 426 dict[k]->states[state_num].repl = repl; 427 dict[k]->states[state_num].replindex = replindex; 428 if (!replcut) { 429 dict[k]->states[state_num].replcut = strlen(word); 430 } else { 431 dict[k]->states[state_num].replcut = replcut; 432 } 433 434 /* now, put in the prefix transitions */ 435 for (; found < 0 ;j--) 436 { 437 last_state = state_num; 438 ch = word[j - 1]; 439 word[j - 1] = '\0'; 440 found = hnj_hash_lookup (hashtab, word); 441 state_num = hnj_get_state (dict[k], hashtab, word); 442 hnj_add_trans (dict[k], state_num, last_state, ch); 443 } 444 } 445 } 446 447 /* Could do unioning of matches here (instead of the preprocessor script). 448 If we did, the pseudocode would look something like this: 449 450 foreach state in the hash table 451 foreach i = [1..length(state) - 1] 452 state to check is substr (state, i) 453 look it up 454 if found, and if there is a match, union the match in. 455 456 It's also possible to avoid the quadratic blowup by doing the 457 search in order of increasing state string sizes - then you 458 can break the loop after finding the first match. 459 460 This step should be optional in any case - if there is a 461 preprocessed rule table, it's always faster to use that. 462 463 */ 464 465 /* put in the fallback states */ 466 for (i = 0; i < HASH_SIZE; i++) 467 for (e = hashtab->entries[i]; e; e = e->next) 468 { 469 if (*(e->key)) for (j = 1; 1; j++) 470 { 471 state_num = hnj_hash_lookup (hashtab, e->key + j); 472 if (state_num >= 0) 473 break; 474 } 475 /* KBH: FIXME state 0 fallback_state should always be -1? */ 476 if (e->val) 477 dict[k]->states[e->val].fallback_state = state_num; 478 } 479 #ifdef VERBOSE 480 for (i = 0; i < HASH_SIZE; i++) 481 for (e = hashtab->entries[i]; e; e = e->next) 482 { 483 printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val, 484 dict[k]->states[e->val].fallback_state); 485 for (j = 0; j < dict[k]->states[e->val].num_trans; j++) 486 printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch, 487 dict[k]->states[e->val].trans[j].new_state); 488 } 489 #endif 490 491 #ifndef VERBOSE 492 hnj_hash_free (hashtab); 493 #endif 494 state_num = 0; 495 } 496 if (k == 2) dict[0]->nextlevel = dict[1]; 497 return dict[0]; 498 } 499 500 void hnj_hyphen_free (HyphenDict *dict) 501 { 502 int state_num; 503 HyphenState *hstate; 504 505 for (state_num = 0; state_num < dict->num_states; state_num++) 506 { 507 hstate = &dict->states[state_num]; 508 if (hstate->match) 509 hnj_free (hstate->match); 510 if (hstate->repl) 511 hnj_free (hstate->repl); 512 if (hstate->trans) 513 hnj_free (hstate->trans); 514 } 515 if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel); 516 517 hnj_free (dict->states); 518 519 hnj_free (dict); 520 } 521 522 #define MAX_WORD 256 523 524 int hnj_hyphen_hyphenate (HyphenDict *dict, 525 const char *word, int word_size, 526 char *hyphens) 527 { 528 char prep_word_buf[MAX_WORD]; 529 char *prep_word; 530 int i, j, k; 531 int state; 532 char ch; 533 HyphenState *hstate; 534 char *match; 535 int offset; 536 537 if (word_size + 3 < MAX_WORD) 538 prep_word = prep_word_buf; 539 else 540 prep_word = hnj_malloc (word_size + 3); 541 542 j = 0; 543 prep_word[j++] = '.'; 544 545 for (i = 0; i < word_size; i++) 546 prep_word[j++] = word[i]; 547 548 prep_word[j++] = '.'; 549 prep_word[j] = '\0'; 550 551 for (i = 0; i < j; i++) 552 hyphens[i] = '0'; 553 554 #ifdef VERBOSE 555 printf ("prep_word = %s\n", prep_word); 556 #endif 557 558 /* now, run the finite state machine */ 559 state = 0; 560 for (i = 0; i < j; i++) 561 { 562 ch = prep_word[i]; 563 for (;;) 564 { 565 566 if (state == -1) { 567 /* return 1; */ 568 /* KBH: FIXME shouldn't this be as follows? */ 569 state = 0; 570 goto try_next_letter; 571 } 572 573 #ifdef VERBOSE 574 char *state_str; 575 state_str = get_state_str (state); 576 577 for (k = 0; k < i - strlen (state_str); k++) 578 putchar (' '); 579 printf ("%s", state_str); 580 #endif 581 582 hstate = &dict->states[state]; 583 for (k = 0; k < hstate->num_trans; k++) 584 if (hstate->trans[k].ch == ch) 585 { 586 state = hstate->trans[k].new_state; 587 goto found_state; 588 } 589 state = hstate->fallback_state; 590 #ifdef VERBOSE 591 printf (" falling back, fallback_state %d\n", state); 592 #endif 593 } 594 found_state: 595 #ifdef VERBOSE 596 printf ("found state %d\n",state); 597 #endif 598 /* Additional optimization is possible here - especially, 599 elimination of trailing zeroes from the match. Leading zeroes 600 have already been optimized. */ 601 match = dict->states[state].match; 602 /* replacing rules not handled by hyphen_hyphenate() */ 603 if (match && !dict->states[state].repl) 604 { 605 offset = i + 1 - strlen (match); 606 #ifdef VERBOSE 607 for (k = 0; k < offset; k++) 608 putchar (' '); 609 printf ("%s\n", match); 610 #endif 611 /* This is a linear search because I tried a binary search and 612 found it to be just a teeny bit slower. */ 613 for (k = 0; match[k]; k++) 614 if (hyphens[offset + k] < match[k]) 615 hyphens[offset + k] = match[k]; 616 } 617 618 /* KBH: we need this to make sure we keep looking in a word */ 619 /* for patterns even if the current character is not known in state 0 */ 620 /* since patterns for hyphenation may occur anywhere in the word */ 621 try_next_letter: ; 622 623 } 624 #ifdef VERBOSE 625 for (i = 0; i < j; i++) 626 putchar (hyphens[i]); 627 putchar ('\n'); 628 #endif 629 630 for (i = 0; i < j - 4; i++) 631 #if 0 632 if (hyphens[i + 1] & 1) 633 hyphens[i] = '-'; 634 #else 635 hyphens[i] = hyphens[i + 1]; 636 #endif 637 hyphens[0] = '0'; 638 for (; i < word_size; i++) 639 hyphens[i] = '0'; 640 hyphens[word_size] = '\0'; 641 642 if (prep_word != prep_word_buf) 643 hnj_free (prep_word); 644 645 return 0; 646 } 647 648 /* character length of the first n byte of the input word */ 649 int hnj_hyphen_strnlen(const char * word, int n, int utf8) 650 { 651 int i = 0; 652 int j = 0; 653 while (j < n && word[j] != '\0') { 654 i++; 655 for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++); 656 } 657 return i; 658 } 659 660 int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens, 661 char *** rep, int ** pos, int ** cut, int lhmin) 662 { 663 int i, j; 664 for (i = 1, j = 0; i < lhmin && word[j] != '\0'; i++) do { 665 // check length of the non-standard part 666 if (*rep && *pos && *cut && (*rep)[j]) { 667 char * rh = strchr((*rep)[j], '='); 668 if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) + 669 hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) { 670 free((*rep)[j]); 671 (*rep)[j] = NULL; 672 hyphens[j] = '0'; 673 } 674 } else { 675 hyphens[j] = '0'; 676 } 677 j++; 678 } while (utf8 && (word[j + 1] & 0xc0) == 0xc0); 679 return 0; 680 } 681 682 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens, 683 char *** rep, int ** pos, int ** cut, int rhmin) 684 { 685 int i; 686 int j = word_size - 2; 687 for (i = 1; i < rhmin && j > 0; j--) { 688 // check length of the non-standard part 689 if (*rep && *pos && *cut && (*rep)[j]) { 690 char * rh = strchr((*rep)[j], '='); 691 if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) + 692 hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) { 693 free((*rep)[j]); 694 (*rep)[j] = NULL; 695 hyphens[j] = '0'; 696 } 697 } else { 698 hyphens[j] = '0'; 699 } 700 if (!utf8 || (word[j] & 0xc0) != 0xc0) i++; 701 } 702 return 0; 703 } 704 705 // recursive function for compound level hyphenation 706 int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size, 707 char * hyphens, char *** rep, int ** pos, int ** cut, 708 int clhmin, int crhmin, int lend, int rend) 709 { 710 char prep_word_buf[MAX_WORD]; 711 char *prep_word; 712 int i, j, k; 713 int state; 714 char ch; 715 HyphenState *hstate; 716 char *match; 717 char *repl; 718 signed char replindex; 719 signed char replcut; 720 int offset; 721 int matchlen_buf[MAX_CHARS]; 722 int matchindex_buf[MAX_CHARS]; 723 char * matchrepl_buf[MAX_CHARS]; 724 int * matchlen; 725 int * matchindex; 726 char ** matchrepl; 727 int isrepl = 0; 728 int nHyphCount; 729 730 if (word_size + 3 < MAX_CHARS) { 731 prep_word = prep_word_buf; 732 matchlen = matchlen_buf; 733 matchindex = matchindex_buf; 734 matchrepl = matchrepl_buf; 735 } else { 736 prep_word = hnj_malloc (word_size + 3); 737 matchlen = hnj_malloc ((word_size + 3) * sizeof(int)); 738 matchindex = hnj_malloc ((word_size + 3) * sizeof(int)); 739 matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *)); 740 } 741 742 j = 0; 743 prep_word[j++] = '.'; 744 745 for (i = 0; i < word_size; i++) 746 prep_word[j++] = word[i]; 747 748 prep_word[j++] = '.'; 749 prep_word[j] = '\0'; 750 751 for (i = 0; i < j; i++) 752 hyphens[i] = '0'; 753 754 #ifdef VERBOSE 755 printf ("prep_word = %s\n", prep_word); 756 #endif 757 758 /* now, run the finite state machine */ 759 state = 0; 760 for (i = 0; i < j; i++) 761 { 762 ch = prep_word[i]; 763 for (;;) 764 { 765 766 if (state == -1) { 767 /* return 1; */ 768 /* KBH: FIXME shouldn't this be as follows? */ 769 state = 0; 770 goto try_next_letter; 771 } 772 773 #ifdef VERBOSE 774 char *state_str; 775 state_str = get_state_str (state); 776 777 for (k = 0; k < i - strlen (state_str); k++) 778 putchar (' '); 779 printf ("%s", state_str); 780 #endif 781 782 hstate = &dict->states[state]; 783 for (k = 0; k < hstate->num_trans; k++) 784 if (hstate->trans[k].ch == ch) 785 { 786 state = hstate->trans[k].new_state; 787 goto found_state; 788 } 789 state = hstate->fallback_state; 790 #ifdef VERBOSE 791 printf (" falling back, fallback_state %d\n", state); 792 #endif 793 } 794 found_state: 795 #ifdef VERBOSE 796 printf ("found state %d\n",state); 797 #endif 798 /* Additional optimization is possible here - especially, 799 elimination of trailing zeroes from the match. Leading zeroes 800 have already been optimized. */ 801 match = dict->states[state].match; 802 repl = dict->states[state].repl; 803 replindex = dict->states[state].replindex; 804 replcut = dict->states[state].replcut; 805 /* replacing rules not handled by hyphen_hyphenate() */ 806 if (match) 807 { 808 offset = i + 1 - strlen (match); 809 #ifdef VERBOSE 810 for (k = 0; k < offset; k++) 811 putchar (' '); 812 printf ("%s (%s)\n", match, repl); 813 #endif 814 if (repl) { 815 if (!isrepl) for(; isrepl < word_size; isrepl++) { 816 matchrepl[isrepl] = NULL; 817 matchindex[isrepl] = -1; 818 } 819 matchlen[offset + replindex] = replcut; 820 } 821 /* This is a linear search because I tried a binary search and 822 found it to be just a teeny bit slower. */ 823 for (k = 0; match[k]; k++) { 824 if ((hyphens[offset + k] < match[k])) { 825 hyphens[offset + k] = match[k]; 826 if (match[k]&1) { 827 matchrepl[offset + k] = repl; 828 if (repl && (k >= replindex) && (k <= replindex + replcut)) { 829 matchindex[offset + replindex] = offset + k; 830 } 831 } 832 } 833 } 834 835 } 836 837 /* KBH: we need this to make sure we keep looking in a word */ 838 /* for patterns even if the current character is not known in state 0 */ 839 /* since patterns for hyphenation may occur anywhere in the word */ 840 try_next_letter: ; 841 842 } 843 #ifdef VERBOSE 844 for (i = 0; i < j; i++) 845 putchar (hyphens[i]); 846 putchar ('\n'); 847 #endif 848 849 for (i = 0; i < j - 3; i++) 850 #if 0 851 if (hyphens[i + 1] & 1) 852 hyphens[i] = '-'; 853 #else 854 hyphens[i] = hyphens[i + 1]; 855 #endif 856 for (; i < word_size; i++) 857 hyphens[i] = '0'; 858 hyphens[word_size] = '\0'; 859 860 /* now create a new char string showing hyphenation positions */ 861 /* count the hyphens and allocate space for the new hyphenated string */ 862 nHyphCount = 0; 863 for (i = 0; i < word_size; i++) 864 if (hyphens[i]&1) 865 nHyphCount++; 866 j = 0; 867 for (i = 0; i < word_size; i++) { 868 if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) { 869 if (rep && pos && cut) { 870 if (!*rep && !*pos && !*cut) { 871 int k; 872 *rep = (char **) malloc(sizeof(char *) * word_size); 873 *pos = (int *) malloc(sizeof(int) * word_size); 874 *cut = (int *) malloc(sizeof(int) * word_size); 875 for (k = 0; k < word_size; k++) { 876 (*rep)[k] = NULL; 877 (*pos)[k] = 0; 878 (*cut)[k] = 0; 879 } 880 } 881 (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]); 882 (*pos)[matchindex[i] - 1] = matchindex[i] - i; 883 (*cut)[matchindex[i] - 1] = matchlen[i]; 884 } 885 j += strlen(matchrepl[matchindex[i]]); 886 i += matchlen[i] - 1; 887 } 888 } 889 890 if (matchrepl != matchrepl_buf) { 891 hnj_free (matchrepl); 892 hnj_free (matchlen); 893 hnj_free (matchindex); 894 } 895 896 // recursive hyphenation of the first (compound) level segments 897 if (dict->nextlevel) { 898 char * rep2_buf[MAX_WORD]; 899 int pos2_buf[MAX_WORD]; 900 int cut2_buf[MAX_WORD]; 901 char hyphens2_buf[MAX_WORD]; 902 char ** rep2; 903 int * pos2; 904 int * cut2; 905 char * hyphens2; 906 int begin = 0; 907 if (word_size < MAX_CHARS) { 908 rep2 = rep2_buf; 909 pos2 = pos2_buf; 910 cut2 = cut2_buf; 911 hyphens2 = hyphens2_buf; 912 } else { 913 rep2 = hnj_malloc (word_size * sizeof(char *)); 914 pos2 = hnj_malloc (word_size * sizeof(int)); 915 cut2 = hnj_malloc (word_size * sizeof(int)); 916 hyphens2 = hnj_malloc (word_size); 917 } 918 for (i = 0; i < word_size; i++) rep2[i] = NULL; 919 for (i = 0; i < word_size; i++) 920 if (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) { 921 if (i - begin > 1) { 922 int hyph = 0; 923 prep_word[i + 2] = '\0'; 924 /* non-standard hyphenation at compound boundary (Schiffahrt) */ 925 if (*rep && *pos && *cut && (*rep)[i]) { 926 char * l = strchr((*rep)[i], '='); 927 strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]); 928 if (l) { 929 hyph = (l - (*rep)[i]) - (*pos)[i]; 930 prep_word[2 + i + hyph] = '\0'; 931 } 932 } 933 hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph, 934 hyphens2, &rep2, &pos2, &cut2, clhmin, 935 crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend)); 936 for (j = 0; j < i - begin - 1; j++) { 937 hyphens[begin + j] = hyphens2[j]; 938 if (rep2[j] && rep && pos && cut) { 939 if (!*rep && !*pos && !*cut) { 940 int k; 941 *rep = (char **) malloc(sizeof(char *) * word_size); 942 *pos = (int *) malloc(sizeof(int) * word_size); 943 *cut = (int *) malloc(sizeof(int) * word_size); 944 for (k = 0; k < word_size; k++) { 945 (*rep)[k] = NULL; 946 (*pos)[k] = 0; 947 (*cut)[k] = 0; 948 } 949 } 950 (*rep)[begin + j] = rep2[j]; 951 (*pos)[begin + j] = pos2[j]; 952 (*cut)[begin + j] = cut2[j]; 953 } 954 } 955 prep_word[i + 2] = word[i + 1]; 956 if (*rep && *pos && *cut && (*rep)[i]) { 957 strcpy(prep_word + 1, word); 958 } 959 } 960 begin = i + 1; 961 for (j = 0; j < word_size; j++) rep2[j] = NULL; 962 } 963 964 // non-compound 965 if (begin == 0) { 966 hnj_hyphen_hyph_(dict->nextlevel, word, word_size, 967 hyphens, rep, pos, cut, clhmin, crhmin, lend, rend); 968 if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, 969 rep, pos, cut, clhmin); 970 if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, 971 rep, pos, cut, crhmin); 972 } 973 974 if (rep2 != rep2_buf) { 975 free(rep2); 976 free(cut2); 977 free(pos2); 978 free(hyphens2); 979 } 980 } 981 982 if (prep_word != prep_word_buf) hnj_free (prep_word); 983 return 0; 984 } 985 986 /* UTF-8 normalization of hyphen and non-standard positions */ 987 int hnj_hyphen_norm(const char *word, int word_size, char * hyphens, 988 char *** rep, int ** pos, int ** cut) 989 { 990 if ((((unsigned char) word[0]) >> 6) == 2) { 991 fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word); 992 return 1; 993 } 994 995 /* calculate UTF-8 character positions */ 996 int i, j, k; 997 for (i = 0, j = -1; i < word_size; i++) { 998 /* beginning of an UTF-8 character (not '10' start bits) */ 999 if ((((unsigned char) word[i]) >> 6) != 2) j++; 1000 hyphens[j] = hyphens[i]; 1001 if (rep && pos && cut && *rep && *pos && *cut) { 1002 int l = (*pos)[i]; 1003 (*pos)[j] = 0; 1004 for (k = 0; k < l; k++) { 1005 if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++; 1006 } 1007 k = i - l + 1; 1008 l = k + (*cut)[i]; 1009 (*cut)[j] = 0; 1010 for (; k < l; k++) { 1011 if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++; 1012 } 1013 (*rep)[j] = (*rep)[i]; 1014 if (j < i) { 1015 (*rep)[i] = NULL; 1016 (*pos)[i] = 0; 1017 (*cut)[i] = 0; 1018 } 1019 } 1020 } 1021 hyphens[j + 1] = '\0'; 1022 return 0; 1023 } 1024 1025 /* get the word with all possible hyphenations (output: hyphword) */ 1026 void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens, 1027 char * hyphword, char *** rep, int ** pos, int ** cut) 1028 { 1029 int i, j; 1030 for (i = 0, j = 0; i < l; i++, j++) { 1031 if (hyphens[i]&1) { 1032 hyphword[j] = word[i]; 1033 if (*rep && *pos && *cut && (*rep)[i]) { 1034 strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]); 1035 j += strlen((*rep)[i]) - (*pos)[i]; 1036 i += (*cut)[i] - (*pos)[i]; 1037 } else hyphword[++j] = '='; 1038 } else hyphword[j] = word[i]; 1039 } 1040 hyphword[j] = '\0'; 1041 } 1042 1043 1044 /* main api function with default hyphenmin parameters */ 1045 int hnj_hyphen_hyphenate2 (HyphenDict *dict, 1046 const char *word, int word_size, char * hyphens, 1047 char *hyphword, char *** rep, int ** pos, int ** cut) 1048 { 1049 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, 1050 dict->clhmin, dict->crhmin, 1, 1); 1051 hnj_hyphen_lhmin(dict->utf8, word, word_size, 1052 hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2)); 1053 hnj_hyphen_rhmin(dict->utf8, word, word_size, 1054 hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2)); 1055 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); 1056 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); 1057 return 0; 1058 } 1059 1060 /* previous main api function with hyphenmin parameters */ 1061 int hnj_hyphen_hyphenate3 (HyphenDict *dict, 1062 const char *word, int word_size, char * hyphens, 1063 char *hyphword, char *** rep, int ** pos, int ** cut, 1064 int lhmin, int rhmin, int clhmin, int crhmin) 1065 { 1066 lhmin = (lhmin > 0 ? lhmin : dict->lhmin); 1067 rhmin = (rhmin > 0 ? rhmin : dict->rhmin); 1068 hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut, 1069 clhmin, crhmin, 1, 1); 1070 hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens, 1071 rep, pos, cut, (lhmin > 0 ? lhmin : 2)); 1072 hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens, 1073 rep, pos, cut, (rhmin > 0 ? rhmin : 2)); 1074 if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut); 1075 if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut); 1076 return 0; 1077 } 1078