Home | History | Annotate | Download | only in hyphenation
      1 /* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
      2  * licenses follows.
      3  */
      4 
      5 /* LibHnj - a library for high quality hyphenation and justification
      6  * Copyright (C) 1998 Raph Levien,
      7  * 	     (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
      8  *           (C) 2001 Peter Novodvorsky (nidd (at) cs.msu.su)
      9  *           (C) 2006, 2007, 2008 Lszl Nmeth (nemeth at OOo)
     10  *
     11  * This library is free software; you can redistribute it and/or
     12  * modify it under the terms of the GNU Library General Public
     13  * License as published by the Free Software Foundation; either
     14  * version 2 of the License, or (at your option) any later version.
     15  *
     16  * This library is distributed in the hope that it will be useful,
     17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
     18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     19  * Library General Public License for more details.
     20  *
     21  * You should have received a copy of the GNU Library General Public
     22  * License along with this library; if not, write to the
     23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
     24  * Boston, MA  02111-1307  USA.
     25  */
     26 
     27 /*
     28  * The contents of this file are subject to the Mozilla Public License
     29  * Version 1.0 (the "MPL"); you may not use this file except in
     30  * compliance with the MPL.  You may obtain a copy of the MPL at
     31  * http://www.mozilla.org/MPL/
     32  *
     33  * Software distributed under the MPL is distributed on an "AS IS" basis,
     34  * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
     35  * for the specific language governing rights and limitations under the
     36  * MPL.
     37  *
     38  */
     39 #include <fcntl.h>
     40 #include <sys/mman.h>
     41 #include <sys/stat.h>
     42 #include <stdlib.h> /* for NULL, malloc */
     43 #include <stdio.h>  /* for fprintf */
     44 #include <string.h> /* for strdup */
     45 #include <unistd.h> /* for close */
     46 
     47 #define noVERBOSE
     48 
     49 #include "hnjalloc.h"
     50 #include "hyphen.h"
     51 
     52 static char *
     53 hnj_strdup (const char *s)
     54 {
     55     char *new;
     56     int l;
     57 
     58     l = strlen (s);
     59     new = hnj_malloc (l + 1);
     60     memcpy (new, s, l);
     61     new[l] = 0;
     62     return new;
     63 }
     64 
     65 /* remove cross-platform text line end characters */
     66 void hnj_strchomp(char * s)
     67 {
     68     int k = strlen(s);
     69     if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
     70     if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
     71 }
     72 
     73 /* a little bit of a hash table implementation. This simply maps strings
     74    to state numbers */
     75 
     76 typedef struct _HashTab HashTab;
     77 typedef struct _HashEntry HashEntry;
     78 
     79 /* A cheap, but effective, hack. */
     80 #define HASH_SIZE 31627
     81 
     82 struct _HashTab {
     83     HashEntry *entries[HASH_SIZE];
     84 };
     85 
     86 struct _HashEntry {
     87     HashEntry *next;
     88     char *key;
     89     int val;
     90 };
     91 
     92 /* a char* hash function from ASU - adapted from Gtk+ */
     93 static unsigned int
     94 hnj_string_hash (const char *s)
     95 {
     96     const char *p;
     97     unsigned int h=0, g;
     98     for(p = s; *p != '\0'; p += 1) {
     99         h = ( h << 4 ) + *p;
    100         if ( ( g = h & 0xf0000000 ) ) {
    101             h = h ^ (g >> 24);
    102             h = h ^ g;
    103         }
    104     }
    105     return h /* % M */;
    106 }
    107 
    108 static HashTab *
    109 hnj_hash_new (void)
    110 {
    111     HashTab *hashtab;
    112     int i;
    113 
    114     hashtab = hnj_malloc (sizeof(HashTab));
    115     for (i = 0; i < HASH_SIZE; i++)
    116         hashtab->entries[i] = NULL;
    117 
    118     return hashtab;
    119 }
    120 
    121 static void
    122 hnj_hash_free (HashTab *hashtab)
    123 {
    124     int i;
    125     HashEntry *e, *next;
    126 
    127     for (i = 0; i < HASH_SIZE; i++)
    128         for (e = hashtab->entries[i]; e; e = next)
    129         {
    130             next = e->next;
    131             hnj_free (e->key);
    132             hnj_free (e);
    133         }
    134 
    135     hnj_free (hashtab);
    136 }
    137 
    138 /* assumes that key is not already present! */
    139 static void
    140 hnj_hash_insert (HashTab *hashtab, const char *key, int val)
    141 {
    142     int i;
    143     HashEntry *e;
    144 
    145     i = hnj_string_hash (key) % HASH_SIZE;
    146     e = hnj_malloc (sizeof(HashEntry));
    147     e->next = hashtab->entries[i];
    148     e->key = hnj_strdup (key);
    149     e->val = val;
    150     hashtab->entries[i] = e;
    151 }
    152 
    153 /* return val if found, otherwise -1 */
    154 static int
    155 hnj_hash_lookup (HashTab *hashtab, const char *key)
    156 {
    157     int i;
    158     HashEntry *e;
    159     i = hnj_string_hash (key) % HASH_SIZE;
    160     for (e = hashtab->entries[i]; e; e = e->next)
    161         if (!strcmp (key, e->key))
    162             return e->val;
    163     return -1;
    164 }
    165 
    166 /* Get the state number, allocating a new state if necessary. */
    167 static int
    168 hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
    169 {
    170     int state_num;
    171 
    172     state_num = hnj_hash_lookup (hashtab, string);
    173 
    174     if (state_num >= 0)
    175         return state_num;
    176 
    177     hnj_hash_insert (hashtab, string, dict->num_states);
    178     /* predicate is true if dict->num_states is a power of two */
    179     if (!(dict->num_states & (dict->num_states - 1)))
    180     {
    181         dict->states = hnj_realloc (dict->states,
    182             (dict->num_states << 1) *
    183             sizeof(HyphenState));
    184     }
    185     dict->states[dict->num_states].match = NULL;
    186     dict->states[dict->num_states].repl = NULL;
    187     dict->states[dict->num_states].fallback_state = -1;
    188     dict->states[dict->num_states].num_trans = 0;
    189     dict->states[dict->num_states].trans = NULL;
    190     return dict->num_states++;
    191 }
    192 
    193 /* add a transition from state1 to state2 through ch - assumes that the
    194    transition does not already exist */
    195 static void
    196 hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
    197 {
    198     int num_trans;
    199 
    200     num_trans = dict->states[state1].num_trans;
    201     if (num_trans == 0)
    202     {
    203         dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
    204     }
    205     else if (!(num_trans & (num_trans - 1)))
    206     {
    207         dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
    208             (num_trans << 1) *
    209             sizeof(HyphenTrans));
    210     }
    211     dict->states[state1].trans[num_trans].ch = ch;
    212     dict->states[state1].trans[num_trans].new_state = state2;
    213     dict->states[state1].num_trans++;
    214 }
    215 
    216 #ifdef VERBOSE
    217 HashTab *global;
    218 
    219 static char *
    220 get_state_str (int state)
    221 {
    222     int i;
    223     HashEntry *e;
    224 
    225     for (i = 0; i < HASH_SIZE; i++)
    226         for (e = global->entries[i]; e; e = e->next)
    227             if (e->val == state)
    228                 return e->key;
    229     return NULL;
    230 }
    231 #endif
    232 
    233 // Get a line from the dictionary contents.
    234 static char *
    235 get_line (char *s, int size, const char *dict_contents, int dict_length,
    236     int *dict_ptr)
    237 {
    238     int len = 0;
    239     while (len < (size - 1) && *dict_ptr < dict_length) {
    240         s[len++] = *(dict_contents + *dict_ptr);
    241         (*dict_ptr)++;
    242         if (s[len - 1] == '\n')
    243             break;
    244     }
    245     s[len] = '\0';
    246     if (len > 0) {
    247         return s;
    248     } else {
    249         return NULL;
    250     }
    251 }
    252 
    253 HyphenDict *
    254 hnj_hyphen_load (const char *fn)
    255 {
    256     if (fn == NULL)
    257         return NULL;
    258     const int fd = open(fn, O_RDONLY);
    259     if (fd == -1)
    260         return NULL;
    261     struct stat sb;
    262     if (fstat(fd, &sb) == -1)  {  /* To obtain file size */
    263         close(fd);
    264         return NULL;
    265     }
    266 
    267     const char *addr = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
    268     if (addr == MAP_FAILED) {
    269         close(fd);
    270         return NULL;
    271     }
    272     HyphenDict *dict = hnj_hyphen_load_from_buffer(addr, sb.st_size);
    273     munmap((void *)addr, sb.st_size);
    274     close(fd);
    275 
    276     return dict;
    277 }
    278 
    279 HyphenDict *
    280 hnj_hyphen_load_from_buffer (const char *dict_contents, int dict_length)
    281 {
    282     HyphenDict *dict[2];
    283     HashTab *hashtab;
    284     char buf[MAX_CHARS];
    285     char word[MAX_CHARS];
    286     char pattern[MAX_CHARS];
    287     char * repl;
    288     signed char replindex;
    289     signed char replcut;
    290     int state_num = 0, last_state;
    291     int i, j, k;
    292     char ch;
    293     int found;
    294     HashEntry *e;
    295     int nextlevel = 0;
    296 
    297     if (dict_contents == NULL)
    298         return NULL;
    299 
    300     int dict_ptr = 0;
    301 // loading one or two dictionaries (separated by NEXTLEVEL keyword)
    302     for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
    303         hashtab = hnj_hash_new ();
    304 #ifdef VERBOSE
    305         global = hashtab;
    306 #endif
    307         hnj_hash_insert (hashtab, "", 0);
    308         dict[k] = hnj_malloc (sizeof(HyphenDict));
    309         dict[k]->num_states = 1;
    310         dict[k]->states = hnj_malloc (sizeof(HyphenState));
    311         dict[k]->states[0].match = NULL;
    312         dict[k]->states[0].repl = NULL;
    313         dict[k]->states[0].fallback_state = -1;
    314         dict[k]->states[0].num_trans = 0;
    315         dict[k]->states[0].trans = NULL;
    316         dict[k]->nextlevel = NULL;
    317         dict[k]->lhmin = 0;
    318         dict[k]->rhmin = 0;
    319         dict[k]->clhmin = 0;
    320         dict[k]->crhmin = 0;
    321 
    322         /* read in character set info */
    323         if (k == 0) {
    324             for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
    325             get_line(dict[k]->cset, sizeof(dict[k]->cset), dict_contents,
    326                 dict_length, &dict_ptr);
    327             for (i=0;i<MAX_NAME;i++)
    328                 if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
    329                     dict[k]->cset[i] = 0;
    330             dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
    331         } else {
    332             strcpy(dict[k]->cset, dict[0]->cset);
    333             dict[k]->utf8 = dict[0]->utf8;
    334         }
    335 
    336         while (get_line(buf, sizeof(buf), dict_contents, dict_length,
    337                 &dict_ptr) != NULL)
    338         {
    339             if (buf[0] != '%')
    340             {
    341                 if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
    342                     nextlevel = 1;
    343                     break;
    344                 } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
    345                     dict[k]->lhmin = atoi(buf + 13);
    346                     continue;
    347                 } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
    348                     dict[k]->rhmin = atoi(buf + 14);
    349                     continue;
    350                 } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
    351                     dict[k]->clhmin = atoi(buf + 21);
    352                     continue;
    353                 } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
    354                     dict[k]->crhmin = atoi(buf + 22);
    355                     continue;
    356                 }
    357                 j = 0;
    358                 pattern[j] = '0';
    359                 repl = strchr(buf, '/');
    360                 replindex = 0;
    361                 replcut = 0;
    362                 if (repl) {
    363                     char * index = strchr(repl + 1, ',');
    364                     *repl = '\0';
    365                     if (index) {
    366                         char * index2 = strchr(index + 1, ',');
    367                         *index = '\0';
    368                         if (index2) {
    369                             *index2 = '\0';
    370                             replindex = (signed char) atoi(index + 1) - 1;
    371                             replcut = (signed char) atoi(index2 + 1);
    372                         }
    373                     } else {
    374                         hnj_strchomp(repl + 1);
    375                         replindex = 0;
    376                         replcut = strlen(buf);
    377                     }
    378                     repl = hnj_strdup(repl + 1);
    379                 }
    380                 for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
    381                 {
    382                     if (buf[i] >= '0' && buf[i] <= '9')
    383                         pattern[j] = buf[i];
    384                     else
    385                     {
    386                         word[j] = buf[i];
    387                         pattern[++j] = '0';
    388                     }
    389                 }
    390                 word[j] = '\0';
    391                 pattern[j + 1] = '\0';
    392 
    393                 i = 0;
    394                 if (!repl) {
    395                     /* Optimize away leading zeroes */
    396                     for (; pattern[i] == '0'; i++);
    397                 } else {
    398                     if (*word == '.') i++;
    399                     /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
    400                     if (dict[k]->utf8) {
    401                         int pu = -1;        /* unicode character position */
    402                         int ps = -1;        /* unicode start position (original replindex) */
    403                         int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
    404                         for (; pc < (strlen(word) + 1); pc++) {
    405                             /* beginning of an UTF-8 character (not '10' start bits) */
    406                             if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
    407                             if ((ps < 0) && (replindex == pu)) {
    408                                 ps = replindex;
    409                                 replindex = pc;
    410                             }
    411                             if ((ps >= 0) && ((pu - ps) == replcut)) {
    412                                 replcut = (pc - replindex);
    413                                 break;
    414                             }
    415                         }
    416                         if (*word == '.') replindex--;
    417                     }
    418                 }
    419 
    420 #ifdef VERBOSE
    421                 printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);
    422 #endif
    423                 found = hnj_hash_lookup (hashtab, word);
    424                 state_num = hnj_get_state (dict[k], hashtab, word);
    425                 dict[k]->states[state_num].match = hnj_strdup (pattern + i);
    426                 dict[k]->states[state_num].repl = repl;
    427                 dict[k]->states[state_num].replindex = replindex;
    428                 if (!replcut) {
    429                     dict[k]->states[state_num].replcut = strlen(word);
    430                 } else {
    431                     dict[k]->states[state_num].replcut = replcut;
    432                 }
    433 
    434                 /* now, put in the prefix transitions */
    435                 for (; found < 0 ;j--)
    436                 {
    437                     last_state = state_num;
    438                     ch = word[j - 1];
    439                     word[j - 1] = '\0';
    440                     found = hnj_hash_lookup (hashtab, word);
    441                     state_num = hnj_get_state (dict[k], hashtab, word);
    442                     hnj_add_trans (dict[k], state_num, last_state, ch);
    443                 }
    444             }
    445         }
    446 
    447         /* Could do unioning of matches here (instead of the preprocessor script).
    448            If we did, the pseudocode would look something like this:
    449 
    450            foreach state in the hash table
    451            foreach i = [1..length(state) - 1]
    452            state to check is substr (state, i)
    453            look it up
    454            if found, and if there is a match, union the match in.
    455 
    456            It's also possible to avoid the quadratic blowup by doing the
    457            search in order of increasing state string sizes - then you
    458            can break the loop after finding the first match.
    459 
    460            This step should be optional in any case - if there is a
    461            preprocessed rule table, it's always faster to use that.
    462 
    463         */
    464 
    465         /* put in the fallback states */
    466         for (i = 0; i < HASH_SIZE; i++)
    467             for (e = hashtab->entries[i]; e; e = e->next)
    468             {
    469                 if (*(e->key)) for (j = 1; 1; j++)
    470                                {
    471                                    state_num = hnj_hash_lookup (hashtab, e->key + j);
    472                                    if (state_num >= 0)
    473                                        break;
    474                                }
    475                 /* KBH: FIXME state 0 fallback_state should always be -1? */
    476                 if (e->val)
    477                     dict[k]->states[e->val].fallback_state = state_num;
    478             }
    479 #ifdef VERBOSE
    480         for (i = 0; i < HASH_SIZE; i++)
    481             for (e = hashtab->entries[i]; e; e = e->next)
    482             {
    483                 printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
    484                     dict[k]->states[e->val].fallback_state);
    485                 for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
    486                     printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
    487                         dict[k]->states[e->val].trans[j].new_state);
    488             }
    489 #endif
    490 
    491 #ifndef VERBOSE
    492         hnj_hash_free (hashtab);
    493 #endif
    494         state_num = 0;
    495     }
    496     if (k == 2) dict[0]->nextlevel = dict[1];
    497     return dict[0];
    498 }
    499 
    500 void hnj_hyphen_free (HyphenDict *dict)
    501 {
    502     int state_num;
    503     HyphenState *hstate;
    504 
    505     for (state_num = 0; state_num < dict->num_states; state_num++)
    506     {
    507         hstate = &dict->states[state_num];
    508         if (hstate->match)
    509             hnj_free (hstate->match);
    510         if (hstate->repl)
    511             hnj_free (hstate->repl);
    512         if (hstate->trans)
    513             hnj_free (hstate->trans);
    514     }
    515     if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);
    516 
    517     hnj_free (dict->states);
    518 
    519     hnj_free (dict);
    520 }
    521 
    522 #define MAX_WORD 256
    523 
    524 int hnj_hyphen_hyphenate (HyphenDict *dict,
    525     const char *word, int word_size,
    526     char *hyphens)
    527 {
    528     char prep_word_buf[MAX_WORD];
    529     char *prep_word;
    530     int i, j, k;
    531     int state;
    532     char ch;
    533     HyphenState *hstate;
    534     char *match;
    535     int offset;
    536 
    537     if (word_size + 3 < MAX_WORD)
    538         prep_word = prep_word_buf;
    539     else
    540         prep_word = hnj_malloc (word_size + 3);
    541 
    542     j = 0;
    543     prep_word[j++] = '.';
    544 
    545     for (i = 0; i < word_size; i++)
    546         prep_word[j++] = word[i];
    547 
    548     prep_word[j++] = '.';
    549     prep_word[j] = '\0';
    550 
    551     for (i = 0; i < j; i++)
    552         hyphens[i] = '0';
    553 
    554 #ifdef VERBOSE
    555     printf ("prep_word = %s\n", prep_word);
    556 #endif
    557 
    558     /* now, run the finite state machine */
    559     state = 0;
    560     for (i = 0; i < j; i++)
    561     {
    562         ch = prep_word[i];
    563         for (;;)
    564         {
    565 
    566             if (state == -1) {
    567                 /* return 1; */
    568                 /*  KBH: FIXME shouldn't this be as follows? */
    569                 state = 0;
    570                 goto try_next_letter;
    571             }
    572 
    573 #ifdef VERBOSE
    574             char *state_str;
    575             state_str = get_state_str (state);
    576 
    577             for (k = 0; k < i - strlen (state_str); k++)
    578                 putchar (' ');
    579             printf ("%s", state_str);
    580 #endif
    581 
    582             hstate = &dict->states[state];
    583             for (k = 0; k < hstate->num_trans; k++)
    584                 if (hstate->trans[k].ch == ch)
    585                 {
    586                     state = hstate->trans[k].new_state;
    587                     goto found_state;
    588                 }
    589             state = hstate->fallback_state;
    590 #ifdef VERBOSE
    591             printf (" falling back, fallback_state %d\n", state);
    592 #endif
    593         }
    594       found_state:
    595 #ifdef VERBOSE
    596         printf ("found state %d\n",state);
    597 #endif
    598         /* Additional optimization is possible here - especially,
    599            elimination of trailing zeroes from the match. Leading zeroes
    600            have already been optimized. */
    601         match = dict->states[state].match;
    602         /* replacing rules not handled by hyphen_hyphenate() */
    603         if (match && !dict->states[state].repl)
    604         {
    605             offset = i + 1 - strlen (match);
    606 #ifdef VERBOSE
    607             for (k = 0; k < offset; k++)
    608                 putchar (' ');
    609             printf ("%s\n", match);
    610 #endif
    611             /* This is a linear search because I tried a binary search and
    612                found it to be just a teeny bit slower. */
    613             for (k = 0; match[k]; k++)
    614                 if (hyphens[offset + k] < match[k])
    615                     hyphens[offset + k] = match[k];
    616         }
    617 
    618         /* KBH: we need this to make sure we keep looking in a word */
    619         /* for patterns even if the current character is not known in state 0 */
    620         /* since patterns for hyphenation may occur anywhere in the word */
    621       try_next_letter: ;
    622 
    623     }
    624 #ifdef VERBOSE
    625     for (i = 0; i < j; i++)
    626         putchar (hyphens[i]);
    627     putchar ('\n');
    628 #endif
    629 
    630     for (i = 0; i < j - 4; i++)
    631 #if 0
    632         if (hyphens[i + 1] & 1)
    633             hyphens[i] = '-';
    634 #else
    635     hyphens[i] = hyphens[i + 1];
    636 #endif
    637     hyphens[0] = '0';
    638     for (; i < word_size; i++)
    639         hyphens[i] = '0';
    640     hyphens[word_size] = '\0';
    641 
    642     if (prep_word != prep_word_buf)
    643         hnj_free (prep_word);
    644 
    645     return 0;
    646 }
    647 
    648 /* character length of the first n byte of the input word */
    649 int hnj_hyphen_strnlen(const char * word, int n, int utf8)
    650 {
    651     int i = 0;
    652     int j = 0;
    653     while (j < n && word[j] != '\0') {
    654         i++;
    655         for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
    656     }
    657     return i;
    658 }
    659 
    660 int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
    661 	char *** rep, int ** pos, int ** cut, int lhmin)
    662 {
    663     int i, j;
    664     for (i = 1, j = 0; i < lhmin && word[j] != '\0'; i++) do {
    665             // check length of the non-standard part
    666             if (*rep && *pos && *cut && (*rep)[j]) {
    667                 char * rh = strchr((*rep)[j], '=');
    668                 if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
    669                         hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
    670                     free((*rep)[j]);
    671                     (*rep)[j] = NULL;
    672                     hyphens[j] = '0';
    673                 }
    674             } else {
    675                 hyphens[j] = '0';
    676             }
    677             j++;
    678         } while (utf8 && (word[j + 1] & 0xc0) == 0xc0);
    679     return 0;
    680 }
    681 
    682 int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
    683 	char *** rep, int ** pos, int ** cut, int rhmin)
    684 {
    685     int i;
    686     int j = word_size - 2;
    687     for (i = 1; i < rhmin && j > 0; j--) {
    688         // check length of the non-standard part
    689         if (*rep && *pos && *cut && (*rep)[j]) {
    690             char * rh = strchr((*rep)[j], '=');
    691             if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
    692                     hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
    693                 free((*rep)[j]);
    694                 (*rep)[j] = NULL;
    695                 hyphens[j] = '0';
    696             }
    697         } else {
    698             hyphens[j] = '0';
    699         }
    700         if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;
    701     }
    702     return 0;
    703 }
    704 
    705 // recursive function for compound level hyphenation
    706 int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
    707     char * hyphens, char *** rep, int ** pos, int ** cut,
    708     int clhmin, int crhmin, int lend, int rend)
    709 {
    710     char prep_word_buf[MAX_WORD];
    711     char *prep_word;
    712     int i, j, k;
    713     int state;
    714     char ch;
    715     HyphenState *hstate;
    716     char *match;
    717     char *repl;
    718     signed char replindex;
    719     signed char replcut;
    720     int offset;
    721     int matchlen_buf[MAX_CHARS];
    722     int matchindex_buf[MAX_CHARS];
    723     char * matchrepl_buf[MAX_CHARS];
    724     int * matchlen;
    725     int * matchindex;
    726     char ** matchrepl;
    727     int isrepl = 0;
    728     int nHyphCount;
    729 
    730     if (word_size + 3 < MAX_CHARS) {
    731         prep_word = prep_word_buf;
    732         matchlen = matchlen_buf;
    733         matchindex = matchindex_buf;
    734         matchrepl = matchrepl_buf;
    735     } else {
    736         prep_word = hnj_malloc (word_size + 3);
    737         matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
    738         matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
    739         matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
    740     }
    741 
    742     j = 0;
    743     prep_word[j++] = '.';
    744 
    745     for (i = 0; i < word_size; i++)
    746         prep_word[j++] = word[i];
    747 
    748     prep_word[j++] = '.';
    749     prep_word[j] = '\0';
    750 
    751     for (i = 0; i < j; i++)
    752         hyphens[i] = '0';
    753 
    754 #ifdef VERBOSE
    755     printf ("prep_word = %s\n", prep_word);
    756 #endif
    757 
    758     /* now, run the finite state machine */
    759     state = 0;
    760     for (i = 0; i < j; i++)
    761     {
    762         ch = prep_word[i];
    763         for (;;)
    764         {
    765 
    766             if (state == -1) {
    767                 /* return 1; */
    768                 /*  KBH: FIXME shouldn't this be as follows? */
    769                 state = 0;
    770                 goto try_next_letter;
    771             }
    772 
    773 #ifdef VERBOSE
    774             char *state_str;
    775             state_str = get_state_str (state);
    776 
    777             for (k = 0; k < i - strlen (state_str); k++)
    778                 putchar (' ');
    779             printf ("%s", state_str);
    780 #endif
    781 
    782             hstate = &dict->states[state];
    783             for (k = 0; k < hstate->num_trans; k++)
    784                 if (hstate->trans[k].ch == ch)
    785                 {
    786                     state = hstate->trans[k].new_state;
    787                     goto found_state;
    788                 }
    789             state = hstate->fallback_state;
    790 #ifdef VERBOSE
    791             printf (" falling back, fallback_state %d\n", state);
    792 #endif
    793         }
    794       found_state:
    795 #ifdef VERBOSE
    796         printf ("found state %d\n",state);
    797 #endif
    798         /* Additional optimization is possible here - especially,
    799            elimination of trailing zeroes from the match. Leading zeroes
    800            have already been optimized. */
    801         match = dict->states[state].match;
    802         repl = dict->states[state].repl;
    803         replindex = dict->states[state].replindex;
    804         replcut = dict->states[state].replcut;
    805         /* replacing rules not handled by hyphen_hyphenate() */
    806         if (match)
    807         {
    808             offset = i + 1 - strlen (match);
    809 #ifdef VERBOSE
    810             for (k = 0; k < offset; k++)
    811                 putchar (' ');
    812             printf ("%s (%s)\n", match, repl);
    813 #endif
    814             if (repl) {
    815                 if (!isrepl) for(; isrepl < word_size; isrepl++) {
    816                         matchrepl[isrepl] = NULL;
    817                         matchindex[isrepl] = -1;
    818                     }
    819                 matchlen[offset + replindex] = replcut;
    820             }
    821             /* This is a linear search because I tried a binary search and
    822                found it to be just a teeny bit slower. */
    823             for (k = 0; match[k]; k++) {
    824                 if ((hyphens[offset + k] < match[k])) {
    825                     hyphens[offset + k] = match[k];
    826                     if (match[k]&1) {
    827                         matchrepl[offset + k] = repl;
    828                         if (repl && (k >= replindex) && (k <= replindex + replcut)) {
    829                             matchindex[offset + replindex] = offset + k;
    830                         }
    831                     }
    832                 }
    833             }
    834 
    835         }
    836 
    837         /* KBH: we need this to make sure we keep looking in a word */
    838         /* for patterns even if the current character is not known in state 0 */
    839         /* since patterns for hyphenation may occur anywhere in the word */
    840       try_next_letter: ;
    841 
    842     }
    843 #ifdef VERBOSE
    844     for (i = 0; i < j; i++)
    845         putchar (hyphens[i]);
    846     putchar ('\n');
    847 #endif
    848 
    849     for (i = 0; i < j - 3; i++)
    850 #if 0
    851         if (hyphens[i + 1] & 1)
    852             hyphens[i] = '-';
    853 #else
    854     hyphens[i] = hyphens[i + 1];
    855 #endif
    856     for (; i < word_size; i++)
    857         hyphens[i] = '0';
    858     hyphens[word_size] = '\0';
    859 
    860     /* now create a new char string showing hyphenation positions */
    861     /* count the hyphens and allocate space for the new hyphenated string */
    862     nHyphCount = 0;
    863     for (i = 0; i < word_size; i++)
    864         if (hyphens[i]&1)
    865             nHyphCount++;
    866     j = 0;
    867     for (i = 0; i < word_size; i++) {
    868         if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
    869             if (rep && pos && cut) {
    870                 if (!*rep && !*pos && !*cut) {
    871                     int k;
    872                     *rep = (char **) malloc(sizeof(char *) * word_size);
    873                     *pos = (int *) malloc(sizeof(int) * word_size);
    874                     *cut = (int *) malloc(sizeof(int) * word_size);
    875                     for (k = 0; k < word_size; k++) {
    876                         (*rep)[k] = NULL;
    877                         (*pos)[k] = 0;
    878                         (*cut)[k] = 0;
    879                     }
    880                 }
    881                 (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
    882                 (*pos)[matchindex[i] - 1] = matchindex[i] - i;
    883                 (*cut)[matchindex[i] - 1] = matchlen[i];
    884             }
    885             j += strlen(matchrepl[matchindex[i]]);
    886             i += matchlen[i] - 1;
    887         }
    888     }
    889 
    890     if (matchrepl != matchrepl_buf) {
    891         hnj_free (matchrepl);
    892         hnj_free (matchlen);
    893         hnj_free (matchindex);
    894     }
    895 
    896     // recursive hyphenation of the first (compound) level segments
    897     if (dict->nextlevel) {
    898         char * rep2_buf[MAX_WORD];
    899         int pos2_buf[MAX_WORD];
    900         int cut2_buf[MAX_WORD];
    901         char hyphens2_buf[MAX_WORD];
    902         char ** rep2;
    903         int * pos2;
    904         int * cut2;
    905         char * hyphens2;
    906         int begin = 0;
    907         if (word_size < MAX_CHARS) {
    908             rep2 = rep2_buf;
    909             pos2 = pos2_buf;
    910             cut2 = cut2_buf;
    911             hyphens2 = hyphens2_buf;
    912         } else {
    913             rep2 = hnj_malloc (word_size * sizeof(char *));
    914             pos2 = hnj_malloc (word_size * sizeof(int));
    915             cut2 = hnj_malloc (word_size * sizeof(int));
    916             hyphens2 = hnj_malloc (word_size);
    917         }
    918         for (i = 0; i < word_size; i++) rep2[i] = NULL;
    919         for (i = 0; i < word_size; i++)
    920             if (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
    921                 if (i - begin > 1) {
    922                     int hyph = 0;
    923                     prep_word[i + 2] = '\0';
    924                     /* non-standard hyphenation at compound boundary (Schiffahrt) */
    925                     if (*rep && *pos && *cut && (*rep)[i]) {
    926                         char * l = strchr((*rep)[i], '=');
    927                         strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);
    928                         if (l) {
    929                             hyph = (l - (*rep)[i]) - (*pos)[i];
    930                             prep_word[2 + i + hyph] = '\0';
    931                         }
    932                     }
    933                     hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
    934                         hyphens2, &rep2, &pos2, &cut2, clhmin,
    935                         crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
    936                     for (j = 0; j < i - begin - 1; j++) {
    937                         hyphens[begin + j] = hyphens2[j];
    938                         if (rep2[j] && rep && pos && cut) {
    939                             if (!*rep && !*pos && !*cut) {
    940                                 int k;
    941                                 *rep = (char **) malloc(sizeof(char *) * word_size);
    942                                 *pos = (int *) malloc(sizeof(int) * word_size);
    943                                 *cut = (int *) malloc(sizeof(int) * word_size);
    944                                 for (k = 0; k < word_size; k++) {
    945                                     (*rep)[k] = NULL;
    946                                     (*pos)[k] = 0;
    947                                     (*cut)[k] = 0;
    948                                 }
    949                             }
    950                             (*rep)[begin + j] = rep2[j];
    951                             (*pos)[begin + j] = pos2[j];
    952                             (*cut)[begin + j] = cut2[j];
    953                         }
    954                     }
    955                     prep_word[i + 2] = word[i + 1];
    956                     if (*rep && *pos && *cut && (*rep)[i]) {
    957                         strcpy(prep_word + 1, word);
    958                     }
    959                 }
    960                 begin = i + 1;
    961                 for (j = 0; j < word_size; j++) rep2[j] = NULL;
    962             }
    963 
    964         // non-compound
    965         if (begin == 0) {
    966             hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
    967                 hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
    968             if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
    969                 rep, pos, cut, clhmin);
    970             if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
    971                 rep, pos, cut, crhmin);
    972         }
    973 
    974         if (rep2 != rep2_buf) {
    975             free(rep2);
    976             free(cut2);
    977             free(pos2);
    978             free(hyphens2);
    979         }
    980     }
    981 
    982     if (prep_word != prep_word_buf) hnj_free (prep_word);
    983     return 0;
    984 }
    985 
    986 /* UTF-8 normalization of hyphen and non-standard positions */
    987 int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
    988 	char *** rep, int ** pos, int ** cut)
    989 {
    990     if ((((unsigned char) word[0]) >> 6) == 2) {
    991         fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
    992         return 1;
    993     }
    994 
    995     /* calculate UTF-8 character positions */
    996     int i, j, k;
    997     for (i = 0, j = -1; i < word_size; i++) {
    998         /* beginning of an UTF-8 character (not '10' start bits) */
    999         if ((((unsigned char) word[i]) >> 6) != 2) j++;
   1000         hyphens[j] = hyphens[i];
   1001         if (rep && pos && cut && *rep && *pos && *cut) {
   1002             int l = (*pos)[i];
   1003             (*pos)[j] = 0;
   1004             for (k = 0; k < l; k++) {
   1005                 if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
   1006             }
   1007             k = i - l + 1;
   1008             l = k + (*cut)[i];
   1009             (*cut)[j] = 0;
   1010             for (; k < l; k++) {
   1011                 if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
   1012             }
   1013             (*rep)[j] = (*rep)[i];
   1014             if (j < i) {
   1015                 (*rep)[i] = NULL;
   1016                 (*pos)[i] = 0;
   1017                 (*cut)[i] = 0;
   1018             }
   1019         }
   1020     }
   1021     hyphens[j + 1] = '\0';
   1022     return 0;
   1023 }
   1024 
   1025 /* get the word with all possible hyphenations (output: hyphword) */
   1026 void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
   1027     char * hyphword, char *** rep, int ** pos, int ** cut)
   1028 {
   1029     int i, j;
   1030     for (i = 0, j = 0; i < l; i++, j++) {
   1031         if (hyphens[i]&1) {
   1032             hyphword[j] = word[i];
   1033             if (*rep && *pos && *cut && (*rep)[i]) {
   1034                 strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);
   1035                 j += strlen((*rep)[i]) - (*pos)[i];
   1036                 i += (*cut)[i] - (*pos)[i];
   1037             } else hyphword[++j] = '=';
   1038         } else hyphword[j] = word[i];
   1039     }
   1040     hyphword[j] = '\0';
   1041 }
   1042 
   1043 
   1044 /* main api function with default hyphenmin parameters */
   1045 int hnj_hyphen_hyphenate2 (HyphenDict *dict,
   1046     const char *word, int word_size, char * hyphens,
   1047     char *hyphword, char *** rep, int ** pos, int ** cut)
   1048 {
   1049     hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
   1050         dict->clhmin, dict->crhmin, 1, 1);
   1051     hnj_hyphen_lhmin(dict->utf8, word, word_size,
   1052         hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
   1053     hnj_hyphen_rhmin(dict->utf8, word, word_size,
   1054         hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
   1055     if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
   1056     if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
   1057     return 0;
   1058 }
   1059 
   1060 /* previous main api function with hyphenmin parameters */
   1061 int hnj_hyphen_hyphenate3 (HyphenDict *dict,
   1062 	const char *word, int word_size, char * hyphens,
   1063 	char *hyphword, char *** rep, int ** pos, int ** cut,
   1064 	int lhmin, int rhmin, int clhmin, int crhmin)
   1065 {
   1066     lhmin = (lhmin > 0 ? lhmin : dict->lhmin);
   1067     rhmin = (rhmin > 0 ? rhmin : dict->rhmin);
   1068     hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
   1069         clhmin, crhmin, 1, 1);
   1070     hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
   1071         rep, pos, cut, (lhmin > 0 ? lhmin : 2));
   1072     hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
   1073         rep, pos, cut, (rhmin > 0 ? rhmin : 2));
   1074     if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
   1075     if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
   1076     return 0;
   1077 }
   1078