external/hyphenation/hyphen.c

/* Libhnj is dual licensed under LGPL and MPL. Boilerplate for both
 * licenses follows.
 */

/* LibHnj - a library for high quality hyphenation and justification
 * Copyright (C) 1998 Raph Levien,
 * 	     (C) 2001 ALTLinux, Moscow (http://www.alt-linux.org),
 *           (C) 2001 Peter Novodvorsky (nidd (at) cs.msu.su)
 *           (C) 2006, 2007, 2008 Lszl Nmeth (nemeth at OOo)
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA  02111-1307  USA.
 */

/*
 * The contents of this file are subject to the Mozilla Public License
 * Version 1.0 (the "MPL"); you may not use this file except in
 * compliance with the MPL.  You may obtain a copy of the MPL at
 * http://www.mozilla.org/MPL/
 *
 * Software distributed under the MPL is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the MPL
 * for the specific language governing rights and limitations under the
 * MPL.
 *
 */
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <stdlib.h> /* for NULL, malloc */
#include <stdio.h>  /* for fprintf */
#include <string.h> /* for strdup */
#include <unistd.h> /* for close */

#define noVERBOSE

#include "hnjalloc.h"
#include "hyphen.h"

static char *
hnj_strdup (const char *s)
{
    char *new;
    int l;

    l = strlen (s);
    new = hnj_malloc (l + 1);
    memcpy (new, s, l);
    new[l] = 0;
    return new;
}

/* remove cross-platform text line end characters */
void hnj_strchomp(char * s)
{
    int k = strlen(s);
    if ((k > 0) && ((*(s+k-1)=='\r') || (*(s+k-1)=='\n'))) *(s+k-1) = '\0';
    if ((k > 1) && (*(s+k-2) == '\r')) *(s+k-2) = '\0';
}

/* a little bit of a hash table implementation. This simply maps strings
   to state numbers */

typedef struct _HashTab HashTab;
typedef struct _HashEntry HashEntry;

/* A cheap, but effective, hack. */
#define HASH_SIZE 31627

struct _HashTab {
    HashEntry *entries[HASH_SIZE];
};

struct _HashEntry {
    HashEntry *next;
    char *key;
    int val;
};

/* a char* hash function from ASU - adapted from Gtk+ */
static unsigned int
hnj_string_hash (const char *s)
{
    const char *p;
    unsigned int h=0, g;
    for(p = s; *p != '\0'; p += 1) {
        h = ( h << 4 ) + *p;
        if ( ( g = h & 0xf0000000 ) ) {
            h = h ^ (g >> 24);
            h = h ^ g;
        }
    }
    return h /* % M */;
}

static HashTab *
hnj_hash_new (void)
{
    HashTab *hashtab;
    int i;

    hashtab = hnj_malloc (sizeof(HashTab));
    for (i = 0; i < HASH_SIZE; i++)
        hashtab->entries[i] = NULL;

    return hashtab;
}

static void
hnj_hash_free (HashTab *hashtab)
{
    int i;
    HashEntry *e, *next;

    for (i = 0; i < HASH_SIZE; i++)
        for (e = hashtab->entries[i]; e; e = next)
        {
            next = e->next;
            hnj_free (e->key);
            hnj_free (e);
        }

    hnj_free (hashtab);
}

/* assumes that key is not already present! */
static void
hnj_hash_insert (HashTab *hashtab, const char *key, int val)
{
    int i;
    HashEntry *e;

    i = hnj_string_hash (key) % HASH_SIZE;
    e = hnj_malloc (sizeof(HashEntry));
    e->next = hashtab->entries[i];
    e->key = hnj_strdup (key);
    e->val = val;
    hashtab->entries[i] = e;
}

/* return val if found, otherwise -1 */
static int
hnj_hash_lookup (HashTab *hashtab, const char *key)
{
    int i;
    HashEntry *e;
    i = hnj_string_hash (key) % HASH_SIZE;
    for (e = hashtab->entries[i]; e; e = e->next)
        if (!strcmp (key, e->key))
            return e->val;
    return -1;
}

/* Get the state number, allocating a new state if necessary. */
static int
hnj_get_state (HyphenDict *dict, HashTab *hashtab, const char *string)
{
    int state_num;

    state_num = hnj_hash_lookup (hashtab, string);

    if (state_num >= 0)
        return state_num;

    hnj_hash_insert (hashtab, string, dict->num_states);
    /* predicate is true if dict->num_states is a power of two */
    if (!(dict->num_states & (dict->num_states - 1)))
    {
        dict->states = hnj_realloc (dict->states,
            (dict->num_states << 1) *
            sizeof(HyphenState));
    }
    dict->states[dict->num_states].match = NULL;
    dict->states[dict->num_states].repl = NULL;
    dict->states[dict->num_states].fallback_state = -1;
    dict->states[dict->num_states].num_trans = 0;
    dict->states[dict->num_states].trans = NULL;
    return dict->num_states++;
}

/* add a transition from state1 to state2 through ch - assumes that the
   transition does not already exist */
static void
hnj_add_trans (HyphenDict *dict, int state1, int state2, char ch)
{
    int num_trans;

    num_trans = dict->states[state1].num_trans;
    if (num_trans == 0)
    {
        dict->states[state1].trans = hnj_malloc (sizeof(HyphenTrans));
    }
    else if (!(num_trans & (num_trans - 1)))
    {
        dict->states[state1].trans = hnj_realloc (dict->states[state1].trans,
            (num_trans << 1) *
            sizeof(HyphenTrans));
    }
    dict->states[state1].trans[num_trans].ch = ch;
    dict->states[state1].trans[num_trans].new_state = state2;
    dict->states[state1].num_trans++;
}

#ifdef VERBOSE
HashTab *global;

static char *
get_state_str (int state)
{
    int i;
    HashEntry *e;

    for (i = 0; i < HASH_SIZE; i++)
        for (e = global->entries[i]; e; e = e->next)
            if (e->val == state)
                return e->key;
    return NULL;
}
#endif

// Get a line from the dictionary contents.
static char *
get_line (char *s, int size, const char *dict_contents, int dict_length,
    int *dict_ptr)
{
    int len = 0;
    while (len < (size - 1) && *dict_ptr < dict_length) {
        s[len++] = *(dict_contents + *dict_ptr);
        (*dict_ptr)++;
        if (s[len - 1] == '\n')
            break;
    }
    s[len] = '\0';
    if (len > 0) {
        return s;
    } else {
        return NULL;
    }
}

HyphenDict *
hnj_hyphen_load (const char *fn)
{
    if (fn == NULL)
        return NULL;
    const int fd = open(fn, O_RDONLY);
    if (fd == -1)
        return NULL;
    struct stat sb;
    if (fstat(fd, &sb) == -1)  {  /* To obtain file size */
        close(fd);
        return NULL;
    }

    const char *addr = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
    if (addr == MAP_FAILED) {
        close(fd);
        return NULL;
    }
    HyphenDict *dict = hnj_hyphen_load_from_buffer(addr, sb.st_size);
    munmap((void *)addr, sb.st_size);
    close(fd);

    return dict;
}

HyphenDict *
hnj_hyphen_load_from_buffer (const char *dict_contents, int dict_length)
{
    HyphenDict *dict[2];
    HashTab *hashtab;
    char buf[MAX_CHARS];
    char word[MAX_CHARS];
    char pattern[MAX_CHARS];
    char * repl;
    signed char replindex;
    signed char replcut;
    int state_num = 0, last_state;
    int i, j, k;
    char ch;
    int found;
    HashEntry *e;
    int nextlevel = 0;

    if (dict_contents == NULL)
        return NULL;

    int dict_ptr = 0;
// loading one or two dictionaries (separated by NEXTLEVEL keyword)
    for (k = 0; k == 0 || (k == 1 && nextlevel); k++) {
        hashtab = hnj_hash_new ();
#ifdef VERBOSE
        global = hashtab;
#endif
        hnj_hash_insert (hashtab, "", 0);
        dict[k] = hnj_malloc (sizeof(HyphenDict));
        dict[k]->num_states = 1;
        dict[k]->states = hnj_malloc (sizeof(HyphenState));
        dict[k]->states[0].match = NULL;
        dict[k]->states[0].repl = NULL;
        dict[k]->states[0].fallback_state = -1;
        dict[k]->states[0].num_trans = 0;
        dict[k]->states[0].trans = NULL;
        dict[k]->nextlevel = NULL;
        dict[k]->lhmin = 0;
        dict[k]->rhmin = 0;
        dict[k]->clhmin = 0;
        dict[k]->crhmin = 0;

        /* read in character set info */
        if (k == 0) {
            for (i=0;i<MAX_NAME;i++) dict[k]->cset[i]= 0;
            get_line(dict[k]->cset, sizeof(dict[k]->cset), dict_contents,
                dict_length, &dict_ptr);
            for (i=0;i<MAX_NAME;i++)
                if ((dict[k]->cset[i] == '\r') || (dict[k]->cset[i] == '\n'))
                    dict[k]->cset[i] = 0;
            dict[k]->utf8 = (strcmp(dict[k]->cset, "UTF-8") == 0);
        } else {
            strcpy(dict[k]->cset, dict[0]->cset);
            dict[k]->utf8 = dict[0]->utf8;
        }

        while (get_line(buf, sizeof(buf), dict_contents, dict_length,
                &dict_ptr) != NULL)
        {
            if (buf[0] != '%')
            {
                if (strncmp(buf, "NEXTLEVEL", 9) == 0) {
                    nextlevel = 1;
                    break;
                } else if (strncmp(buf, "LEFTHYPHENMIN", 13) == 0) {
                    dict[k]->lhmin = atoi(buf + 13);
                    continue;
                } else if (strncmp(buf, "RIGHTHYPHENMIN", 14) == 0) {
                    dict[k]->rhmin = atoi(buf + 14);
                    continue;
                } else if (strncmp(buf, "COMPOUNDLEFTHYPHENMIN", 21) == 0) {
                    dict[k]->clhmin = atoi(buf + 21);
                    continue;
                } else if (strncmp(buf, "COMPOUNDRIGHTHYPHENMIN", 22) == 0) {
                    dict[k]->crhmin = atoi(buf + 22);
                    continue;
                }
                j = 0;
                pattern[j] = '0';
                repl = strchr(buf, '/');
                replindex = 0;
                replcut = 0;
                if (repl) {
                    char * index = strchr(repl + 1, ',');
                    *repl = '\0';
                    if (index) {
                        char * index2 = strchr(index + 1, ',');
                        *index = '\0';
                        if (index2) {
                            *index2 = '\0';
                            replindex = (signed char) atoi(index + 1) - 1;
                            replcut = (signed char) atoi(index2 + 1);
                        }
                    } else {
                        hnj_strchomp(repl + 1);
                        replindex = 0;
                        replcut = strlen(buf);
                    }
                    repl = hnj_strdup(repl + 1);
                }
                for (i = 0; ((buf[i] > ' ') || (buf[i] < 0)); i++)
                {
                    if (buf[i] >= '0' && buf[i] <= '9')
                        pattern[j] = buf[i];
                    else
                    {
                        word[j] = buf[i];
                        pattern[++j] = '0';
                    }
                }
                word[j] = '\0';
                pattern[j + 1] = '\0';

                i = 0;
                if (!repl) {
                    /* Optimize away leading zeroes */
                    for (; pattern[i] == '0'; i++);
                } else {
                    if (*word == '.') i++;
                    /* convert UTF-8 char. positions of discretionary hyph. replacements to 8-bit */
                    if (dict[k]->utf8) {
                        int pu = -1;        /* unicode character position */
                        int ps = -1;        /* unicode start position (original replindex) */
                        int pc = (*word == '.') ? 1: 0; /* 8-bit character position */
                        for (; pc < (strlen(word) + 1); pc++) {
                            /* beginning of an UTF-8 character (not '10' start bits) */
                            if ((((unsigned char) word[pc]) >> 6) != 2) pu++;
                            if ((ps < 0) && (replindex == pu)) {
                                ps = replindex;
                                replindex = pc;
                            }
                            if ((ps >= 0) && ((pu - ps) == replcut)) {
                                replcut = (pc - replindex);
                                break;
                            }
                        }
                        if (*word == '.') replindex--;
                    }
                }

#ifdef VERBOSE
                printf ("word %s pattern %s, j = %d  repl: %s\n", word, pattern + i, j, repl);
#endif
                found = hnj_hash_lookup (hashtab, word);
                state_num = hnj_get_state (dict[k], hashtab, word);
                dict[k]->states[state_num].match = hnj_strdup (pattern + i);
                dict[k]->states[state_num].repl = repl;
                dict[k]->states[state_num].replindex = replindex;
                if (!replcut) {
                    dict[k]->states[state_num].replcut = strlen(word);
                } else {
                    dict[k]->states[state_num].replcut = replcut;
                }

                /* now, put in the prefix transitions */
                for (; found < 0 ;j--)
                {
                    last_state = state_num;
                    ch = word[j - 1];
                    word[j - 1] = '\0';
                    found = hnj_hash_lookup (hashtab, word);
                    state_num = hnj_get_state (dict[k], hashtab, word);
                    hnj_add_trans (dict[k], state_num, last_state, ch);
                }
            }
        }

        /* Could do unioning of matches here (instead of the preprocessor script).
           If we did, the pseudocode would look something like this:

           foreach state in the hash table
           foreach i = [1..length(state) - 1]
           state to check is substr (state, i)
           look it up
           if found, and if there is a match, union the match in.

           It's also possible to avoid the quadratic blowup by doing the
           search in order of increasing state string sizes - then you
           can break the loop after finding the first match.

           This step should be optional in any case - if there is a
           preprocessed rule table, it's always faster to use that.

        */

        /* put in the fallback states */
        for (i = 0; i < HASH_SIZE; i++)
            for (e = hashtab->entries[i]; e; e = e->next)
            {
                if (*(e->key)) for (j = 1; 1; j++)
                               {
                                   state_num = hnj_hash_lookup (hashtab, e->key + j);
                                   if (state_num >= 0)
                                       break;
                               }
                /* KBH: FIXME state 0 fallback_state should always be -1? */
                if (e->val)
                    dict[k]->states[e->val].fallback_state = state_num;
            }
#ifdef VERBOSE
        for (i = 0; i < HASH_SIZE; i++)
            for (e = hashtab->entries[i]; e; e = e->next)
            {
                printf ("%d string %s state %d, fallback=%d\n", i, e->key, e->val,
                    dict[k]->states[e->val].fallback_state);
                for (j = 0; j < dict[k]->states[e->val].num_trans; j++)
                    printf (" %c->%d\n", dict[k]->states[e->val].trans[j].ch,
                        dict[k]->states[e->val].trans[j].new_state);
            }
#endif

#ifndef VERBOSE
        hnj_hash_free (hashtab);
#endif
        state_num = 0;
    }
    if (k == 2) dict[0]->nextlevel = dict[1];
    return dict[0];
}

void hnj_hyphen_free (HyphenDict *dict)
{
    int state_num;
    HyphenState *hstate;

    for (state_num = 0; state_num < dict->num_states; state_num++)
    {
        hstate = &dict->states[state_num];
        if (hstate->match)
            hnj_free (hstate->match);
        if (hstate->repl)
            hnj_free (hstate->repl);
        if (hstate->trans)
            hnj_free (hstate->trans);
    }
    if (dict->nextlevel) hnj_hyphen_free(dict->nextlevel);

    hnj_free (dict->states);

    hnj_free (dict);
}

#define MAX_WORD 256

int hnj_hyphen_hyphenate (HyphenDict *dict,
    const char *word, int word_size,
    char *hyphens)
{
    char prep_word_buf[MAX_WORD];
    char *prep_word;
    int i, j, k;
    int state;
    char ch;
    HyphenState *hstate;
    char *match;
    int offset;

    if (word_size + 3 < MAX_WORD)
        prep_word = prep_word_buf;
    else
        prep_word = hnj_malloc (word_size + 3);

    j = 0;
    prep_word[j++] = '.';

    for (i = 0; i < word_size; i++)
        prep_word[j++] = word[i];

    prep_word[j++] = '.';
    prep_word[j] = '\0';

    for (i = 0; i < j; i++)
        hyphens[i] = '0';

#ifdef VERBOSE
    printf ("prep_word = %s\n", prep_word);
#endif

    /* now, run the finite state machine */
    state = 0;
    for (i = 0; i < j; i++)
    {
        ch = prep_word[i];
        for (;;)
        {

            if (state == -1) {
                /* return 1; */
                /*  KBH: FIXME shouldn't this be as follows? */
                state = 0;
                goto try_next_letter;
            }

#ifdef VERBOSE
            char *state_str;
            state_str = get_state_str (state);

            for (k = 0; k < i - strlen (state_str); k++)
                putchar (' ');
            printf ("%s", state_str);
#endif

            hstate = &dict->states[state];
            for (k = 0; k < hstate->num_trans; k++)
                if (hstate->trans[k].ch == ch)
                {
                    state = hstate->trans[k].new_state;
                    goto found_state;
                }
            state = hstate->fallback_state;
#ifdef VERBOSE
            printf (" falling back, fallback_state %d\n", state);
#endif
        }
      found_state:
#ifdef VERBOSE
        printf ("found state %d\n",state);
#endif
        /* Additional optimization is possible here - especially,
           elimination of trailing zeroes from the match. Leading zeroes
           have already been optimized. */
        match = dict->states[state].match;
        /* replacing rules not handled by hyphen_hyphenate() */
        if (match && !dict->states[state].repl)
        {
            offset = i + 1 - strlen (match);
#ifdef VERBOSE
            for (k = 0; k < offset; k++)
                putchar (' ');
            printf ("%s\n", match);
#endif
            /* This is a linear search because I tried a binary search and
               found it to be just a teeny bit slower. */
            for (k = 0; match[k]; k++)
                if (hyphens[offset + k] < match[k])
                    hyphens[offset + k] = match[k];
        }

        /* KBH: we need this to make sure we keep looking in a word */
        /* for patterns even if the current character is not known in state 0 */
        /* since patterns for hyphenation may occur anywhere in the word */
      try_next_letter: ;

    }
#ifdef VERBOSE
    for (i = 0; i < j; i++)
        putchar (hyphens[i]);
    putchar ('\n');
#endif

    for (i = 0; i < j - 4; i++)
#if 0
        if (hyphens[i + 1] & 1)
            hyphens[i] = '-';
#else
    hyphens[i] = hyphens[i + 1];
#endif
    hyphens[0] = '0';
    for (; i < word_size; i++)
        hyphens[i] = '0';
    hyphens[word_size] = '\0';

    if (prep_word != prep_word_buf)
        hnj_free (prep_word);

    return 0;
}

/* character length of the first n byte of the input word */
int hnj_hyphen_strnlen(const char * word, int n, int utf8)
{
    int i = 0;
    int j = 0;
    while (j < n && word[j] != '\0') {
        i++;
        for (j++; utf8 && (word[j] & 0xc0) == 0x80; j++);
    }
    return i;
}

int hnj_hyphen_lhmin(int utf8, const char *word, int word_size, char * hyphens,
	char *** rep, int ** pos, int ** cut, int lhmin)
{
    int i, j;
    for (i = 1, j = 0; i < lhmin && word[j] != '\0'; i++) do {
            // check length of the non-standard part
            if (*rep && *pos && *cut && (*rep)[j]) {
                char * rh = strchr((*rep)[j], '=');
                if (rh && (hnj_hyphen_strnlen(word, j - (*pos)[j] + 1, utf8) +
                        hnj_hyphen_strnlen((*rep)[j], rh - (*rep)[j], utf8)) < lhmin) {
                    free((*rep)[j]);
                    (*rep)[j] = NULL;
                    hyphens[j] = '0';
                }
            } else {
                hyphens[j] = '0';
            }
            j++;
        } while (utf8 && (word[j + 1] & 0xc0) == 0xc0);
    return 0;
}

int hnj_hyphen_rhmin(int utf8, const char *word, int word_size, char * hyphens,
	char *** rep, int ** pos, int ** cut, int rhmin)
{
    int i;
    int j = word_size - 2;
    for (i = 1; i < rhmin && j > 0; j--) {
        // check length of the non-standard part
        if (*rep && *pos && *cut && (*rep)[j]) {
            char * rh = strchr((*rep)[j], '=');
            if (rh && (hnj_hyphen_strnlen(word + j - (*pos)[j] + (*cut)[j] + 1, 100, utf8) +
                    hnj_hyphen_strnlen(rh + 1, strlen(rh + 1), utf8)) < rhmin) {
                free((*rep)[j]);
                (*rep)[j] = NULL;
                hyphens[j] = '0';
            }
        } else {
            hyphens[j] = '0';
        }
        if (!utf8 || (word[j] & 0xc0) != 0xc0) i++;
    }
    return 0;
}

// recursive function for compound level hyphenation
int hnj_hyphen_hyph_(HyphenDict *dict, const char *word, int word_size,
    char * hyphens, char *** rep, int ** pos, int ** cut,
    int clhmin, int crhmin, int lend, int rend)
{
    char prep_word_buf[MAX_WORD];
    char *prep_word;
    int i, j, k;
    int state;
    char ch;
    HyphenState *hstate;
    char *match;
    char *repl;
    signed char replindex;
    signed char replcut;
    int offset;
    int matchlen_buf[MAX_CHARS];
    int matchindex_buf[MAX_CHARS];
    char * matchrepl_buf[MAX_CHARS];
    int * matchlen;
    int * matchindex;
    char ** matchrepl;
    int isrepl = 0;
    int nHyphCount;

    if (word_size + 3 < MAX_CHARS) {
        prep_word = prep_word_buf;
        matchlen = matchlen_buf;
        matchindex = matchindex_buf;
        matchrepl = matchrepl_buf;
    } else {
        prep_word = hnj_malloc (word_size + 3);
        matchlen = hnj_malloc ((word_size + 3) * sizeof(int));
        matchindex = hnj_malloc ((word_size + 3) * sizeof(int));
        matchrepl = hnj_malloc ((word_size + 3) * sizeof(char *));
    }

    j = 0;
    prep_word[j++] = '.';

    for (i = 0; i < word_size; i++)
        prep_word[j++] = word[i];

    prep_word[j++] = '.';
    prep_word[j] = '\0';

    for (i = 0; i < j; i++)
        hyphens[i] = '0';

#ifdef VERBOSE
    printf ("prep_word = %s\n", prep_word);
#endif

    /* now, run the finite state machine */
    state = 0;
    for (i = 0; i < j; i++)
    {
        ch = prep_word[i];
        for (;;)
        {

            if (state == -1) {
                /* return 1; */
                /*  KBH: FIXME shouldn't this be as follows? */
                state = 0;
                goto try_next_letter;
            }

#ifdef VERBOSE
            char *state_str;
            state_str = get_state_str (state);

            for (k = 0; k < i - strlen (state_str); k++)
                putchar (' ');
            printf ("%s", state_str);
#endif

            hstate = &dict->states[state];
            for (k = 0; k < hstate->num_trans; k++)
                if (hstate->trans[k].ch == ch)
                {
                    state = hstate->trans[k].new_state;
                    goto found_state;
                }
            state = hstate->fallback_state;
#ifdef VERBOSE
            printf (" falling back, fallback_state %d\n", state);
#endif
        }
      found_state:
#ifdef VERBOSE
        printf ("found state %d\n",state);
#endif
        /* Additional optimization is possible here - especially,
           elimination of trailing zeroes from the match. Leading zeroes
           have already been optimized. */
        match = dict->states[state].match;
        repl = dict->states[state].repl;
        replindex = dict->states[state].replindex;
        replcut = dict->states[state].replcut;
        /* replacing rules not handled by hyphen_hyphenate() */
        if (match)
        {
            offset = i + 1 - strlen (match);
#ifdef VERBOSE
            for (k = 0; k < offset; k++)
                putchar (' ');
            printf ("%s (%s)\n", match, repl);
#endif
            if (repl) {
                if (!isrepl) for(; isrepl < word_size; isrepl++) {
                        matchrepl[isrepl] = NULL;
                        matchindex[isrepl] = -1;
                    }
                matchlen[offset + replindex] = replcut;
            }
            /* This is a linear search because I tried a binary search and
               found it to be just a teeny bit slower. */
            for (k = 0; match[k]; k++) {
                if ((hyphens[offset + k] < match[k])) {
                    hyphens[offset + k] = match[k];
                    if (match[k]&1) {
                        matchrepl[offset + k] = repl;
                        if (repl && (k >= replindex) && (k <= replindex + replcut)) {
                            matchindex[offset + replindex] = offset + k;
                        }
                    }
                }
            }

        }

        /* KBH: we need this to make sure we keep looking in a word */
        /* for patterns even if the current character is not known in state 0 */
        /* since patterns for hyphenation may occur anywhere in the word */
      try_next_letter: ;

    }
#ifdef VERBOSE
    for (i = 0; i < j; i++)
        putchar (hyphens[i]);
    putchar ('\n');
#endif

    for (i = 0; i < j - 3; i++)
#if 0
        if (hyphens[i + 1] & 1)
            hyphens[i] = '-';
#else
    hyphens[i] = hyphens[i + 1];
#endif
    for (; i < word_size; i++)
        hyphens[i] = '0';
    hyphens[word_size] = '\0';

    /* now create a new char string showing hyphenation positions */
    /* count the hyphens and allocate space for the new hyphenated string */
    nHyphCount = 0;
    for (i = 0; i < word_size; i++)
        if (hyphens[i]&1)
            nHyphCount++;
    j = 0;
    for (i = 0; i < word_size; i++) {
        if (isrepl && (matchindex[i] >= 0) && matchrepl[matchindex[i]]) {
            if (rep && pos && cut) {
                if (!*rep && !*pos && !*cut) {
                    int k;
                    *rep = (char **) malloc(sizeof(char *) * word_size);
                    *pos = (int *) malloc(sizeof(int) * word_size);
                    *cut = (int *) malloc(sizeof(int) * word_size);
                    for (k = 0; k < word_size; k++) {
                        (*rep)[k] = NULL;
                        (*pos)[k] = 0;
                        (*cut)[k] = 0;
                    }
                }
                (*rep)[matchindex[i] - 1] = hnj_strdup(matchrepl[matchindex[i]]);
                (*pos)[matchindex[i] - 1] = matchindex[i] - i;
                (*cut)[matchindex[i] - 1] = matchlen[i];
            }
            j += strlen(matchrepl[matchindex[i]]);
            i += matchlen[i] - 1;
        }
    }

    if (matchrepl != matchrepl_buf) {
        hnj_free (matchrepl);
        hnj_free (matchlen);
        hnj_free (matchindex);
    }

    // recursive hyphenation of the first (compound) level segments
    if (dict->nextlevel) {
        char * rep2_buf[MAX_WORD];
        int pos2_buf[MAX_WORD];
        int cut2_buf[MAX_WORD];
        char hyphens2_buf[MAX_WORD];
        char ** rep2;
        int * pos2;
        int * cut2;
        char * hyphens2;
        int begin = 0;
        if (word_size < MAX_CHARS) {
            rep2 = rep2_buf;
            pos2 = pos2_buf;
            cut2 = cut2_buf;
            hyphens2 = hyphens2_buf;
        } else {
            rep2 = hnj_malloc (word_size * sizeof(char *));
            pos2 = hnj_malloc (word_size * sizeof(int));
            cut2 = hnj_malloc (word_size * sizeof(int));
            hyphens2 = hnj_malloc (word_size);
        }
        for (i = 0; i < word_size; i++) rep2[i] = NULL;
        for (i = 0; i < word_size; i++)
            if (hyphens[i]&1 || (begin > 0 && i + 1 == word_size)) {
                if (i - begin > 1) {
                    int hyph = 0;
                    prep_word[i + 2] = '\0';
                    /* non-standard hyphenation at compound boundary (Schiffahrt) */
                    if (*rep && *pos && *cut && (*rep)[i]) {
                        char * l = strchr((*rep)[i], '=');
                        strcpy(prep_word + 2 + i - (*pos)[i], (*rep)[i]);
                        if (l) {
                            hyph = (l - (*rep)[i]) - (*pos)[i];
                            prep_word[2 + i + hyph] = '\0';
                        }
                    }
                    hnj_hyphen_hyph_(dict, prep_word + begin + 1, i - begin + 1 + hyph,
                        hyphens2, &rep2, &pos2, &cut2, clhmin,
                        crhmin, (begin > 0 ? 0 : lend), (hyphens[i]&1 ? 0 : rend));
                    for (j = 0; j < i - begin - 1; j++) {
                        hyphens[begin + j] = hyphens2[j];
                        if (rep2[j] && rep && pos && cut) {
                            if (!*rep && !*pos && !*cut) {
                                int k;
                                *rep = (char **) malloc(sizeof(char *) * word_size);
                                *pos = (int *) malloc(sizeof(int) * word_size);
                                *cut = (int *) malloc(sizeof(int) * word_size);
                                for (k = 0; k < word_size; k++) {
                                    (*rep)[k] = NULL;
                                    (*pos)[k] = 0;
                                    (*cut)[k] = 0;
                                }
                            }
                            (*rep)[begin + j] = rep2[j];
                            (*pos)[begin + j] = pos2[j];
                            (*cut)[begin + j] = cut2[j];
                        }
                    }
                    prep_word[i + 2] = word[i + 1];
                    if (*rep && *pos && *cut && (*rep)[i]) {
                        strcpy(prep_word + 1, word);
                    }
                }
                begin = i + 1;
                for (j = 0; j < word_size; j++) rep2[j] = NULL;
            }

        // non-compound
        if (begin == 0) {
            hnj_hyphen_hyph_(dict->nextlevel, word, word_size,
                hyphens, rep, pos, cut, clhmin, crhmin, lend, rend);
            if (!lend) hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
                rep, pos, cut, clhmin);
            if (!rend) hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
                rep, pos, cut, crhmin);
        }

        if (rep2 != rep2_buf) {
            free(rep2);
            free(cut2);
            free(pos2);
            free(hyphens2);
        }
    }

    if (prep_word != prep_word_buf) hnj_free (prep_word);
    return 0;
}

/* UTF-8 normalization of hyphen and non-standard positions */
int hnj_hyphen_norm(const char *word, int word_size, char * hyphens,
	char *** rep, int ** pos, int ** cut)
{
    if ((((unsigned char) word[0]) >> 6) == 2) {
        fprintf(stderr, "error - bad, non UTF-8 input: %s\n", word);
        return 1;
    }

    /* calculate UTF-8 character positions */
    int i, j, k;
    for (i = 0, j = -1; i < word_size; i++) {
        /* beginning of an UTF-8 character (not '10' start bits) */
        if ((((unsigned char) word[i]) >> 6) != 2) j++;
        hyphens[j] = hyphens[i];
        if (rep && pos && cut && *rep && *pos && *cut) {
            int l = (*pos)[i];
            (*pos)[j] = 0;
            for (k = 0; k < l; k++) {
                if ((((unsigned char) word[i - k]) >> 6) != 2) (*pos)[j]++;
            }
            k = i - l + 1;
            l = k + (*cut)[i];
            (*cut)[j] = 0;
            for (; k < l; k++) {
                if ((((unsigned char) word[k]) >> 6) != 2) (*cut)[j]++;
            }
            (*rep)[j] = (*rep)[i];
            if (j < i) {
                (*rep)[i] = NULL;
                (*pos)[i] = 0;
                (*cut)[i] = 0;
            }
        }
    }
    hyphens[j + 1] = '\0';
    return 0;
}

/* get the word with all possible hyphenations (output: hyphword) */
void hnj_hyphen_hyphword(const char * word, int l, const char * hyphens,
    char * hyphword, char *** rep, int ** pos, int ** cut)
{
    int i, j;
    for (i = 0, j = 0; i < l; i++, j++) {
        if (hyphens[i]&1) {
            hyphword[j] = word[i];
            if (*rep && *pos && *cut && (*rep)[i]) {
                strcpy(hyphword + j - (*pos)[i] + 1, (*rep)[i]);
                j += strlen((*rep)[i]) - (*pos)[i];
                i += (*cut)[i] - (*pos)[i];
            } else hyphword[++j] = '=';
        } else hyphword[j] = word[i];
    }
    hyphword[j] = '\0';
}


/* main api function with default hyphenmin parameters */
int hnj_hyphen_hyphenate2 (HyphenDict *dict,
    const char *word, int word_size, char * hyphens,
    char *hyphword, char *** rep, int ** pos, int ** cut)
{
    hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
        dict->clhmin, dict->crhmin, 1, 1);
    hnj_hyphen_lhmin(dict->utf8, word, word_size,
        hyphens, rep, pos, cut, (dict->lhmin > 0 ? dict->lhmin : 2));
    hnj_hyphen_rhmin(dict->utf8, word, word_size,
        hyphens, rep, pos, cut, (dict->rhmin > 0 ? dict->rhmin : 2));
    if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
    if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
    return 0;
}

/* previous main api function with hyphenmin parameters */
int hnj_hyphen_hyphenate3 (HyphenDict *dict,
	const char *word, int word_size, char * hyphens,
	char *hyphword, char *** rep, int ** pos, int ** cut,
	int lhmin, int rhmin, int clhmin, int crhmin)
{
    lhmin = (lhmin > 0 ? lhmin : dict->lhmin);
    rhmin = (rhmin > 0 ? rhmin : dict->rhmin);
    hnj_hyphen_hyph_(dict, word, word_size, hyphens, rep, pos, cut,
        clhmin, crhmin, 1, 1);
    hnj_hyphen_lhmin(dict->utf8, word, word_size, hyphens,
        rep, pos, cut, (lhmin > 0 ? lhmin : 2));
    hnj_hyphen_rhmin(dict->utf8, word, word_size, hyphens,
        rep, pos, cut, (rhmin > 0 ? rhmin : 2));
    if (hyphword) hnj_hyphen_hyphword(word, word_size, hyphens, hyphword, rep, pos, cut);
    if (dict->utf8) return hnj_hyphen_norm(word, word_size, hyphens, rep, pos, cut);
    return 0;
}