Home | History | Annotate | Download | only in lib
      1 /*
      2  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 /**
     17  * @file picoklex.c
     18  *
     19  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
     20  * All rights reserved.
     21  *
     22  * History:
     23  * - 2009-04-20 -- initial version
     24  *
     25  */
     26 #include "picoos.h"
     27 #include "picodbg.h"
     28 #include "picodata.h"
     29 #include "picoknow.h"
     30 #include "picoklex.h"
     31 
     32 #ifdef __cplusplus
     33 extern "C" {
     34 #endif
     35 #if 0
     36 }
     37 #endif
     38 
     39 /* ************************************************************/
     40 /* lexicon */
     41 /* ************************************************************/
     42 
     43 /**
     44  * @addtogroup picolex
     45  *
     46   overview:
     47   - lex consists of optional searchindex and a non-empty list of lexblocks
     48   - lexblocks are fixed size, at the start of a block there is also the
     49     start of an entry
     50   - using the searchindex a unambiguous lexblock can be determined which
     51     contains the entry (or there is no entry)
     52   - one lex entry has POS GRAPH PHON, all mandatory, but
     53     - PHON can be empty string -> no pronunciation in the resulting TTS output
     54     - PHON can be :G2P -> use G2P later to add pronunciation
     55   - (POS,GRAPH) is a uniq key (only one entry allowed)
     56   - (GRAPH) is almost a uniq key (2-4 entries with the same GRAPH, and
     57     differing POS and differing PHON possible)
     58     - for one graph we can have two or three solutions from the lex
     59        which all need to be passed on the the next PU
     60     - in this case GRAPH, POS, and PHON all must be available in lex
     61 
     62   sizing:
     63     - 3 bytes entry index -> 16MB addressable
     64     - 2 bytes searchindex nr -> 64K blocks possible
     65     - 5 bytes per searchindex entry
     66       - 3 bytes for graph-prefix
     67       - 2 bytes blockadr in searchindex -> 64K blocks possible
     68     - lexblock size 512B:
     69       - 32M possible
     70       - with ~20 bytes per entry
     71         -> max. average of ~26 entries to be searched per lookup
     72     - overhead of ~10 bytes per block to sync with
     73       block boundaries
     74     - examples:
     75       - 500KB lex -> 1000 blocks,
     76         1000 entries in searchindex, ~25.6K lex-entries,
     77         - ~5KB searchindex
     78            ~10KB overhead for block sync
     79       - 100KB lex -> 200 blocks,
     80         200 entries in searchindex, ~5.1K lex-entries,
     81         - ~1KB searchindex
     82            ~2KB overhead for block sync
     83 
     84   pil-file: lexicon knowledge base in binary form
     85 
     86     lex-kb = content
     87 
     88     content = searchindex {lexblock}1:NRBLOCKS2
     89 
     90     lexblock = {lexentry}1:        (lexblock size is fixed 512Bytes)
     91 
     92     searchindex = NRBLOCKS2 {GRAPH1 GRAPH1 GRAPH1 LEXBLOCKIND2}=NRBLOCKS2
     93 
     94     lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1
     95                LENPOSPHON1 POS1 {PHON1}=LENPOSPHON1-2
     96 
     97     - special cases:
     98       - PHON is empty string (no pronunciation in the resulting TTS output):
     99         lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1  2 POS1
    100       - PHON can be :G2P -> use G2P later to add pronunciation:
    101         lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1  3 POS1 <reserved-phon-val=5>
    102     - multi-byte values always little endian
    103 */
    104 
    105 
    106 /* ************************************************************/
    107 /* lexicon data defines */
    108 /* may not be changed with current implementation */
    109 /* ************************************************************/
    110 
    111 /* nr bytes of nrblocks info */
    112 #define PICOKLEX_LEX_NRBLOCKS_SIZE 2
    113 
    114 /* search index entry: - nr graphs
    115                        - nr bytes of block index
    116                        - nr bytes per entry, NRGRAPHS*INDSIZE */
    117 #define PICOKLEX_LEX_SIE_NRGRAPHS  3
    118 #define PICOKLEX_LEX_SIE_INDSIZE   2
    119 #define PICOKLEX_LEX_SIE_SIZE      5
    120 
    121 /* nr of bytes per lexblock */
    122 #define PICOKLEX_LEXBLOCK_SIZE   512
    123 
    124 
    125 /* reserved values in klex to indicate :G2P needed for a lexentry */
    126 #define PICOKLEX_NEEDS_G2P   5
    127 
    128 
    129 /* ************************************************************/
    130 /* lexicon type and loading */
    131 /* ************************************************************/
    132 
    133 /** object       : LexKnowledgeBase
    134  *  shortcut     : klex
    135  *  derived from : picoknow_KnowledgeBase
    136  */
    137 
    138 typedef struct klex_subobj *klex_SubObj;
    139 
    140 typedef struct klex_subobj
    141 {
    142     picoos_uint16 nrblocks; /* nr lexblocks = nr eles in searchind */
    143     picoos_uint8 *searchind;
    144     picoos_uint8 *lexblocks;
    145 } klex_subobj_t;
    146 
    147 
    148 static pico_status_t klexInitialize(register picoknow_KnowledgeBase this,
    149                                     picoos_Common common)
    150 {
    151     picoos_uint32 curpos = 0;
    152     klex_subobj_t *klex;
    153 
    154     PICODBG_DEBUG(("start"));
    155 
    156     /* check whether (this->size != 0) done before calling this function */
    157 
    158     if (NULL == this || NULL == this->subObj) {
    159         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
    160                                        NULL, NULL);
    161     }
    162     klex = (klex_subobj_t *) this->subObj;
    163 
    164     if (PICO_OK == picoos_read_mem_pi_uint16(this->base, &curpos,
    165                                              &(klex->nrblocks))) {
    166         if (klex->nrblocks > 0) {
    167             PICODBG_DEBUG(("nr blocks: %i, curpos: %i", klex->nrblocks,curpos));
    168             klex->searchind = this->base + curpos;
    169         } else {
    170             klex->searchind = NULL;
    171         }
    172         klex->lexblocks = this->base + PICOKLEX_LEX_NRBLOCKS_SIZE +
    173                              (klex->nrblocks * (PICOKLEX_LEX_SIE_SIZE));
    174         return PICO_OK;
    175     } else {
    176         return picoos_emRaiseException(common->em, PICO_EXC_FILE_CORRUPT,
    177                                        NULL, NULL);
    178     }
    179 }
    180 
    181 
    182 static pico_status_t klexSubObjDeallocate(register picoknow_KnowledgeBase this,
    183                                           picoos_MemoryManager mm)
    184 {
    185     if (NULL != this) {
    186         picoos_deallocate(mm, (void *) &this->subObj);
    187     }
    188     return PICO_OK;
    189 }
    190 
    191 
    192 /* we don't offer a specialized constructor for a LexKnowledgeBase but
    193  * instead a "specializer" of an allready existing generic
    194  * picoknow_KnowledgeBase */
    195 
    196 pico_status_t picoklex_specializeLexKnowledgeBase(picoknow_KnowledgeBase this,
    197                                                   picoos_Common common)
    198 {
    199     if (NULL == this) {
    200         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
    201                                        NULL, NULL);
    202     }
    203     if (this->size > 0) {
    204         this->subDeallocate = klexSubObjDeallocate;
    205         this->subObj = picoos_allocate(common->mm, sizeof(klex_subobj_t));
    206         if (NULL == this->subObj) {
    207             return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
    208                                            NULL, NULL);
    209         }
    210         return klexInitialize(this, common);
    211     } else {
    212         /* some dummy klex */
    213         return PICO_OK;
    214     }
    215 }
    216 
    217 /* for now we don't need to do anything special for the main lex */
    218 /*
    219 pico_status_t picoklex_specializeMainLexKnowledgeBase(
    220         picoknow_KnowledgeBase this,
    221         picoos_Common common)
    222 {
    223     return picoklex_specializeLexKnowledgeBase(this,common);
    224 }
    225 */
    226 
    227 
    228 /* ************************************************************/
    229 /* lexicon getLex */
    230 /* ************************************************************/
    231 
    232 picoklex_Lex picoklex_getLex(picoknow_KnowledgeBase this)
    233 {
    234     if (NULL == this) {
    235         return NULL;
    236     } else {
    237         return (picoklex_Lex) this->subObj;
    238     }
    239 }
    240 
    241 
    242 /* ************************************************************/
    243 /* functions on searchindex */
    244 /* ************************************************************/
    245 
    246 
    247 static picoos_uint32 klex_getSearchIndexVal(const klex_SubObj this,
    248                                             picoos_uint16 index)
    249 {
    250     picoos_uint32 pos, val;
    251     pos = index * PICOKLEX_LEX_SIE_SIZE;
    252     val = this->searchind[pos];
    253     val = (val << 8) + this->searchind[pos + 1];
    254     val = (val << 8) + this->searchind[pos + 2];
    255     return val;
    256 }
    257 
    258 
    259 /* Determine first lexblock containing entries for specified
    260    grapheme. */
    261 
    262 static picoos_uint16 klex_getLexblockNr(const klex_SubObj this,
    263                                         const picoos_uint8 *graphsi) {
    264     /* graphsi is of len PICOKLEX_LEX_SI_NGRAPHS */
    265     picoos_int32 low, mid, high;
    266     picoos_uint32 searchval, indval;
    267 
    268     /* PICOKLEX_LEX_SIE_NRGRAPHS */
    269 
    270     /* convert graph-prefix to number with 'lexicographic' ordering */
    271     searchval = graphsi[0];
    272     searchval = (searchval << 8) + graphsi[1];
    273     searchval = (searchval << 8) + graphsi[2];
    274 
    275     low = 0;
    276     high = this->nrblocks;
    277 
    278     /* do binary search */
    279     while (low < high) {
    280         mid = (low + high) / 2;
    281         indval = klex_getSearchIndexVal(this, mid);
    282         if (indval < searchval) {
    283             low = mid + 1;
    284         } else {
    285             high = mid;
    286         }
    287     }
    288     PICODBG_ASSERT(high == low);
    289     /* low points to the first entry greater than or equal to searchval */
    290 
    291     if (low < this->nrblocks) {
    292         indval = klex_getSearchIndexVal(this, low);
    293         if (indval > searchval) {
    294             low--;
    295             /* if there are identical elements in the search index we have
    296                to move to the first one */
    297             if (low > 0) {
    298                 indval = klex_getSearchIndexVal(this, low);
    299                 while (indval == klex_getSearchIndexVal(this, low-1)) {
    300                     low--;
    301                 }
    302             }
    303         }
    304     } else {
    305         low = this->nrblocks - 1;
    306     }
    307 
    308 #if defined(PICO_DEBUG)
    309     {
    310         picoos_uint32 pos = low * PICOKLEX_LEX_SIE_SIZE;
    311         PICODBG_DEBUG(("binary search result is %c%c%c (%d)",
    312                        this->searchind[pos], this->searchind[pos + 1],
    313                        this->searchind[pos + 2], low));
    314     }
    315 #endif
    316 
    317     return (picoos_uint16) low;
    318 }
    319 
    320 
    321 /* Determine number of adjacent lexblocks containing entries for
    322    the same grapheme search prefix (identified by search index). */
    323 
    324 static picoos_uint16 klex_getLexblockRange(const klex_SubObj this,
    325                                            picoos_uint16 index)
    326 {
    327     picoos_uint16 count;
    328     picoos_uint32 sval1, sval2;
    329 
    330     sval1 = klex_getSearchIndexVal(this, index);
    331 
    332 #if defined(PICO_DEBUG)
    333     /* 'index' must point to first lexblock of its kind */
    334     if (index > 0) {
    335         sval2 = klex_getSearchIndexVal(this, index - 1);
    336         PICODBG_ASSERT(sval1 != sval2);
    337     }
    338 #endif
    339 
    340     index++;
    341     sval2 = klex_getSearchIndexVal(this, index);
    342 
    343     count = 1;
    344     while (sval1 == sval2) {
    345         count++;
    346         index++;
    347         sval2 = klex_getSearchIndexVal(this, index);
    348     }
    349 
    350     return count;
    351 }
    352 
    353 
    354 /* ************************************************************/
    355 /* functions on single lexblock */
    356 /* ************************************************************/
    357 
    358 static picoos_int8 klex_lexMatch(picoos_uint8 *lexentry,
    359                                  const picoos_uint8 *graph,
    360                                  const picoos_uint16 graphlen) {
    361     picoos_uint8 i;
    362     picoos_uint8 lexlen;
    363     picoos_uint8 *lexgraph;
    364 
    365     lexlen = lexentry[0] - 1;
    366     lexgraph = &(lexentry[1]);
    367     for (i=0; (i<graphlen) && (i<lexlen); i++) {
    368         PICODBG_TRACE(("%d|%d  graph|lex: %c|%c", graphlen, lexlen,
    369                        graph[i], lexgraph[i]));
    370         if (lexgraph[i] < graph[i]) {
    371             return -1;
    372         } else if (lexgraph[i] > graph[i]) {
    373             return 1;
    374         }
    375     }
    376     if (graphlen == lexlen) {
    377         return 0;
    378     } else if (lexlen < graphlen) {
    379         return -1;
    380     } else {
    381         return 1;
    382     }
    383 }
    384 
    385 
    386 static void klex_setLexResult(const picoos_uint8 *lexentry,
    387                               const picoos_uint32 lexpos,
    388                               picoklex_lexl_result_t *lexres) {
    389     picoos_uint8 i;
    390 
    391     /* check if :G2P */
    392     if ((2 < (lexentry[lexentry[0]])) && ((lexentry[lexentry[0] + 2]) == PICOKLEX_NEEDS_G2P)) {
    393         /* set pos */
    394         lexres->posind[0] = lexentry[lexentry[0] + 1];
    395         /* set rest */
    396         lexres->phonfound = FALSE;
    397         lexres->posindlen = 1;
    398         lexres->nrres = 1;
    399         PICODBG_DEBUG(("result %d :G2P", lexres->nrres));
    400     } else {
    401         i = lexres->nrres * (PICOKLEX_POSIND_SIZE);
    402         lexres->posindlen += PICOKLEX_POSIND_SIZE;
    403         lexres->phonfound = TRUE;
    404         /* set pos */
    405         lexres->posind[i++] = lexentry[lexentry[0] + 1];
    406         /* set ind, PICOKLEX_IND_SIZE */
    407         lexres->posind[i++] = 0x000000ff & (lexpos);
    408         lexres->posind[i++] = 0x000000ff & (lexpos >>  8);
    409         lexres->posind[i]   = 0x000000ff & (lexpos >> 16);
    410         lexres->nrres++;
    411         PICODBG_DEBUG(("result %d", lexres->nrres));
    412     }
    413 }
    414 
    415 
    416 static void klex_lexblockLookup(klex_SubObj this,
    417                                 const picoos_uint32 lexposStart,
    418                                 const picoos_uint32 lexposEnd,
    419                                 const picoos_uint8 *graph,
    420                                 const picoos_uint16 graphlen,
    421                                 picoklex_lexl_result_t *lexres) {
    422     picoos_uint32 lexpos;
    423     picoos_int8 rv;
    424 
    425     lexres->nrres = 0;
    426 
    427     lexpos = lexposStart;
    428     rv = -1;
    429     while ((rv < 0) && (lexpos < lexposEnd)) {
    430 
    431         rv = klex_lexMatch(&(this->lexblocks[lexpos]), graph, graphlen);
    432 
    433         if (rv == 0) { /* found */
    434             klex_setLexResult(&(this->lexblocks[lexpos]), lexpos, lexres);
    435             if (lexres->phonfound) {
    436                 /* look for more results, up to MAX_NRRES, don't even
    437                    check if more results would be available */
    438                 while ((lexres->nrres < PICOKLEX_MAX_NRRES) &&
    439                        (lexpos < lexposEnd)) {
    440                     lexpos += this->lexblocks[lexpos];
    441                     lexpos += this->lexblocks[lexpos];
    442                     /* if there are no more entries in this block, advance
    443                        to next block by skipping all zeros */
    444                     while ((this->lexblocks[lexpos] == 0) &&
    445                            (lexpos < lexposEnd)) {
    446                         lexpos++;
    447                     }
    448                     if (lexpos < lexposEnd) {
    449                         if (klex_lexMatch(&(this->lexblocks[lexpos]), graph,
    450                                           graphlen) == 0) {
    451                             klex_setLexResult(&(this->lexblocks[lexpos]),
    452                                               lexpos, lexres);
    453                         } else {
    454                             /* no more results, quit loop */
    455                             lexpos = lexposEnd;
    456                         }
    457                     }
    458                 }
    459             } else {
    460                 /* :G2P mark */
    461             }
    462         } else if (rv < 0) {
    463             /* not found, goto next entry */
    464             lexpos += this->lexblocks[lexpos];
    465             lexpos += this->lexblocks[lexpos];
    466             /* if there are no more entries in this block, advance
    467                to next block by skipping all zeros */
    468             while ((this->lexblocks[lexpos] == 0) && (lexpos < lexposEnd)) {
    469                 lexpos++;
    470             }
    471         } else {
    472             /* rv > 0, not found, won't show up later in block */
    473         }
    474     }
    475 }
    476 
    477 
    478 /* ************************************************************/
    479 /* lexicon lookup functions */
    480 /* ************************************************************/
    481 
    482 picoos_uint8 picoklex_lexLookup(const picoklex_Lex this,
    483                                 const picoos_uint8 *graph,
    484                                 const picoos_uint16 graphlen,
    485                                 picoklex_lexl_result_t *lexres) {
    486     picoos_uint16 lbnr, lbc;
    487     picoos_uint32 lexposStart, lexposEnd;
    488     picoos_uint8 i;
    489     picoos_uint8 tgraph[PICOKLEX_LEX_SIE_NRGRAPHS];
    490     klex_SubObj klex = (klex_SubObj) this;
    491 
    492     if (NULL == klex) {
    493         PICODBG_ERROR(("no lexicon loaded"));
    494         /* no exception here needed, already checked at initialization */
    495         return FALSE;
    496     }
    497 
    498     lexres->nrres = 0;
    499     lexres->posindlen = 0;
    500     lexres->phonfound = FALSE;
    501 
    502     for (i = 0; i<PICOKLEX_LEX_SIE_NRGRAPHS; i++) {
    503         if (i < graphlen) {
    504             tgraph[i] = graph[i];
    505         } else {
    506             tgraph[i] = '\0';
    507         }
    508     }
    509     PICODBG_DEBUG(("tgraph: %c%c%c", tgraph[0],tgraph[1],tgraph[2]));
    510 
    511     if ((klex->nrblocks) == 0) {
    512         /* no searchindex, no lexblock */
    513         PICODBG_WARN(("no searchindex, no lexblock"));
    514         return FALSE;
    515     } else {
    516         lbnr = klex_getLexblockNr(klex, tgraph);
    517         PICODBG_ASSERT(lbnr < klex->nrblocks);
    518         lbc = klex_getLexblockRange(klex, lbnr);
    519         PICODBG_ASSERT((lbc >= 1) && (lbc <= klex->nrblocks));
    520     }
    521     PICODBG_DEBUG(("lexblock nr: %d (#%d)", lbnr, lbc));
    522 
    523     lexposStart = lbnr * PICOKLEX_LEXBLOCK_SIZE;
    524     lexposEnd = lexposStart + lbc * PICOKLEX_LEXBLOCK_SIZE;
    525 
    526     PICODBG_DEBUG(("lookup start, lexpos range %d..%d", lexposStart,lexposEnd));
    527     klex_lexblockLookup(klex, lexposStart, lexposEnd, graph, graphlen, lexres);
    528     PICODBG_DEBUG(("lookup done, %d found", lexres->nrres));
    529 
    530     return (lexres->nrres > 0);
    531 }
    532 
    533 
    534 picoos_uint8 picoklex_lexIndLookup(const picoklex_Lex this,
    535                                    const picoos_uint8 *ind,
    536                                    const picoos_uint8 indlen,
    537                                    picoos_uint8 *pos,
    538                                    picoos_uint8 **phon,
    539                                    picoos_uint8 *phonlen) {
    540     picoos_uint32 pentry;
    541     klex_SubObj klex = (klex_SubObj) this;
    542 
    543     /* check indlen */
    544     if (indlen != PICOKLEX_IND_SIZE) {
    545         return FALSE;
    546     }
    547 
    548     /* PICOKLEX_IND_SIZE */
    549     pentry = 0x000000ff & (ind[0]);
    550     pentry |= ((picoos_uint32)(ind[1]) <<  8);
    551     pentry |= ((picoos_uint32)(ind[2]) << 16);
    552 
    553     /* check ind if it is within lexblocks byte stream, if not, return FALSE */
    554     if (pentry >= ((picoos_uint32)klex->nrblocks * PICOKLEX_LEXBLOCK_SIZE)) {
    555         return FALSE;
    556     }
    557 
    558     pentry += (klex->lexblocks[pentry]);
    559     *phonlen = (klex->lexblocks[pentry++]) - 2;
    560     *pos = klex->lexblocks[pentry++];
    561     *phon = &(klex->lexblocks[pentry]);
    562 
    563     PICODBG_DEBUG(("pentry: %d, phonlen: %d", pentry, *phonlen));
    564     return TRUE;
    565 }
    566 
    567 #ifdef __cplusplus
    568 }
    569 #endif
    570 
    571 
    572 /* end */
    573