Home | History | Annotate | Download | only in lib
      1 /*
      2  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 /**
     17  * @file picoktab.c
     18  *
     19  * symbol tables needed at runtime
     20  *
     21  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
     22  * All rights reserved.
     23  *
     24  * History:
     25  * - 2009-04-20 -- initial version
     26  *
     27  */
     28 
     29 #include "picoos.h"
     30 #include "picodbg.h"
     31 #include "picoknow.h"
     32 #include "picobase.h"
     33 #include "picoktab.h"
     34 #include "picodata.h"
     35 
     36 #ifdef __cplusplus
     37 extern "C" {
     38 #endif
     39 #if 0
     40 }
     41 #endif
     42 
     43 
     44 /** @todo : the following would be better part of a knowledge base.
     45  * Make sure it is consistent with the phoneme symbol table used in the lingware */
     46 
     47 /* PLANE_PHONEMES */
     48 
     49 /* PLANE_POS */
     50 
     51 /* PLANE_PB_STRENGTHS */
     52 
     53 /* PLANE_ACCENTS */
     54 
     55 /* PLANE_INTERN */
     56 #define PICOKTAB_TMPID_PHONSTART      '\x26'  /* 38  '&' */
     57 #define PICOKTAB_TMPID_PHONTERM       '\x23'  /* 35  '#' */
     58 
     59 
     60 /* ************************************************************/
     61 /* fixed ids */
     62 /* ************************************************************/
     63 
     64 
     65 static pico_status_t ktabIdsInitialize(register picoknow_KnowledgeBase this,
     66                                        picoos_Common common)
     67 {
     68     picoktab_FixedIds ids;
     69 
     70     PICODBG_DEBUG(("start"));
     71 
     72     if (NULL == this || NULL == this->subObj) {
     73         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
     74                                        NULL, NULL);
     75     }
     76     ids = (picoktab_FixedIds) this->subObj;
     77 
     78     ids->phonStartId = PICOKTAB_TMPID_PHONSTART;
     79     ids->phonTermId = PICOKTAB_TMPID_PHONTERM;
     80     return PICO_OK;
     81 }
     82 
     83 
     84 static pico_status_t ktabIdsSubObjDeallocate(register picoknow_KnowledgeBase this,
     85                                              picoos_MemoryManager mm)
     86 {
     87     if (NULL != this) {
     88         picoos_deallocate(mm, (void *) &this->subObj);
     89     }
     90     return PICO_OK;
     91 }
     92 
     93 pico_status_t picoktab_specializeIdsKnowledgeBase(picoknow_KnowledgeBase this,
     94                                                   picoos_Common common)
     95 {
     96     if (NULL == this) {
     97         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
     98                                        NULL, NULL);
     99     }
    100     this->subDeallocate = ktabIdsSubObjDeallocate;
    101     this->subObj = picoos_allocate(common->mm, sizeof(picoktab_fixed_ids_t));
    102     if (NULL == this->subObj) {
    103         return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
    104                                        NULL, NULL);
    105     }
    106     return ktabIdsInitialize(this, common);
    107 }
    108 
    109 picoktab_FixedIds picoktab_getFixedIds(picoknow_KnowledgeBase this)
    110 {
    111     return ((NULL == this) ? NULL : ((picoktab_FixedIds) this->subObj));
    112 }
    113 
    114 
    115 picoktab_FixedIds picoktab_newFixedIds(picoos_MemoryManager mm)
    116 {
    117     picoktab_FixedIds this = (picoktab_FixedIds) picoos_allocate(mm,sizeof(*this));
    118     if (NULL != this) {
    119         /* initialize */
    120     }
    121     return this;
    122 }
    123 
    124 
    125 void picoktab_disposeFixedIds(picoos_MemoryManager mm, picoktab_FixedIds * this)
    126 {
    127     if (NULL != (*this)) {
    128         /* terminate */
    129         picoos_deallocate(mm,(void *)this);
    130     }
    131 }
    132 
    133 
    134 
    135 /* ************************************************************/
    136 /* Graphs */
    137 /* ************************************************************/
    138 
    139 /* overview binary file format for graphs kb:
    140 
    141     graphs-kb = NROFSENTRIES SIZEOFSENTRY ofstable graphs
    142 
    143     NROFSENTRIES  : 2 bytes, number of entries in offset table
    144     SIZEOFSENTRY  : 1 byte,  size of one entry in offset table
    145 
    146     ofstable = {OFFSET}=NROFSENTRIES (contains NROFSENTRIES entries of OFFSET)
    147 
    148     OFFSET: SIZEOFSENTRY bytes, offset to baseaddress of graphs-kb to entry in graphs
    149 
    150     graphs = {graph}=NROFSENTRIES (contains NROFSENTRIES entries of graph)
    151 
    152     graph = PROPSET FROM TO [TOKENTYPE] [TOKENSUBTYPE] [VALUE] [LOWERCASE] [GRAPHSUBS1] [GRAPHSUBS2]
    153 
    154     FROM          : 1..4 unsigned bytes, UTF8 character without terminating 0
    155     TO            : 1..4 unsigned bytes, UTF8 character without terminating 0
    156     PROPSET       : 1 unsigned byte, least significant bit : has TO field
    157                                                              next bit : has TOKENTYPE
    158                                                              next bit : has TOKENSUBTYPE
    159                                                              next bit : has VALUE
    160                                                              next bit : has LOWERCASE
    161                                                              next bit : has GRAPHSUBS1
    162                                                              next bit : has GRAPHSUBS2
    163                                                              next bit : has PUNC
    164 
    165     TOKENTYPE    : 1 unsigned byte
    166     TOKENSUBTYPE : 1 unsigned byte
    167     VALUE        : 1 unsigned byte
    168     LOWERCASE    : 1..4 unsigned bytes, UTF8 character without terminating 0
    169     GRAPHSUBS1   : 1..4 unsigned bytes, UTF8 character without terminating 0
    170     GRAPHSUBS2   : 1..4 unsigned bytes, UTF8 character without terminating 0
    171     PUNC         : 1 unsigned byte
    172 */
    173 
    174 static picoos_uint32 ktab_propOffset (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 prop);
    175 
    176 #define KTAB_START_GRAPHS_NR_OFFSET     0
    177 #define KTAB_START_GRAPHS_SIZE_OFFSET   2
    178 #define KTAB_START_GRAPHS_OFFSET_TABLE  3
    179 #define KTAB_START_GRAPHS_GRAPH_TABLE   0
    180 
    181 /* bitmasks to extract the grapheme properties info from the property set */
    182 #define KTAB_GRAPH_PROPSET_TO            ((picoos_uint8)'\x01')
    183 #define KTAB_GRAPH_PROPSET_TOKENTYPE     ((picoos_uint8)'\x02')
    184 #define KTAB_GRAPH_PROPSET_TOKENSUBTYPE  ((picoos_uint8)'\x04')
    185 #define KTAB_GRAPH_PROPSET_VALUE         ((picoos_uint8)'\x08')
    186 #define KTAB_GRAPH_PROPSET_LOWERCASE     ((picoos_uint8)'\x010')
    187 #define KTAB_GRAPH_PROPSET_GRAPHSUBS1    ((picoos_uint8)'\x020')
    188 #define KTAB_GRAPH_PROPSET_GRAPHSUBS2    ((picoos_uint8)'\x040')
    189 #define KTAB_GRAPH_PROPSET_PUNCT         ((picoos_uint8)'\x080')
    190 
    191 
    192 typedef struct ktabgraphs_subobj *ktabgraphs_SubObj;
    193 
    194 typedef struct ktabgraphs_subobj {
    195     picoos_uint16 nrOffset;
    196     picoos_uint16 sizeOffset;
    197 
    198     picoos_uint8 * offsetTable;
    199     picoos_uint8 * graphTable;
    200 } ktabgraphs_subobj_t;
    201 
    202 
    203 
    204 static pico_status_t ktabGraphsInitialize(register picoknow_KnowledgeBase this,
    205                                           picoos_Common common) {
    206     ktabgraphs_subobj_t * ktabgraphs;
    207 
    208     PICODBG_DEBUG(("start"));
    209 
    210     if (NULL == this || NULL == this->subObj) {
    211         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
    212                                        NULL, NULL);
    213     }
    214     ktabgraphs = (ktabgraphs_subobj_t *) this->subObj;
    215     ktabgraphs->nrOffset = ((int)(this->base[KTAB_START_GRAPHS_NR_OFFSET])) + 256*(int)(this->base[KTAB_START_GRAPHS_NR_OFFSET+1]);
    216     ktabgraphs->sizeOffset  = (int)(this->base[KTAB_START_GRAPHS_SIZE_OFFSET]);
    217     ktabgraphs->offsetTable = &(this->base[KTAB_START_GRAPHS_OFFSET_TABLE]);
    218     ktabgraphs->graphTable  = &(this->base[KTAB_START_GRAPHS_GRAPH_TABLE]);
    219     return PICO_OK;
    220 }
    221 
    222 static pico_status_t ktabGraphsSubObjDeallocate(register picoknow_KnowledgeBase this,
    223                                                 picoos_MemoryManager mm) {
    224     if (NULL != this) {
    225         picoos_deallocate(mm, (void *) &this->subObj);
    226     }
    227     return PICO_OK;
    228 }
    229 
    230 
    231 pico_status_t picoktab_specializeGraphsKnowledgeBase(picoknow_KnowledgeBase this,
    232                                                      picoos_Common common) {
    233     if (NULL == this) {
    234         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
    235                                        NULL, NULL);
    236     }
    237     this->subDeallocate = ktabGraphsSubObjDeallocate;
    238     this->subObj = picoos_allocate(common->mm, sizeof(ktabgraphs_subobj_t));
    239     if (NULL == this->subObj) {
    240         return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
    241                                        NULL, NULL);
    242     }
    243     return ktabGraphsInitialize(this, common);
    244 }
    245 
    246 
    247 picoktab_Graphs picoktab_getGraphs(picoknow_KnowledgeBase this) {
    248     if (NULL == this) {
    249         return NULL;
    250     } else {
    251         return (picoktab_Graphs) this->subObj;
    252     }
    253 }
    254 
    255 
    256 /* Graphs methods */
    257 
    258 picoos_uint8 picoktab_hasVowellikeProp(const picoktab_Graphs this,
    259                                        const picoos_uint8 *graph,
    260                                        const picoos_uint8 graphlenmax) {
    261 
    262   picoos_uint8 ui8App;
    263   picoos_uint32 graphsOffset;
    264   ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
    265 
    266   ui8App = graphlenmax;        /* avoid warning "var not used in this function"*/
    267 
    268   graphsOffset = picoktab_graphOffset (this, (picoos_uchar *)graph);
    269   return g->graphTable[graphsOffset + ktab_propOffset (this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE)] == PICODATA_ITEMINFO1_TOKTYPE_LETTERV;
    270 }
    271 
    272 
    273 static void ktab_getStrProp (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 propOffset, picoos_uchar * str)
    274 {
    275   ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
    276   picoos_uint32 i, l;
    277 
    278   i = 0;
    279   l = picobase_det_utf8_length(g->graphTable[graphsOffset+propOffset]);
    280   while (i<l) {
    281     str[i] = g->graphTable[graphsOffset+propOffset+i];
    282     i++;
    283   }
    284   str[l] = 0;
    285 }
    286 
    287 
    288 static picoos_uint32 ktab_propOffset(const picoktab_Graphs this,
    289         picoos_uint32 graphsOffset, picoos_uint32 prop)
    290 /* Returns offset of property 'prop' inside the graph with offset 'graphsOffset' in graphs table;
    291  If the property is found, a value > 0 is returned otherwise 0 */
    292 {
    293     picoos_uint32 n = 0;
    294     ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this;
    295 
    296     if ((g->graphTable[graphsOffset] & prop) == prop) {
    297         n = n + 1; /* overread PROPSET field */
    298         n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread FROM field */
    299         if (prop > KTAB_GRAPH_PROPSET_TO) {
    300             if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TO)
    301                     == KTAB_GRAPH_PROPSET_TO) {
    302                 n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread TO field */
    303             }
    304         } else {
    305             return n;
    306         }
    307         if (prop > KTAB_GRAPH_PROPSET_TOKENTYPE) {
    308             if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENTYPE)
    309                     == KTAB_GRAPH_PROPSET_TOKENTYPE) {
    310                 n = n + 1; /* overread TOKENTYPE field */
    311             }
    312         } else {
    313             return n;
    314         }
    315         if (prop > KTAB_GRAPH_PROPSET_TOKENSUBTYPE) {
    316             if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENSUBTYPE)
    317                     == KTAB_GRAPH_PROPSET_TOKENSUBTYPE) {
    318                 n = n + 1; /* overread stokentype field */
    319             }
    320         } else {
    321             return n;
    322         }
    323         if (prop > KTAB_GRAPH_PROPSET_VALUE) {
    324             if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_VALUE)
    325                     == KTAB_GRAPH_PROPSET_VALUE) {
    326                 n = n + 1; /* overread value field */
    327             }
    328         } else {
    329             return n;
    330         }
    331         if (prop > KTAB_GRAPH_PROPSET_LOWERCASE) {
    332             if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_LOWERCASE)
    333                     == KTAB_GRAPH_PROPSET_LOWERCASE) {
    334                 n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread lowercase field */
    335             }
    336         } else {
    337             return n;
    338         }
    339         if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS1) {
    340             if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS1)
    341                     == KTAB_GRAPH_PROPSET_GRAPHSUBS1) {
    342                 n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs1 field */
    343             }
    344         } else {
    345             return n;
    346         }
    347         if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS2) {
    348             if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS2)
    349                     == KTAB_GRAPH_PROPSET_GRAPHSUBS2) {
    350                 n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs2 field */
    351             }
    352         } else {
    353             return n;
    354         }
    355         if (prop > KTAB_GRAPH_PROPSET_PUNCT) {
    356             if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_PUNCT)
    357                     == KTAB_GRAPH_PROPSET_PUNCT) {
    358                 n = n + 1; /* overread value field */
    359             }
    360         } else {
    361             return n;
    362         }
    363     }
    364 
    365     return n;
    366 }
    367 
    368 
    369 picoos_uint32 picoktab_graphOffset (const picoktab_Graphs this, picoos_uchar * utf8graph)
    370 {  ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
    371    picoos_int32 a, b, m;
    372    picoos_uint32 graphsOffset;
    373    picoos_uint32 propOffset;
    374    picobase_utf8char from;
    375    picobase_utf8char to;
    376    picoos_bool utfGEfrom;
    377    picoos_bool utfLEto;
    378 
    379    if (g->nrOffset > 0) {
    380      a = 0;
    381      b = g->nrOffset-1;
    382      do  {
    383        m = (a+b) / 2;
    384 
    385        /* get offset to graph[m] */
    386        if (g->sizeOffset == 1) {
    387          graphsOffset = g->offsetTable[g->sizeOffset*m];
    388        }
    389        else {
    390          graphsOffset =     g->offsetTable[g->sizeOffset*m    ] +
    391                         256*g->offsetTable[g->sizeOffset*m + 1];
    392          /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i %i", m, g->offsetTable[g->sizeOffset*m], g->offsetTable[g->sizeOffset*m + 1], graphsOffset));
    393          */
    394        }
    395 
    396        /* get FROM and TO field of graph[m] */
    397        ktab_getStrProp(this, graphsOffset, 1, from);
    398        propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TO);
    399        if (propOffset > 0) {
    400          ktab_getStrProp(this, graphsOffset, propOffset, to);
    401        }
    402        else {
    403          picoos_strcpy((picoos_char *)to, (picoos_char *)from);
    404        }
    405 
    406        /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i '%s' '%s' '%s'", a, m, b, from, utf8graph, to));
    407        */
    408        utfGEfrom = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)from) >= 0;
    409        utfLEto = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)to) <= 0;
    410 
    411        if (utfGEfrom && utfLEto) {
    412          /* PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' found", utf8graph));
    413           */
    414          return graphsOffset;
    415        }
    416        if (!utfGEfrom) {
    417          b = m-1;
    418        }
    419        else if (!utfLEto) {
    420          a = m+1;
    421        }
    422      } while (a<=b);
    423    }
    424    PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' not found", utf8graph));
    425    return 0;
    426 }
    427 
    428 
    429 
    430 
    431 picoos_bool  picoktab_getIntPropTokenType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * stokenType)
    432 {
    433   picoos_uint32 propOffset;
    434   ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
    435 
    436   propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE);
    437   if (propOffset > 0) {
    438     *stokenType = (picoos_uint8)(g->graphTable[graphsOffset+propOffset]);
    439     return TRUE;
    440   }
    441   else {
    442     return FALSE;
    443   }
    444 }
    445 
    446 
    447 picoos_bool  picoktab_getIntPropTokenSubType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_int8 * stokenSubType)
    448 {
    449   picoos_uint32 propOffset;
    450   ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
    451 
    452   propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENSUBTYPE);
    453   if (propOffset > 0) {
    454     *stokenSubType = (picoos_int8)(g->graphTable[graphsOffset+propOffset]);
    455     return TRUE;
    456   }
    457   else {
    458     return FALSE;
    459   }
    460 }
    461 
    462 picoos_bool  picoktab_getIntPropValue (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 * value)
    463 {
    464   picoos_uint32 propOffset;
    465   ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
    466 
    467   propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_VALUE);
    468   if (propOffset > 0) {
    469     *value = (picoos_uint32)(g->graphTable[graphsOffset+propOffset]);
    470     return TRUE;
    471   }
    472   else {
    473     return FALSE;
    474   }
    475 }
    476 
    477 
    478 picoos_bool  picoktab_getIntPropPunct (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * info1, picoos_uint8 * info2)
    479 {
    480   picoos_uint32 propOffset;
    481   ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this;
    482 
    483   propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_PUNCT);
    484   if (propOffset > 0) {
    485       if (g->graphTable[graphsOffset+propOffset] == 2) {
    486           *info1 = PICODATA_ITEMINFO1_PUNC_SENTEND;
    487       }
    488       else {
    489           *info1 = PICODATA_ITEMINFO1_PUNC_PHRASEEND;
    490       }
    491     if (g->graphTable[graphsOffset+1] == '.') {
    492         *info2 = PICODATA_ITEMINFO2_PUNC_SENT_T;
    493     }
    494     else if (g->graphTable[graphsOffset+1] == '?') {
    495         *info2 = PICODATA_ITEMINFO2_PUNC_SENT_Q;
    496     }
    497     else if (g->graphTable[graphsOffset+1] == '!') {
    498         *info2 = PICODATA_ITEMINFO2_PUNC_SENT_E;
    499     }
    500     else {
    501         *info2 = PICODATA_ITEMINFO2_PUNC_PHRASE;
    502     }
    503     return TRUE;
    504   }
    505   else {
    506     return FALSE;
    507   }
    508 }
    509 
    510 
    511 picoos_bool  picoktab_getStrPropLowercase (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * lowercase)
    512 {
    513   picoos_uint32 propOffset;
    514 
    515   propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_LOWERCASE);
    516   if (propOffset > 0) {
    517     ktab_getStrProp(this, graphsOffset, propOffset, lowercase);
    518     return TRUE;
    519   }
    520   else {
    521     return FALSE;
    522   }
    523 }
    524 
    525 
    526 picoos_bool  picoktab_getStrPropGraphsubs1 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs1)
    527 {
    528   picoos_uint32 propOffset;
    529 
    530   propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS1);
    531   if (propOffset > 0) {
    532     ktab_getStrProp(this, graphsOffset, propOffset, graphsubs1);
    533     return TRUE;
    534   }
    535   else {
    536     return FALSE;
    537   }
    538 }
    539 
    540 
    541 picoos_bool  picoktab_getStrPropGraphsubs2 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs2)
    542 {
    543   picoos_uint32 propOffset;
    544 
    545   propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS2);
    546   if (propOffset > 0) {
    547     ktab_getStrProp(this, graphsOffset, propOffset, graphsubs2);
    548     return TRUE;
    549   }
    550   else {
    551     return FALSE;
    552   }
    553 }
    554 /* *****************************************************************/
    555 /* used for tools */
    556 
    557 static void ktab_getUtf8 (picoos_uchar ** pos, picoos_uchar * to)
    558 {
    559   picoos_uint32 l;
    560   l = picobase_det_utf8_length(**pos);
    561   while (l>0) {
    562     *(to++) = *((*pos)++);
    563     l--;
    564   }
    565   *to = 0;
    566 }
    567 
    568 picoos_uint16 picoktab_graphsGetNumEntries(const picoktab_Graphs this)
    569 {
    570     ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this;
    571     return g->nrOffset;
    572 }
    573 
    574 void picoktab_graphsGetGraphInfo(const picoktab_Graphs this,
    575         picoos_uint16 graphIndex, picoos_uchar * from, picoos_uchar * to,
    576         picoos_uint8 * propset,
    577         picoos_uint8 * stokenType, picoos_uint8 * stokenSubType,
    578         picoos_uint8 * value, picoos_uchar * lowercase,
    579         picoos_uchar * graphsubs1, picoos_uchar * graphsubs2,
    580         picoos_uint8 * punct) {
    581     ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this;
    582     picoos_uint32 graphsOffset;
    583     picoos_uint8 * pos;
    584 
    585     /* calculate offset of graph[graphIndex] */
    586     if (g->sizeOffset == 1) {
    587         graphsOffset = g->offsetTable[graphIndex];
    588     } else {
    589         graphsOffset = g->offsetTable[2 * graphIndex]
    590                 + (g->offsetTable[2 * graphIndex + 1] << 8);
    591     }
    592     pos = &(g->graphTable[graphsOffset]);
    593     *propset = *pos;
    594 
    595     pos++; /* advance to FROM */
    596     ktab_getUtf8(&pos, from); /* get FROM and advance */
    597     if ((*propset) & KTAB_GRAPH_PROPSET_TO) {
    598         ktab_getUtf8(&pos, to); /* get TO and advance */
    599     } else {
    600         picoos_strcpy((picoos_char *)to, (picoos_char *)from);
    601     }
    602     if ((*propset) & KTAB_GRAPH_PROPSET_TOKENTYPE) {
    603         (*stokenType) = *(pos++); /* get TOKENTYPE and advance */
    604     } else {
    605         (*stokenType) = -1;
    606     }
    607     if ((*propset) & KTAB_GRAPH_PROPSET_TOKENSUBTYPE) {
    608         (*stokenSubType) = *(pos++); /* get TOKENSUBTYPE and advance */
    609     } else {
    610         (*stokenSubType) = -1;
    611     }
    612     if ((*propset) & KTAB_GRAPH_PROPSET_VALUE) {
    613         (*value) = *(pos++); /* get VALUE and advance */
    614     } else {
    615         (*value) = -1;
    616     }
    617     if ((*propset) & KTAB_GRAPH_PROPSET_LOWERCASE) {
    618         ktab_getUtf8(&pos, lowercase); /* get LOWERCASE and advance */
    619     } else {
    620         lowercase[0] = NULLC;
    621     }
    622     if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS1) {
    623         ktab_getUtf8(&pos, graphsubs1); /* get GRAPHSUBS1 and advance */
    624     } else {
    625         graphsubs1[0] = NULLC;
    626     }
    627     if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS2) {
    628         ktab_getUtf8(&pos, graphsubs2); /* get GRAPHSUBS2 and advance */
    629     } else {
    630         graphsubs2[0] = NULLC;
    631     }
    632     if ((*propset) & KTAB_GRAPH_PROPSET_PUNCT) {
    633         (*punct) = *(pos++); /* get PUNCT and advance */
    634     } else {
    635         (*punct) = -1;
    636     }
    637 }
    638 
    639 /* ************************************************************/
    640 /* Phones */
    641 /* ************************************************************/
    642 
    643 /* overview binary file format for phones kb:
    644 
    645     phones-kb = specids propertytable
    646 
    647     specids = PRIMSTRESSID1 SECSTRESSID1 SYLLBOUNDID1 PAUSEID1 WORDBOUNDID1
    648               RESERVE1 RESERVE1 RESERVE1
    649 
    650     propertytable = {PHONEPROP2}=256
    651 
    652     PRIMSTRESSID1: one byte, ID of primary stress
    653     SECSTRESSID1: one byte, ID of secondary stress
    654     SYLLBOUNDID1: one byte, ID of syllable boundary
    655     PAUSEID1: one byte, ID of pause
    656     RESERVE1: reserved for future use
    657 
    658     PHONEPROP2: one byte, max. of 256 phones directly access this table
    659                 to check a property for a phone; binary properties
    660                 encoded (1 bit per prop)
    661        least significant bit: vowel
    662                     next bit: diphth
    663                     next bit: glott
    664                     next bit: nonsyllvowel
    665                     next bit: syllcons
    666        3 bits spare
    667  */
    668 
    669 #define KTAB_START_SPECIDS   0
    670 #define KTAB_IND_PRIMSTRESS  0
    671 #define KTAB_IND_SECSTRESS   1
    672 #define KTAB_IND_SYLLBOUND   2
    673 #define KTAB_IND_PAUSE       3
    674 #define KTAB_IND_WORDBOUND   4
    675 
    676 #define KTAB_START_PROPS     8
    677 
    678 
    679 typedef struct ktabphones_subobj *ktabphones_SubObj;
    680 
    681 typedef struct ktabphones_subobj {
    682     picoos_uint8 *specids;
    683     picoos_uint8 *props;
    684 } ktabphones_subobj_t;
    685 
    686 
    687 /* bitmasks to extract the property info from props */
    688 #define KTAB_PPROP_VOWEL        '\x01'
    689 #define KTAB_PPROP_DIPHTH       '\x02'
    690 #define KTAB_PPROP_GLOTT        '\x04'
    691 #define KTAB_PPROP_NONSYLLVOWEL '\x08'
    692 #define KTAB_PPROP_SYLLCONS     '\x10'
    693 
    694 
    695 static pico_status_t ktabPhonesInitialize(register picoknow_KnowledgeBase this,
    696                                           picoos_Common common) {
    697     ktabphones_subobj_t * ktabphones;
    698 
    699     PICODBG_DEBUG(("start"));
    700 
    701     if (NULL == this || NULL == this->subObj) {
    702         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
    703                                        NULL, NULL);
    704     }
    705     ktabphones = (ktabphones_subobj_t *) this->subObj;
    706     ktabphones->specids = &(this->base[KTAB_START_SPECIDS]);
    707     ktabphones->props   = &(this->base[KTAB_START_PROPS]);
    708     return PICO_OK;
    709 }
    710 
    711 static pico_status_t ktabPhonesSubObjDeallocate(register picoknow_KnowledgeBase this,
    712                                                 picoos_MemoryManager mm) {
    713     if (NULL != this) {
    714         picoos_deallocate(mm, (void *) &this->subObj);
    715     }
    716     return PICO_OK;
    717 }
    718 
    719 pico_status_t picoktab_specializePhonesKnowledgeBase(picoknow_KnowledgeBase this,
    720                                                      picoos_Common common) {
    721     if (NULL == this) {
    722         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
    723                                        NULL, NULL);
    724     }
    725     this->subDeallocate = ktabPhonesSubObjDeallocate;
    726     this->subObj = picoos_allocate(common->mm, sizeof(ktabphones_subobj_t));
    727     if (NULL == this->subObj) {
    728         return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
    729                                        NULL, NULL);
    730     }
    731     return ktabPhonesInitialize(this, common);
    732 }
    733 
    734 picoktab_Phones picoktab_getPhones(picoknow_KnowledgeBase this) {
    735     if (NULL == this) {
    736         return NULL;
    737     } else {
    738         return (picoktab_Phones) this->subObj;
    739     }
    740 }
    741 
    742 
    743 /* Phones methods */
    744 
    745 picoos_uint8 picoktab_hasVowelProp(const picoktab_Phones this,
    746                                    const picoos_uint8 ch) {
    747     return (KTAB_PPROP_VOWEL & ((ktabphones_SubObj)this)->props[ch]);
    748 }
    749 picoos_uint8 picoktab_hasDiphthProp(const picoktab_Phones this,
    750                                     const picoos_uint8 ch) {
    751     return (KTAB_PPROP_DIPHTH & ((ktabphones_SubObj)this)->props[ch]);
    752 }
    753 picoos_uint8 picoktab_hasGlottProp(const picoktab_Phones this,
    754                                    const picoos_uint8 ch) {
    755     return (KTAB_PPROP_GLOTT & ((ktabphones_SubObj)this)->props[ch]);
    756 }
    757 picoos_uint8 picoktab_hasNonsyllvowelProp(const picoktab_Phones this,
    758                                           const picoos_uint8 ch) {
    759     return (KTAB_PPROP_NONSYLLVOWEL & ((ktabphones_SubObj)this)->props[ch]);
    760 }
    761 picoos_uint8 picoktab_hasSyllconsProp(const picoktab_Phones this,
    762                                       const picoos_uint8 ch) {
    763     return (KTAB_PPROP_SYLLCONS & ((ktabphones_SubObj)this)->props[ch]);
    764 }
    765 
    766 picoos_bool picoktab_isSyllCarrier(const picoktab_Phones this,
    767                                     const picoos_uint8 ch) {
    768     picoos_uint8 props;
    769     props = ((ktabphones_SubObj)this)->props[ch];
    770     return (((KTAB_PPROP_VOWEL & props) &&
    771              !(KTAB_PPROP_NONSYLLVOWEL & props))
    772             || (KTAB_PPROP_SYLLCONS & props));
    773 }
    774 
    775 picoos_bool picoktab_isPrimstress(const picoktab_Phones this,
    776                                    const picoos_uint8 ch) {
    777     return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS]);
    778 }
    779 picoos_bool picoktab_isSecstress(const picoktab_Phones this,
    780                                   const picoos_uint8 ch) {
    781     return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS]);
    782 }
    783 picoos_bool picoktab_isSyllbound(const picoktab_Phones this,
    784                                   const picoos_uint8 ch) {
    785     return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND]);
    786 }
    787 picoos_bool picoktab_isWordbound(const picoktab_Phones this,
    788                                   const picoos_uint8 ch) {
    789     return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND]);
    790 }
    791 picoos_bool picoktab_isPause(const picoktab_Phones this,
    792                               const picoos_uint8 ch) {
    793     return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE]);
    794 }
    795 
    796 picoos_uint8 picoktab_getPrimstressID(const picoktab_Phones this) {
    797     return ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS];
    798 }
    799 picoos_uint8 picoktab_getSecstressID(const picoktab_Phones this) {
    800     return ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS];
    801 }
    802 picoos_uint8 picoktab_getSyllboundID(const picoktab_Phones this) {
    803     return ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND];
    804 }
    805 picoos_uint8 picoktab_getWordboundID(const picoktab_Phones this) {
    806     return ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND];
    807 }
    808 picoos_uint8 picoktab_getPauseID(const picoktab_Phones this) {
    809     return ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE];
    810 }
    811 
    812 /* ************************************************************/
    813 /* Pos */
    814 /* ************************************************************/
    815 
    816 /* overview binary file format for pos kb:
    817 
    818     pos-kb = header posids
    819     header = {COUNT2 OFFS2}=8
    820     posids = {POSID1 {PARTID1}0:8}1:
    821 
    822     where POSID1 is the value of the (combined) part-of-speech symbol,
    823     and {PARTID1} are the symbol values of its components (empty if it
    824     is not a combined symbol). The {PARTID1} list is sorted.
    825     Part-of-speech symbols with equal number of components are grouped
    826     together.
    827 
    828     The header contains information about these groups:
    829 
    830     COUNT2 specifies the number of elements in the group, and OFFS2
    831     specifies the offset (relative to the beginning of the kb) where
    832     the group data starts, i.e.:
    833 
    834     25   32  -> 25 not-combined elements, starting at offset 32
    835     44   57  -> 44 elements composed of 2 symbols, starting at offset 57
    836     23  189  -> 23 elements composed of 3 symbols, starting at offset 189
    837     ...
    838 
    839     Currently, each symbol may be composed of up to 8 other symbols.
    840     Therefore, the header has 8 entries, too. The header starts with
    841     the unique POS list, and then in increasing order, 2 symbols, 3
    842     symbols,...
    843 
    844 Zur Anschauung die ge-printf-te Version:
    845 
    846  25   32
    847  44   57
    848  23  189
    849  12  281
    850   4  341
    851   1  365
    852   0    0
    853   0    0
    854  33 |
    855  34 |
    856  35 |
    857  60 |
    858  etc.
    859  36 |  35  60
    860  50 |  35  95
    861  51 |  35  97
    862  58 |  35 120
    863  59 |  35 131
    864  61 |  60  75
    865  63 |  60  95
    866  64 |  60  97
    867  etc.
    868  42 |  35  60 117
    869  44 |  35  60 131
    870  45 |  35  73  97
    871  48 |  35  84  97
    872  54 |  35  97 131
    873  56 |  35 113 120
    874  57 |  35 117 120
    875  62 |  60  84 122
    876  etc.
    877  */
    878 
    879 typedef struct ktabpos_subobj *ktabpos_SubObj;
    880 
    881 typedef struct ktabpos_subobj {
    882     picoos_uint16 nrcomb[PICOKTAB_MAXNRPOS_IN_COMB];
    883     picoos_uint8 *nrcombstart[PICOKTAB_MAXNRPOS_IN_COMB];
    884 } ktabpos_subobj_t;
    885 
    886 
    887 static pico_status_t ktabPosInitialize(register picoknow_KnowledgeBase this,
    888                                        picoos_Common common) {
    889     ktabpos_subobj_t *ktabpos;
    890     picoos_uint16 osprev;
    891     picoos_uint16 os, pos;
    892     picoos_uint8 i;
    893 
    894     PICODBG_DEBUG(("start"));
    895 
    896     if (NULL == this || NULL == this->subObj) {
    897         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
    898                                        NULL, NULL);
    899     }
    900     ktabpos = (ktabpos_subobj_t *)this->subObj;
    901 
    902     os = 0;
    903     for (i = 0, pos = 0; i < PICOKTAB_MAXNRPOS_IN_COMB; i++, pos += 4) {
    904         ktabpos->nrcomb[i] = ((picoos_uint16)(this->base[pos+1])) << 8 |
    905             this->base[pos];
    906         if (ktabpos->nrcomb[i] > 0) {
    907             osprev = os;
    908             os = ((picoos_uint16)(this->base[pos+3])) << 8 | this->base[pos+2];
    909             ktabpos->nrcombstart[i] = &(this->base[os]);
    910             PICODBG_TRACE(("i %d, pos %d, nr %d, osprev %d, os %d", i, pos,
    911                            ktabpos->nrcomb[i], osprev, os));
    912             if (osprev >= os) {
    913                 /* cannot be, in a valid kb */
    914                 return picoos_emRaiseException(common->em,
    915                                                PICO_EXC_FILE_CORRUPT,
    916                                                NULL, NULL);
    917             }
    918         } else {
    919             if (i == 0) {
    920                 /* cannot be, in a valid kb */
    921                 return picoos_emRaiseException(common->em,
    922                                                PICO_EXC_FILE_CORRUPT,
    923                                                NULL, NULL);
    924             }
    925             ktabpos->nrcombstart[i] = NULL;
    926         }
    927     }
    928     return PICO_OK;
    929 }
    930 
    931 static pico_status_t ktabPosSubObjDeallocate(register picoknow_KnowledgeBase this,
    932                                              picoos_MemoryManager mm) {
    933     if (NULL != this) {
    934         picoos_deallocate(mm, (void *) &this->subObj);
    935     }
    936     return PICO_OK;
    937 }
    938 
    939 pico_status_t picoktab_specializePosKnowledgeBase(picoknow_KnowledgeBase this,
    940                                                   picoos_Common common) {
    941     if (NULL == this) {
    942         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING,
    943                                        NULL, NULL);
    944     }
    945     this->subDeallocate = ktabPosSubObjDeallocate;
    946     this->subObj = picoos_allocate(common->mm, sizeof(ktabpos_subobj_t));
    947     if (NULL == this->subObj) {
    948         return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM,
    949                                        NULL, NULL);
    950     }
    951     return ktabPosInitialize(this, common);
    952 }
    953 
    954 picoktab_Pos picoktab_getPos(picoknow_KnowledgeBase this) {
    955     if (NULL == this) {
    956         return NULL;
    957     } else {
    958         return (picoktab_Pos) this->subObj;
    959     }
    960 }
    961 
    962 
    963 /* Pos methods */
    964 
    965 static picoos_int16 ktab_isEqualPosGroup(const picoos_uint8 *grp1,
    966                                          const picoos_uint8 *grp2,
    967                                          picoos_uint8 len)
    968 {
    969     /* if both, grp1 and grp2 would be sorted in ascending order
    970        we could implement a function picoktab_comparePosGroup in
    971        a similar manner as strcmp */
    972 
    973     picoos_uint16 i, j, equal;
    974 
    975     equal = 1;
    976 
    977     i = 0;
    978     while (equal && (i < len)) {
    979         /* search grp1[i] in grp2 */
    980         j = 0;
    981         while ((j < len) && (grp1[i] != grp2[j])) {
    982             j++;
    983         }
    984         equal = (j < len);
    985         i++;
    986     }
    987 
    988     return equal;
    989 }
    990 
    991 
    992 picoos_bool picoktab_isUniquePos(const picoktab_Pos this,
    993                                   const picoos_uint8 pos) {
    994     ktabpos_subobj_t *ktabpos;
    995     picoos_uint16 i;
    996 
    997     /* speed-up possible with e.g. binary search */
    998 
    999     ktabpos = (ktabpos_subobj_t *)this;
   1000     PICODBG_TRACE(("pos %d, nrcombinations %d", pos, ktabpos->nrcomb[0]));
   1001     i = 0;
   1002     while ((i < ktabpos->nrcomb[0]) && (pos > ktabpos->nrcombstart[0][i])) {
   1003         PICODBG_TRACE(("compare with pos %d at position %d",
   1004                        ktabpos->nrcombstart[0][i], pos, i));
   1005         i++;
   1006     }
   1007     return ((i < ktabpos->nrcomb[0]) && (pos == ktabpos->nrcombstart[0][i]));
   1008 }
   1009 
   1010 
   1011 picoos_bool picoktab_isPartOfPosGroup(const picoktab_Pos this,
   1012                                        const picoos_uint8 pos,
   1013                                        const picoos_uint8 posgroup)
   1014 {
   1015     ktabpos_subobj_t *ktabpos;
   1016     picoos_uint8 *grp;
   1017     picoos_uint16 i, j, n, s, grplen;
   1018     picoos_uint8 *e;
   1019     picoos_uint8 found;
   1020 
   1021     ktabpos = (ktabpos_subobj_t *) this;
   1022 
   1023     grp = NULL;
   1024     found = FALSE;
   1025     grplen = 0;
   1026 
   1027     /* currently, a linear search is required to find 'posgroup'; the
   1028        knowledge base should be extended to allow for a faster search */
   1029 
   1030     /* treat case i==0, grplen==0, ie. pos == posgroup */
   1031     if (pos == posgroup) {
   1032         found = TRUE;
   1033     }
   1034 
   1035     i = 1;
   1036     while ((grp == NULL) && (i < PICOKTAB_MAXNRPOS_IN_COMB)) {
   1037         n = ktabpos->nrcomb[i];       /* number of entries */
   1038         e = ktabpos->nrcombstart[i];  /* ptr to first entry */
   1039         s = i + 2;                    /* size of an entry in bytes */
   1040         /* was with while starting at 0:
   1041         s = i > 0 ? i + 2 : 1;
   1042         */
   1043         j = 0;
   1044         while ((grp == NULL) && (j < n)) {
   1045             if (posgroup == e[0]) {
   1046                 grp = e + 1;
   1047                 grplen = s - 1;
   1048             }
   1049             e += s;
   1050             j++;
   1051         }
   1052         i++;
   1053     }
   1054 
   1055     /* test if 'pos' is contained in the components of 'posgroup' */
   1056     if (grp != NULL) {
   1057         for (i = 0; !found && (i < grplen); i++) {
   1058             if (pos == grp[i]) {
   1059                 found = TRUE;
   1060             }
   1061         }
   1062 
   1063         /* just a way to test picoktab_getPosGroup */
   1064         /*
   1065         PICODBG_ASSERT(picoktab_getPosGroup(this, grp, grplen) == posgroup);
   1066         */
   1067     }
   1068 
   1069     return found;
   1070 }
   1071 
   1072 
   1073 picoos_uint8 picoktab_getPosGroup(const picoktab_Pos this,
   1074                                   const picoos_uint8 *poslist,
   1075                                   const picoos_uint8 poslistlen)
   1076 {
   1077     picoos_uint8 poscomb;
   1078     ktabpos_subobj_t *ktabpos;
   1079     picoos_uint16 i, j, n, s;
   1080     picoos_uint8 *e;
   1081 
   1082     ktabpos = (ktabpos_subobj_t *) this;
   1083     poscomb = 0;
   1084 
   1085     if ((poslistlen > 0) && (poslistlen <= PICOKTAB_MAXNRPOS_IN_COMB)) {
   1086         i = poslistlen - 1;
   1087         if (i > 0) {
   1088             n = ktabpos->nrcomb[i];       /* number of entries */
   1089             e = ktabpos->nrcombstart[i];  /* ptr to first entry */
   1090             s = i + 2;                    /* size of an entry in bytes */
   1091             j = 0;
   1092             while (!poscomb && (j < n)) {
   1093                 if (ktab_isEqualPosGroup(poslist, e + 1, poslistlen)) {
   1094                     poscomb = *e;
   1095                 }
   1096                 e += s;
   1097                 j++;
   1098             }
   1099             if (!poscomb) {
   1100                 /* combination not found; shouldn't occur if lingware OK! */
   1101                 /* contingency solution: take first */
   1102                 PICODBG_WARN(("dynamically created POS combination not found in table; taking first (%i)",poslist[0]));
   1103                 poscomb = poslist[0];
   1104             }
   1105         } else {  /* not a composed POS */
   1106             poscomb = poslist[0];
   1107         }
   1108     }
   1109 
   1110     return poscomb;
   1111 }
   1112 
   1113 #ifdef __cplusplus
   1114 }
   1115 #endif
   1116 
   1117 
   1118 /* end */
   1119