Home | History | Annotate | Download | only in lib
      1 /*
      2  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 /**
     17  * @file picokfst.c
     18  *
     19  * FST knowledge loading and access
     20  *
     21  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
     22  * All rights reserved.
     23  *
     24  * History:
     25  * - 2009-04-20 -- initial version
     26  *
     27  */
     28 #include "picoos.h"
     29 #include "picodbg.h"
     30 #include "picoknow.h"
     31 #include "picokfst.h"
     32 
     33 #ifdef __cplusplus
     34 extern "C" {
     35 #endif
     36 #if 0
     37 }
     38 #endif
     39 
     40 
     41 #define FileHdrSize 4       /* size of FST file header */
     42 
     43 
     44 
     45 /* ************************************************************/
     46 /* function to create specialized kb, */
     47 /* to be used by picorsrc only */
     48 /* ************************************************************/
     49 
     50 /** object       : FSTKnowledgeBase
     51  *  shortcut     : kfst
     52  *  derived from : picoknow_KnowledgeBase
     53  */
     54 
     55 typedef struct kfst_subobj * kfst_SubObj;
     56 
     57 typedef struct kfst_subobj{
     58     picoos_uint8 * fstStream;         /* the byte stream base address */
     59     picoos_int32 hdrLen;              /* length of file header */
     60     picoos_int32 transductionMode;    /* transduction mode to be used for FST */
     61     picoos_int32 nrClasses;           /* nr of pair/transition classes in FST; class is in [1..nrClasses] */
     62     picoos_int32 nrStates;            /* nr of states in FST; state is in [1..nrState] */
     63     picoos_int32 termClass;           /* pair class of terminator symbol pair; probably obsolete */
     64     picoos_int32 alphaHashTabSize;    /* size of pair alphabet hash table */
     65     picoos_int32 alphaHashTabPos;     /* absolute address of the start of the pair alphabet */
     66     picoos_int32 transTabEntrySize;   /* size in bytes of each transition table entry */
     67     picoos_int32 transTabPos;         /* absolute address of the start of the transition table */
     68     picoos_int32 inEpsStateTabPos;    /* absolute address of the start of the input epsilon transition table */
     69     picoos_int32 accStateTabPos;      /* absolute address of the table of accepting states */
     70 } kfst_subobj_t;
     71 
     72 
     73 
     74 /* ************************************************************/
     75 /* primitives for reading from byte stream */
     76 /* ************************************************************/
     77 
     78 /* Converts 'nrBytes' bytes starting at position '*pos' in byte stream 'stream' into unsigned number 'num'.
     79    '*pos' is modified to the position right after the number */
     80 static void FixedBytesToUnsignedNum (picoos_uint8 * stream, picoos_uint8 nrBytes, picoos_uint32 * pos, picoos_uint32 * num)
     81 {
     82     picoos_int32 i;
     83 
     84     (*num) = 0;
     85     for (i = 0; i < nrBytes; i++) {
     86         (*num) = ((*num) << 8) + (picoos_uint32)stream[*pos];
     87         (*pos)++;
     88     }
     89 }
     90 
     91 
     92 /* Converts 'nrBytes' bytes starting at position '*pos' in byte stream 'stream' into signed number 'num'.
     93    '*pos' is modified to the position right after the number */
     94 static void FixedBytesToSignedNum (picoos_uint8 * stream, picoos_uint8 nrBytes, picoos_uint32 * pos, picoos_int32 * num)
     95 {
     96     picoos_int32 i;
     97     picoos_uint32 val;
     98 
     99     val = 0;
    100     for (i = 0; i < nrBytes; i++) {
    101         val = (val << 8) + (picoos_uint32)stream[*pos];
    102         (*pos)++;
    103     }
    104     if (val % 2 == 1) {
    105         /* negative number */
    106         (*num) = -((picoos_int32)((val - 1) / 2)) - 1;
    107     } else {
    108         /* positive number */
    109         (*num) = val / 2;
    110     }
    111 }
    112 
    113 
    114 /* Converts varying-sized sequence of bytes starting at position '*pos' in byte stream 'stream'
    115    into (signed) number 'num'. '*pos' is modified to the position right after the number. */
    116 static void BytesToNum (picoos_uint8 * stream, picoos_uint32 * pos, picoos_int32 * num)
    117 {
    118     picoos_uint32 val;
    119     picoos_uint32 b;
    120 
    121     val = 0;
    122     b = (picoos_uint32)stream[*pos];
    123     (*pos)++;
    124     while (b < 128) {
    125         val = (val << 7) + b;
    126         b = (picoos_uint32)stream[*pos];
    127         (*pos)++;
    128     }
    129     val = (val << 7) + (b - 128);
    130     if (val % 2 == 1) {
    131         /* negative number */
    132         (*num) = -((picoos_int32)((val - 1) / 2)) - 1;
    133     } else {
    134         /* positive number */
    135         (*num) = val / 2;
    136     }
    137 }
    138 
    139 
    140 /* ************************************************************/
    141 /* setting up FST from byte stream */
    142 /* ************************************************************/
    143 
    144 static pico_status_t kfstInitialize(register picoknow_KnowledgeBase this,
    145         picoos_Common common)
    146 {
    147     picoos_uint32 curpos;
    148     picoos_int32 offs;
    149     kfst_subobj_t * kfst;
    150 
    151     PICODBG_DEBUG(("kfstInitialize -- start\n"));
    152 
    153     if (NULL == this || NULL == this->subObj) {
    154         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL,
    155                 NULL);
    156     }
    157     kfst = (kfst_subobj_t *) this->subObj;
    158 
    159     /* +CT+ */
    160     kfst->fstStream = this->base;
    161     PICODBG_TRACE(("base: %d\n",this->base));
    162     kfst->hdrLen = FileHdrSize;
    163     curpos = kfst->hdrLen;
    164     BytesToNum(kfst->fstStream,& curpos,& kfst->transductionMode);
    165     BytesToNum(kfst->fstStream,& curpos,& kfst->nrClasses);
    166     BytesToNum(kfst->fstStream,& curpos,& kfst->nrStates);
    167     BytesToNum(kfst->fstStream,& curpos,& kfst->termClass);
    168     BytesToNum(kfst->fstStream,& curpos,& kfst->alphaHashTabSize);
    169     BytesToNum(kfst->fstStream,& curpos,& offs);
    170     kfst->alphaHashTabPos = kfst->hdrLen + offs;
    171     BytesToNum(kfst->fstStream,& curpos,& kfst->transTabEntrySize);
    172     BytesToNum(kfst->fstStream,& curpos,& offs);
    173     kfst->transTabPos = kfst->hdrLen + offs;
    174     BytesToNum(kfst->fstStream,& curpos,& offs);
    175     kfst->inEpsStateTabPos = kfst->hdrLen + offs;
    176     BytesToNum(kfst->fstStream,& curpos,& offs);
    177     kfst->accStateTabPos = kfst->hdrLen + offs;
    178     /* -CT- */
    179 
    180     return PICO_OK;
    181 }
    182 
    183 
    184 static pico_status_t kfstSubObjDeallocate(register picoknow_KnowledgeBase this,
    185         picoos_MemoryManager mm)
    186 {
    187     if (NULL != this) {
    188         picoos_deallocate(mm, (void *) &this->subObj);
    189     }
    190     return PICO_OK;
    191 }
    192 
    193 
    194 /* calculates a small number of data (e.g. addresses) from kb for fast access.
    195  * This data is encapsulated in a picokfst_FST that can later be retrieved
    196  * with picokfst_getFST. */
    197 pico_status_t picokfst_specializeFSTKnowledgeBase(picoknow_KnowledgeBase this,
    198                                                   picoos_Common common)
    199 {
    200     pico_status_t status;
    201 
    202     if (NULL == this) {
    203         return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, NULL, NULL);
    204     }
    205     if (0 < this->size) {
    206         /* not a dummy kb */
    207         this->subDeallocate = kfstSubObjDeallocate;
    208 
    209         this->subObj = picoos_allocate(common->mm, sizeof(kfst_subobj_t));
    210 
    211         if (NULL == this->subObj) {
    212             return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL);
    213         }
    214         status = kfstInitialize(this, common);
    215         if (PICO_OK != status) {
    216             picoos_deallocate(common->mm,(void **)&this->subObj);
    217         }
    218     }
    219     return PICO_OK;
    220 }
    221 
    222 
    223 /* ************************************************************/
    224 /* FST type and getFST function */
    225 /* ************************************************************/
    226 
    227 
    228 
    229 /* return kb FST for usage in PU */
    230 picokfst_FST picokfst_getFST(picoknow_KnowledgeBase this)
    231 {
    232     if (NULL == this) {
    233         return NULL;
    234     } else {
    235         return (picokfst_FST) this->subObj;
    236     }
    237 }
    238 
    239 
    240 
    241 /* ************************************************************/
    242 /* FST access methods */
    243 /* ************************************************************/
    244 
    245 
    246 /* see description in header file */
    247 extern picoos_uint8 picokfst_kfstGetTransductionMode(picokfst_FST this)
    248 {
    249     kfst_SubObj fst = (kfst_SubObj) this;
    250     if (fst != NULL) {
    251         return fst->transductionMode;
    252     } else {
    253         return 0;
    254     }
    255 }
    256 
    257 
    258 /* see description in header file */
    259 extern void picokfst_kfstGetFSTSizes (picokfst_FST this, picoos_int32 *nrStates, picoos_int32 *nrClasses)
    260 {
    261     kfst_SubObj fst = (kfst_SubObj) this;
    262     if (fst != NULL) {
    263         *nrStates = fst->nrStates;
    264         *nrClasses = fst->nrClasses;
    265     } else {
    266         *nrStates = 0;
    267         *nrClasses = 0;
    268     }
    269 }
    270 
    271 /* see description in header file */
    272 extern void picokfst_kfstStartPairSearch (picokfst_FST this, picokfst_symid_t inSym,
    273                                           picoos_bool * inSymFound, picoos_int32 * searchState)
    274 {
    275     picoos_uint32 pos;
    276     picoos_int32 offs;
    277     picoos_int32 h;
    278     picoos_int32 inSymCellPos;
    279     picoos_int32 inSymX;
    280     picoos_int32 nextSameHashInSymOffs;
    281 
    282     kfst_SubObj fst = (kfst_SubObj) this;
    283     (*searchState) =  -1;
    284     (*inSymFound) = 0;
    285     h = inSym % fst->alphaHashTabSize;
    286     pos = fst->alphaHashTabPos + (h * 4);
    287     FixedBytesToSignedNum(fst->fstStream,4,& pos,& offs);
    288     if (offs > 0) {
    289         inSymCellPos = fst->alphaHashTabPos + offs;
    290         pos = inSymCellPos;
    291         BytesToNum(fst->fstStream,& pos,& inSymX);
    292         BytesToNum(fst->fstStream,& pos,& nextSameHashInSymOffs);
    293         while ((inSymX != inSym) && (nextSameHashInSymOffs > 0)) {
    294             inSymCellPos = inSymCellPos + nextSameHashInSymOffs;
    295             pos = inSymCellPos;
    296             BytesToNum(fst->fstStream,& pos,& inSymX);
    297             BytesToNum(fst->fstStream,& pos,& nextSameHashInSymOffs);
    298         }
    299         if (inSymX == inSym) {
    300             /* input symbol found; state is set to position after symbol cell */
    301             (*searchState) = pos;
    302             (*inSymFound) = 1;
    303         }
    304     }
    305 }
    306 
    307 
    308 /* see description in header file */
    309 extern void picokfst_kfstGetNextPair (picokfst_FST this, picoos_int32 * searchState,
    310                                       picoos_bool * pairFound,
    311                                       picokfst_symid_t * outSym, picokfst_class_t * pairClass)
    312 {
    313     picoos_uint32 pos;
    314     picoos_int32 val;
    315 
    316     kfst_SubObj fst = (kfst_SubObj) this;
    317     if ((*searchState) < 0) {
    318         (*pairFound) = 0;
    319         (*outSym) = PICOKFST_SYMID_ILLEG;
    320         (*pairClass) =  -1;
    321     } else {
    322         pos = (*searchState);
    323         BytesToNum(fst->fstStream,& pos,& val);
    324         *outSym = (picokfst_symid_t)val;
    325         if ((*outSym) != PICOKFST_SYMID_ILLEG) {
    326             BytesToNum(fst->fstStream,& pos,& val);
    327             *pairClass = (picokfst_class_t)val;
    328             (*pairFound) = 1;
    329             (*searchState) = pos;
    330         } else {
    331             (*pairFound) = 0;
    332             (*outSym) = PICOKFST_SYMID_ILLEG;
    333             (*pairClass) =  -1;
    334             (*searchState) =  -1;
    335         }
    336     }
    337 }
    338 
    339 
    340 
    341 /* see description in header file */
    342 extern void picokfst_kfstGetTrans (picokfst_FST this, picokfst_state_t startState, picokfst_class_t transClass,
    343                                    picokfst_state_t * endState)
    344 {
    345 
    346     picoos_uint32 pos;
    347     picoos_int32 index;
    348     picoos_uint32 endStateX;
    349 
    350     kfst_SubObj fst = (kfst_SubObj) this;
    351     if ((startState < 1) || (startState > fst->nrStates) || (transClass < 1) || (transClass > fst->nrClasses)) {
    352         (*endState) = 0;
    353     } else {
    354         index = (startState - 1) * fst->nrClasses + transClass - 1;
    355         pos = fst->transTabPos + (index * fst->transTabEntrySize);
    356         FixedBytesToUnsignedNum(fst->fstStream,fst->transTabEntrySize,& pos,& endStateX);
    357         (*endState) = endStateX;
    358     }
    359 }
    360 
    361 
    362 /* see description in header file */
    363 extern void picokfst_kfstStartInEpsTransSearch (picokfst_FST this, picokfst_state_t startState,
    364                                                 picoos_bool * inEpsTransFound, picoos_int32 * searchState)
    365 {
    366 
    367     picoos_int32 offs;
    368     picoos_uint32 pos;
    369 
    370     kfst_SubObj fst = (kfst_SubObj) this;
    371     (*searchState) =  -1;
    372     (*inEpsTransFound) = 0;
    373     if ((startState > 0) && (startState <= fst->nrStates)) {
    374         pos = fst->inEpsStateTabPos + (startState - 1) * 4;
    375         FixedBytesToSignedNum(fst->fstStream,4,& pos,& offs);
    376         if (offs > 0) {
    377             (*searchState) = fst->inEpsStateTabPos + offs;
    378             (*inEpsTransFound) = 1;
    379         }
    380     }
    381 }
    382 
    383 
    384 
    385 /* see description in header file */
    386 extern void picokfst_kfstGetNextInEpsTrans (picokfst_FST this, picoos_int32 * searchState,
    387                                             picoos_bool * inEpsTransFound,
    388                                             picokfst_symid_t * outSym, picokfst_state_t * endState)
    389 {
    390     picoos_uint32 pos;
    391     picoos_int32 val;
    392 
    393     kfst_SubObj fst = (kfst_SubObj) this;
    394     if ((*searchState) < 0) {
    395         (*inEpsTransFound) = 0;
    396         (*outSym) = PICOKFST_SYMID_ILLEG;
    397         (*endState) = 0;
    398     } else {
    399         pos = (*searchState);
    400         BytesToNum(fst->fstStream,& pos,& val);
    401         *outSym = (picokfst_symid_t)val;
    402         if ((*outSym) != PICOKFST_SYMID_ILLEG) {
    403             BytesToNum(fst->fstStream,& pos,& val);
    404             *endState = (picokfst_state_t)val;
    405             (*inEpsTransFound) = 1;
    406             (*searchState) = pos;
    407         } else {
    408             (*inEpsTransFound) = 0;
    409             (*outSym) = PICOKFST_SYMID_ILLEG;
    410             (*endState) = 0;
    411             (*searchState) =  -1;
    412         }
    413     }
    414 }
    415 
    416 
    417 /* see description in header file */
    418 extern picoos_bool picokfst_kfstIsAcceptingState (picokfst_FST this, picokfst_state_t state)
    419 {
    420 
    421     picoos_uint32 pos;
    422     picoos_uint32 val;
    423 
    424     kfst_SubObj fst = (kfst_SubObj) this;
    425     if ((state > 0) && (state <= fst->nrStates)) {
    426         pos = fst->accStateTabPos + (state - 1);
    427         FixedBytesToUnsignedNum(fst->fstStream,1,& pos,& val);
    428         return (val == 1);
    429     } else {
    430         return 0;
    431     }
    432 }
    433 
    434 #ifdef __cplusplus
    435 }
    436 #endif
    437 
    438 /* End picofst.c */
    439