1 /* 2 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 /** 17 * @file picoktab.c 18 * 19 * symbol tables needed at runtime 20 * 21 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland 22 * All rights reserved. 23 * 24 * History: 25 * - 2009-04-20 -- initial version 26 * 27 */ 28 29 #include "picoos.h" 30 #include "picodbg.h" 31 #include "picoknow.h" 32 #include "picobase.h" 33 #include "picoktab.h" 34 #include "picodata.h" 35 36 #ifdef __cplusplus 37 extern "C" { 38 #endif 39 #if 0 40 } 41 #endif 42 43 44 /** @todo : the following would be better part of a knowledge base. 45 * Make sure it is consistent with the phoneme symbol table used in the lingware */ 46 47 /* PLANE_PHONEMES */ 48 49 /* PLANE_POS */ 50 51 /* PLANE_PB_STRENGTHS */ 52 53 /* PLANE_ACCENTS */ 54 55 /* PLANE_INTERN */ 56 #define PICOKTAB_TMPID_PHONSTART '\x26' /* 38 '&' */ 57 #define PICOKTAB_TMPID_PHONTERM '\x23' /* 35 '#' */ 58 59 60 /* ************************************************************/ 61 /* fixed ids */ 62 /* ************************************************************/ 63 64 65 static pico_status_t ktabIdsInitialize(register picoknow_KnowledgeBase this, 66 picoos_Common common) 67 { 68 picoktab_FixedIds ids; 69 70 PICODBG_DEBUG(("start")); 71 72 if (NULL == this || NULL == this->subObj) { 73 return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, 74 NULL, NULL); 75 } 76 ids = (picoktab_FixedIds) this->subObj; 77 78 ids->phonStartId = PICOKTAB_TMPID_PHONSTART; 79 ids->phonTermId = PICOKTAB_TMPID_PHONTERM; 80 return PICO_OK; 81 } 82 83 84 static pico_status_t ktabIdsSubObjDeallocate(register picoknow_KnowledgeBase this, 85 picoos_MemoryManager mm) 86 { 87 if (NULL != this) { 88 picoos_deallocate(mm, (void *) &this->subObj); 89 } 90 return PICO_OK; 91 } 92 93 pico_status_t picoktab_specializeIdsKnowledgeBase(picoknow_KnowledgeBase this, 94 picoos_Common common) 95 { 96 if (NULL == this) { 97 return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, 98 NULL, NULL); 99 } 100 this->subDeallocate = ktabIdsSubObjDeallocate; 101 this->subObj = picoos_allocate(common->mm, sizeof(picoktab_fixed_ids_t)); 102 if (NULL == this->subObj) { 103 return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, 104 NULL, NULL); 105 } 106 return ktabIdsInitialize(this, common); 107 } 108 109 picoktab_FixedIds picoktab_getFixedIds(picoknow_KnowledgeBase this) 110 { 111 return ((NULL == this) ? NULL : ((picoktab_FixedIds) this->subObj)); 112 } 113 114 115 picoktab_FixedIds picoktab_newFixedIds(picoos_MemoryManager mm) 116 { 117 picoktab_FixedIds this = (picoktab_FixedIds) picoos_allocate(mm,sizeof(*this)); 118 if (NULL != this) { 119 /* initialize */ 120 } 121 return this; 122 } 123 124 125 void picoktab_disposeFixedIds(picoos_MemoryManager mm, picoktab_FixedIds * this) 126 { 127 if (NULL != (*this)) { 128 /* terminate */ 129 picoos_deallocate(mm,(void *)this); 130 } 131 } 132 133 134 135 /* ************************************************************/ 136 /* Graphs */ 137 /* ************************************************************/ 138 139 /* overview binary file format for graphs kb: 140 141 graphs-kb = NROFSENTRIES SIZEOFSENTRY ofstable graphs 142 143 NROFSENTRIES : 2 bytes, number of entries in offset table 144 SIZEOFSENTRY : 1 byte, size of one entry in offset table 145 146 ofstable = {OFFSET}=NROFSENTRIES (contains NROFSENTRIES entries of OFFSET) 147 148 OFFSET: SIZEOFSENTRY bytes, offset to baseaddress of graphs-kb to entry in graphs 149 150 graphs = {graph}=NROFSENTRIES (contains NROFSENTRIES entries of graph) 151 152 graph = PROPSET FROM TO [TOKENTYPE] [TOKENSUBTYPE] [VALUE] [LOWERCASE] [GRAPHSUBS1] [GRAPHSUBS2] 153 154 FROM : 1..4 unsigned bytes, UTF8 character without terminating 0 155 TO : 1..4 unsigned bytes, UTF8 character without terminating 0 156 PROPSET : 1 unsigned byte, least significant bit : has TO field 157 next bit : has TOKENTYPE 158 next bit : has TOKENSUBTYPE 159 next bit : has VALUE 160 next bit : has LOWERCASE 161 next bit : has GRAPHSUBS1 162 next bit : has GRAPHSUBS2 163 next bit : has PUNC 164 165 TOKENTYPE : 1 unsigned byte 166 TOKENSUBTYPE : 1 unsigned byte 167 VALUE : 1 unsigned byte 168 LOWERCASE : 1..4 unsigned bytes, UTF8 character without terminating 0 169 GRAPHSUBS1 : 1..4 unsigned bytes, UTF8 character without terminating 0 170 GRAPHSUBS2 : 1..4 unsigned bytes, UTF8 character without terminating 0 171 PUNC : 1 unsigned byte 172 */ 173 174 static picoos_uint32 ktab_propOffset (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 prop); 175 176 #define KTAB_START_GRAPHS_NR_OFFSET 0 177 #define KTAB_START_GRAPHS_SIZE_OFFSET 2 178 #define KTAB_START_GRAPHS_OFFSET_TABLE 3 179 #define KTAB_START_GRAPHS_GRAPH_TABLE 0 180 181 /* bitmasks to extract the grapheme properties info from the property set */ 182 #define KTAB_GRAPH_PROPSET_TO ((picoos_uint8)'\x01') 183 #define KTAB_GRAPH_PROPSET_TOKENTYPE ((picoos_uint8)'\x02') 184 #define KTAB_GRAPH_PROPSET_TOKENSUBTYPE ((picoos_uint8)'\x04') 185 #define KTAB_GRAPH_PROPSET_VALUE ((picoos_uint8)'\x08') 186 #define KTAB_GRAPH_PROPSET_LOWERCASE ((picoos_uint8)'\x010') 187 #define KTAB_GRAPH_PROPSET_GRAPHSUBS1 ((picoos_uint8)'\x020') 188 #define KTAB_GRAPH_PROPSET_GRAPHSUBS2 ((picoos_uint8)'\x040') 189 #define KTAB_GRAPH_PROPSET_PUNCT ((picoos_uint8)'\x080') 190 191 192 typedef struct ktabgraphs_subobj *ktabgraphs_SubObj; 193 194 typedef struct ktabgraphs_subobj { 195 picoos_uint16 nrOffset; 196 picoos_uint16 sizeOffset; 197 198 picoos_uint8 * offsetTable; 199 picoos_uint8 * graphTable; 200 } ktabgraphs_subobj_t; 201 202 203 204 static pico_status_t ktabGraphsInitialize(register picoknow_KnowledgeBase this, 205 picoos_Common common) { 206 ktabgraphs_subobj_t * ktabgraphs; 207 208 PICODBG_DEBUG(("start")); 209 210 if (NULL == this || NULL == this->subObj) { 211 return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, 212 NULL, NULL); 213 } 214 ktabgraphs = (ktabgraphs_subobj_t *) this->subObj; 215 ktabgraphs->nrOffset = ((int)(this->base[KTAB_START_GRAPHS_NR_OFFSET])) + 256*(int)(this->base[KTAB_START_GRAPHS_NR_OFFSET+1]); 216 ktabgraphs->sizeOffset = (int)(this->base[KTAB_START_GRAPHS_SIZE_OFFSET]); 217 ktabgraphs->offsetTable = &(this->base[KTAB_START_GRAPHS_OFFSET_TABLE]); 218 ktabgraphs->graphTable = &(this->base[KTAB_START_GRAPHS_GRAPH_TABLE]); 219 return PICO_OK; 220 } 221 222 static pico_status_t ktabGraphsSubObjDeallocate(register picoknow_KnowledgeBase this, 223 picoos_MemoryManager mm) { 224 if (NULL != this) { 225 picoos_deallocate(mm, (void *) &this->subObj); 226 } 227 return PICO_OK; 228 } 229 230 231 pico_status_t picoktab_specializeGraphsKnowledgeBase(picoknow_KnowledgeBase this, 232 picoos_Common common) { 233 if (NULL == this) { 234 return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, 235 NULL, NULL); 236 } 237 this->subDeallocate = ktabGraphsSubObjDeallocate; 238 this->subObj = picoos_allocate(common->mm, sizeof(ktabgraphs_subobj_t)); 239 if (NULL == this->subObj) { 240 return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, 241 NULL, NULL); 242 } 243 return ktabGraphsInitialize(this, common); 244 } 245 246 247 picoktab_Graphs picoktab_getGraphs(picoknow_KnowledgeBase this) { 248 if (NULL == this) { 249 return NULL; 250 } else { 251 return (picoktab_Graphs) this->subObj; 252 } 253 } 254 255 256 /* Graphs methods */ 257 258 picoos_uint8 picoktab_hasVowellikeProp(const picoktab_Graphs this, 259 const picoos_uint8 *graph, 260 const picoos_uint8 graphlenmax) { 261 262 picoos_uint8 ui8App; 263 picoos_uint32 graphsOffset; 264 ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; 265 266 ui8App = graphlenmax; /* avoid warning "var not used in this function"*/ 267 268 graphsOffset = picoktab_graphOffset (this, (picoos_uchar *)graph); 269 return g->graphTable[graphsOffset + ktab_propOffset (this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE)] == PICODATA_ITEMINFO1_TOKTYPE_LETTERV; 270 } 271 272 273 static void ktab_getStrProp (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 propOffset, picoos_uchar * str) 274 { 275 ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; 276 picoos_uint32 i, l; 277 278 i = 0; 279 l = picobase_det_utf8_length(g->graphTable[graphsOffset+propOffset]); 280 while (i<l) { 281 str[i] = g->graphTable[graphsOffset+propOffset+i]; 282 i++; 283 } 284 str[l] = 0; 285 } 286 287 288 static picoos_uint32 ktab_propOffset(const picoktab_Graphs this, 289 picoos_uint32 graphsOffset, picoos_uint32 prop) 290 /* Returns offset of property 'prop' inside the graph with offset 'graphsOffset' in graphs table; 291 If the property is found, a value > 0 is returned otherwise 0 */ 292 { 293 picoos_uint32 n = 0; 294 ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this; 295 296 if ((g->graphTable[graphsOffset] & prop) == prop) { 297 n = n + 1; /* overread PROPSET field */ 298 n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread FROM field */ 299 if (prop > KTAB_GRAPH_PROPSET_TO) { 300 if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TO) 301 == KTAB_GRAPH_PROPSET_TO) { 302 n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread TO field */ 303 } 304 } else { 305 return n; 306 } 307 if (prop > KTAB_GRAPH_PROPSET_TOKENTYPE) { 308 if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENTYPE) 309 == KTAB_GRAPH_PROPSET_TOKENTYPE) { 310 n = n + 1; /* overread TOKENTYPE field */ 311 } 312 } else { 313 return n; 314 } 315 if (prop > KTAB_GRAPH_PROPSET_TOKENSUBTYPE) { 316 if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_TOKENSUBTYPE) 317 == KTAB_GRAPH_PROPSET_TOKENSUBTYPE) { 318 n = n + 1; /* overread stokentype field */ 319 } 320 } else { 321 return n; 322 } 323 if (prop > KTAB_GRAPH_PROPSET_VALUE) { 324 if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_VALUE) 325 == KTAB_GRAPH_PROPSET_VALUE) { 326 n = n + 1; /* overread value field */ 327 } 328 } else { 329 return n; 330 } 331 if (prop > KTAB_GRAPH_PROPSET_LOWERCASE) { 332 if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_LOWERCASE) 333 == KTAB_GRAPH_PROPSET_LOWERCASE) { 334 n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread lowercase field */ 335 } 336 } else { 337 return n; 338 } 339 if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS1) { 340 if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS1) 341 == KTAB_GRAPH_PROPSET_GRAPHSUBS1) { 342 n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs1 field */ 343 } 344 } else { 345 return n; 346 } 347 if (prop > KTAB_GRAPH_PROPSET_GRAPHSUBS2) { 348 if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_GRAPHSUBS2) 349 == KTAB_GRAPH_PROPSET_GRAPHSUBS2) { 350 n = n + picobase_det_utf8_length(g->graphTable[graphsOffset+n]); /* overread graphsubs2 field */ 351 } 352 } else { 353 return n; 354 } 355 if (prop > KTAB_GRAPH_PROPSET_PUNCT) { 356 if ((g->graphTable[graphsOffset] & KTAB_GRAPH_PROPSET_PUNCT) 357 == KTAB_GRAPH_PROPSET_PUNCT) { 358 n = n + 1; /* overread value field */ 359 } 360 } else { 361 return n; 362 } 363 } 364 365 return n; 366 } 367 368 369 picoos_uint32 picoktab_graphOffset (const picoktab_Graphs this, picoos_uchar * utf8graph) 370 { ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; 371 picoos_int32 a, b, m; 372 picoos_uint32 graphsOffset; 373 picoos_uint32 propOffset; 374 picobase_utf8char from; 375 picobase_utf8char to; 376 picoos_bool utfGEfrom; 377 picoos_bool utfLEto; 378 379 if (g->nrOffset > 0) { 380 a = 0; 381 b = g->nrOffset-1; 382 do { 383 m = (a+b) / 2; 384 385 /* get offset to graph[m] */ 386 if (g->sizeOffset == 1) { 387 graphsOffset = g->offsetTable[g->sizeOffset*m]; 388 } 389 else { 390 graphsOffset = g->offsetTable[g->sizeOffset*m ] + 391 256*g->offsetTable[g->sizeOffset*m + 1]; 392 /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i %i", m, g->offsetTable[g->sizeOffset*m], g->offsetTable[g->sizeOffset*m + 1], graphsOffset)); 393 */ 394 } 395 396 /* get FROM and TO field of graph[m] */ 397 ktab_getStrProp(this, graphsOffset, 1, from); 398 propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TO); 399 if (propOffset > 0) { 400 ktab_getStrProp(this, graphsOffset, propOffset, to); 401 } 402 else { 403 picoos_strcpy((picoos_char *)to, (picoos_char *)from); 404 } 405 406 /* PICODBG_DEBUG(("picoktab_graphOffset: %i %i %i '%s' '%s' '%s'", a, m, b, from, utf8graph, to)); 407 */ 408 utfGEfrom = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)from) >= 0; 409 utfLEto = picoos_strcmp((picoos_char *)utf8graph, (picoos_char *)to) <= 0; 410 411 if (utfGEfrom && utfLEto) { 412 /* PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' found", utf8graph)); 413 */ 414 return graphsOffset; 415 } 416 if (!utfGEfrom) { 417 b = m-1; 418 } 419 else if (!utfLEto) { 420 a = m+1; 421 } 422 } while (a<=b); 423 } 424 PICODBG_DEBUG(("picoktab_graphOffset: utf char '%s' not found", utf8graph)); 425 return 0; 426 } 427 428 429 430 431 picoos_bool picoktab_getIntPropTokenType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * stokenType) 432 { 433 picoos_uint32 propOffset; 434 ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; 435 436 propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENTYPE); 437 if (propOffset > 0) { 438 *stokenType = (picoos_uint8)(g->graphTable[graphsOffset+propOffset]); 439 return TRUE; 440 } 441 else { 442 return FALSE; 443 } 444 } 445 446 447 picoos_bool picoktab_getIntPropTokenSubType (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_int8 * stokenSubType) 448 { 449 picoos_uint32 propOffset; 450 ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; 451 452 propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_TOKENSUBTYPE); 453 if (propOffset > 0) { 454 *stokenSubType = (picoos_int8)(g->graphTable[graphsOffset+propOffset]); 455 return TRUE; 456 } 457 else { 458 return FALSE; 459 } 460 } 461 462 picoos_bool picoktab_getIntPropValue (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint32 * value) 463 { 464 picoos_uint32 propOffset; 465 ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; 466 467 propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_VALUE); 468 if (propOffset > 0) { 469 *value = (picoos_uint32)(g->graphTable[graphsOffset+propOffset]); 470 return TRUE; 471 } 472 else { 473 return FALSE; 474 } 475 } 476 477 478 picoos_bool picoktab_getIntPropPunct (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uint8 * info1, picoos_uint8 * info2) 479 { 480 picoos_uint32 propOffset; 481 ktabgraphs_subobj_t * g = (ktabgraphs_SubObj)this; 482 483 propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_PUNCT); 484 if (propOffset > 0) { 485 if (g->graphTable[graphsOffset+propOffset] == 2) { 486 *info1 = PICODATA_ITEMINFO1_PUNC_SENTEND; 487 } 488 else { 489 *info1 = PICODATA_ITEMINFO1_PUNC_PHRASEEND; 490 } 491 if (g->graphTable[graphsOffset+1] == '.') { 492 *info2 = PICODATA_ITEMINFO2_PUNC_SENT_T; 493 } 494 else if (g->graphTable[graphsOffset+1] == '?') { 495 *info2 = PICODATA_ITEMINFO2_PUNC_SENT_Q; 496 } 497 else if (g->graphTable[graphsOffset+1] == '!') { 498 *info2 = PICODATA_ITEMINFO2_PUNC_SENT_E; 499 } 500 else { 501 *info2 = PICODATA_ITEMINFO2_PUNC_PHRASE; 502 } 503 return TRUE; 504 } 505 else { 506 return FALSE; 507 } 508 } 509 510 511 picoos_bool picoktab_getStrPropLowercase (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * lowercase) 512 { 513 picoos_uint32 propOffset; 514 515 propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_LOWERCASE); 516 if (propOffset > 0) { 517 ktab_getStrProp(this, graphsOffset, propOffset, lowercase); 518 return TRUE; 519 } 520 else { 521 return FALSE; 522 } 523 } 524 525 526 picoos_bool picoktab_getStrPropGraphsubs1 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs1) 527 { 528 picoos_uint32 propOffset; 529 530 propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS1); 531 if (propOffset > 0) { 532 ktab_getStrProp(this, graphsOffset, propOffset, graphsubs1); 533 return TRUE; 534 } 535 else { 536 return FALSE; 537 } 538 } 539 540 541 picoos_bool picoktab_getStrPropGraphsubs2 (const picoktab_Graphs this, picoos_uint32 graphsOffset, picoos_uchar * graphsubs2) 542 { 543 picoos_uint32 propOffset; 544 545 propOffset = ktab_propOffset(this, graphsOffset, KTAB_GRAPH_PROPSET_GRAPHSUBS2); 546 if (propOffset > 0) { 547 ktab_getStrProp(this, graphsOffset, propOffset, graphsubs2); 548 return TRUE; 549 } 550 else { 551 return FALSE; 552 } 553 } 554 /* *****************************************************************/ 555 /* used for tools */ 556 557 static void ktab_getUtf8 (picoos_uchar ** pos, picoos_uchar * to) 558 { 559 picoos_uint32 l; 560 l = picobase_det_utf8_length(**pos); 561 while (l>0) { 562 *(to++) = *((*pos)++); 563 l--; 564 } 565 *to = 0; 566 } 567 568 picoos_uint16 picoktab_graphsGetNumEntries(const picoktab_Graphs this) 569 { 570 ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this; 571 return g->nrOffset; 572 } 573 574 void picoktab_graphsGetGraphInfo(const picoktab_Graphs this, 575 picoos_uint16 graphIndex, picoos_uchar * from, picoos_uchar * to, 576 picoos_uint8 * propset, 577 picoos_uint8 * stokenType, picoos_uint8 * stokenSubType, 578 picoos_uint8 * value, picoos_uchar * lowercase, 579 picoos_uchar * graphsubs1, picoos_uchar * graphsubs2, 580 picoos_uint8 * punct) { 581 ktabgraphs_subobj_t * g = (ktabgraphs_SubObj) this; 582 picoos_uint32 graphsOffset; 583 picoos_uint8 * pos; 584 585 /* calculate offset of graph[graphIndex] */ 586 if (g->sizeOffset == 1) { 587 graphsOffset = g->offsetTable[graphIndex]; 588 } else { 589 graphsOffset = g->offsetTable[2 * graphIndex] 590 + (g->offsetTable[2 * graphIndex + 1] << 8); 591 } 592 pos = &(g->graphTable[graphsOffset]); 593 *propset = *pos; 594 595 pos++; /* advance to FROM */ 596 ktab_getUtf8(&pos, from); /* get FROM and advance */ 597 if ((*propset) & KTAB_GRAPH_PROPSET_TO) { 598 ktab_getUtf8(&pos, to); /* get TO and advance */ 599 } else { 600 picoos_strcpy((picoos_char *)to, (picoos_char *)from); 601 } 602 if ((*propset) & KTAB_GRAPH_PROPSET_TOKENTYPE) { 603 (*stokenType) = *(pos++); /* get TOKENTYPE and advance */ 604 } else { 605 (*stokenType) = -1; 606 } 607 if ((*propset) & KTAB_GRAPH_PROPSET_TOKENSUBTYPE) { 608 (*stokenSubType) = *(pos++); /* get TOKENSUBTYPE and advance */ 609 } else { 610 (*stokenSubType) = -1; 611 } 612 if ((*propset) & KTAB_GRAPH_PROPSET_VALUE) { 613 (*value) = *(pos++); /* get VALUE and advance */ 614 } else { 615 (*value) = -1; 616 } 617 if ((*propset) & KTAB_GRAPH_PROPSET_LOWERCASE) { 618 ktab_getUtf8(&pos, lowercase); /* get LOWERCASE and advance */ 619 } else { 620 lowercase[0] = NULLC; 621 } 622 if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS1) { 623 ktab_getUtf8(&pos, graphsubs1); /* get GRAPHSUBS1 and advance */ 624 } else { 625 graphsubs1[0] = NULLC; 626 } 627 if ((*propset) & KTAB_GRAPH_PROPSET_GRAPHSUBS2) { 628 ktab_getUtf8(&pos, graphsubs2); /* get GRAPHSUBS2 and advance */ 629 } else { 630 graphsubs2[0] = NULLC; 631 } 632 if ((*propset) & KTAB_GRAPH_PROPSET_PUNCT) { 633 (*punct) = *(pos++); /* get PUNCT and advance */ 634 } else { 635 (*punct) = -1; 636 } 637 } 638 639 /* ************************************************************/ 640 /* Phones */ 641 /* ************************************************************/ 642 643 /* overview binary file format for phones kb: 644 645 phones-kb = specids propertytable 646 647 specids = PRIMSTRESSID1 SECSTRESSID1 SYLLBOUNDID1 PAUSEID1 WORDBOUNDID1 648 RESERVE1 RESERVE1 RESERVE1 649 650 propertytable = {PHONEPROP2}=256 651 652 PRIMSTRESSID1: one byte, ID of primary stress 653 SECSTRESSID1: one byte, ID of secondary stress 654 SYLLBOUNDID1: one byte, ID of syllable boundary 655 PAUSEID1: one byte, ID of pause 656 RESERVE1: reserved for future use 657 658 PHONEPROP2: one byte, max. of 256 phones directly access this table 659 to check a property for a phone; binary properties 660 encoded (1 bit per prop) 661 least significant bit: vowel 662 next bit: diphth 663 next bit: glott 664 next bit: nonsyllvowel 665 next bit: syllcons 666 3 bits spare 667 */ 668 669 #define KTAB_START_SPECIDS 0 670 #define KTAB_IND_PRIMSTRESS 0 671 #define KTAB_IND_SECSTRESS 1 672 #define KTAB_IND_SYLLBOUND 2 673 #define KTAB_IND_PAUSE 3 674 #define KTAB_IND_WORDBOUND 4 675 676 #define KTAB_START_PROPS 8 677 678 679 typedef struct ktabphones_subobj *ktabphones_SubObj; 680 681 typedef struct ktabphones_subobj { 682 picoos_uint8 *specids; 683 picoos_uint8 *props; 684 } ktabphones_subobj_t; 685 686 687 /* bitmasks to extract the property info from props */ 688 #define KTAB_PPROP_VOWEL '\x01' 689 #define KTAB_PPROP_DIPHTH '\x02' 690 #define KTAB_PPROP_GLOTT '\x04' 691 #define KTAB_PPROP_NONSYLLVOWEL '\x08' 692 #define KTAB_PPROP_SYLLCONS '\x10' 693 694 695 static pico_status_t ktabPhonesInitialize(register picoknow_KnowledgeBase this, 696 picoos_Common common) { 697 ktabphones_subobj_t * ktabphones; 698 699 PICODBG_DEBUG(("start")); 700 701 if (NULL == this || NULL == this->subObj) { 702 return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, 703 NULL, NULL); 704 } 705 ktabphones = (ktabphones_subobj_t *) this->subObj; 706 ktabphones->specids = &(this->base[KTAB_START_SPECIDS]); 707 ktabphones->props = &(this->base[KTAB_START_PROPS]); 708 return PICO_OK; 709 } 710 711 static pico_status_t ktabPhonesSubObjDeallocate(register picoknow_KnowledgeBase this, 712 picoos_MemoryManager mm) { 713 if (NULL != this) { 714 picoos_deallocate(mm, (void *) &this->subObj); 715 } 716 return PICO_OK; 717 } 718 719 pico_status_t picoktab_specializePhonesKnowledgeBase(picoknow_KnowledgeBase this, 720 picoos_Common common) { 721 if (NULL == this) { 722 return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, 723 NULL, NULL); 724 } 725 this->subDeallocate = ktabPhonesSubObjDeallocate; 726 this->subObj = picoos_allocate(common->mm, sizeof(ktabphones_subobj_t)); 727 if (NULL == this->subObj) { 728 return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, 729 NULL, NULL); 730 } 731 return ktabPhonesInitialize(this, common); 732 } 733 734 picoktab_Phones picoktab_getPhones(picoknow_KnowledgeBase this) { 735 if (NULL == this) { 736 return NULL; 737 } else { 738 return (picoktab_Phones) this->subObj; 739 } 740 } 741 742 743 /* Phones methods */ 744 745 picoos_uint8 picoktab_hasVowelProp(const picoktab_Phones this, 746 const picoos_uint8 ch) { 747 return (KTAB_PPROP_VOWEL & ((ktabphones_SubObj)this)->props[ch]); 748 } 749 picoos_uint8 picoktab_hasDiphthProp(const picoktab_Phones this, 750 const picoos_uint8 ch) { 751 return (KTAB_PPROP_DIPHTH & ((ktabphones_SubObj)this)->props[ch]); 752 } 753 picoos_uint8 picoktab_hasGlottProp(const picoktab_Phones this, 754 const picoos_uint8 ch) { 755 return (KTAB_PPROP_GLOTT & ((ktabphones_SubObj)this)->props[ch]); 756 } 757 picoos_uint8 picoktab_hasNonsyllvowelProp(const picoktab_Phones this, 758 const picoos_uint8 ch) { 759 return (KTAB_PPROP_NONSYLLVOWEL & ((ktabphones_SubObj)this)->props[ch]); 760 } 761 picoos_uint8 picoktab_hasSyllconsProp(const picoktab_Phones this, 762 const picoos_uint8 ch) { 763 return (KTAB_PPROP_SYLLCONS & ((ktabphones_SubObj)this)->props[ch]); 764 } 765 766 picoos_bool picoktab_isSyllCarrier(const picoktab_Phones this, 767 const picoos_uint8 ch) { 768 picoos_uint8 props; 769 props = ((ktabphones_SubObj)this)->props[ch]; 770 return (((KTAB_PPROP_VOWEL & props) && 771 !(KTAB_PPROP_NONSYLLVOWEL & props)) 772 || (KTAB_PPROP_SYLLCONS & props)); 773 } 774 775 picoos_bool picoktab_isPrimstress(const picoktab_Phones this, 776 const picoos_uint8 ch) { 777 return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS]); 778 } 779 picoos_bool picoktab_isSecstress(const picoktab_Phones this, 780 const picoos_uint8 ch) { 781 return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS]); 782 } 783 picoos_bool picoktab_isSyllbound(const picoktab_Phones this, 784 const picoos_uint8 ch) { 785 return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND]); 786 } 787 picoos_bool picoktab_isWordbound(const picoktab_Phones this, 788 const picoos_uint8 ch) { 789 return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND]); 790 } 791 picoos_bool picoktab_isPause(const picoktab_Phones this, 792 const picoos_uint8 ch) { 793 return (ch == ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE]); 794 } 795 796 picoos_uint8 picoktab_getPrimstressID(const picoktab_Phones this) { 797 return ((ktabphones_SubObj)this)->specids[KTAB_IND_PRIMSTRESS]; 798 } 799 picoos_uint8 picoktab_getSecstressID(const picoktab_Phones this) { 800 return ((ktabphones_SubObj)this)->specids[KTAB_IND_SECSTRESS]; 801 } 802 picoos_uint8 picoktab_getSyllboundID(const picoktab_Phones this) { 803 return ((ktabphones_SubObj)this)->specids[KTAB_IND_SYLLBOUND]; 804 } 805 picoos_uint8 picoktab_getWordboundID(const picoktab_Phones this) { 806 return ((ktabphones_SubObj)this)->specids[KTAB_IND_WORDBOUND]; 807 } 808 picoos_uint8 picoktab_getPauseID(const picoktab_Phones this) { 809 return ((ktabphones_SubObj)this)->specids[KTAB_IND_PAUSE]; 810 } 811 812 /* ************************************************************/ 813 /* Pos */ 814 /* ************************************************************/ 815 816 /* overview binary file format for pos kb: 817 818 pos-kb = header posids 819 header = {COUNT2 OFFS2}=8 820 posids = {POSID1 {PARTID1}0:8}1: 821 822 where POSID1 is the value of the (combined) part-of-speech symbol, 823 and {PARTID1} are the symbol values of its components (empty if it 824 is not a combined symbol). The {PARTID1} list is sorted. 825 Part-of-speech symbols with equal number of components are grouped 826 together. 827 828 The header contains information about these groups: 829 830 COUNT2 specifies the number of elements in the group, and OFFS2 831 specifies the offset (relative to the beginning of the kb) where 832 the group data starts, i.e.: 833 834 25 32 -> 25 not-combined elements, starting at offset 32 835 44 57 -> 44 elements composed of 2 symbols, starting at offset 57 836 23 189 -> 23 elements composed of 3 symbols, starting at offset 189 837 ... 838 839 Currently, each symbol may be composed of up to 8 other symbols. 840 Therefore, the header has 8 entries, too. The header starts with 841 the unique POS list, and then in increasing order, 2 symbols, 3 842 symbols,... 843 844 Zur Anschauung die ge-printf-te Version: 845 846 25 32 847 44 57 848 23 189 849 12 281 850 4 341 851 1 365 852 0 0 853 0 0 854 33 | 855 34 | 856 35 | 857 60 | 858 etc. 859 36 | 35 60 860 50 | 35 95 861 51 | 35 97 862 58 | 35 120 863 59 | 35 131 864 61 | 60 75 865 63 | 60 95 866 64 | 60 97 867 etc. 868 42 | 35 60 117 869 44 | 35 60 131 870 45 | 35 73 97 871 48 | 35 84 97 872 54 | 35 97 131 873 56 | 35 113 120 874 57 | 35 117 120 875 62 | 60 84 122 876 etc. 877 */ 878 879 typedef struct ktabpos_subobj *ktabpos_SubObj; 880 881 typedef struct ktabpos_subobj { 882 picoos_uint16 nrcomb[PICOKTAB_MAXNRPOS_IN_COMB]; 883 picoos_uint8 *nrcombstart[PICOKTAB_MAXNRPOS_IN_COMB]; 884 } ktabpos_subobj_t; 885 886 887 static pico_status_t ktabPosInitialize(register picoknow_KnowledgeBase this, 888 picoos_Common common) { 889 ktabpos_subobj_t *ktabpos; 890 picoos_uint16 osprev; 891 picoos_uint16 os, pos; 892 picoos_uint8 i; 893 894 PICODBG_DEBUG(("start")); 895 896 if (NULL == this || NULL == this->subObj) { 897 return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, 898 NULL, NULL); 899 } 900 ktabpos = (ktabpos_subobj_t *)this->subObj; 901 902 os = 0; 903 for (i = 0, pos = 0; i < PICOKTAB_MAXNRPOS_IN_COMB; i++, pos += 4) { 904 ktabpos->nrcomb[i] = ((picoos_uint16)(this->base[pos+1])) << 8 | 905 this->base[pos]; 906 if (ktabpos->nrcomb[i] > 0) { 907 osprev = os; 908 os = ((picoos_uint16)(this->base[pos+3])) << 8 | this->base[pos+2]; 909 ktabpos->nrcombstart[i] = &(this->base[os]); 910 PICODBG_TRACE(("i %d, pos %d, nr %d, osprev %d, os %d", i, pos, 911 ktabpos->nrcomb[i], osprev, os)); 912 if (osprev >= os) { 913 /* cannot be, in a valid kb */ 914 return picoos_emRaiseException(common->em, 915 PICO_EXC_FILE_CORRUPT, 916 NULL, NULL); 917 } 918 } else { 919 if (i == 0) { 920 /* cannot be, in a valid kb */ 921 return picoos_emRaiseException(common->em, 922 PICO_EXC_FILE_CORRUPT, 923 NULL, NULL); 924 } 925 ktabpos->nrcombstart[i] = NULL; 926 } 927 } 928 return PICO_OK; 929 } 930 931 static pico_status_t ktabPosSubObjDeallocate(register picoknow_KnowledgeBase this, 932 picoos_MemoryManager mm) { 933 if (NULL != this) { 934 picoos_deallocate(mm, (void *) &this->subObj); 935 } 936 return PICO_OK; 937 } 938 939 pico_status_t picoktab_specializePosKnowledgeBase(picoknow_KnowledgeBase this, 940 picoos_Common common) { 941 if (NULL == this) { 942 return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, 943 NULL, NULL); 944 } 945 this->subDeallocate = ktabPosSubObjDeallocate; 946 this->subObj = picoos_allocate(common->mm, sizeof(ktabpos_subobj_t)); 947 if (NULL == this->subObj) { 948 return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, 949 NULL, NULL); 950 } 951 return ktabPosInitialize(this, common); 952 } 953 954 picoktab_Pos picoktab_getPos(picoknow_KnowledgeBase this) { 955 if (NULL == this) { 956 return NULL; 957 } else { 958 return (picoktab_Pos) this->subObj; 959 } 960 } 961 962 963 /* Pos methods */ 964 965 static picoos_int16 ktab_isEqualPosGroup(const picoos_uint8 *grp1, 966 const picoos_uint8 *grp2, 967 picoos_uint8 len) 968 { 969 /* if both, grp1 and grp2 would be sorted in ascending order 970 we could implement a function picoktab_comparePosGroup in 971 a similar manner as strcmp */ 972 973 picoos_uint16 i, j, equal; 974 975 equal = 1; 976 977 i = 0; 978 while (equal && (i < len)) { 979 /* search grp1[i] in grp2 */ 980 j = 0; 981 while ((j < len) && (grp1[i] != grp2[j])) { 982 j++; 983 } 984 equal = (j < len); 985 i++; 986 } 987 988 return equal; 989 } 990 991 992 picoos_bool picoktab_isUniquePos(const picoktab_Pos this, 993 const picoos_uint8 pos) { 994 ktabpos_subobj_t *ktabpos; 995 picoos_uint16 i; 996 997 /* speed-up possible with e.g. binary search */ 998 999 ktabpos = (ktabpos_subobj_t *)this; 1000 PICODBG_TRACE(("pos %d, nrcombinations %d", pos, ktabpos->nrcomb[0])); 1001 i = 0; 1002 while ((i < ktabpos->nrcomb[0]) && (pos > ktabpos->nrcombstart[0][i])) { 1003 PICODBG_TRACE(("compare with pos %d at position %d", 1004 ktabpos->nrcombstart[0][i], pos, i)); 1005 i++; 1006 } 1007 return ((i < ktabpos->nrcomb[0]) && (pos == ktabpos->nrcombstart[0][i])); 1008 } 1009 1010 1011 picoos_bool picoktab_isPartOfPosGroup(const picoktab_Pos this, 1012 const picoos_uint8 pos, 1013 const picoos_uint8 posgroup) 1014 { 1015 ktabpos_subobj_t *ktabpos; 1016 picoos_uint8 *grp; 1017 picoos_uint16 i, j, n, s, grplen; 1018 picoos_uint8 *e; 1019 picoos_uint8 found; 1020 1021 ktabpos = (ktabpos_subobj_t *) this; 1022 1023 grp = NULL; 1024 found = FALSE; 1025 grplen = 0; 1026 1027 /* currently, a linear search is required to find 'posgroup'; the 1028 knowledge base should be extended to allow for a faster search */ 1029 1030 /* treat case i==0, grplen==0, ie. pos == posgroup */ 1031 if (pos == posgroup) { 1032 found = TRUE; 1033 } 1034 1035 i = 1; 1036 while ((grp == NULL) && (i < PICOKTAB_MAXNRPOS_IN_COMB)) { 1037 n = ktabpos->nrcomb[i]; /* number of entries */ 1038 e = ktabpos->nrcombstart[i]; /* ptr to first entry */ 1039 s = i + 2; /* size of an entry in bytes */ 1040 /* was with while starting at 0: 1041 s = i > 0 ? i + 2 : 1; 1042 */ 1043 j = 0; 1044 while ((grp == NULL) && (j < n)) { 1045 if (posgroup == e[0]) { 1046 grp = e + 1; 1047 grplen = s - 1; 1048 } 1049 e += s; 1050 j++; 1051 } 1052 i++; 1053 } 1054 1055 /* test if 'pos' is contained in the components of 'posgroup' */ 1056 if (grp != NULL) { 1057 for (i = 0; !found && (i < grplen); i++) { 1058 if (pos == grp[i]) { 1059 found = TRUE; 1060 } 1061 } 1062 1063 /* just a way to test picoktab_getPosGroup */ 1064 /* 1065 PICODBG_ASSERT(picoktab_getPosGroup(this, grp, grplen) == posgroup); 1066 */ 1067 } 1068 1069 return found; 1070 } 1071 1072 1073 picoos_uint8 picoktab_getPosGroup(const picoktab_Pos this, 1074 const picoos_uint8 *poslist, 1075 const picoos_uint8 poslistlen) 1076 { 1077 picoos_uint8 poscomb; 1078 ktabpos_subobj_t *ktabpos; 1079 picoos_uint16 i, j, n, s; 1080 picoos_uint8 *e; 1081 1082 ktabpos = (ktabpos_subobj_t *) this; 1083 poscomb = 0; 1084 1085 if ((poslistlen > 0) && (poslistlen <= PICOKTAB_MAXNRPOS_IN_COMB)) { 1086 i = poslistlen - 1; 1087 if (i > 0) { 1088 n = ktabpos->nrcomb[i]; /* number of entries */ 1089 e = ktabpos->nrcombstart[i]; /* ptr to first entry */ 1090 s = i + 2; /* size of an entry in bytes */ 1091 j = 0; 1092 while (!poscomb && (j < n)) { 1093 if (ktab_isEqualPosGroup(poslist, e + 1, poslistlen)) { 1094 poscomb = *e; 1095 } 1096 e += s; 1097 j++; 1098 } 1099 if (!poscomb) { 1100 /* combination not found; shouldn't occur if lingware OK! */ 1101 /* contingency solution: take first */ 1102 PICODBG_WARN(("dynamically created POS combination not found in table; taking first (%i)",poslist[0])); 1103 poscomb = poslist[0]; 1104 } 1105 } else { /* not a composed POS */ 1106 poscomb = poslist[0]; 1107 } 1108 } 1109 1110 return poscomb; 1111 } 1112 1113 #ifdef __cplusplus 1114 } 1115 #endif 1116 1117 1118 /* end */ 1119