1 /* 2 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 /** 17 * @file picoklex.c 18 * 19 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland 20 * All rights reserved. 21 * 22 * History: 23 * - 2009-04-20 -- initial version 24 * 25 */ 26 #include "picoos.h" 27 #include "picodbg.h" 28 #include "picodata.h" 29 #include "picoknow.h" 30 #include "picoklex.h" 31 32 #ifdef __cplusplus 33 extern "C" { 34 #endif 35 #if 0 36 } 37 #endif 38 39 /* ************************************************************/ 40 /* lexicon */ 41 /* ************************************************************/ 42 43 /** 44 * @addtogroup picolex 45 * 46 overview: 47 - lex consists of optional searchindex and a non-empty list of lexblocks 48 - lexblocks are fixed size, at the start of a block there is also the 49 start of an entry 50 - using the searchindex a unambiguous lexblock can be determined which 51 contains the entry (or there is no entry) 52 - one lex entry has POS GRAPH PHON, all mandatory, but 53 - PHON can be empty string -> no pronunciation in the resulting TTS output 54 - PHON can be :G2P -> use G2P later to add pronunciation 55 - (POS,GRAPH) is a uniq key (only one entry allowed) 56 - (GRAPH) is almost a uniq key (2-4 entries with the same GRAPH, and 57 differing POS and differing PHON possible) 58 - for one graph we can have two or three solutions from the lex 59 which all need to be passed on the the next PU 60 - in this case GRAPH, POS, and PHON all must be available in lex 61 62 sizing: 63 - 3 bytes entry index -> 16MB addressable 64 - 2 bytes searchindex nr -> 64K blocks possible 65 - 5 bytes per searchindex entry 66 - 3 bytes for graph-prefix 67 - 2 bytes blockadr in searchindex -> 64K blocks possible 68 - lexblock size 512B: 69 - 32M possible 70 - with ~20 bytes per entry 71 -> max. average of ~26 entries to be searched per lookup 72 - overhead of ~10 bytes per block to sync with 73 block boundaries 74 - examples: 75 - 500KB lex -> 1000 blocks, 76 1000 entries in searchindex, ~25.6K lex-entries, 77 - ~5KB searchindex 78 ~10KB overhead for block sync 79 - 100KB lex -> 200 blocks, 80 200 entries in searchindex, ~5.1K lex-entries, 81 - ~1KB searchindex 82 ~2KB overhead for block sync 83 84 pil-file: lexicon knowledge base in binary form 85 86 lex-kb = content 87 88 content = searchindex {lexblock}1:NRBLOCKS2 89 90 lexblock = {lexentry}1: (lexblock size is fixed 512Bytes) 91 92 searchindex = NRBLOCKS2 {GRAPH1 GRAPH1 GRAPH1 LEXBLOCKIND2}=NRBLOCKS2 93 94 lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1 95 LENPOSPHON1 POS1 {PHON1}=LENPOSPHON1-2 96 97 - special cases: 98 - PHON is empty string (no pronunciation in the resulting TTS output): 99 lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1 2 POS1 100 - PHON can be :G2P -> use G2P later to add pronunciation: 101 lexentry = LENGRAPH1 {GRAPH1}=LENGRAPH1-1 3 POS1 <reserved-phon-val=5> 102 - multi-byte values always little endian 103 */ 104 105 106 /* ************************************************************/ 107 /* lexicon data defines */ 108 /* may not be changed with current implementation */ 109 /* ************************************************************/ 110 111 /* nr bytes of nrblocks info */ 112 #define PICOKLEX_LEX_NRBLOCKS_SIZE 2 113 114 /* search index entry: - nr graphs 115 - nr bytes of block index 116 - nr bytes per entry, NRGRAPHS*INDSIZE */ 117 #define PICOKLEX_LEX_SIE_NRGRAPHS 3 118 #define PICOKLEX_LEX_SIE_INDSIZE 2 119 #define PICOKLEX_LEX_SIE_SIZE 5 120 121 /* nr of bytes per lexblock */ 122 #define PICOKLEX_LEXBLOCK_SIZE 512 123 124 125 /* reserved values in klex to indicate :G2P needed for a lexentry */ 126 #define PICOKLEX_NEEDS_G2P 5 127 128 129 /* ************************************************************/ 130 /* lexicon type and loading */ 131 /* ************************************************************/ 132 133 /** object : LexKnowledgeBase 134 * shortcut : klex 135 * derived from : picoknow_KnowledgeBase 136 */ 137 138 typedef struct klex_subobj *klex_SubObj; 139 140 typedef struct klex_subobj 141 { 142 picoos_uint16 nrblocks; /* nr lexblocks = nr eles in searchind */ 143 picoos_uint8 *searchind; 144 picoos_uint8 *lexblocks; 145 } klex_subobj_t; 146 147 148 static pico_status_t klexInitialize(register picoknow_KnowledgeBase this, 149 picoos_Common common) 150 { 151 picoos_uint32 curpos = 0; 152 klex_subobj_t *klex; 153 154 PICODBG_DEBUG(("start")); 155 156 /* check whether (this->size != 0) done before calling this function */ 157 158 if (NULL == this || NULL == this->subObj) { 159 return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, 160 NULL, NULL); 161 } 162 klex = (klex_subobj_t *) this->subObj; 163 164 if (PICO_OK == picoos_read_mem_pi_uint16(this->base, &curpos, 165 &(klex->nrblocks))) { 166 if (klex->nrblocks > 0) { 167 PICODBG_DEBUG(("nr blocks: %i, curpos: %i", klex->nrblocks,curpos)); 168 klex->searchind = this->base + curpos; 169 } else { 170 klex->searchind = NULL; 171 } 172 klex->lexblocks = this->base + PICOKLEX_LEX_NRBLOCKS_SIZE + 173 (klex->nrblocks * (PICOKLEX_LEX_SIE_SIZE)); 174 return PICO_OK; 175 } else { 176 return picoos_emRaiseException(common->em, PICO_EXC_FILE_CORRUPT, 177 NULL, NULL); 178 } 179 } 180 181 182 static pico_status_t klexSubObjDeallocate(register picoknow_KnowledgeBase this, 183 picoos_MemoryManager mm) 184 { 185 if (NULL != this) { 186 picoos_deallocate(mm, (void *) &this->subObj); 187 } 188 return PICO_OK; 189 } 190 191 192 /* we don't offer a specialized constructor for a LexKnowledgeBase but 193 * instead a "specializer" of an allready existing generic 194 * picoknow_KnowledgeBase */ 195 196 pico_status_t picoklex_specializeLexKnowledgeBase(picoknow_KnowledgeBase this, 197 picoos_Common common) 198 { 199 if (NULL == this) { 200 return picoos_emRaiseException(common->em, PICO_EXC_KB_MISSING, 201 NULL, NULL); 202 } 203 if (this->size > 0) { 204 this->subDeallocate = klexSubObjDeallocate; 205 this->subObj = picoos_allocate(common->mm, sizeof(klex_subobj_t)); 206 if (NULL == this->subObj) { 207 return picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, 208 NULL, NULL); 209 } 210 return klexInitialize(this, common); 211 } else { 212 /* some dummy klex */ 213 return PICO_OK; 214 } 215 } 216 217 /* for now we don't need to do anything special for the main lex */ 218 /* 219 pico_status_t picoklex_specializeMainLexKnowledgeBase( 220 picoknow_KnowledgeBase this, 221 picoos_Common common) 222 { 223 return picoklex_specializeLexKnowledgeBase(this,common); 224 } 225 */ 226 227 228 /* ************************************************************/ 229 /* lexicon getLex */ 230 /* ************************************************************/ 231 232 picoklex_Lex picoklex_getLex(picoknow_KnowledgeBase this) 233 { 234 if (NULL == this) { 235 return NULL; 236 } else { 237 return (picoklex_Lex) this->subObj; 238 } 239 } 240 241 242 /* ************************************************************/ 243 /* functions on searchindex */ 244 /* ************************************************************/ 245 246 247 static picoos_uint32 klex_getSearchIndexVal(const klex_SubObj this, 248 picoos_uint16 index) 249 { 250 picoos_uint32 pos, val; 251 pos = index * PICOKLEX_LEX_SIE_SIZE; 252 val = this->searchind[pos]; 253 val = (val << 8) + this->searchind[pos + 1]; 254 val = (val << 8) + this->searchind[pos + 2]; 255 return val; 256 } 257 258 259 /* Determine first lexblock containing entries for specified 260 grapheme. */ 261 262 static picoos_uint16 klex_getLexblockNr(const klex_SubObj this, 263 const picoos_uint8 *graphsi) { 264 /* graphsi is of len PICOKLEX_LEX_SI_NGRAPHS */ 265 picoos_int32 low, mid, high; 266 picoos_uint32 searchval, indval; 267 268 /* PICOKLEX_LEX_SIE_NRGRAPHS */ 269 270 /* convert graph-prefix to number with 'lexicographic' ordering */ 271 searchval = graphsi[0]; 272 searchval = (searchval << 8) + graphsi[1]; 273 searchval = (searchval << 8) + graphsi[2]; 274 275 low = 0; 276 high = this->nrblocks; 277 278 /* do binary search */ 279 while (low < high) { 280 mid = (low + high) / 2; 281 indval = klex_getSearchIndexVal(this, mid); 282 if (indval < searchval) { 283 low = mid + 1; 284 } else { 285 high = mid; 286 } 287 } 288 PICODBG_ASSERT(high == low); 289 /* low points to the first entry greater than or equal to searchval */ 290 291 if (low < this->nrblocks) { 292 indval = klex_getSearchIndexVal(this, low); 293 if (indval > searchval) { 294 low--; 295 /* if there are identical elements in the search index we have 296 to move to the first one */ 297 if (low > 0) { 298 indval = klex_getSearchIndexVal(this, low); 299 while (indval == klex_getSearchIndexVal(this, low-1)) { 300 low--; 301 } 302 } 303 } 304 } else { 305 low = this->nrblocks - 1; 306 } 307 308 #if defined(PICO_DEBUG) 309 { 310 picoos_uint32 pos = low * PICOKLEX_LEX_SIE_SIZE; 311 PICODBG_DEBUG(("binary search result is %c%c%c (%d)", 312 this->searchind[pos], this->searchind[pos + 1], 313 this->searchind[pos + 2], low)); 314 } 315 #endif 316 317 return (picoos_uint16) low; 318 } 319 320 321 /* Determine number of adjacent lexblocks containing entries for 322 the same grapheme search prefix (identified by search index). */ 323 324 static picoos_uint16 klex_getLexblockRange(const klex_SubObj this, 325 picoos_uint16 index) 326 { 327 picoos_uint16 count; 328 picoos_uint32 sval1, sval2; 329 330 sval1 = klex_getSearchIndexVal(this, index); 331 332 #if defined(PICO_DEBUG) 333 /* 'index' must point to first lexblock of its kind */ 334 if (index > 0) { 335 sval2 = klex_getSearchIndexVal(this, index - 1); 336 PICODBG_ASSERT(sval1 != sval2); 337 } 338 #endif 339 340 index++; 341 sval2 = klex_getSearchIndexVal(this, index); 342 343 count = 1; 344 while (sval1 == sval2) { 345 count++; 346 index++; 347 sval2 = klex_getSearchIndexVal(this, index); 348 } 349 350 return count; 351 } 352 353 354 /* ************************************************************/ 355 /* functions on single lexblock */ 356 /* ************************************************************/ 357 358 static picoos_int8 klex_lexMatch(picoos_uint8 *lexentry, 359 const picoos_uint8 *graph, 360 const picoos_uint16 graphlen) { 361 picoos_uint8 i; 362 picoos_uint8 lexlen; 363 picoos_uint8 *lexgraph; 364 365 lexlen = lexentry[0] - 1; 366 lexgraph = &(lexentry[1]); 367 for (i=0; (i<graphlen) && (i<lexlen); i++) { 368 PICODBG_TRACE(("%d|%d graph|lex: %c|%c", graphlen, lexlen, 369 graph[i], lexgraph[i])); 370 if (lexgraph[i] < graph[i]) { 371 return -1; 372 } else if (lexgraph[i] > graph[i]) { 373 return 1; 374 } 375 } 376 if (graphlen == lexlen) { 377 return 0; 378 } else if (lexlen < graphlen) { 379 return -1; 380 } else { 381 return 1; 382 } 383 } 384 385 386 static void klex_setLexResult(const picoos_uint8 *lexentry, 387 const picoos_uint32 lexpos, 388 picoklex_lexl_result_t *lexres) { 389 picoos_uint8 i; 390 391 /* check if :G2P */ 392 if ((2 < (lexentry[lexentry[0]])) && ((lexentry[lexentry[0] + 2]) == PICOKLEX_NEEDS_G2P)) { 393 /* set pos */ 394 lexres->posind[0] = lexentry[lexentry[0] + 1]; 395 /* set rest */ 396 lexres->phonfound = FALSE; 397 lexres->posindlen = 1; 398 lexres->nrres = 1; 399 PICODBG_DEBUG(("result %d :G2P", lexres->nrres)); 400 } else { 401 i = lexres->nrres * (PICOKLEX_POSIND_SIZE); 402 lexres->posindlen += PICOKLEX_POSIND_SIZE; 403 lexres->phonfound = TRUE; 404 /* set pos */ 405 lexres->posind[i++] = lexentry[lexentry[0] + 1]; 406 /* set ind, PICOKLEX_IND_SIZE */ 407 lexres->posind[i++] = 0x000000ff & (lexpos); 408 lexres->posind[i++] = 0x000000ff & (lexpos >> 8); 409 lexres->posind[i] = 0x000000ff & (lexpos >> 16); 410 lexres->nrres++; 411 PICODBG_DEBUG(("result %d", lexres->nrres)); 412 } 413 } 414 415 416 static void klex_lexblockLookup(klex_SubObj this, 417 const picoos_uint32 lexposStart, 418 const picoos_uint32 lexposEnd, 419 const picoos_uint8 *graph, 420 const picoos_uint16 graphlen, 421 picoklex_lexl_result_t *lexres) { 422 picoos_uint32 lexpos; 423 picoos_int8 rv; 424 425 lexres->nrres = 0; 426 427 lexpos = lexposStart; 428 rv = -1; 429 while ((rv < 0) && (lexpos < lexposEnd)) { 430 431 rv = klex_lexMatch(&(this->lexblocks[lexpos]), graph, graphlen); 432 433 if (rv == 0) { /* found */ 434 klex_setLexResult(&(this->lexblocks[lexpos]), lexpos, lexres); 435 if (lexres->phonfound) { 436 /* look for more results, up to MAX_NRRES, don't even 437 check if more results would be available */ 438 while ((lexres->nrres < PICOKLEX_MAX_NRRES) && 439 (lexpos < lexposEnd)) { 440 lexpos += this->lexblocks[lexpos]; 441 lexpos += this->lexblocks[lexpos]; 442 /* if there are no more entries in this block, advance 443 to next block by skipping all zeros */ 444 while ((this->lexblocks[lexpos] == 0) && 445 (lexpos < lexposEnd)) { 446 lexpos++; 447 } 448 if (lexpos < lexposEnd) { 449 if (klex_lexMatch(&(this->lexblocks[lexpos]), graph, 450 graphlen) == 0) { 451 klex_setLexResult(&(this->lexblocks[lexpos]), 452 lexpos, lexres); 453 } else { 454 /* no more results, quit loop */ 455 lexpos = lexposEnd; 456 } 457 } 458 } 459 } else { 460 /* :G2P mark */ 461 } 462 } else if (rv < 0) { 463 /* not found, goto next entry */ 464 lexpos += this->lexblocks[lexpos]; 465 lexpos += this->lexblocks[lexpos]; 466 /* if there are no more entries in this block, advance 467 to next block by skipping all zeros */ 468 while ((this->lexblocks[lexpos] == 0) && (lexpos < lexposEnd)) { 469 lexpos++; 470 } 471 } else { 472 /* rv > 0, not found, won't show up later in block */ 473 } 474 } 475 } 476 477 478 /* ************************************************************/ 479 /* lexicon lookup functions */ 480 /* ************************************************************/ 481 482 picoos_uint8 picoklex_lexLookup(const picoklex_Lex this, 483 const picoos_uint8 *graph, 484 const picoos_uint16 graphlen, 485 picoklex_lexl_result_t *lexres) { 486 picoos_uint16 lbnr, lbc; 487 picoos_uint32 lexposStart, lexposEnd; 488 picoos_uint8 i; 489 picoos_uint8 tgraph[PICOKLEX_LEX_SIE_NRGRAPHS]; 490 klex_SubObj klex = (klex_SubObj) this; 491 492 if (NULL == klex) { 493 PICODBG_ERROR(("no lexicon loaded")); 494 /* no exception here needed, already checked at initialization */ 495 return FALSE; 496 } 497 498 lexres->nrres = 0; 499 lexres->posindlen = 0; 500 lexres->phonfound = FALSE; 501 502 for (i = 0; i<PICOKLEX_LEX_SIE_NRGRAPHS; i++) { 503 if (i < graphlen) { 504 tgraph[i] = graph[i]; 505 } else { 506 tgraph[i] = '\0'; 507 } 508 } 509 PICODBG_DEBUG(("tgraph: %c%c%c", tgraph[0],tgraph[1],tgraph[2])); 510 511 if ((klex->nrblocks) == 0) { 512 /* no searchindex, no lexblock */ 513 PICODBG_WARN(("no searchindex, no lexblock")); 514 return FALSE; 515 } else { 516 lbnr = klex_getLexblockNr(klex, tgraph); 517 PICODBG_ASSERT(lbnr < klex->nrblocks); 518 lbc = klex_getLexblockRange(klex, lbnr); 519 PICODBG_ASSERT((lbc >= 1) && (lbc <= klex->nrblocks)); 520 } 521 PICODBG_DEBUG(("lexblock nr: %d (#%d)", lbnr, lbc)); 522 523 lexposStart = lbnr * PICOKLEX_LEXBLOCK_SIZE; 524 lexposEnd = lexposStart + lbc * PICOKLEX_LEXBLOCK_SIZE; 525 526 PICODBG_DEBUG(("lookup start, lexpos range %d..%d", lexposStart,lexposEnd)); 527 klex_lexblockLookup(klex, lexposStart, lexposEnd, graph, graphlen, lexres); 528 PICODBG_DEBUG(("lookup done, %d found", lexres->nrres)); 529 530 return (lexres->nrres > 0); 531 } 532 533 534 picoos_uint8 picoklex_lexIndLookup(const picoklex_Lex this, 535 const picoos_uint8 *ind, 536 const picoos_uint8 indlen, 537 picoos_uint8 *pos, 538 picoos_uint8 **phon, 539 picoos_uint8 *phonlen) { 540 picoos_uint32 pentry; 541 klex_SubObj klex = (klex_SubObj) this; 542 543 /* check indlen */ 544 if (indlen != PICOKLEX_IND_SIZE) { 545 return FALSE; 546 } 547 548 /* PICOKLEX_IND_SIZE */ 549 pentry = 0x000000ff & (ind[0]); 550 pentry |= ((picoos_uint32)(ind[1]) << 8); 551 pentry |= ((picoos_uint32)(ind[2]) << 16); 552 553 /* check ind if it is within lexblocks byte stream, if not, return FALSE */ 554 if (pentry >= ((picoos_uint32)klex->nrblocks * PICOKLEX_LEXBLOCK_SIZE)) { 555 return FALSE; 556 } 557 558 pentry += (klex->lexblocks[pentry]); 559 *phonlen = (klex->lexblocks[pentry++]) - 2; 560 *pos = klex->lexblocks[pentry++]; 561 *phon = &(klex->lexblocks[pentry]); 562 563 PICODBG_DEBUG(("pentry: %d, phonlen: %d", pentry, *phonlen)); 564 return TRUE; 565 } 566 567 #ifdef __cplusplus 568 } 569 #endif 570 571 572 /* end */ 573