1 /* 2 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 /** 17 * @file picowa.c 18 * 19 * word analysis PU - lexicon lookup and POS prediction 20 * 21 * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland 22 * All rights reserved. 23 * 24 * History: 25 * - 2009-04-20 -- initial version 26 * 27 */ 28 29 #include "picoos.h" 30 #include "picodbg.h" 31 #include "picodata.h" 32 #include "picowa.h" 33 #include "picoklex.h" 34 #include "picokdt.h" 35 #include "picoktab.h" 36 37 #ifdef __cplusplus 38 extern "C" { 39 #endif 40 #if 0 41 } 42 #endif 43 44 /* PU waStep states */ 45 #define WA_STEPSTATE_COLLECT 0 46 #define WA_STEPSTATE_PROCESS 1 47 #define WA_STEPSTATE_FEED 2 48 49 50 /* subobject : WordAnaUnit 51 * shortcut : wa 52 * context size : one item 53 */ 54 typedef struct wa_subobj { 55 picoos_uint8 procState; /* for next processing step decision */ 56 57 /* one item only */ 58 picoos_uint8 inBuf[PICOWA_MAXITEMSIZE]; /* internal input buffer */ 59 picoos_uint16 inBufSize; /* actually allocated size */ 60 picoos_uint16 inLen; /* length of item in inBuf, 0 for empty buf */ 61 62 picoos_uint8 outBuf[PICOWA_MAXITEMSIZE]; /* internal output buffer */ 63 picoos_uint16 outBufSize; /* actually allocated size */ 64 picoos_uint16 outLen; /* length of item in outBuf, 0 for empty buf */ 65 66 /* lex knowledge base */ 67 picoklex_Lex lex; 68 69 /* ulex knowledge bases */ 70 picoos_uint8 numUlex; 71 picoklex_Lex ulex[PICOKNOW_MAX_NUM_ULEX]; 72 73 /* tab knowledge base */ 74 picoktab_Pos tabpos; 75 76 /* dtposp knowledge base */ 77 picokdt_DtPosP dtposp; 78 } wa_subobj_t; 79 80 81 static pico_status_t waInitialize(register picodata_ProcessingUnit this, picoos_int32 resetMode) { 82 picoos_uint8 i; 83 picoklex_Lex ulex; 84 wa_subobj_t * wa; 85 86 picoknow_kb_id_t ulexKbIds[PICOKNOW_MAX_NUM_ULEX] = PICOKNOW_KBID_ULEX_ARRAY; 87 88 PICODBG_DEBUG(("calling")); 89 90 if (NULL == this || NULL == this->subObj) { 91 return (picodata_step_result_t) picoos_emRaiseException(this->common->em, 92 PICO_ERR_NULLPTR_ACCESS, NULL, NULL); 93 } 94 wa = (wa_subobj_t *) this->subObj; 95 wa->procState = WA_STEPSTATE_COLLECT; 96 wa->inBufSize = PICOWA_MAXITEMSIZE; 97 wa->inLen = 0; 98 wa->outBufSize = PICOWA_MAXITEMSIZE; 99 wa->outLen = 0; 100 101 if (resetMode == PICO_RESET_SOFT) { 102 /*following initializations needed only at startup or after a full reset*/ 103 return PICO_OK; 104 } 105 /* kb lex */ 106 wa->lex = picoklex_getLex(this->voice->kbArray[PICOKNOW_KBID_LEX_MAIN]); 107 if (wa->lex == NULL) { 108 return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING, 109 NULL, NULL); 110 } 111 PICODBG_DEBUG(("got lex")); 112 113 /* kb ulex[] */ 114 wa->numUlex = 0; 115 for (i = 0; i<PICOKNOW_MAX_NUM_ULEX; i++) { 116 ulex = picoklex_getLex(this->voice->kbArray[ulexKbIds[i]]); 117 if (NULL != ulex) { 118 wa->ulex[wa->numUlex++] = ulex; 119 } 120 } 121 PICODBG_DEBUG(("got %i user lexica", wa->numUlex)); 122 123 /* kb tabpos */ 124 wa->tabpos = 125 picoktab_getPos(this->voice->kbArray[PICOKNOW_KBID_TAB_POS]); 126 if (wa->tabpos == NULL) { 127 return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING, 128 NULL, NULL); 129 } 130 PICODBG_DEBUG(("got tabpos")); 131 132 /* kb dtposp */ 133 wa->dtposp = picokdt_getDtPosP(this->voice->kbArray[PICOKNOW_KBID_DT_POSP]); 134 if (wa->dtposp == NULL) { 135 return picoos_emRaiseException(this->common->em, PICO_EXC_KB_MISSING, 136 NULL, NULL); 137 } 138 PICODBG_DEBUG(("got dtposp")); 139 return PICO_OK; 140 } 141 142 static picodata_step_result_t waStep(register picodata_ProcessingUnit this, 143 picoos_int16 mode, 144 picoos_uint16 *numBytesOutput); 145 146 static pico_status_t waTerminate(register picodata_ProcessingUnit this) { 147 return PICO_OK; 148 } 149 150 static pico_status_t waSubObjDeallocate(register picodata_ProcessingUnit this, 151 picoos_MemoryManager mm) { 152 if (NULL != this) { 153 picoos_deallocate(this->common->mm, (void *) &this->subObj); 154 } 155 mm = mm; /* avoid warning "var not used in this function"*/ 156 return PICO_OK; 157 } 158 159 160 picodata_ProcessingUnit picowa_newWordAnaUnit(picoos_MemoryManager mm, 161 picoos_Common common, 162 picodata_CharBuffer cbIn, 163 picodata_CharBuffer cbOut, 164 picorsrc_Voice voice) { 165 picodata_ProcessingUnit this; 166 167 this = picodata_newProcessingUnit(mm, common, cbIn, cbOut, voice); 168 if (this == NULL) { 169 return NULL; 170 } 171 172 this->initialize = waInitialize; 173 PICODBG_DEBUG(("set this->step to waStep")); 174 this->step = waStep; 175 this->terminate = waTerminate; 176 this->subDeallocate = waSubObjDeallocate; 177 this->subObj = picoos_allocate(mm, sizeof(wa_subobj_t)); 178 if (this->subObj == NULL) { 179 picoos_deallocate(mm, (void *)&this); 180 picoos_emRaiseException(common->em, PICO_EXC_OUT_OF_MEM, NULL, NULL); 181 return NULL; 182 } 183 184 waInitialize(this, PICO_RESET_FULL); 185 return this; 186 } 187 188 /* ***********************************************************************/ 189 /* WORDGRAPH proc functions */ 190 /* ***********************************************************************/ 191 192 static picoos_uint8 waClassifyPos(register picodata_ProcessingUnit this, 193 register wa_subobj_t *wa, 194 const picoos_uint8 *graph, 195 const picoos_uint16 graphlen) { 196 picokdt_classify_result_t dtres; 197 picoos_uint8 specchar; 198 picoos_uint16 i; 199 200 PICODBG_DEBUG(("graphlen %d", graphlen)); 201 202 /* check existence of special char (e.g. hyphen) in graph: 203 for now, check existence of hard-coded ascii hyphen, 204 ie. preproc needs to match all UTF8 hyphens to the ascii 205 hyphen. */ 206 /* @todo : consider specifying special char(s) in lingware. */ 207 specchar = FALSE; 208 i = 0; 209 while ((i < graphlen) && (!specchar)) { 210 if (graph[i++] == '-') { 211 specchar = TRUE; 212 } 213 } 214 215 /* construct input vector, which is set in dtposp */ 216 if (!picokdt_dtPosPconstructInVec(wa->dtposp, graph, graphlen, specchar)) { 217 /* error constructing invec */ 218 PICODBG_WARN(("problem with invec")); 219 picoos_emRaiseWarning(this->common->em, PICO_WARN_INVECTOR, NULL, NULL); 220 return PICODATA_ITEMINFO1_ERR; 221 } 222 223 /* classify */ 224 if (!picokdt_dtPosPclassify(wa->dtposp)) { 225 /* error doing classification */ 226 PICODBG_WARN(("problem classifying")); 227 picoos_emRaiseWarning(this->common->em, PICO_WARN_CLASSIFICATION, 228 NULL, NULL); 229 return PICODATA_ITEMINFO1_ERR; 230 } 231 232 /* decompose */ 233 if (!picokdt_dtPosPdecomposeOutClass(wa->dtposp, &dtres)) { 234 /* error decomposing */ 235 PICODBG_WARN(("problem decomposing")); 236 picoos_emRaiseWarning(this->common->em, PICO_WARN_OUTVECTOR, 237 NULL, NULL); 238 return PICODATA_ITEMINFO1_ERR; 239 } 240 241 if (dtres.set) { 242 PICODBG_DEBUG(("class %d", dtres.class)); 243 return (picoos_uint8)dtres.class; 244 } else { 245 PICODBG_WARN(("result not set")); 246 picoos_emRaiseWarning(this->common->em, PICO_WARN_CLASSIFICATION, 247 NULL, NULL); 248 return PICODATA_ITEMINFO1_ERR; 249 } 250 } 251 252 253 static pico_status_t waProcessWordgraph(register picodata_ProcessingUnit this, 254 register wa_subobj_t *wa /*inout*/, 255 picodata_itemhead_t *head /*inout*/, 256 const picoos_uint8 *content) { 257 pico_status_t status; 258 picoklex_lexl_result_t lexres; 259 picoos_uint8 posbuf[PICOKTAB_MAXNRPOS_IN_COMB]; 260 picoos_uint8 i; 261 picoos_uint8 foundIndex; 262 picoos_bool found; 263 264 265 PICODBG_DEBUG(("type %c, len %d", head->type, head->len)); 266 267 /* do lookup 268 if no entry found: 269 do POS prediction: -> WORDGRAPH(POSes,NA)graph 270 else: 271 if incl-phone: 272 N entries possible -> WORDINDEX(POSes,NA)POS1|ind1...POSN|indN 273 (N in {1,...,PICOKLEX_MAX_NRRES}, now up to 4) 274 else: 275 no phone, one entry -> WORDGRAPH(POS,NA)graph 276 */ 277 278 found = FALSE; 279 i = 0; 280 while (!found && (i < wa->numUlex)) { 281 found = picoklex_lexLookup(wa->ulex[i], content, head->len, &lexres); 282 i++; 283 } 284 /* note that if found, i will be incremented nevertheless, so i >= 1 */ 285 if (found) { 286 foundIndex = i; 287 } else { 288 foundIndex = 0; 289 } 290 if (!found && !picoklex_lexLookup(wa->lex, content, head->len, &lexres)) { 291 /* no lex entry found, WORDGRAPH(POS,NA)graph */ 292 if (PICO_OK == picodata_copy_item(wa->inBuf, wa->inLen, 293 wa->outBuf, wa->outBufSize, 294 &wa->outLen)) { 295 wa->inLen = 0; 296 /* predict and modify pos in info1 */ 297 if (PICO_OK != picodata_set_iteminfo1(wa->outBuf, wa->outLen, 298 waClassifyPos(this, wa, content, head->len))) { 299 return picoos_emRaiseException(this->common->em, 300 PICO_EXC_BUF_OVERFLOW,NULL,NULL); 301 } 302 } 303 304 } else { /* at least one entry found */ 305 PICODBG_DEBUG(("at least one entry found in lexicon %i",foundIndex)); 306 if (lexres.phonfound) { /* incl. ind-phone and possibly multi-ent. */ 307 if (lexres.nrres > PICOKLEX_MAX_NRRES) { 308 /* not possible with system lexicon, needs to be 309 ensured for user lex too */ 310 picoos_emRaiseWarning(this->common->em, PICO_WARN_FALLBACK,NULL, 311 (picoos_char *)"using %d lexicon lookup results", 312 PICOKLEX_MAX_NRRES); 313 lexres.nrres = PICOKLEX_MAX_NRRES; 314 } 315 head->type = PICODATA_ITEM_WORDINDEX; 316 if (lexres.nrres == 1) { 317 head->info1 = lexres.posind[0]; 318 } else { 319 /* more than one result, POSgroup info needs to be 320 determined for later POS disambiguation */ 321 for (i = 0; i < lexres.nrres; i++) { 322 posbuf[i] = lexres.posind[i * PICOKLEX_POSIND_SIZE]; 323 } 324 head->info1 = picoktab_getPosGroup(wa->tabpos, posbuf, 325 lexres.nrres); 326 } 327 head->info2 = foundIndex; 328 head->len = lexres.posindlen; 329 if ((status = picodata_put_itemparts(head, lexres.posind, 330 lexres.posindlen, 331 wa->outBuf, wa->outBufSize, 332 &wa->outLen)) == PICO_OK) { 333 wa->inLen = 0; 334 } else { 335 return picoos_emRaiseException(this->common->em, status, 336 NULL, NULL); 337 } 338 339 } else { /* no phone, :G2P, one entry: WORDGRAPH(POS,NA)graph */ 340 if (PICO_OK == picodata_copy_item(wa->inBuf, wa->inLen, 341 wa->outBuf, wa->outBufSize, 342 &wa->outLen)) { 343 wa->inLen = 0; 344 /* set lex pos in info1 */ 345 if (PICO_OK != picodata_set_iteminfo1(wa->outBuf, wa->outLen, 346 lexres.posind[0])) { 347 return picoos_emRaiseException(this->common->em, 348 PICO_EXC_BUF_OVERFLOW, 349 NULL, NULL); 350 } 351 } 352 } 353 } 354 return PICO_OK; 355 } 356 357 358 /* ***********************************************************************/ 359 /* waStep function */ 360 /* ***********************************************************************/ 361 362 /* 363 collect into internal buffer, process, and then feed to output buffer 364 365 init state: COLLECT ext ext 366 state transitions: in IN OUTout 367 COLLECT | getOneItem ->-1 +1 0 0 | (ATOMIC) -> PROCESS (got item) 368 COLLECT | getOneItem -> 0 0 0 0 | IDLE (got no item) 369 370 PROCESS | procOneItem -> 0 -1 +1 0 | (ATOMIC) -> FEED (proc'ed item) 371 PROCESS | procOneItem -> 0 -1 0 0 | BUSY -> COLLECT (item skipped) 372 373 FEED | putOneItem -> 0 0 -1 +1 | BUSY -> COLLECT (put item) 374 FEED | putOneItem -> 0 0 1 0 | OUT_FULL (put no item) 375 */ 376 377 static picodata_step_result_t waStep(register picodata_ProcessingUnit this, 378 picoos_int16 mode, 379 picoos_uint16 * numBytesOutput) { 380 register wa_subobj_t *wa; 381 pico_status_t rv = PICO_OK; 382 383 if (NULL == this || NULL == this->subObj) { 384 return PICODATA_PU_ERROR; 385 } 386 wa = (wa_subobj_t *) this->subObj; 387 mode = mode; /* avoid warning "var not used in this function"*/ 388 *numBytesOutput = 0; 389 while (1) { /* exit via return */ 390 PICODBG_DEBUG(("doing state %i, inLen: %d, outLen: %d", 391 wa->procState, wa->inLen, wa->outLen)); 392 393 switch (wa->procState) { 394 /* collect state: get item from charBuf and store in 395 * internal inBuf 396 */ 397 case WA_STEPSTATE_COLLECT: 398 if (wa->inLen == 0) { /* is input buffer empty? */ 399 picoos_uint16 blen; 400 /* try to get one item */ 401 rv = picodata_cbGetItem(this->cbIn, wa->inBuf, 402 wa->inBufSize, &blen); 403 PICODBG_DEBUG(("after getting item, status: %d", rv)); 404 if (PICO_OK == rv) { 405 /* we now have one item */ 406 wa->inLen = blen; 407 wa->procState = WA_STEPSTATE_PROCESS; 408 /* uncomment next line to split into two steps */ 409 /* return PICODATA_PU_ATOMIC; */ 410 } else if (PICO_EOF == rv) { 411 /* there was no item in the char buffer */ 412 return PICODATA_PU_IDLE; 413 } else if ((PICO_EXC_BUF_UNDERFLOW == rv) 414 || (PICO_EXC_BUF_OVERFLOW == rv)) { 415 PICODBG_ERROR(("problem getting item")); 416 picoos_emRaiseException(this->common->em, rv, 417 NULL, NULL); 418 return PICODATA_PU_ERROR; 419 } else { 420 PICODBG_ERROR(("problem getting item, unhandled")); 421 picoos_emRaiseException(this->common->em, rv, 422 NULL, NULL); 423 return PICODATA_PU_ERROR; 424 } 425 } else { /* there already is an item in the input buffer */ 426 PICODBG_WARN(("item already in input buffer")); 427 picoos_emRaiseWarning(this->common->em, 428 PICO_WARN_PU_IRREG_ITEM, NULL, NULL); 429 wa->procState = WA_STEPSTATE_PROCESS; 430 /* uncomment next to split into two steps */ 431 /* return PICODATA_PU_ATOMIC; */ 432 } 433 break; 434 435 436 /* process state: process item in internal inBuf and put 437 * result in internal outBuf 438 */ 439 case WA_STEPSTATE_PROCESS: 440 441 /* ensure there is an item in inBuf and it is valid */ 442 if ((wa->inLen > 0) && picodata_is_valid_item(wa->inBuf, 443 wa->inLen)) { 444 picodata_itemhead_t ihead; 445 picoos_uint8 *icontent; 446 pico_status_t rvP = PICO_OK; 447 448 rv = picodata_get_iteminfo(wa->inBuf, wa->inLen, &ihead, 449 &icontent); 450 if (PICO_OK == rv) { 451 452 switch (ihead.type) { 453 case PICODATA_ITEM_WORDGRAPH: 454 455 if (0 < ihead.len) { 456 rvP = waProcessWordgraph(this, wa, &ihead, 457 icontent); 458 } else { 459 /* else ignore empty WORDGRAPH */ 460 wa->inLen = 0; 461 wa->procState = WA_STEPSTATE_COLLECT; 462 return PICODATA_PU_BUSY; 463 } 464 break; 465 case PICODATA_ITEM_OTHER: 466 /* skip item */ 467 rvP = PICO_WARN_PU_DISCARD_BUF; 468 break; 469 default: 470 /* copy item unmodified */ 471 rvP = picodata_copy_item(wa->inBuf, 472 wa->inLen, wa->outBuf, 473 wa->outBufSize, &wa->outLen); 474 break; 475 } 476 477 if (PICO_OK == rvP) { 478 wa->inLen = 0; 479 wa->procState = WA_STEPSTATE_FEED; 480 /* uncomment next to split into two steps */ 481 /* return PICODATA_PU_ATOMIC; */ 482 } else if (PICO_WARN_PU_DISCARD_BUF == rvP) { 483 /* discard input buffer and get a new item */ 484 PICODBG_INFO(("skipping OTHER item")); 485 /* picoos_emRaiseWarning(this->common->em, 486 PICO_WARN_PU_DISCARD_BUF, NULL, NULL); 487 */ 488 wa->inLen = 0; 489 wa->procState = WA_STEPSTATE_COLLECT; 490 return PICODATA_PU_BUSY; 491 } else { 492 /* PICO_EXC_BUF_OVERFLOW <- overflow in outbuf 493 PICO_ERR_OTHER <- no valid item in inbuf 494 or return from processWordgraph 495 */ 496 PICODBG_ERROR(("problem processing item", rvP)); 497 picoos_emRaiseException(this->common->em, rvP, 498 NULL, NULL); 499 return PICODATA_PU_ERROR; 500 } 501 502 } else { /* could not get iteminfo */ 503 /* PICO_EXC_BUF_OVERFLOW <- overflow in outbuf 504 PICO_ERR_OTHER <- no valid item in inbuf 505 */ 506 PICODBG_ERROR(("problem getting item info, " 507 "discard buffer content")); 508 wa->inLen = 0; 509 wa->procState = WA_STEPSTATE_COLLECT; 510 picoos_emRaiseException(this->common->em, rv, 511 NULL, NULL); 512 return PICODATA_PU_ERROR; 513 } 514 515 } else if (wa->inLen == 0) { /* no item in inBuf */ 516 PICODBG_INFO(("no item in inBuf")); 517 /* wa->inLen = 0;*/ 518 wa->procState = WA_STEPSTATE_COLLECT; 519 return PICODATA_PU_BUSY; 520 521 } else { /* no valid item in inBuf */ 522 /* bad state/item, discard buffer content */ 523 PICODBG_WARN(("no valid item, discard buffer content")); 524 picoos_emRaiseWarning(this->common->em, 525 PICO_WARN_PU_IRREG_ITEM, NULL, NULL); 526 picoos_emRaiseWarning(this->common->em, 527 PICO_WARN_PU_DISCARD_BUF, NULL, NULL); 528 wa->inLen = 0; 529 wa->procState = WA_STEPSTATE_COLLECT; 530 return PICODATA_PU_BUSY; 531 } 532 break; 533 534 535 /* feed state: copy item in internal outBuf to output charBuf */ 536 case WA_STEPSTATE_FEED: 537 538 /* check that item fits in cb should not be needed */ 539 rv = picodata_cbPutItem(this->cbOut, wa->outBuf, 540 wa->outLen, numBytesOutput); 541 542 PICODATA_INFO_ITEM(this->voice->kbArray[PICOKNOW_KBID_DBG], 543 (picoos_uint8 *)"wana: ", wa->outBuf, 544 wa->outLen); 545 546 PICODBG_DEBUG(("put item, status: %d", rv)); 547 if (PICO_OK == rv) { 548 wa->outLen = 0; 549 wa->procState = WA_STEPSTATE_COLLECT; 550 return PICODATA_PU_BUSY; 551 } else if (PICO_EXC_BUF_OVERFLOW == rv) { 552 PICODBG_INFO(("feeding, overflow, PICODATA_PU_OUT_FULL")); 553 return PICODATA_PU_OUT_FULL; 554 } else if ((PICO_EXC_BUF_UNDERFLOW == rv) 555 || (PICO_ERR_OTHER == rv)) { 556 PICODBG_WARN(("feeding problem, discarding item")); 557 wa->outLen = 0; 558 wa->procState = WA_STEPSTATE_COLLECT; 559 picoos_emRaiseWarning(this->common->em, rv, NULL,NULL); 560 return PICODATA_PU_BUSY; 561 } 562 break; 563 564 default: 565 break; 566 567 } /* switch */ 568 569 } /* while */ 570 571 /* should be never reached */ 572 PICODBG_ERROR(("reached end of function")); 573 picoos_emRaiseException(this->common->em, PICO_ERR_OTHER, NULL, NULL); 574 return PICODATA_PU_ERROR; 575 } 576 577 #ifdef __cplusplus 578 } 579 #endif 580 581 582 /* end */ 583