Home | History | Annotate | Download | only in lib
      1 /*
      2  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *     http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 /**
     17  * @file picotok.h
     18  *
     19  * Copyright (C) 2008-2009 SVOX AG, Baslerstr. 30, 8048 Zuerich, Switzerland
     20  * All rights reserved.
     21  *
     22  * History:
     23  * - 2009-04-20 -- initial version
     24  *
     25  */
     26 
     27 
     28 /** @addtogroup picotok
     29 itemtype, iteminfo1, iteminfo2, content -> TYPE(INFO1,INFO2)content
     30 in the following
     31 
     32 input
     33 =====
     34 
     35 - UTF8 text
     36 
     37 limitations: currently only german umlauts in addition to ASCII
     38 
     39 
     40 minimal input size (before processing starts)
     41 ==================
     42 
     43 processing (ie. tokenization) starts when
     44 - 'PICO_EOF' char received (which happens whenever the cbIn buffer is empty)
     45 - tok-internal buffer is full
     46 
     47 
     48 items output
     49 ============
     50 
     51 processing the character stream can result in one of the
     52 following items:
     53 -> WORDGRAPH(NA,NA)graph    <- mapped to lower case; incl. 1-2 digit nrs (0-99)
     54 -> OTHER(NA,NA)string       <- skip or spell
     55 -> PUNC(PUNCtype,PUNCsubtype)
     56 -> CMD(CMDtype,CMDsubtype)args
     57 
     58 with
     59 - PUNCtype %d
     60     PICODATA_ITEMINFO1_PUNC_SENTEND
     61     PICODATA_ITEMINFO1_PUNC_PHRASEEND
     62 - PUNCsubtype %d
     63     PICODATA_ITEMINFO2_PUNC_SENT_T
     64     PICODATA_ITEMINFO2_PUNC_SENT_Q
     65     PICODATA_ITEMINFO2_PUNC_SENT_E
     66     PICODATA_ITEMINFO2_PUNC_PHRASE
     67     (used later: PICODATA_ITEMINFO2_PUNC_PHRASE_FORCED)
     68 - CMDtype %d
     69     PICODATA_ITEMINFO1_CMD_FLUSH    (no args)
     70     ? PICODATA_ITEMINFO1_CMD_PLAY ? (not yet)
     71 - CMDsubtype %d
     72     PICODATA_ITEMINFO2_NA
     73     ? PICODATA_ITEMINFO2_CMD_PLAY_G2P ? (not yet)
     74 - graph, len>0, utf8 graphemes, %s
     75 - string, len>0, can be any string with printable ascii characters, %s
     76 
     77 
     78 other limitations
     79 =================
     80 
     81 - item size: header plus len=256 (valid for Pico in general)
     82  */
     83 
     84 
     85 #ifndef PICOTOK_H_
     86 #define PICOTOK_H_
     87 
     88 #include "picoos.h"
     89 #include "picodata.h"
     90 #include "picorsrc.h"
     91 
     92 #ifdef __cplusplus
     93 extern "C" {
     94 #endif
     95 #if 0
     96 }
     97 #endif
     98 
     99 
    100 
    101 picodata_ProcessingUnit picotok_newTokenizeUnit(
    102         picoos_MemoryManager mm,
    103         picoos_Common common,
    104         picodata_CharBuffer cbIn,
    105         picodata_CharBuffer cbOut,
    106         picorsrc_Voice voice);
    107 
    108 #define PICOTOK_OUTBUF_SIZE 256
    109 
    110 #ifdef __cplusplus
    111 }
    112 #endif
    113 
    114 
    115 #endif /*PICOTOK_H_*/
    116