Home | History | Annotate | Download | only in crec
      1 /*---------------------------------------------------------------------------*
      2  *  text_parser.c  *
      3  *                                                                           *
      4  *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
      5  *                                                                           *
      6  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
      7  *  you may not use this file except in compliance with the License.         *
      8  *                                                                           *
      9  *  You may obtain a copy of the License at                                  *
     10  *      http://www.apache.org/licenses/LICENSE-2.0                           *
     11  *                                                                           *
     12  *  Unless required by applicable law or agreed to in writing, software      *
     13  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
     14  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
     15  *  See the License for the specific language governing permissions and      *
     16  *  limitations under the License.                                           *
     17  *                                                                           *
     18  *---------------------------------------------------------------------------*/
     19 
     20 #include"pstdio.h"
     21 #include"srec_context.h"
     22 #include"astar.h"
     23 
     24 #include "passert.h"
     25 #include "portable.h"
     26 
     27 
     28 #define MAX_LOCAL_LEN 256
     29 #define PARSE_PASS 0
     30 #define PARSE_FAIL 1
     31 
     32 
     33 static int check_word_path(srec_context* context, arc_token* atok,
     34                            const char* transcription, int tlen)
     35 {
     36   const char    *wd, *p;
     37   char          *q;
     38   arc_token*    next_atok;
     39   wordID        wdID;
     40   int           q_position;
     41 
     42   if ( strlen ( transcription ) >= MAX_LOCAL_LEN - 1)
     43   {
     44     PLogError("Transcription too long [%s]\n", transcription);
     45     return PARSE_FAIL;
     46   }
     47 
     48   while (1) {
     49     char copy_of_word[MAX_LOCAL_LEN]; /* save heap on recursive function */
     50 
     51     /* wd points to the first char of last word */
     52     wd = transcription;
     53     if (tlen > 0)
     54     {
     55       for (wd = transcription + tlen - 1; wd > transcription; wd--)
     56       {
     57         if (*wd == ' ')
     58         {
     59           wd++;
     60           break;
     61         }
     62       }
     63     }
     64     for (p = wd, q = copy_of_word; ; p++, q++)
     65     {
     66       q_position = q - copy_of_word;
     67       if (q_position < 0 || (size_t)q_position >= MAX_LOCAL_LEN)
     68       {
     69         PLogError("Word too long in transcription [%s]\n", transcription);
     70         return PARSE_FAIL;
     71       }
     72       *q = *p;
     73       if (*p == ' ' || *p == '\0')
     74       {
     75         *q = 0;
     76         break;
     77       }
     78     }
     79     wdID = wordmap_find_index(context->olabels, copy_of_word);
     80 
     81     if (wdID < MAXwordID)
     82     {
     83       next_atok = get_arc_for_word(atok, wdID, context, context->beg_silence_word);
     84     }
     85     else
     86     {
     87       next_atok = get_arc_for_word_without_slot_annotation(atok, wd, context, context->beg_silence_word);
     88       if (!next_atok) return PARSE_FAIL;
     89     }
     90 
     91     if (!next_atok) return PARSE_FAIL;
     92 
     93     int whether_final_atok = 0;
     94     arc_token* tmp;
     95     for (tmp = ARC_TOKEN_PTR(context->arc_token_list, next_atok->first_next_arc); tmp != NULL;
     96          tmp = ARC_TOKEN_PTR(context->arc_token_list, tmp->next_token_index))
     97     {
     98       if (tmp->ilabel == MAXwordID) whether_final_atok = 1;
     99     }
    100 
    101     if (wd == transcription && whether_final_atok) return PARSE_PASS;
    102     if (wd == transcription) return PARSE_FAIL;
    103     tlen--;
    104     while (transcription[tlen] != ' ' && tlen > 0) tlen--;
    105 
    106     atok = next_atok;
    107   }
    108 }
    109 
    110 int FST_CheckPath_Simple(srec_context* context, const char* transcription)
    111 {
    112   arc_token* atok = &context->arc_token_list[0];
    113   int transcription_len = strlen(transcription);
    114   int rc;
    115 
    116   for (; transcription_len > 0; transcription_len--)
    117     if (transcription[transcription_len-1] != ' ') break;
    118   rc = check_word_path(context, atok, transcription, transcription_len);
    119   return rc;
    120 }
    121 
    122 int FST_CheckPath_Complex(srec_context* context, const char* transcription,
    123                           char* literal, size_t max_literal_len)
    124 {
    125   int i, j, rc;
    126   int num_spaces;
    127   char copy_of_transcription[MAX_LOCAL_LEN];
    128   char* spaces[24], *p; /* can't go too high here!! */
    129   ASSERT(strlen(transcription) < MAX_LOCAL_LEN);
    130 
    131   strcpy(copy_of_transcription, transcription);
    132   for (num_spaces = 0, p = copy_of_transcription; *p; p++)
    133   {
    134     if (*p == ' ')
    135     {
    136       if ((size_t)num_spaces >= sizeof(spaces) / sizeof(char*))
    137       {
    138         PLogError("FST_CheckPath_Complex() failed on too many words\n");
    139         return PARSE_FAIL;
    140       }
    141       spaces[num_spaces++] = p;
    142     }
    143   }
    144 
    145   if (num_spaces == 0)
    146   {
    147     rc = FST_CheckPath_Simple(context, transcription);
    148     if (rc == PARSE_PASS)
    149     {
    150       ASSERT(strlen(copy_of_transcription) < max_literal_len);
    151       strcpy(literal, copy_of_transcription);
    152     }
    153     return rc;
    154   }
    155 
    156   for (i = 0; i < (1 << num_spaces); i++)
    157   {
    158     /* find the space pointers */
    159     for (j = 0; j < num_spaces; j++)
    160       *spaces[j] = i & (1 << j) ? '_' : ' ';
    161     /* check each word, potentially within a rule! */
    162     for (p = strtok(copy_of_transcription, " "); p; p = strtok(NULL, " "))
    163     {
    164       wordID k, wdid = wordmap_find_index(context->olabels, p);
    165       if (wdid < MAXwordID) continue;
    166       for (k = 1; k < context->olabels->num_slots; k++)
    167       {
    168         wdid = wordmap_find_index_in_rule(context->olabels, p, k);
    169         if (wdid < MAXwordID) break;
    170       }
    171       if (wdid == MAXwordID)
    172         goto next_i;
    173     }
    174     /* fix the nulls back */
    175     for (j = 0; j < num_spaces; j++)
    176       *spaces[j] = i & (1 << j) ? '_' : ' ';
    177     rc = FST_CheckPath_Simple(context, copy_of_transcription);
    178     if (rc == PARSE_PASS)
    179     {
    180       ASSERT(strlen(copy_of_transcription) < max_literal_len);
    181       strcpy(literal, copy_of_transcription);
    182       return rc;
    183     }
    184 next_i:
    185     continue;
    186   }
    187   return PARSE_FAIL;
    188 }
    189 
    190 static void clean_up_sentence(char* s);
    191 
    192 int FST_CheckPath(srec_context* context, const char* transcription,
    193                   char* literal, size_t max_literal_len)
    194 {
    195   char mytranscription[256];
    196   passert(strlen(transcription) < sizeof(mytranscription));
    197   strcpy(mytranscription, transcription);
    198   clean_up_sentence(mytranscription);
    199   if (!context->arc_token_list)
    200     return 2;
    201   else
    202     return FST_CheckPath_Complex(context, mytranscription, literal, max_literal_len);
    203 }
    204 
    205 static void clean_up_sentence(char* s)
    206 {
    207   char* p, *q;
    208   if (0) printf("sentence: '%s'\n", s);
    209   /* change speech codes to spaces */
    210   for (p = s; *p; p++)
    211   {
    212     if (*p == '[')
    213       for (;*p && *p != ']'; p++)
    214         *p = ' ';
    215     if (*p == ']') *p = ' ';
    216   }
    217   /* trim leading spaces */
    218   for (p = s; *p == ' ';)
    219     for (q = p; *q; q++) *q = *(q + 1);
    220   /* trim middle spaces */
    221   for (p = s; p && *p;)
    222   {
    223     if (!*p) break;
    224     p = strchr(p, ' ');
    225     if (!p) break;
    226     for (;*(p + 1) == ' ';)
    227       for (q = p; *q; q++) *q = *(q + 1);
    228     p++;
    229   }
    230   /* trim ending spaces */
    231   for (p = s + strlen(s); p != s;)
    232     if (*(--p) == ' ') *p = 0;
    233     else break;
    234 
    235   if (0) printf("clean_sentence: '%s'\n", s);
    236 }
    237 
    238 
    239 
    240