1 /*---------------------------------------------------------------------------* 2 * text_parser.c * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 #include"pstdio.h" 21 #include"srec_context.h" 22 #include"astar.h" 23 24 #include "passert.h" 25 #include "portable.h" 26 27 28 #define MAX_LOCAL_LEN 256 29 #define PARSE_PASS 0 30 #define PARSE_FAIL 1 31 32 33 static int check_word_path(srec_context* context, arc_token* atok, 34 const char* transcription, int tlen) 35 { 36 const char *wd, *p; 37 char *q; 38 arc_token* next_atok; 39 wordID wdID; 40 int q_position; 41 42 if ( strlen ( transcription ) >= MAX_LOCAL_LEN - 1) 43 { 44 PLogError("Transcription too long [%s]\n", transcription); 45 return PARSE_FAIL; 46 } 47 48 while (1) { 49 char copy_of_word[MAX_LOCAL_LEN]; /* save heap on recursive function */ 50 51 /* wd points to the first char of last word */ 52 wd = transcription; 53 if (tlen > 0) 54 { 55 for (wd = transcription + tlen - 1; wd > transcription; wd--) 56 { 57 if (*wd == ' ') 58 { 59 wd++; 60 break; 61 } 62 } 63 } 64 for (p = wd, q = copy_of_word; ; p++, q++) 65 { 66 q_position = q - copy_of_word; 67 if (q_position < 0 || (size_t)q_position >= MAX_LOCAL_LEN) 68 { 69 PLogError("Word too long in transcription [%s]\n", transcription); 70 return PARSE_FAIL; 71 } 72 *q = *p; 73 if (*p == ' ' || *p == '\0') 74 { 75 *q = 0; 76 break; 77 } 78 } 79 wdID = wordmap_find_index(context->olabels, copy_of_word); 80 81 if (wdID < MAXwordID) 82 { 83 next_atok = get_arc_for_word(atok, wdID, context, context->beg_silence_word); 84 } 85 else 86 { 87 next_atok = get_arc_for_word_without_slot_annotation(atok, wd, context, context->beg_silence_word); 88 if (!next_atok) return PARSE_FAIL; 89 } 90 91 if (!next_atok) return PARSE_FAIL; 92 93 int whether_final_atok = 0; 94 arc_token* tmp; 95 for (tmp = ARC_TOKEN_PTR(context->arc_token_list, next_atok->first_next_arc); tmp != NULL; 96 tmp = ARC_TOKEN_PTR(context->arc_token_list, tmp->next_token_index)) 97 { 98 if (tmp->ilabel == MAXwordID) whether_final_atok = 1; 99 } 100 101 if (wd == transcription && whether_final_atok) return PARSE_PASS; 102 if (wd == transcription) return PARSE_FAIL; 103 tlen--; 104 while (transcription[tlen] != ' ' && tlen > 0) tlen--; 105 106 atok = next_atok; 107 } 108 } 109 110 int FST_CheckPath_Simple(srec_context* context, const char* transcription) 111 { 112 arc_token* atok = &context->arc_token_list[0]; 113 int transcription_len = strlen(transcription); 114 int rc; 115 116 for (; transcription_len > 0; transcription_len--) 117 if (transcription[transcription_len-1] != ' ') break; 118 rc = check_word_path(context, atok, transcription, transcription_len); 119 return rc; 120 } 121 122 int FST_CheckPath_Complex(srec_context* context, const char* transcription, 123 char* literal, size_t max_literal_len) 124 { 125 int i, j, rc; 126 int num_spaces; 127 char copy_of_transcription[MAX_LOCAL_LEN]; 128 char* spaces[24], *p; /* can't go too high here!! */ 129 ASSERT(strlen(transcription) < MAX_LOCAL_LEN); 130 131 strcpy(copy_of_transcription, transcription); 132 for (num_spaces = 0, p = copy_of_transcription; *p; p++) 133 { 134 if (*p == ' ') 135 { 136 if ((size_t)num_spaces >= sizeof(spaces) / sizeof(char*)) 137 { 138 PLogError("FST_CheckPath_Complex() failed on too many words\n"); 139 return PARSE_FAIL; 140 } 141 spaces[num_spaces++] = p; 142 } 143 } 144 145 if (num_spaces == 0) 146 { 147 rc = FST_CheckPath_Simple(context, transcription); 148 if (rc == PARSE_PASS) 149 { 150 ASSERT(strlen(copy_of_transcription) < max_literal_len); 151 strcpy(literal, copy_of_transcription); 152 } 153 return rc; 154 } 155 156 for (i = 0; i < (1 << num_spaces); i++) 157 { 158 /* find the space pointers */ 159 for (j = 0; j < num_spaces; j++) 160 *spaces[j] = i & (1 << j) ? '_' : ' '; 161 /* check each word, potentially within a rule! */ 162 for (p = strtok(copy_of_transcription, " "); p; p = strtok(NULL, " ")) 163 { 164 wordID k, wdid = wordmap_find_index(context->olabels, p); 165 if (wdid < MAXwordID) continue; 166 for (k = 1; k < context->olabels->num_slots; k++) 167 { 168 wdid = wordmap_find_index_in_rule(context->olabels, p, k); 169 if (wdid < MAXwordID) break; 170 } 171 if (wdid == MAXwordID) 172 goto next_i; 173 } 174 /* fix the nulls back */ 175 for (j = 0; j < num_spaces; j++) 176 *spaces[j] = i & (1 << j) ? '_' : ' '; 177 rc = FST_CheckPath_Simple(context, copy_of_transcription); 178 if (rc == PARSE_PASS) 179 { 180 ASSERT(strlen(copy_of_transcription) < max_literal_len); 181 strcpy(literal, copy_of_transcription); 182 return rc; 183 } 184 next_i: 185 continue; 186 } 187 return PARSE_FAIL; 188 } 189 190 static void clean_up_sentence(char* s); 191 192 int FST_CheckPath(srec_context* context, const char* transcription, 193 char* literal, size_t max_literal_len) 194 { 195 char mytranscription[256]; 196 passert(strlen(transcription) < sizeof(mytranscription)); 197 strcpy(mytranscription, transcription); 198 clean_up_sentence(mytranscription); 199 if (!context->arc_token_list) 200 return 2; 201 else 202 return FST_CheckPath_Complex(context, mytranscription, literal, max_literal_len); 203 } 204 205 static void clean_up_sentence(char* s) 206 { 207 char* p, *q; 208 if (0) printf("sentence: '%s'\n", s); 209 /* change speech codes to spaces */ 210 for (p = s; *p; p++) 211 { 212 if (*p == '[') 213 for (;*p && *p != ']'; p++) 214 *p = ' '; 215 if (*p == ']') *p = ' '; 216 } 217 /* trim leading spaces */ 218 for (p = s; *p == ' ';) 219 for (q = p; *q; q++) *q = *(q + 1); 220 /* trim middle spaces */ 221 for (p = s; p && *p;) 222 { 223 if (!*p) break; 224 p = strchr(p, ' '); 225 if (!p) break; 226 for (;*(p + 1) == ' ';) 227 for (q = p; *q; q++) *q = *(q + 1); 228 p++; 229 } 230 /* trim ending spaces */ 231 for (p = s + strlen(s); p != s;) 232 if (*(--p) == ' ') *p = 0; 233 else break; 234 235 if (0) printf("clean_sentence: '%s'\n", s); 236 } 237 238 239 240