1 /*---------------------------------------------------------------------------* 2 * voc_read.c * 3 * * 4 * Copyright 2007, 2008 Nuance Communciations, Inc. * 5 * * 6 * Licensed under the Apache License, Version 2.0 (the 'License'); * 7 * you may not use this file except in compliance with the License. * 8 * * 9 * You may obtain a copy of the License at * 10 * http://www.apache.org/licenses/LICENSE-2.0 * 11 * * 12 * Unless required by applicable law or agreed to in writing, software * 13 * distributed under the License is distributed on an 'AS IS' BASIS, * 14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * 15 * See the License for the specific language governing permissions and * 16 * limitations under the License. * 17 * * 18 *---------------------------------------------------------------------------*/ 19 20 21 #ifndef _RTT 22 #include <stdio.h> 23 #endif 24 #include <stdlib.h> 25 #include <math.h> 26 #include <assert.h> 27 28 #if defined(__cplusplus) && defined(_MSC_VER) 29 extern "C" 30 { 31 #include <string.h> 32 } 33 #else 34 #include <string.h> 35 #endif 36 37 #include <sys/types.h> 38 #include <sys/stat.h> 39 #ifdef _WIN32 40 #define stat _stat 41 #else 42 #include <unistd.h> 43 #endif 44 45 46 #include <fcntl.h> 47 #include <sys/mman.h> 48 49 #include <zipfile/zipfile.h> 50 51 52 #include "hmmlib.h" 53 #include "duk_io.h" 54 #include "LCHAR.h" 55 #include "portable.h" 56 57 #include "memmove.h" 58 59 static const char voc_read[] = "$Id: voc_read.c,v 1.14.6.18 2008/03/05 21:18:44 dahan Exp $"; 60 61 62 #define cr_or_nl(ch) ((ch) == '\n' || (ch) == '\r') 63 64 65 #ifndef _RTT 66 67 /** 68 * Read word models and their phoneme transcriptions from .ok or .voc files. 69 * returns -1 on error 70 */ 71 int read_word_transcription(const LCHAR* basename, vocab_info* voc, ESR_Locale* locale) 72 { 73 const char *ok; 74 ESR_ReturnCode rc; 75 int result; 76 int i; 77 char token[256]; 78 79 ASSERT(voc); 80 81 if (basename == NULL || strlen(basename) == 0) { 82 PLogError("Error: invalid arg to read_word_transcription()\n"); 83 goto CLEANUP; 84 } 85 86 if (mmap_zip(basename, (void**)&voc->ok_file_data, (size_t*)&voc->ok_file_data_length)) { 87 PLogError("read_word_transcription: mmap_zip failed for %s\n", basename); 88 goto CLEANUP; 89 } 90 91 /* this assumption eliminates simplifies bounds checking when parsing */ 92 if (!cr_or_nl(voc->ok_file_data[voc->ok_file_data_length - 1])) { 93 PLogError(L("read_word_transcription: last character in %s not newline\n"), basename); 94 goto CLEANUP; 95 } 96 97 /* set up point to walk the data */ 98 ok = voc->ok_file_data; 99 100 /* verify the header */ 101 i = 0; 102 while (*ok != '=') { 103 if (cr_or_nl(*ok)) { 104 PLogError(L("%s was missing '=' in #LANG=en-us header"), basename); 105 goto CLEANUP; 106 } 107 token[i++] = *ok++; 108 } 109 token[i] = 0; 110 ok++; 111 CHKLOG(rc, lstrcasecmp(token, L("#lang"), &result)); 112 if (result != 0) 113 { 114 PLogError(L("%s was missing #LANG=en-us header"), basename); 115 goto CLEANUP; 116 } 117 i = 0; 118 while (!cr_or_nl(*ok)) token[i++] = *ok++; 119 token[i] = 0; 120 ok++; 121 CHKLOG(rc, ESR_str2locale(token, locale)); 122 123 /* set up first and last entries */ 124 voc->first_entry = strchr(voc->ok_file_data, '\n') + 1; 125 voc->last_entry = voc->ok_file_data + voc->ok_file_data_length - 2; 126 while (*voc->last_entry != '\n') voc->last_entry--; /* header forces termination */ 127 voc->last_entry++; 128 129 /* determine if there are any upper case entries */ 130 voc->hasUpper = 1; 131 while (ok < voc->ok_file_data + voc->ok_file_data_length) { 132 int ch = *ok; 133 if ('A' <= ch && ch <= 'Z') { 134 voc->hasUpper = 1; 135 break; 136 } 137 else if ('Z' < ch) { 138 voc->hasUpper = 0; 139 break; 140 } 141 /* scan to the next entry */ 142 while (*ok++ != '\n') ; 143 } 144 145 return 0; 146 147 CLEANUP: 148 delete_word_transcription(voc); 149 150 PLogError(L("read_word_transcription: failed to read '%s'"), basename); 151 152 return -1; 153 } 154 #endif 155 156 /* the label is terminated with 0 and the entry terminated with ' ' */ 157 static int kompare(const char* label, const char* entry) { 158 while (*label == *entry) { 159 label++; 160 entry++; 161 } 162 return (*label ? *label : ' ') - *entry; 163 } 164 165 int get_prons(const vocab_info* voc, const char* label, char* prons, int prons_len) { 166 int num_prons; 167 const char* low; 168 const char* middle; 169 const char* high; 170 171 //PLogError(L("get_prons '%s'"), label); 172 173 /* dictionaries are usually lower case, so do this for speed */ 174 if (!voc->hasUpper && 'A' <= *label && *label <= 'Z') return 0; 175 176 /* binary search to find matching entry */ 177 low = voc->first_entry; 178 high = voc->last_entry; 179 while (1) { 180 /* pick a point in the middle and align to next entry */ 181 middle = low + ((high - low) >> 1) - 1; 182 while (*middle++ != '\n') ; 183 184 /* compare 'label' to 'middle' */ 185 int diff = kompare(label, middle); 186 if (diff == 0) break; 187 188 /* nothing found */ 189 if (low == high) return 0; 190 191 /* 'middle' aligned to 'high', so move 'high' down */ 192 if (middle == high) { 193 high -= 2; 194 while (*high != '\n') high--; 195 high++; 196 continue; 197 } 198 199 if (diff > 0) low = middle; 200 else high = middle; 201 } 202 203 /* back up to find the first entry equal to 'label' */ 204 low = middle; 205 while (voc->first_entry < low) { 206 const char* lo; 207 for (lo = low - 2; *lo != '\n'; lo--) ; 208 lo++; 209 if (kompare(label, lo)) break; 210 low = lo; 211 } 212 213 /* move forward to the last entry equal to 'label' */ 214 high = middle; 215 while (high < voc->last_entry) { 216 const char* hi; 217 for (hi = high; *hi != '\n'; hi++) ; 218 hi++; 219 if (kompare(label, hi)) break; 220 high = hi; 221 } 222 223 /* loop over all the entries */ 224 num_prons = 0; 225 while (low <= high) { 226 /* scan over the label */ 227 while (*low++ != ' ') ; 228 229 /* skip the whitespace */ 230 while (*low == ' ') low++; 231 232 /* copy the pron */ 233 while (*low != '\n') { 234 if (--prons_len <= 2) return -1; 235 *prons++ = *low++; 236 } 237 *prons++ = 0; 238 low++; 239 num_prons++; 240 } 241 *prons++ = 0; 242 243 return num_prons; 244 } 245 246 void delete_word_transcription(vocab_info* voc) 247 { 248 ASSERT(voc); 249 250 voc->first_entry = 0; 251 voc->last_entry = 0; 252 if (voc->ok_file_data) munmap_zip(voc->ok_file_data, voc->ok_file_data_length); 253 voc->ok_file_data = NULL; 254 voc->ok_file_data_length = 0; 255 } 256 257 258 /**************************************************/ 259 /* may want to move these functions to 'portable' */ 260 /**************************************************/ 261 262 static int endeql(const char* string, const char* end) { 263 return strlen(end) <= strlen(string) && !strcmp(string + strlen(string) - strlen(end), end); 264 } 265 266 /* decompress_entry requires an oversize destination buffer, so... */ 267 static size_t inflateSize(size_t size) { 268 return size + size / 1000 + 1; 269 } 270 271 int mmap_zip(const char* fname, void** buf, size_t* size) { 272 int fd = -1; 273 struct stat statbuf; 274 zipfile_t zf = 0; 275 zipentry_t ze = 0; 276 char entryname[FILENAME_MAX]; 277 size_t size2 = 0; 278 void* buf2 = 0; 279 280 /* open data file, determine size, map it, and close fd */ 281 fd = open(fname, O_RDONLY); 282 if (fd < 0) goto FAILED; 283 284 /* determine length */ 285 if (fstat(fd, &statbuf) < 0) goto FAILED; 286 287 /* mmap it */ 288 *size = statbuf.st_size; 289 *buf = mmap(0, inflateSize(statbuf.st_size), PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); 290 if (*buf == MAP_FAILED) goto FAILED; 291 292 /* close fd, since we can */ 293 close(fd); 294 fd = -1; 295 296 /* if not a zip file, we are done! */ 297 if (!endeql(fname, ".zip")) return 0; 298 299 /* set up zipfiler */ 300 zf = init_zipfile(*buf, *size); 301 if (!zf) goto FAILED; 302 303 /* get entry */ 304 strcpy(entryname, strrchr(fname, '/') ? strrchr(fname, '/') + 1 : fname); 305 entryname[strlen(entryname) - strlen(".zip")] = 0; 306 ze = lookup_zipentry(zf, entryname); 307 if (!ze) goto FAILED; 308 309 /* mmap anon memory to hold unzipped entry */ 310 size2 = get_zipentry_size(ze); 311 buf2 = mmap(0, inflateSize(size2), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0); 312 if (buf2 == (void*)-1) goto FAILED; 313 314 /* unzip entry */ 315 if (decompress_zipentry(ze, buf2, size2)) goto FAILED; 316 317 /* release unzipper */ 318 release_zipfile(zf); 319 zf = 0; 320 321 /* release mmapped file */ 322 munmap(*buf, inflateSize(*size)); 323 324 /* set return values */ 325 *buf = buf2; 326 *size = size2; 327 328 return 0; 329 330 FAILED: 331 if (fd != -1) close(fd); 332 if (zf) release_zipfile(zf); 333 if (buf2) munmap(buf2, inflateSize(size2)); 334 if (*buf && *buf != (void*)-1) munmap(*buf, inflateSize(*size)); 335 *buf = 0; 336 *size = 0; 337 return -1; 338 } 339 340 int munmap_zip(void* buf, size_t size) { 341 return munmap(buf, inflateSize(size)); 342 } 343 344