Home | History | Annotate | Download | only in clib
      1 /*---------------------------------------------------------------------------*
      2  *  voc_read.c  *
      3  *                                                                           *
      4  *  Copyright 2007, 2008 Nuance Communciations, Inc.                               *
      5  *                                                                           *
      6  *  Licensed under the Apache License, Version 2.0 (the 'License');          *
      7  *  you may not use this file except in compliance with the License.         *
      8  *                                                                           *
      9  *  You may obtain a copy of the License at                                  *
     10  *      http://www.apache.org/licenses/LICENSE-2.0                           *
     11  *                                                                           *
     12  *  Unless required by applicable law or agreed to in writing, software      *
     13  *  distributed under the License is distributed on an 'AS IS' BASIS,        *
     14  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. *
     15  *  See the License for the specific language governing permissions and      *
     16  *  limitations under the License.                                           *
     17  *                                                                           *
     18  *---------------------------------------------------------------------------*/
     19 
     20 
     21 #ifndef _RTT
     22 #include <stdio.h>
     23 #endif
     24 #include <stdlib.h>
     25 #include <math.h>
     26 #include <assert.h>
     27 
     28 #if defined(__cplusplus) && defined(_MSC_VER)
     29 extern "C"
     30 {
     31 #include <string.h>
     32 }
     33 #else
     34 #include <string.h>
     35 #endif
     36 
     37 #include <sys/types.h>
     38 #include <sys/stat.h>
     39 #ifdef _WIN32
     40 #define stat _stat
     41 #else
     42 #include <unistd.h>
     43 #endif
     44 
     45 
     46 #include <fcntl.h>
     47 #include <sys/mman.h>
     48 
     49 #include <zipfile/zipfile.h>
     50 
     51 
     52 #include "hmmlib.h"
     53 #include "duk_io.h"
     54 #include "LCHAR.h"
     55 #include "portable.h"
     56 
     57 #include "memmove.h"
     58 
     59 static const char voc_read[] = "$Id: voc_read.c,v 1.14.6.18 2008/03/05 21:18:44 dahan Exp $";
     60 
     61 
     62 #define cr_or_nl(ch) ((ch) == '\n' || (ch) == '\r')
     63 
     64 
     65 #ifndef _RTT
     66 
     67 /**
     68  *  Read word models and their phoneme transcriptions from .ok or .voc files.
     69  *  returns -1 on error
     70  */
     71 int read_word_transcription(const LCHAR* basename, vocab_info* voc, ESR_Locale* locale)
     72 {
     73   const char *ok;
     74   ESR_ReturnCode rc;
     75   int result;
     76   int i;
     77   char token[256];
     78 
     79   ASSERT(voc);
     80 
     81   if (basename == NULL || strlen(basename) == 0) {
     82     PLogError("Error: invalid arg to read_word_transcription()\n");
     83     goto CLEANUP;
     84   }
     85 
     86   if (mmap_zip(basename, (void**)&voc->ok_file_data, (size_t*)&voc->ok_file_data_length)) {
     87     PLogError("read_word_transcription: mmap_zip failed for %s\n", basename);
     88     goto CLEANUP;
     89   }
     90 
     91   /* this assumption eliminates simplifies bounds checking when parsing */
     92   if (!cr_or_nl(voc->ok_file_data[voc->ok_file_data_length - 1])) {
     93     PLogError(L("read_word_transcription: last character in %s not newline\n"), basename);
     94     goto CLEANUP;
     95   }
     96 
     97   /* set up point to walk the data */
     98   ok = voc->ok_file_data;
     99 
    100   /* verify the header */
    101   i = 0;
    102   while (*ok != '=') {
    103     if (cr_or_nl(*ok)) {
    104       PLogError(L("%s was missing '=' in #LANG=en-us header"), basename);
    105       goto CLEANUP;
    106     }
    107     token[i++] = *ok++;
    108   }
    109   token[i] = 0;
    110   ok++;
    111   CHKLOG(rc, lstrcasecmp(token, L("#lang"), &result));
    112   if (result != 0)
    113   {
    114     PLogError(L("%s was missing #LANG=en-us header"), basename);
    115     goto CLEANUP;
    116   }
    117   i = 0;
    118   while (!cr_or_nl(*ok)) token[i++] = *ok++;
    119   token[i] = 0;
    120   ok++;
    121   CHKLOG(rc, ESR_str2locale(token, locale));
    122 
    123   /* set up first and last entries */
    124   voc->first_entry = strchr(voc->ok_file_data, '\n') + 1;
    125   voc->last_entry = voc->ok_file_data + voc->ok_file_data_length - 2;
    126   while (*voc->last_entry != '\n') voc->last_entry--; /* header forces termination */
    127   voc->last_entry++;
    128 
    129   /* determine if there are any upper case entries */
    130   voc->hasUpper = 1;
    131   while (ok < voc->ok_file_data + voc->ok_file_data_length) {
    132     int ch = *ok;
    133     if ('A' <= ch && ch <= 'Z') {
    134       voc->hasUpper = 1;
    135       break;
    136     }
    137     else if ('Z' < ch) {
    138       voc->hasUpper = 0;
    139       break;
    140     }
    141     /* scan to the next entry */
    142     while (*ok++ != '\n') ;
    143   }
    144 
    145   return 0;
    146 
    147 CLEANUP:
    148   delete_word_transcription(voc);
    149 
    150   PLogError(L("read_word_transcription: failed to read '%s'"), basename);
    151 
    152   return -1;
    153 }
    154 #endif
    155 
    156 /* the label is terminated with 0 and the entry terminated with ' ' */
    157 static int kompare(const char* label, const char* entry) {
    158   while (*label == *entry) {
    159     label++;
    160     entry++;
    161   }
    162   return (*label ? *label : ' ') - *entry;
    163 }
    164 
    165 int get_prons(const vocab_info* voc, const char* label, char* prons, int prons_len) {
    166   int num_prons;
    167   const char* low;
    168   const char* middle;
    169   const char* high;
    170 
    171   //PLogError(L("get_prons '%s'"), label);
    172 
    173   /* dictionaries are usually lower case, so do this for speed */
    174   if (!voc->hasUpper && 'A' <= *label && *label <= 'Z') return 0;
    175 
    176   /* binary search to find matching entry */
    177   low = voc->first_entry;
    178   high = voc->last_entry;
    179   while (1) {
    180     /* pick a point in the middle and align to next entry */
    181     middle = low + ((high - low) >> 1) - 1;
    182     while (*middle++ != '\n') ;
    183 
    184     /* compare 'label' to 'middle' */
    185     int diff = kompare(label, middle);
    186     if (diff == 0) break;
    187 
    188     /* nothing found */
    189     if (low == high) return 0;
    190 
    191     /* 'middle' aligned to 'high', so move 'high' down */
    192     if (middle == high) {
    193       high -= 2;
    194       while (*high != '\n') high--;
    195       high++;
    196       continue;
    197     }
    198 
    199     if (diff > 0) low = middle;
    200     else high = middle;
    201   }
    202 
    203   /* back up to find the first entry equal to 'label' */
    204   low = middle;
    205   while (voc->first_entry < low) {
    206     const char* lo;
    207     for (lo = low - 2; *lo != '\n'; lo--) ;
    208     lo++;
    209     if (kompare(label, lo)) break;
    210     low = lo;
    211   }
    212 
    213   /* move forward to the last entry equal to 'label' */
    214   high = middle;
    215   while (high < voc->last_entry) {
    216     const char* hi;
    217     for (hi = high; *hi != '\n'; hi++) ;
    218     hi++;
    219     if (kompare(label, hi)) break;
    220     high = hi;
    221   }
    222 
    223   /* loop over all the entries */
    224   num_prons = 0;
    225   while (low <= high) {
    226     /* scan over the label */
    227     while (*low++ != ' ') ;
    228 
    229     /* skip the whitespace */
    230     while (*low == ' ') low++;
    231 
    232     /* copy the pron */
    233     while (*low != '\n') {
    234       if (--prons_len <= 2) return -1;
    235       *prons++ = *low++;
    236     }
    237     *prons++ = 0;
    238     low++;
    239     num_prons++;
    240   }
    241   *prons++ = 0;
    242 
    243   return num_prons;
    244 }
    245 
    246 void delete_word_transcription(vocab_info* voc)
    247 {
    248   ASSERT(voc);
    249 
    250   voc->first_entry = 0;
    251   voc->last_entry = 0;
    252   if (voc->ok_file_data) munmap_zip(voc->ok_file_data, voc->ok_file_data_length);
    253   voc->ok_file_data = NULL;
    254   voc->ok_file_data_length = 0;
    255 }
    256 
    257 
    258 /**************************************************/
    259 /* may want to move these functions to 'portable' */
    260 /**************************************************/
    261 
    262 static int endeql(const char* string, const char* end) {
    263   return strlen(end) <= strlen(string) && !strcmp(string + strlen(string) - strlen(end), end);
    264 }
    265 
    266 /* decompress_entry requires an oversize destination buffer, so... */
    267 static size_t inflateSize(size_t size) {
    268   return size + size / 1000 + 1;
    269 }
    270 
    271 int mmap_zip(const char* fname, void** buf, size_t* size) {
    272     int fd = -1;
    273     struct stat statbuf;
    274     zipfile_t zf = 0;
    275     zipentry_t ze = 0;
    276     char entryname[FILENAME_MAX];
    277     size_t size2 = 0;
    278     void* buf2 = 0;
    279 
    280     /* open data file, determine size, map it, and close fd */
    281     fd = open(fname, O_RDONLY);
    282     if (fd < 0) goto FAILED;
    283 
    284     /* determine length */
    285     if (fstat(fd, &statbuf) < 0) goto FAILED;
    286 
    287     /* mmap it */
    288     *size = statbuf.st_size;
    289     *buf = mmap(0, inflateSize(statbuf.st_size), PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
    290     if (*buf == MAP_FAILED) goto FAILED;
    291 
    292     /* close fd, since we can */
    293     close(fd);
    294     fd = -1;
    295 
    296     /* if not a zip file, we are done! */
    297     if (!endeql(fname, ".zip")) return 0;
    298 
    299     /* set up zipfiler */
    300     zf = init_zipfile(*buf, *size);
    301     if (!zf) goto FAILED;
    302 
    303     /* get entry */
    304     strcpy(entryname, strrchr(fname, '/') ? strrchr(fname, '/') + 1 : fname);
    305     entryname[strlen(entryname) - strlen(".zip")] = 0;
    306     ze = lookup_zipentry(zf, entryname);
    307     if (!ze) goto FAILED;
    308 
    309     /* mmap anon memory to hold unzipped entry */
    310     size2 = get_zipentry_size(ze);
    311     buf2 = mmap(0, inflateSize(size2), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANON, -1, 0);
    312     if (buf2 == (void*)-1) goto FAILED;
    313 
    314     /* unzip entry */
    315     if (decompress_zipentry(ze, buf2, size2)) goto FAILED;
    316 
    317     /* release unzipper */
    318     release_zipfile(zf);
    319     zf = 0;
    320 
    321     /* release mmapped file */
    322     munmap(*buf, inflateSize(*size));
    323 
    324     /* set return values */
    325     *buf = buf2;
    326     *size = size2;
    327 
    328     return 0;
    329 
    330 FAILED:
    331     if (fd != -1) close(fd);
    332     if (zf) release_zipfile(zf);
    333     if (buf2) munmap(buf2, inflateSize(size2));
    334     if (*buf && *buf != (void*)-1) munmap(*buf, inflateSize(*size));
    335     *buf = 0;
    336     *size = 0;
    337     return -1;
    338 }
    339 
    340 int munmap_zip(void* buf, size_t size) {
    341     return munmap(buf, inflateSize(size));
    342 }
    343 
    344