Home | History | Annotate | Download | only in dictionary
      1 /*
      2  * Copyright (C) 2007 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License"); you may not
      5  * use this file except in compliance with the License. You may obtain a copy of
      6  * the License at
      7  *
      8  * http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
     12  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
     13  * License for the specific language governing permissions and limitations under
     14  * the License.
     15  */
     16 
     17 
     18 //
     19 // This converts the data found at http://www.speech.cs.cmu.edu/cgi-bin/cmudict
     20 // into the *.ok format used by Nuance.
     21 // We use the file c0.6, which corresponds to (v. 0.6).
     22 //
     23 // to run: make cmu2nuance && ./cmu2nuance <c0.6 >c0.6.ok
     24 //
     25 // TODO: look at generation of 'L', ')', and ','
     26 //
     27 
     28 #include <stdio.h>
     29 #include <string.h>
     30 #include <ctype.h>
     31 
     32 
     33 static const char* xlate(const char* phone, const char* cmu, const char* nuance) {
     34   int ncmu = strlen(cmu);
     35   if (strncmp(phone, cmu, ncmu) || !isspace(phone[ncmu])) return NULL;
     36   fputs(nuance, stdout);
     37   return phone + strlen(cmu);
     38 }
     39 
     40 
     41 int main(int argc, const char* argv[]) {
     42   char line[200];
     43 
     44   fputs("#LANG=EN-US\n", stdout);
     45 
     46   for (int lineno = 1; NULL != fgets(line, sizeof(line), stdin); lineno++)
     47   {
     48     if (line[0] == '#') continue;
     49     if (line[0] == 0) continue;
     50     if (!isalnum(line[0])) {
     51       fprintf(stderr, "warning: ignoring line %d - %s", lineno, line);
     52       continue;
     53     }
     54 
     55     const char* p = line;
     56 
     57     // parse name, echoing in lower case and skipping (2) suffix
     58     while (!isspace(*p)) {
     59       if (*p == 0) {
     60         fprintf(stderr, "can't read name at line %d\n", lineno);
     61         break;
     62       }
     63       if (p[0] == '(' && isdigit(p[1]) && p[2] == ')' && isspace(p[3])) {
     64         p += 3;
     65         break;
     66       }
     67       fputc(tolower(*p), stdout);
     68       p++;
     69     }
     70     fputc(' ', stdout);
     71 
     72     // loop over whitespace delimited phonemes
     73     while (1) {
     74       // skip leading whitespace
     75       while (isspace(*p)) p++;
     76       if (*p == 0) break;
     77 
     78       const char* next = 0;
     79       if (
     80         (next=xlate(p, "AA1 R", ")r")) ||   // odd     AA D
     81         (next=xlate(p, "AA0", "o")) ||   // odd     AA D
     82         (next=xlate(p, "AA1", "o")) ||   // odd     AA D
     83         (next=xlate(p, "AA2", "o")) ||   // odd     AA D
     84 
     85         (next=xlate(p, "AE0", "a")) ||   // at      AE T
     86         (next=xlate(p, "AE1", "a")) ||   // at      AE T
     87         (next=xlate(p, "AE2", "a")) ||   // at      AE T
     88 
     89 //        (next=xlate(p, "AH0 L", "L")) || // drops accuracy by 1%
     90         (next=xlate(p, "AH0 N", "~")) ||   // hut     HH AH T - from jean
     91         (next=xlate(p, "AH0 M", "}")) ||   // hut     HH AH T - from jean
     92         (next=xlate(p, "AH0", "@")) ||   // hut     HH AH T - from jean
     93         (next=xlate(p, "AH1", "u")) ||   // hut     HH AH T
     94         (next=xlate(p, "AH2", "u")) ||   // hut     HH AH T
     95 
     96         (next=xlate(p, "AO0", "{")) ||   // ought   AO T
     97         (next=xlate(p, "AO1", "{")) ||   // ought   AO T
     98         (next=xlate(p, "AO2", "{")) ||   // ought   AO T
     99 
    100         (next=xlate(p, "AW0", "?")) ||   // cow     K AW
    101         (next=xlate(p, "AW1", "?")) ||   // cow     K AW
    102         (next=xlate(p, "AW2", "?")) ||   // cow     K AW
    103 
    104         (next=xlate(p, "AY0", "I")) ||   // hide    HH AY D
    105         (next=xlate(p, "AY1", "I")) ||   // hide    HH AY D
    106         (next=xlate(p, "AY2", "I")) ||   // hide    HH AY D
    107 
    108         (next=xlate(p, "B"  , "b")) ||   // be      B IY
    109         (next=xlate(p, "CH" , "C")) ||   // cheese  CH IY Z
    110         (next=xlate(p, "D"  , "d")) ||   // dee     D IY
    111         (next=xlate(p, "DH" , "D")) ||   // thee    DH IY
    112 
    113         (next=xlate(p, "EH1 R", ",r")) ||   // Ed      EH D
    114         (next=xlate(p, "EH0", "c")) ||   // Ed      EH D - from jean
    115         (next=xlate(p, "EH1", "e")) ||   // Ed      EH D
    116         (next=xlate(p, "EH2", "e")) ||   // Ed      EH D
    117 
    118         (next=xlate(p, "ER0", "P")) ||   // hurt    HH ER T
    119         (next=xlate(p, "ER1", "V")) ||   // hurt    HH ER T
    120         (next=xlate(p, "ER2", "V")) ||   // hurt    HH ER T
    121 
    122         (next=xlate(p, "EY0", "A")) ||   // ate     EY T
    123         (next=xlate(p, "EY1", "A")) ||   // ate     EY T
    124         (next=xlate(p, "EY2", "A")) ||   // ate     EY T
    125 
    126         (next=xlate(p, "F"  , "f")) ||   // fee     F IY
    127         (next=xlate(p, "G"  , "g")) ||   // green   G R IY N
    128         (next=xlate(p, "HH" , "h")) ||   // he      HH IY
    129 
    130         (next=xlate(p, "IH0", "6")) ||   // it      IH T
    131         (next=xlate(p, "IH1", "i")) ||   // it      IH T
    132         (next=xlate(p, "IH2", "i")) ||   // it      IH T
    133 
    134         (next=xlate(p, "IY0", "/")) ||   // eat     IY T - from jean
    135         (next=xlate(p, "IY1", "E")) ||   // eat     IY T
    136         (next=xlate(p, "IY2", "E")) ||   // eat     IY T
    137 
    138         (next=xlate(p, "JH" , "j")) ||   // gee     JH IY
    139         (next=xlate(p, "K"  , "k")) ||   // key     K IY
    140         (next=xlate(p, "L"  , "l")) ||   // lee     L IY
    141         (next=xlate(p, "M"  , "m")) ||   // me      M IY
    142         (next=xlate(p, "N"  , "n")) ||   // knee    N IY
    143         (next=xlate(p, "NG" , "N")) ||   // ping    P IH NG
    144 
    145         (next=xlate(p, "OW0", "]")) ||   // oat     OW T
    146         (next=xlate(p, "OW1", "O")) ||   // oat     OW T
    147         (next=xlate(p, "OW2", "O")) ||   // oat     OW T
    148 
    149         (next=xlate(p, "OY0", "<")) ||   // toy     T OY
    150         (next=xlate(p, "OY1", "<")) ||   // toy     T OY
    151         (next=xlate(p, "OY2", "<")) ||   // toy     T OY
    152 
    153         (next=xlate(p, "P"  , "p")) ||   // pee     P IY
    154         (next=xlate(p, "R"  , "r")) ||   // read    R IY D
    155         (next=xlate(p, "S"  , "s")) ||   // sea     S IY
    156         (next=xlate(p, "SH" , "S")) ||   // she     SH IY
    157         (next=xlate(p, "T"  , "t")) ||   // tea     T IY
    158         (next=xlate(p, "TH" , "T")) ||   // theta   TH EY T AH
    159 
    160         (next=xlate(p, "UH0", "q")) ||   // hood    HH UH D
    161         (next=xlate(p, "UH1", "q")) ||   // hood    HH UH D
    162         (next=xlate(p, "UH2", "q")) ||   // hood    HH UH D
    163 
    164         (next=xlate(p, "UW0", "U")) ||   // two     T UW
    165         (next=xlate(p, "UW1", "U")) ||   // two     T UW
    166         (next=xlate(p, "UW2", "U")) ||   // two     T UW
    167 
    168         (next=xlate(p, "V"  , "v")) ||   // vee     V IY
    169         (next=xlate(p, "W"  , "w")) ||   // we      W IY
    170         (next=xlate(p, "Y"  , "y")) ||   // yield   Y IY L D
    171         (next=xlate(p, "Z"  , "z")) ||   // zee     Z IY
    172         (next=xlate(p, "ZH" , "Z")) ||   // seizure S IY ZH ER
    173         0) {
    174         p = next;
    175       }
    176       else {
    177         fprintf(stderr, "can't pronounce line %d: %s", lineno, p);
    178         break;
    179       }
    180 
    181     }
    182 
    183     fputc('\n', stdout);
    184 
    185   }
    186 }
    187