Home | History | Annotate | Download | only in transforms
      1 <?xml version="1.0" encoding="UTF-8" ?>
      2 <!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
      3 <!--
      4 Copyright  1991-2013 Unicode, Inc.
      5 CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
      6 For terms of use, see http://www.unicode.org/copyright.html
      7 -->
      8 <supplementalData>
      9 	<version number="$Revision: 12263 $"/>
     10 	<transforms>
     11 		<transform source="Latn" target="Kana" direction="both" alias="Latin-Katakana und-Kana-t-und-latn" backwardAlias="Katakana-Latin und-Latn-t-und-kana">
     12 			<tRule>
     13 # note: a global filter is more efficient, but MUST include all source chars
     14 #:: [\u0000-\u007E  - - - [:Latin:][:Katakana:] [:nonspacing mark:]] ;
     15 # MINIMAL FILTER GENERATED FOR: Latin-Katakana
     16 ### WARNING -- must add width filter, both here and below!!! ###
     17 :: [[-\u1160----\u3000--------][',.A-Za-z~-------------------------]] ;
     18 :: [:Latin:] fullwidth-halfwidth ();
     19 :: NFD (NFC);
     20 :: Lower ();    # whenever transliterating from cased to uncased script, include this
     21 # :: NFD () ;   # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
     22 # Uses modified Hepburn. Small changes to make  unambiguous.
     23 # | Kunrei-shiki: Hepburn/MHepburn
     24 # | ------------------------------
     25 # | si: shi
     26 # | si ~ya: sha
     27 # | si ~yu: shu
     28 # | si ~yo: sho
     29 # | zi: ji
     30 # | zi ~ya: ja
     31 # | zi ~yu: ju
     32 # | zi ~yo: jo
     33 # | ti: chi
     34 # | ti ~ya: cha
     35 # | ti ~yu: chu
     36 # | ti ~yu: cho
     37 # | tu: tsu
     38 # | di: ji/dji
     39 # | du: zu/dzu
     40 # | hu: fu
     41 # | For foreign words:
     42 # | -----------------
     43 # | se ~i si
     44 # | si ~e she
     45 # |
     46 # | ze ~i zi
     47 # | zi ~e je
     48 # |
     49 # | te ~i ti
     50 # | ti ~e che
     51 # | te ~u tu
     52 # |
     53 # | de ~i di
     54 # | de ~u du
     55 # | de ~i di
     56 # |
     57 # | he ~u: hu
     58 # | hu ~a fa
     59 # | hu ~i fi
     60 # | hu ~e he
     61 # | hu ~o ho
     62 # Most small forms are generated, but if necessary
     63 # explicit small forms are given with ~a, ~ya, etc.
     64 #------------------------------------------------------
     65 # Variables
     66 $vowel = [aeiou] ;
     67 $consonant = [bcdfghjklmnpqrstvwxyz] ;
     68 $macron =  ;
     69 # Variables used for doubled-consonants with tsu
     70 $kana = [-] ;
     71 $voice = [];
     72 $semivoice = [];
     73 $k_start = [] ;
     74 $s_start = [] ;
     75 $j_start = [] $voice ;
     76 $t_start = [] ;
     77 $n_start = [] ;
     78 $h_start = [] ;
     79 $f_start = [] ;
     80 $m_start = [] ;
     81 $y_start = [] ;
     82 $r_start = [] ;
     83 $w_start = [] ;
     84 $v_start = [] ;
     85 $voweled_basekana = [---] ;
     86 # if  is followed by $n_quoter, then it needs an
     87 # apostrophe after its romaji form to disambiguate it.
     88 # e.g.,   ! =  , so represent as &quot;n'a&quot;, not &quot;na&quot;.
     89 $n_quoter  =  [             ] ;
     90 $small_y = [] ;
     91 $iteration =  ;
     92 #------------------------------------------------------
     93 # katakana rules
     94 # Punctuation
     95 '.'  ;
     96 ','  ;
     97 # ' ' } [a-z]  ; # delete spaces before latin
     98 # ' '  [^' '-] {} ['-] ; #insert spaces before hiragana
     99 # Iteration Mark
    100 # Copy previous letter  marks
    101 # TODO
    102 # | $1 $1  ($kana [[:M:]$voice$semivoice]?) $iteration
    103 # Specials for katakana -- not shared with hiragana
    104 va   ;
    105 vi   ;
    106 ve   ;
    107 vo   ;
    108 '~ka'   ;
    109 '~ke'   ;
    110 # ~~~ begin shared rules ~~~
    111 #special
    112 ya  '~';
    113 yi  '~' ;
    114 yu  '~';
    115 ye  '~';
    116 yo  '~';
    117 #normal
    118 a   ;
    119 b | '~'   } $small_y ;
    120 by } $vowel   | '~y' ;
    121 ba   ;
    122 bi   ;
    123 bu   ;
    124 be   ;
    125 bo   ;
    126 c } i  | s ;
    127 c } e  | s ;
    128 da   ;
    129 di   ;
    130 du   ;
    131 de   ;
    132 do   ;
    133 dzu   ;
    134 dja   ;
    135 dji'~i'   ; # liu
    136 dju   ;
    137 dje   ;
    138 djo   ;
    139 dji   ;
    140 dj  } $vowel   | '~y' ;
    141 # TODO: QUESTION: use  instead of dj, dz
    142 cha   ;
    143 chi'~i'   ; # liu
    144 chu   ;
    145 che   ;
    146 cho   ;
    147 chi   ;
    148 ch } $vowel   | '~y' ;
    149 e   ;
    150 g | '~'  } $small_y ;
    151 gy  } $vowel   | '~y' ;
    152 ga   ;
    153 gi   ;
    154 gu   ;
    155 ge   ;
    156 go   ;
    157 i   ;
    158 # j  } $vowel   | '~y' ;
    159 ja   ;
    160 ji'~i'   ; # liu
    161 ju   ;
    162 je   ;
    163 jo   ;
    164 ji   ;
    165 k | '~'  } $small_y ;
    166 ky  } $vowel   | '~y' ;
    167 ka   ;
    168 ki   ;
    169 ku   ;
    170 ke   ;
    171 ko   ;
    172 m | '~'  } $small_y ;
    173 my  } $vowel   | '~y' ;
    174 ma   ;
    175 mi   ;
    176 mu   ;
    177 me   ;
    178 mo   ;
    179 m } [pbfv]   ;
    180 n | '~'   } $small_y ;
    181 ny  } $vowel   | '~y' ;
    182 na   ;
    183 ni   ;
    184 nu   ;
    185 ne   ;
    186 no   ;
    187 o   ;
    188 p | '~'   } $small_y ;
    189 py  } $vowel   | '~y' ;
    190 pa   ;
    191 pi   ;
    192 pu   ;
    193 pe   ;
    194 po   ;
    195 h | '~'   } $small_y ;
    196 hy  } $vowel   | '~y' ;
    197 ha   ;
    198 hi   ;
    199 hu   ;
    200 he   ;
    201 ho   ;
    202 # f | '~'   } $small_y ;
    203 # f } $vowel   | '~' ;
    204 fa   ;
    205 fi   ;
    206 fe   ;
    207 fo   ;
    208 fu   ;
    209 r | '~'   } $small_y ;
    210 ry  } $vowel   | '~y' ;
    211 ra   ;
    212 ri   ;
    213 ru   ;
    214 re   ;
    215 ro   ;
    216 za   ;
    217 zi   ;
    218 zu   ;
    219 ze   ;
    220 zo   ;
    221 sa   ;
    222 si   ;
    223 su   ;
    224 se   ;
    225 so   ;
    226 sha   ;
    227 shi'~i'   ; # liu
    228 shu   ;
    229 she   ;
    230 sho   ;
    231 shi   ;
    232 sh } $vowel   | '~y' ;
    233 ta   ;
    234 ti   ;
    235 tu   ;
    236 te   ;
    237 to   ;
    238 tsu   ;
    239 # v  } $vowel   | '~' ;
    240 #'v~a'   ; # liu
    241 #'v~i'   ; # liu
    242 #'v~e'   ; # liu
    243 #'v~o'   ; # liu
    244 vu   ;
    245 u   ;
    246 # w  } $vowel   | '~' ;
    247 wa   ;
    248 wi   ;
    249 wu   ;
    250 we   ;
    251 wo   ;
    252 ya   ;
    253 yi   ;
    254 yu   ;
    255 ye   ;
    256 yo   ;
    257 # double consonants
    258 #specials
    259 s } sh   ;
    260 t } ch   ;
    261 #voiced
    262 j } j   } $j_start ;
    263 b } b   } [$h_start$f_start] $voice;
    264 d } d   } $t_start $voice;
    265 g } g   } $k_start $voice;
    266 p } p   } [$h_start$f_start] $semivoice;
    267 # v } v   } []  $voice ;
    268 z } z   } $s_start $voice;
    269 v } v   } $v_start;
    270 # normal
    271 k } k   } $k_start ;
    272 m } m   } $m_start ;
    273 n } n   } $n_start ;
    274 h } h   } $h_start ;
    275 f } f   } $f_start ;
    276 r } r   } $r_start ;
    277 t } t   } $t_start ;
    278 s } s   } $s_start ;
    279 w } w    } $w_start;
    280 y } y   } $y_start;
    281 # completeness
    282 x } x   ;
    283 c } k   ;
    284 c } c   ;
    285 c } q   ;
    286 l } l   ;
    287 q } q   ;
    288 # y } y   ;
    289 # w } w   ;
    290 # prolonged vowel mark. this indicates a doubling of
    291 # the preceding vowel sound
    292 #a  a {  ; # liu
    293 #e  e {  ; # liu
    294 #i  i {  ; # liu
    295 #o  o {  ; # liu
    296 #u  u {  ; # liu
    297 $macron   ;
    298 # small forms
    299 '~a'   ;
    300 '~i'   ;
    301 '~u'   ;
    302 '~e'   ;
    303 '~o'   ;
    304 '~tsu'   ;
    305 '~wa'   ;
    306 '~ya'   ;
    307 '~yi'   ;
    308 '~yu'   ;
    309 '~ye'   ;
    310 '~yo'   ;
    311 # iteration marks
    312 # TODO: make more accurate
    313 j $1  sh (y* $vowel) {$voice ;
    314 dj $1  ch (y* $vowel) {$voice ;
    315 dz $1  ts (y* $vowel) {$voice ;
    316 g $1  k (y* $vowel) {$voice ;
    317 z $1  s (y* $vowel) {$voice ;
    318 d $1  t (y* $vowel) {$voice ;
    319 h $1  b (y* $vowel) {$voice ;
    320 v $1  w (y* $vowel) {$voice ;
    321 sh $1  sh (y* $vowel) {$voice ;
    322 j $1  j (y* $vowel) {$voice ;
    323 ch $1  ch (y* $vowel) {$voice ;
    324 dj $1  dj(y* $vowel) {$voice ;
    325 ts $1  ts (y* $vowel) {$voice ;
    326 dz $1  dz (y* $vowel) {$voice ;
    327 $1  ($consonant y* $vowel) {$voice? ;
    328 $1  (.) { $voice? ; # otherwise repeat last character
    329   $voice? ; # delete if no characters found
    330 # h- rule: lengthens vowel if not followed by a vowel.
    331 # At the point this is applied, latin [cons]?vowel sequences
    332 # have been converted to katakana in NFD form.
    333 $voweled_basekana [\u3099 \u309A]? { h   ;
    334 # one-way latin-  kana rules. these do not occur in
    335 # well-formed romaji representing actual japanese text.
    336 # their purpose is to make all romaji map to kana of
    337 # some sort.
    338 # the following are not really necessary, but produce
    339 # slightly more natural results.
    340 cy   ;
    341 dy   ;
    342 hy   ;
    343 sy   ;
    344 ty   ;
    345 zy   ;
    346 h   ;
    347 # isolated consonants listed here so as not to mask
    348 # longer rules above.
    349 ch  ;
    350 sh   ;
    351 dz   ;
    352 dj  ;
    353 b   ;
    354 d   ;
    355 g   ;
    356 k   ;
    357 m   ;
    358 n''   } $n_quoter ;
    359 n   ;
    360 p   ;
    361 r   ;
    362 s   ;
    363 t   ;
    364 y   ;
    365 z   ;
    366 v   ;
    367 f  ;
    368 j   ;
    369 w  ;
    370   | ss ;
    371   | e ;
    372   | d ;
    373   | u ;
    374   | th ;
    375 # simple substitutions using backup
    376 c  | k ;
    377 l  | r ;
    378 q  | k ;
    379 x  | ks ;
    380 # ~~~ END shared rules ~~~
    381 #------------------------------------------------------
    382 # Final cleanup
    383 '~'  ; # delete stray tildes between letters
    384 [:Katakana:] { '' } [:Latin:]  ; # delete stray quotes between letters
    385 # [[:Nonspacing Mark:]-[-]]  ; # delete any non-spacing marks that we didn't use
    386 :: NFC (NFD) ;
    387 :: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth);
    388 # note: a global filter is more efficient, but MUST include all source chars!!
    389 #:: ([\u0000-\u007E  - - - [:Latin:][:Katakana:] [:nonspacing mark:]]);
    390 # MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
    391 :: ( [[\ -~--------][~---------][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ;
    392 # eof
    393 			</tRule>
    394 		</transform>
    395 	</transforms>
    396 </supplementalData>
    397