1 <?xml version="1.0" encoding="UTF-8" ?> 2 <!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd"> 3 <!-- 4 Copyright 1991-2013 Unicode, Inc. 5 CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/) 6 For terms of use, see http://www.unicode.org/copyright.html 7 --> 8 <supplementalData> 9 <version number="$Revision: 12263 $"/> 10 <transforms> 11 <transform source="Latn" target="Kana" direction="both" alias="Latin-Katakana und-Kana-t-und-latn" backwardAlias="Katakana-Latin und-Latn-t-und-kana"> 12 <tRule> 13 # note: a global filter is more efficient, but MUST include all source chars 14 #:: [\u0000-\u007E - - - [:Latin:][:Katakana:] [:nonspacing mark:]] ; 15 # MINIMAL FILTER GENERATED FOR: Latin-Katakana 16 ### WARNING -- must add width filter, both here and below!!! ### 17 :: [[-\u1160----\u3000--------][',.A-Za-z~-------------------------]] ; 18 :: [:Latin:] fullwidth-halfwidth (); 19 :: NFD (NFC); 20 :: Lower (); # whenever transliterating from cased to uncased script, include this 21 # :: NFD () ; # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese 22 # Uses modified Hepburn. Small changes to make unambiguous. 23 # | Kunrei-shiki: Hepburn/MHepburn 24 # | ------------------------------ 25 # | si: shi 26 # | si ~ya: sha 27 # | si ~yu: shu 28 # | si ~yo: sho 29 # | zi: ji 30 # | zi ~ya: ja 31 # | zi ~yu: ju 32 # | zi ~yo: jo 33 # | ti: chi 34 # | ti ~ya: cha 35 # | ti ~yu: chu 36 # | ti ~yu: cho 37 # | tu: tsu 38 # | di: ji/dji 39 # | du: zu/dzu 40 # | hu: fu 41 # | For foreign words: 42 # | ----------------- 43 # | se ~i si 44 # | si ~e she 45 # | 46 # | ze ~i zi 47 # | zi ~e je 48 # | 49 # | te ~i ti 50 # | ti ~e che 51 # | te ~u tu 52 # | 53 # | de ~i di 54 # | de ~u du 55 # | de ~i di 56 # | 57 # | he ~u: hu 58 # | hu ~a fa 59 # | hu ~i fi 60 # | hu ~e he 61 # | hu ~o ho 62 # Most small forms are generated, but if necessary 63 # explicit small forms are given with ~a, ~ya, etc. 64 #------------------------------------------------------ 65 # Variables 66 $vowel = [aeiou] ; 67 $consonant = [bcdfghjklmnpqrstvwxyz] ; 68 $macron = ; 69 # Variables used for doubled-consonants with tsu 70 $kana = [-] ; 71 $voice = []; 72 $semivoice = []; 73 $k_start = [] ; 74 $s_start = [] ; 75 $j_start = [] $voice ; 76 $t_start = [] ; 77 $n_start = [] ; 78 $h_start = [] ; 79 $f_start = [] ; 80 $m_start = [] ; 81 $y_start = [] ; 82 $r_start = [] ; 83 $w_start = [] ; 84 $v_start = [] ; 85 $voweled_basekana = [---] ; 86 # if is followed by $n_quoter, then it needs an 87 # apostrophe after its romaji form to disambiguate it. 88 # e.g., ! = , so represent as "n'a", not "na". 89 $n_quoter = [ ] ; 90 $small_y = [] ; 91 $iteration = ; 92 #------------------------------------------------------ 93 # katakana rules 94 # Punctuation 95 '.' ; 96 ',' ; 97 # ' ' } [a-z] ; # delete spaces before latin 98 # ' ' [^' '-] {} ['-] ; #insert spaces before hiragana 99 # Iteration Mark 100 # Copy previous letter marks 101 # TODO 102 # | $1 $1 ($kana [[:M:]$voice$semivoice]?) $iteration 103 # Specials for katakana -- not shared with hiragana 104 va ; 105 vi ; 106 ve ; 107 vo ; 108 '~ka' ; 109 '~ke' ; 110 # ~~~ begin shared rules ~~~ 111 #special 112 ya '~'; 113 yi '~' ; 114 yu '~'; 115 ye '~'; 116 yo '~'; 117 #normal 118 a ; 119 b | '~' } $small_y ; 120 by } $vowel | '~y' ; 121 ba ; 122 bi ; 123 bu ; 124 be ; 125 bo ; 126 c } i | s ; 127 c } e | s ; 128 da ; 129 di ; 130 du ; 131 de ; 132 do ; 133 dzu ; 134 dja ; 135 dji'~i' ; # liu 136 dju ; 137 dje ; 138 djo ; 139 dji ; 140 dj } $vowel | '~y' ; 141 # TODO: QUESTION: use instead of dj, dz 142 cha ; 143 chi'~i' ; # liu 144 chu ; 145 che ; 146 cho ; 147 chi ; 148 ch } $vowel | '~y' ; 149 e ; 150 g | '~' } $small_y ; 151 gy } $vowel | '~y' ; 152 ga ; 153 gi ; 154 gu ; 155 ge ; 156 go ; 157 i ; 158 # j } $vowel | '~y' ; 159 ja ; 160 ji'~i' ; # liu 161 ju ; 162 je ; 163 jo ; 164 ji ; 165 k | '~' } $small_y ; 166 ky } $vowel | '~y' ; 167 ka ; 168 ki ; 169 ku ; 170 ke ; 171 ko ; 172 m | '~' } $small_y ; 173 my } $vowel | '~y' ; 174 ma ; 175 mi ; 176 mu ; 177 me ; 178 mo ; 179 m } [pbfv] ; 180 n | '~' } $small_y ; 181 ny } $vowel | '~y' ; 182 na ; 183 ni ; 184 nu ; 185 ne ; 186 no ; 187 o ; 188 p | '~' } $small_y ; 189 py } $vowel | '~y' ; 190 pa ; 191 pi ; 192 pu ; 193 pe ; 194 po ; 195 h | '~' } $small_y ; 196 hy } $vowel | '~y' ; 197 ha ; 198 hi ; 199 hu ; 200 he ; 201 ho ; 202 # f | '~' } $small_y ; 203 # f } $vowel | '~' ; 204 fa ; 205 fi ; 206 fe ; 207 fo ; 208 fu ; 209 r | '~' } $small_y ; 210 ry } $vowel | '~y' ; 211 ra ; 212 ri ; 213 ru ; 214 re ; 215 ro ; 216 za ; 217 zi ; 218 zu ; 219 ze ; 220 zo ; 221 sa ; 222 si ; 223 su ; 224 se ; 225 so ; 226 sha ; 227 shi'~i' ; # liu 228 shu ; 229 she ; 230 sho ; 231 shi ; 232 sh } $vowel | '~y' ; 233 ta ; 234 ti ; 235 tu ; 236 te ; 237 to ; 238 tsu ; 239 # v } $vowel | '~' ; 240 #'v~a' ; # liu 241 #'v~i' ; # liu 242 #'v~e' ; # liu 243 #'v~o' ; # liu 244 vu ; 245 u ; 246 # w } $vowel | '~' ; 247 wa ; 248 wi ; 249 wu ; 250 we ; 251 wo ; 252 ya ; 253 yi ; 254 yu ; 255 ye ; 256 yo ; 257 # double consonants 258 #specials 259 s } sh ; 260 t } ch ; 261 #voiced 262 j } j } $j_start ; 263 b } b } [$h_start$f_start] $voice; 264 d } d } $t_start $voice; 265 g } g } $k_start $voice; 266 p } p } [$h_start$f_start] $semivoice; 267 # v } v } [] $voice ; 268 z } z } $s_start $voice; 269 v } v } $v_start; 270 # normal 271 k } k } $k_start ; 272 m } m } $m_start ; 273 n } n } $n_start ; 274 h } h } $h_start ; 275 f } f } $f_start ; 276 r } r } $r_start ; 277 t } t } $t_start ; 278 s } s } $s_start ; 279 w } w } $w_start; 280 y } y } $y_start; 281 # completeness 282 x } x ; 283 c } k ; 284 c } c ; 285 c } q ; 286 l } l ; 287 q } q ; 288 # y } y ; 289 # w } w ; 290 # prolonged vowel mark. this indicates a doubling of 291 # the preceding vowel sound 292 #a a { ; # liu 293 #e e { ; # liu 294 #i i { ; # liu 295 #o o { ; # liu 296 #u u { ; # liu 297 $macron ; 298 # small forms 299 '~a' ; 300 '~i' ; 301 '~u' ; 302 '~e' ; 303 '~o' ; 304 '~tsu' ; 305 '~wa' ; 306 '~ya' ; 307 '~yi' ; 308 '~yu' ; 309 '~ye' ; 310 '~yo' ; 311 # iteration marks 312 # TODO: make more accurate 313 j $1 sh (y* $vowel) {$voice ; 314 dj $1 ch (y* $vowel) {$voice ; 315 dz $1 ts (y* $vowel) {$voice ; 316 g $1 k (y* $vowel) {$voice ; 317 z $1 s (y* $vowel) {$voice ; 318 d $1 t (y* $vowel) {$voice ; 319 h $1 b (y* $vowel) {$voice ; 320 v $1 w (y* $vowel) {$voice ; 321 sh $1 sh (y* $vowel) {$voice ; 322 j $1 j (y* $vowel) {$voice ; 323 ch $1 ch (y* $vowel) {$voice ; 324 dj $1 dj(y* $vowel) {$voice ; 325 ts $1 ts (y* $vowel) {$voice ; 326 dz $1 dz (y* $vowel) {$voice ; 327 $1 ($consonant y* $vowel) {$voice? ; 328 $1 (.) { $voice? ; # otherwise repeat last character 329 $voice? ; # delete if no characters found 330 # h- rule: lengthens vowel if not followed by a vowel. 331 # At the point this is applied, latin [cons]?vowel sequences 332 # have been converted to katakana in NFD form. 333 $voweled_basekana [\u3099 \u309A]? { h ; 334 # one-way latin- kana rules. these do not occur in 335 # well-formed romaji representing actual japanese text. 336 # their purpose is to make all romaji map to kana of 337 # some sort. 338 # the following are not really necessary, but produce 339 # slightly more natural results. 340 cy ; 341 dy ; 342 hy ; 343 sy ; 344 ty ; 345 zy ; 346 h ; 347 # isolated consonants listed here so as not to mask 348 # longer rules above. 349 ch ; 350 sh ; 351 dz ; 352 dj ; 353 b ; 354 d ; 355 g ; 356 k ; 357 m ; 358 n'' } $n_quoter ; 359 n ; 360 p ; 361 r ; 362 s ; 363 t ; 364 y ; 365 z ; 366 v ; 367 f ; 368 j ; 369 w ; 370 | ss ; 371 | e ; 372 | d ; 373 | u ; 374 | th ; 375 # simple substitutions using backup 376 c | k ; 377 l | r ; 378 q | k ; 379 x | ks ; 380 # ~~~ END shared rules ~~~ 381 #------------------------------------------------------ 382 # Final cleanup 383 '~' ; # delete stray tildes between letters 384 [:Katakana:] { '' } [:Latin:] ; # delete stray quotes between letters 385 # [[:Nonspacing Mark:]-[-]] ; # delete any non-spacing marks that we didn't use 386 :: NFC (NFD) ; 387 :: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth); 388 # note: a global filter is more efficient, but MUST include all source chars!! 389 #:: ([\u0000-\u007E - - - [:Latin:][:Katakana:] [:nonspacing mark:]]); 390 # MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD 391 :: ( [[\ -~--------][~---------][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ; 392 # eof 393 </tRule> 394 </transform> 395 </transforms> 396 </supplementalData> 397