common/transforms/Latin-Katakana.xml

<?xml version="1.0" encoding="UTF-8" ?>
<!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
<!--
Copyright  1991-2013 Unicode, Inc.
CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
For terms of use, see http://www.unicode.org/copyright.html
-->
<supplementalData>
	<version number="$Revision: 12263 $"/>
	<transforms>
		<transform source="Latn" target="Kana" direction="both" alias="Latin-Katakana und-Kana-t-und-latn" backwardAlias="Katakana-Latin und-Latn-t-und-kana">
			<tRule>
# note: a global filter is more efficient, but MUST include all source chars
#:: [\u0000-\u007E  - - - [:Latin:][:Katakana:] [:nonspacing mark:]] ;
# MINIMAL FILTER GENERATED FOR: Latin-Katakana
### WARNING -- must add width filter, both here and below!!! ###
:: [[-\u1160----\u3000--------][',.A-Za-z~-------------------------]] ;
:: [:Latin:] fullwidth-halfwidth ();
:: NFD (NFC);
:: Lower ();    # whenever transliterating from cased to uncased script, include this
# :: NFD () ;   # this would catch the odd cases where a lowercase is not in NFD, but none are important for Japanese
# Uses modified Hepburn. Small changes to make  unambiguous.
# | Kunrei-shiki: Hepburn/MHepburn
# | ------------------------------
# | si: shi
# | si ~ya: sha
# | si ~yu: shu
# | si ~yo: sho
# | zi: ji
# | zi ~ya: ja
# | zi ~yu: ju
# | zi ~yo: jo
# | ti: chi
# | ti ~ya: cha
# | ti ~yu: chu
# | ti ~yu: cho
# | tu: tsu
# | di: ji/dji
# | du: zu/dzu
# | hu: fu
# | For foreign words:
# | -----------------
# | se ~i si
# | si ~e she
# |
# | ze ~i zi
# | zi ~e je
# |
# | te ~i ti
# | ti ~e che
# | te ~u tu
# |
# | de ~i di
# | de ~u du
# | de ~i di
# |
# | he ~u: hu
# | hu ~a fa
# | hu ~i fi
# | hu ~e he
# | hu ~o ho
# Most small forms are generated, but if necessary
# explicit small forms are given with ~a, ~ya, etc.
#------------------------------------------------------
# Variables
$vowel = [aeiou] ;
$consonant = [bcdfghjklmnpqrstvwxyz] ;
$macron =  ;
# Variables used for doubled-consonants with tsu
$kana = [-] ;
$voice = [];
$semivoice = [];
$k_start = [] ;
$s_start = [] ;
$j_start = [] $voice ;
$t_start = [] ;
$n_start = [] ;
$h_start = [] ;
$f_start = [] ;
$m_start = [] ;
$y_start = [] ;
$r_start = [] ;
$w_start = [] ;
$v_start = [] ;
$voweled_basekana = [---] ;
# if  is followed by $n_quoter, then it needs an
# apostrophe after its romaji form to disambiguate it.
# e.g.,   ! =  , so represent as &quot;n'a&quot;, not &quot;na&quot;.
$n_quoter  =  [             ] ;
$small_y = [] ;
$iteration =  ;
#------------------------------------------------------
# katakana rules
# Punctuation
'.'  ;
','  ;
# ' ' } [a-z]  ; # delete spaces before latin
# ' '  [^' '-] {} ['-] ; #insert spaces before hiragana
# Iteration Mark
# Copy previous letter  marks
# TODO
# | $1 $1  ($kana [[:M:]$voice$semivoice]?) $iteration
# Specials for katakana -- not shared with hiragana
va   ;
vi   ;
ve   ;
vo   ;
'~ka'   ;
'~ke'   ;
# ~~~ begin shared rules ~~~
#special
ya  '~';
yi  '~' ;
yu  '~';
ye  '~';
yo  '~';
#normal
a   ;
b | '~'   } $small_y ;
by } $vowel   | '~y' ;
ba   ;
bi   ;
bu   ;
be   ;
bo   ;
c } i  | s ;
c } e  | s ;
da   ;
di   ;
du   ;
de   ;
do   ;
dzu   ;
dja   ;
dji'~i'   ; # liu
dju   ;
dje   ;
djo   ;
dji   ;
dj  } $vowel   | '~y' ;
# TODO: QUESTION: use  instead of dj, dz
cha   ;
chi'~i'   ; # liu
chu   ;
che   ;
cho   ;
chi   ;
ch } $vowel   | '~y' ;
e   ;
g | '~'  } $small_y ;
gy  } $vowel   | '~y' ;
ga   ;
gi   ;
gu   ;
ge   ;
go   ;
i   ;
# j  } $vowel   | '~y' ;
ja   ;
ji'~i'   ; # liu
ju   ;
je   ;
jo   ;
ji   ;
k | '~'  } $small_y ;
ky  } $vowel   | '~y' ;
ka   ;
ki   ;
ku   ;
ke   ;
ko   ;
m | '~'  } $small_y ;
my  } $vowel   | '~y' ;
ma   ;
mi   ;
mu   ;
me   ;
mo   ;
m } [pbfv]   ;
n | '~'   } $small_y ;
ny  } $vowel   | '~y' ;
na   ;
ni   ;
nu   ;
ne   ;
no   ;
o   ;
p | '~'   } $small_y ;
py  } $vowel   | '~y' ;
pa   ;
pi   ;
pu   ;
pe   ;
po   ;
h | '~'   } $small_y ;
hy  } $vowel   | '~y' ;
ha   ;
hi   ;
hu   ;
he   ;
ho   ;
# f | '~'   } $small_y ;
# f } $vowel   | '~' ;
fa   ;
fi   ;
fe   ;
fo   ;
fu   ;
r | '~'   } $small_y ;
ry  } $vowel   | '~y' ;
ra   ;
ri   ;
ru   ;
re   ;
ro   ;
za   ;
zi   ;
zu   ;
ze   ;
zo   ;
sa   ;
si   ;
su   ;
se   ;
so   ;
sha   ;
shi'~i'   ; # liu
shu   ;
she   ;
sho   ;
shi   ;
sh } $vowel   | '~y' ;
ta   ;
ti   ;
tu   ;
te   ;
to   ;
tsu   ;
# v  } $vowel   | '~' ;
#'v~a'   ; # liu
#'v~i'   ; # liu
#'v~e'   ; # liu
#'v~o'   ; # liu
vu   ;
u   ;
# w  } $vowel   | '~' ;
wa   ;
wi   ;
wu   ;
we   ;
wo   ;
ya   ;
yi   ;
yu   ;
ye   ;
yo   ;
# double consonants
#specials
s } sh   ;
t } ch   ;
#voiced
j } j   } $j_start ;
b } b   } [$h_start$f_start] $voice;
d } d   } $t_start $voice;
g } g   } $k_start $voice;
p } p   } [$h_start$f_start] $semivoice;
# v } v   } []  $voice ;
z } z   } $s_start $voice;
v } v   } $v_start;
# normal
k } k   } $k_start ;
m } m   } $m_start ;
n } n   } $n_start ;
h } h   } $h_start ;
f } f   } $f_start ;
r } r   } $r_start ;
t } t   } $t_start ;
s } s   } $s_start ;
w } w    } $w_start;
y } y   } $y_start;
# completeness
x } x   ;
c } k   ;
c } c   ;
c } q   ;
l } l   ;
q } q   ;
# y } y   ;
# w } w   ;
# prolonged vowel mark. this indicates a doubling of
# the preceding vowel sound
#a  a {  ; # liu
#e  e {  ; # liu
#i  i {  ; # liu
#o  o {  ; # liu
#u  u {  ; # liu
$macron   ;
# small forms
'~a'   ;
'~i'   ;
'~u'   ;
'~e'   ;
'~o'   ;
'~tsu'   ;
'~wa'   ;
'~ya'   ;
'~yi'   ;
'~yu'   ;
'~ye'   ;
'~yo'   ;
# iteration marks
# TODO: make more accurate
j $1  sh (y* $vowel) {$voice ;
dj $1  ch (y* $vowel) {$voice ;
dz $1  ts (y* $vowel) {$voice ;
g $1  k (y* $vowel) {$voice ;
z $1  s (y* $vowel) {$voice ;
d $1  t (y* $vowel) {$voice ;
h $1  b (y* $vowel) {$voice ;
v $1  w (y* $vowel) {$voice ;
sh $1  sh (y* $vowel) {$voice ;
j $1  j (y* $vowel) {$voice ;
ch $1  ch (y* $vowel) {$voice ;
dj $1  dj(y* $vowel) {$voice ;
ts $1  ts (y* $vowel) {$voice ;
dz $1  dz (y* $vowel) {$voice ;
$1  ($consonant y* $vowel) {$voice? ;
$1  (.) { $voice? ; # otherwise repeat last character
  $voice? ; # delete if no characters found
# h- rule: lengthens vowel if not followed by a vowel.
# At the point this is applied, latin [cons]?vowel sequences
# have been converted to katakana in NFD form.
$voweled_basekana [\u3099 \u309A]? { h   ;
# one-way latin-  kana rules. these do not occur in
# well-formed romaji representing actual japanese text.
# their purpose is to make all romaji map to kana of
# some sort.
# the following are not really necessary, but produce
# slightly more natural results.
cy   ;
dy   ;
hy   ;
sy   ;
ty   ;
zy   ;
h   ;
# isolated consonants listed here so as not to mask
# longer rules above.
ch  ;
sh   ;
dz   ;
dj  ;
b   ;
d   ;
g   ;
k   ;
m   ;
n''   } $n_quoter ;
n   ;
p   ;
r   ;
s   ;
t   ;
y   ;
z   ;
v   ;
f  ;
j   ;
w  ;
  | ss ;
  | e ;
  | d ;
  | u ;
  | th ;
# simple substitutions using backup
c  | k ;
l  | r ;
q  | k ;
x  | ks ;
# ~~~ END shared rules ~~~
#------------------------------------------------------
# Final cleanup
'~'  ; # delete stray tildes between letters
[:Katakana:] { '' } [:Latin:]  ; # delete stray quotes between letters
# [[:Nonspacing Mark:]-[-]]  ; # delete any non-spacing marks that we didn't use
:: NFC (NFD) ;
:: ([[:Katakana:][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] halfwidth-fullwidth);
# note: a global filter is more efficient, but MUST include all source chars!!
#:: ([\u0000-\u007E  - - - [:Latin:][:Katakana:] [:nonspacing mark:]]);
# MINIMAL FILTER GENERATED FOR: Latin-Katakana BACKWARD
:: ( [[\ -~--------][~---------][\u309B\u309C\u30A0\u30FC\uFF70\uFF9E\uFF9F]] ) ;
# eof
			</tRule>
		</transform>
	</transforms>
</supplementalData>