Home | History | Annotate | Download | only in transforms
      1 <?xml version="1.0" encoding="UTF-8" ?>
      2 <!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd">
      3 <!--
      4 Copyright  1991-2018 Unicode, Inc.
      5 CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
      6 For terms of use, see http://www.unicode.org/copyright.html
      7 -->
      8 <supplementalData>
      9 	<version number="$Revision: 14381 $"/>
     10 	<transforms>
     11 		<transform source="Zawgyi" target="my" direction="forward" alias="my-t-my-s0-zawgyi">
     12 			<tRule><![CDATA[
     13 # This transform converts Zawgyi "encoded" Burmese into proper
     14 # unicode. Zawgyi is a popular encoding scheme in Myanmar. It uses
     15 # the Myanmar unicode range but assigns different characters or
     16 # glyphs to some codepoints. In addition to the character mapping,
     17 # there is reordering of codepoints needed to match the expected
     18 # unicode order. This reordering is context-based.
     19 #
     20 # This transform is done in two main stages:
     21 # (1) Map all Zawgyi codepoints to their Unicode counterpart.
     22 # (2) Perform reordering.
     23 
     24 # Modern Burmese digits & Unicode code points.
     25 $nondigits = [^\u1040-\u1049];
     26 $consonant = [\u1000-\u1021];
     27 $vowelsign = [\u102B-\u1030\u1032];  # Unicode vowel signs except E (1031)
     28 $vowelsAndConsonants = [\u1000-\u102a];
     29 
     30 $umedial = [\u103B-\u103E];    # Medial codepoints in Unicode
     31 $vowelmedial = [\u102B-\u1030\u1032\1u36\u1037\u103A-\u103F];  # Union of vowel signs and medials
     32 $ukinzi = \u1004\u103A\u1039;  # Codepoints representing kinzi in Unicode
     33 
     34 # Zawgyi medial ra has multiple representations
     35 $zmedialra = [\u103B\u107E-\u1084];
     36 
     37 $wspace = [\u0020\u00a0\u1680\u2000-\u200d\u2060\u202f\u205f\u3000\ufeff];
     38 
     39 
     40 ####
     41 #### STAGE 1: CODEPOINT MAPPING FROM ZAWGYI TO UNICODE
     42 ####
     43 
     44 # Kinzi (predefined ligatures)
     45 # Move base character to the right
     46 ($consonant) \u103A \u1064  $ukinzi $1 \u103B;
     47 ($consonant) \u1064  $ukinzi $1;
     48 \u1064  $ukinzi;
     49 
     50 # Special cases moving base character to right before vowel signs
     51 ($consonant) \u108B  $ukinzi $1 \u102D;
     52 ($consonant) \u108C  $ukinzi $1 \u102E;
     53 ($consonant) \u108D  $ukinzi $1 \u1036;
     54 
     55 # Special cases moving Kinzi block to left
     56 ($consonant) \u103A \u1033 \u108B  $ukinzi $1 \u103B \u102D \u102F;
     57 ($consonant) \u103A \u108b  $ukinzi $1 \u103B \u102D ;
     58 ($consonant) \u103A \u108C  $ukinzi $1 \u103B \u102E ;
     59 ($consonant) \u103A \u108D  $ukinzi $1 \u103B \u1036 ;
     60 ($consonant) \u103A \u108e  $1 \u103B \u102D \u1036 ;
     61 
     62 \u108B  $ukinzi \u102D ;
     63 \u108C  $ukinzi \u102E ;
     64 \u108D  $ukinzi \u1036 ;
     65 
     66 # Consonants (only the ones that have to change)
     67 \u106A  \u1009 ;  # NYA
     68 \u106B  \u100A ;
     69 \u108F  \u1014 ;
     70 \u1090  \u101B ;
     71 \u1086  \u103F ;
     72 
     73 # yapin
     74 [\u103A|\u107d]  \u103B ;
     75 
     76 # yayit
     77 ($zmedialra)+  \u103C ;
     78 
     79 # wasway
     80 \u103C* \u108A  \u103D \u103E;  # To avoid duplicate medials
     81 \u103C  \u103D ;
     82 
     83 # hatoh
     84 [\u103D|\u1087]  \u103E ;
     85 \u1088  \u103E \u102F ;
     86 \u1089  \u103E \u1030 ;
     87 
     88 # Vowels
     89 \u1033  \u102F ;
     90 \u1034  \u1030 ;
     91 
     92 # asat
     93 \u1039  \u103A ;
     94 
     95 # lower dot
     96 [\u1094\u1095]  \u1037 ;
     97 
     98 # Special cases for 1025 vs 1009;
     99 \u1025 \u1039  \u1009 \u103a;
    100 \u1025 \u1061  \u1009 \u1039 \u1001;
    101 \u1025 \u1062  \u1009 \u1039 \u1002;
    102 \u1025 \u1065  \u1009 \u1039 \u1005;
    103 \u1025 \u1068  \u1009 \u1039 \u1007;
    104 \u1025 \u1076  \u1009 \u1039 \u1013;
    105 \u1025 \u1078  \u1009 \u1039 \u1015;
    106 \u1025 \u107A  \u1009 \u1039 \u1017;
    107 \u1025 \u1079  \u1009 \u1039 \u1016;
    108 
    109 # Stacked Consonants
    110 \u105A  \u102B \u103A ;
    111 \u1060  \u1039 \u1000 ;
    112 \u1061  \u1039 \u1001 ;
    113 \u1062  \u1039 \u1002 ;
    114 \u1063  \u1039 \u1003 ;
    115 \u1065  \u1039 \u1005 ;
    116 [\u1066\u1067]  \u1039 \u1006 ;
    117 \u1068  \u1039 \u1007 ;
    118 \u1069  \u1039 \u1008 ;
    119 \u106C  \u1039 \u100B ;
    120 \u106D  \u1039 \u100C ;
    121 \u1070  \u1039 \u100F ;
    122 [\u1071\u1072]  \u1039 \u1010 ;
    123 \u1096  \u1039 \u1010 \u103D;
    124 [\u1073\u1074]  \u1039 \u1011 ;
    125 \u1075  \u1039 \u1012 ;
    126 \u1076  \u1039 \u1013 ;
    127 \u1077  \u1039 \u1014 ;
    128 \u1078  \u1039 \u1015 ;
    129 \u1079  \u1039 \u1016 ;
    130 \u107A  \u1039 \u1017 ;
    131 [\u107B\u1093]  \u1039 \u1018 ;
    132 \u107C  \u1039 \u1019 ;
    133 \u1085  \u1039 \u101C ;
    134 \u108E  \u102D \u1036 ;
    135 
    136 # Pre-defined ligatures
    137 \u106E  \u100D\u1039\u100D ;
    138 \u106F  \u100D\u1039\u100E ;
    139 \u1091  \u100F\u1039\u100D ;
    140 \u1092  \u100B\u1039\u100C ;
    141 \u1097  \u100B\u1039\u100B ;
    142 \u104E  \u104E\u1004\u103A\u1038 ;
    143 
    144 
    145 ####
    146 #### STAGE 1.01: Digits 0 and 4 used instead of letters
    147 # Case of MYANMAR digit being used instead of a letter
    148 # Lone digit zero and four at start
    149 ::Null;
    150 ^ \u1040 ($nondigits)  \u101D $1;
    151 ^ \u1044 ($nondigits)  | \u104E $1 ;
    152 
    153 # Lone digit zero or four at end
    154 ($nondigits) \u1040 $  $1 \u101D;
    155 ($nondigits) \u1044 $  $1 \u104e;
    156 
    157 # Evowel and dependent vowel signs before 0 or 4 only
    158 #   -> convert to the consonant.
    159 ([\u102b-\u103f]) \u1040 ($nondigits)  $1 \u101d $2;
    160 ([\u102b-\u103f]) \u1044 ($nondigits)  $1 \u104E $2;
    161 
    162 
    163 ####
    164 #### STAGE 1.1: Strip spaces immediately before combining characters.
    165 ####   Move e-vowel after consonants and medials
    166 ####   Now every codepoint is Unicode. This starts conversion
    167 ####   from semi-visual order to logical order.
    168 ####
    169 ::Null;
    170 
    171 # Don't remove spaces before E vowel or medial Ra at this stage
    172 ($wspace) \u1037 > \u1037 $1;
    173 ($wspace+) ([\u102b-\u1030\u1032-\u103b\u103d\u103e])  $2;
    174 
    175 # Remove a duplicate early
    176 \u1037+  \u1037;
    177 
    178 # Move e-vowel after medials and consonants.
    179 \u1031+ $ukinzi ($consonant) > $ukinzi $1 \u1031;
    180 \u1031+ \u1037+ ($consonant) > $1 \u1031 \u1037 ;
    181 \u1031+ \u103c ($consonant) > $1 \u103c \u1031;
    182 
    183 # Move medials other than 103c before the 1031. Leave 103c for
    184 # the next consonant.
    185 \u1031+ ($consonant) ([\u103b\u103d\u103e]+) > $1 $2 \u1031;
    186 \u1031+ ($vowelsAndConsonants) > $1 \u1031;
    187 
    188 
    189 ####
    190 #### STAGE 2: POST REORDERING RULES FOR UNICODE RENDERING
    191 ####
    192 ::Null;
    193 
    194 \u103b \u103a > \u103a \u103b;
    195 
    196 # Simpler replacements for Zawgyi 1025
    197 \u1025 \u102E  \u1026;
    198 
    199 # Asat and dot below reordering, to Unicode NFC.
    200 \u103A\u1037  \u1037\u103A;
    201 
    202 # Reorder some vowel signs
    203 \u1036 ($umedial*) ($vowelsign+)  $1 $2 \u1036 ;
    204 ([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032])  $2 $1;
    205 
    206 # Move ra medial which precedes consonant, but not other medials.
    207 \u103C ($consonant)  $1 \u103C;
    208 
    209 
    210 ####
    211 #### Stage 3
    212 #### Move \u1036, and \u103C after consonants.
    213 ::Null;
    214 
    215 ($umedial) \u1039 ($consonant) > \u1039 $2 $1;
    216 
    217 \u103C \u103A \u1039 ($consonant)  \u103A \u1039 $1 \u103C;
    218 
    219 \u1036 ($umedial+)  $1 \u1036;
    220 
    221 
    222 ####
    223 #### Stage 4
    224 #### Reordering medials, dot below, contractions, E sign, and asat.
    225 ::Null;
    226 
    227 # Reorder the medials
    228 ([\u103C\u103D\u103E]+) \u103B  \u103B $1;
    229 ([\u103D\u103E]+) \u103C  \u103C $1;
    230 \u103E\u103D  \u103D\u103E ;
    231 
    232 # Contractions with vowel signs
    233 ([\u1031]+) ($vowelsign*) \u1039 ($consonant)  \u1039 $3 $1 $2;
    234 ($vowelsign+) \u1039 ($consonant)  \u1039 $2 $1;
    235 
    236 # Move vowel sign E \u1031 after medials, but not across consonants
    237 ($umedial*) ([\u1031]+) ($umedial*)  $1 $3 $2;
    238 
    239 # Reorder dot below after medials and vowel diacritics
    240 \u1037 ([\u102D-\u1030\u1032\u1036\u103b-\u103e]+)  $1 \u1037;
    241 
    242 # Move vowel signs after medials
    243 ($vowelsign+) ($umedial+)  $2 $1;
    244 
    245 # Reorder modifiers and asat
    246 ($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant)  $1 \u103A $2 $3;
    247 
    248 
    249 ####
    250 #### Stage 5.  More reorderings
    251 #### Vowel signs after medials, sort medials, 
    252 ####
    253 ::Null;
    254 
    255 # Replace CA + YA with JHA after moving other things beyond the medials.
    256 \u1005 \u103b  \u1008;
    257 
    258 # More moving vowel signs after medials
    259 ([\u102b-\u1032]) ($umedial)  $2 $1;
    260 
    261 # Sort the medials
    262 ([\u103C\u103D\u103E]) \u103B  \u103B $1;
    263 ([\u103D\u103E]) \u103C  \u103C $1;
    264 \u103E\u103D  \u103D\u103E ;
    265 
    266 # Move visarga after other signs
    267 \u1038 ($vowelmedial)  $1 \u1038;
    268 
    269 # Reorder
    270 \u1036 \u102f  \u102f \u1036;
    271 
    272 
    273 ###
    274 ### Stage 6
    275 ### Finish conflicting and extra diacritics. Remove some white space
    276 ###
    277 ::Null;
    278 
    279 # Fix duplicate combiners
    280 \u102D \u102D+  \u102D;
    281 \u102E \u102E+  \u102E;
    282 \u102F \u102F+  \u102F;
    283 \u1030 \u1030+  \u1030;
    284 \u1032 \u1032+  \u1032;
    285 \u1036 \u1036+  \u1036;
    286 \u1037 \u1037+  \u1037;
    287 \u1039 \u1039+  \u1039;
    288 \u103a \u103a+  \u103a;
    289 \u103b \u103b+  \u103b;
    290 \u103c \u103c+  \u103c;
    291 \u103d \u103d+  \u103d;
    292 \u103e \u103e+  \u103e; # http://unicode.org/cldr/trac/ticket/10386
    293 
    294 # Fix overlapping signs
    295 \u102F [\u1030\u103a]  \u102F;
    296 \u102D \u102E  \u102E;
    297 
    298 # Remove space directly before diacritics.
    299 ($wspace)+ ([\u102b-\u1032\u1036-\u103e])  $2;
    300 
    301 # Remove ZWSP at start and end
    302 ^ \u200b+  ;
    303 \u200b+ $  ;
    304 
    305 # Fix multiple spaces around ZWSP to single ZWSP.
    306 $wspace* \u200b $wspace*  \u200b;
    307 			]]></tRule>
    308 		</transform>
    309 	</transforms>
    310 </supplementalData>
    311