1 <?xml version="1.0" encoding="UTF-8" ?> 2 <!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd"> 3 <!-- 4 Copyright 1991-2018 Unicode, Inc. 5 CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/) 6 For terms of use, see http://www.unicode.org/copyright.html 7 --> 8 <supplementalData> 9 <version number="$Revision: 14381 $"/> 10 <transforms> 11 <transform source="Zawgyi" target="my" direction="forward" alias="my-t-my-s0-zawgyi"> 12 <tRule><![CDATA[ 13 # This transform converts Zawgyi "encoded" Burmese into proper 14 # unicode. Zawgyi is a popular encoding scheme in Myanmar. It uses 15 # the Myanmar unicode range but assigns different characters or 16 # glyphs to some codepoints. In addition to the character mapping, 17 # there is reordering of codepoints needed to match the expected 18 # unicode order. This reordering is context-based. 19 # 20 # This transform is done in two main stages: 21 # (1) Map all Zawgyi codepoints to their Unicode counterpart. 22 # (2) Perform reordering. 23 24 # Modern Burmese digits & Unicode code points. 25 $nondigits = [^\u1040-\u1049]; 26 $consonant = [\u1000-\u1021]; 27 $vowelsign = [\u102B-\u1030\u1032]; # Unicode vowel signs except E (1031) 28 $vowelsAndConsonants = [\u1000-\u102a]; 29 30 $umedial = [\u103B-\u103E]; # Medial codepoints in Unicode 31 $vowelmedial = [\u102B-\u1030\u1032\1u36\u1037\u103A-\u103F]; # Union of vowel signs and medials 32 $ukinzi = \u1004\u103A\u1039; # Codepoints representing kinzi in Unicode 33 34 # Zawgyi medial ra has multiple representations 35 $zmedialra = [\u103B\u107E-\u1084]; 36 37 $wspace = [\u0020\u00a0\u1680\u2000-\u200d\u2060\u202f\u205f\u3000\ufeff]; 38 39 40 #### 41 #### STAGE 1: CODEPOINT MAPPING FROM ZAWGYI TO UNICODE 42 #### 43 44 # Kinzi (predefined ligatures) 45 # Move base character to the right 46 ($consonant) \u103A \u1064 $ukinzi $1 \u103B; 47 ($consonant) \u1064 $ukinzi $1; 48 \u1064 $ukinzi; 49 50 # Special cases moving base character to right before vowel signs 51 ($consonant) \u108B $ukinzi $1 \u102D; 52 ($consonant) \u108C $ukinzi $1 \u102E; 53 ($consonant) \u108D $ukinzi $1 \u1036; 54 55 # Special cases moving Kinzi block to left 56 ($consonant) \u103A \u1033 \u108B $ukinzi $1 \u103B \u102D \u102F; 57 ($consonant) \u103A \u108b $ukinzi $1 \u103B \u102D ; 58 ($consonant) \u103A \u108C $ukinzi $1 \u103B \u102E ; 59 ($consonant) \u103A \u108D $ukinzi $1 \u103B \u1036 ; 60 ($consonant) \u103A \u108e $1 \u103B \u102D \u1036 ; 61 62 \u108B $ukinzi \u102D ; 63 \u108C $ukinzi \u102E ; 64 \u108D $ukinzi \u1036 ; 65 66 # Consonants (only the ones that have to change) 67 \u106A \u1009 ; # NYA 68 \u106B \u100A ; 69 \u108F \u1014 ; 70 \u1090 \u101B ; 71 \u1086 \u103F ; 72 73 # yapin 74 [\u103A|\u107d] \u103B ; 75 76 # yayit 77 ($zmedialra)+ \u103C ; 78 79 # wasway 80 \u103C* \u108A \u103D \u103E; # To avoid duplicate medials 81 \u103C \u103D ; 82 83 # hatoh 84 [\u103D|\u1087] \u103E ; 85 \u1088 \u103E \u102F ; 86 \u1089 \u103E \u1030 ; 87 88 # Vowels 89 \u1033 \u102F ; 90 \u1034 \u1030 ; 91 92 # asat 93 \u1039 \u103A ; 94 95 # lower dot 96 [\u1094\u1095] \u1037 ; 97 98 # Special cases for 1025 vs 1009; 99 \u1025 \u1039 \u1009 \u103a; 100 \u1025 \u1061 \u1009 \u1039 \u1001; 101 \u1025 \u1062 \u1009 \u1039 \u1002; 102 \u1025 \u1065 \u1009 \u1039 \u1005; 103 \u1025 \u1068 \u1009 \u1039 \u1007; 104 \u1025 \u1076 \u1009 \u1039 \u1013; 105 \u1025 \u1078 \u1009 \u1039 \u1015; 106 \u1025 \u107A \u1009 \u1039 \u1017; 107 \u1025 \u1079 \u1009 \u1039 \u1016; 108 109 # Stacked Consonants 110 \u105A \u102B \u103A ; 111 \u1060 \u1039 \u1000 ; 112 \u1061 \u1039 \u1001 ; 113 \u1062 \u1039 \u1002 ; 114 \u1063 \u1039 \u1003 ; 115 \u1065 \u1039 \u1005 ; 116 [\u1066\u1067] \u1039 \u1006 ; 117 \u1068 \u1039 \u1007 ; 118 \u1069 \u1039 \u1008 ; 119 \u106C \u1039 \u100B ; 120 \u106D \u1039 \u100C ; 121 \u1070 \u1039 \u100F ; 122 [\u1071\u1072] \u1039 \u1010 ; 123 \u1096 \u1039 \u1010 \u103D; 124 [\u1073\u1074] \u1039 \u1011 ; 125 \u1075 \u1039 \u1012 ; 126 \u1076 \u1039 \u1013 ; 127 \u1077 \u1039 \u1014 ; 128 \u1078 \u1039 \u1015 ; 129 \u1079 \u1039 \u1016 ; 130 \u107A \u1039 \u1017 ; 131 [\u107B\u1093] \u1039 \u1018 ; 132 \u107C \u1039 \u1019 ; 133 \u1085 \u1039 \u101C ; 134 \u108E \u102D \u1036 ; 135 136 # Pre-defined ligatures 137 \u106E \u100D\u1039\u100D ; 138 \u106F \u100D\u1039\u100E ; 139 \u1091 \u100F\u1039\u100D ; 140 \u1092 \u100B\u1039\u100C ; 141 \u1097 \u100B\u1039\u100B ; 142 \u104E \u104E\u1004\u103A\u1038 ; 143 144 145 #### 146 #### STAGE 1.01: Digits 0 and 4 used instead of letters 147 # Case of MYANMAR digit being used instead of a letter 148 # Lone digit zero and four at start 149 ::Null; 150 ^ \u1040 ($nondigits) \u101D $1; 151 ^ \u1044 ($nondigits) | \u104E $1 ; 152 153 # Lone digit zero or four at end 154 ($nondigits) \u1040 $ $1 \u101D; 155 ($nondigits) \u1044 $ $1 \u104e; 156 157 # Evowel and dependent vowel signs before 0 or 4 only 158 # -> convert to the consonant. 159 ([\u102b-\u103f]) \u1040 ($nondigits) $1 \u101d $2; 160 ([\u102b-\u103f]) \u1044 ($nondigits) $1 \u104E $2; 161 162 163 #### 164 #### STAGE 1.1: Strip spaces immediately before combining characters. 165 #### Move e-vowel after consonants and medials 166 #### Now every codepoint is Unicode. This starts conversion 167 #### from semi-visual order to logical order. 168 #### 169 ::Null; 170 171 # Don't remove spaces before E vowel or medial Ra at this stage 172 ($wspace) \u1037 > \u1037 $1; 173 ($wspace+) ([\u102b-\u1030\u1032-\u103b\u103d\u103e]) $2; 174 175 # Remove a duplicate early 176 \u1037+ \u1037; 177 178 # Move e-vowel after medials and consonants. 179 \u1031+ $ukinzi ($consonant) > $ukinzi $1 \u1031; 180 \u1031+ \u1037+ ($consonant) > $1 \u1031 \u1037 ; 181 \u1031+ \u103c ($consonant) > $1 \u103c \u1031; 182 183 # Move medials other than 103c before the 1031. Leave 103c for 184 # the next consonant. 185 \u1031+ ($consonant) ([\u103b\u103d\u103e]+) > $1 $2 \u1031; 186 \u1031+ ($vowelsAndConsonants) > $1 \u1031; 187 188 189 #### 190 #### STAGE 2: POST REORDERING RULES FOR UNICODE RENDERING 191 #### 192 ::Null; 193 194 \u103b \u103a > \u103a \u103b; 195 196 # Simpler replacements for Zawgyi 1025 197 \u1025 \u102E \u1026; 198 199 # Asat and dot below reordering, to Unicode NFC. 200 \u103A\u1037 \u1037\u103A; 201 202 # Reorder some vowel signs 203 \u1036 ($umedial*) ($vowelsign+) $1 $2 \u1036 ; 204 ([\u102B\u102C\u102F\u1030]) ([\u102D\u102E\u1032]) $2 $1; 205 206 # Move ra medial which precedes consonant, but not other medials. 207 \u103C ($consonant) $1 \u103C; 208 209 210 #### 211 #### Stage 3 212 #### Move \u1036, and \u103C after consonants. 213 ::Null; 214 215 ($umedial) \u1039 ($consonant) > \u1039 $2 $1; 216 217 \u103C \u103A \u1039 ($consonant) \u103A \u1039 $1 \u103C; 218 219 \u1036 ($umedial+) $1 \u1036; 220 221 222 #### 223 #### Stage 4 224 #### Reordering medials, dot below, contractions, E sign, and asat. 225 ::Null; 226 227 # Reorder the medials 228 ([\u103C\u103D\u103E]+) \u103B \u103B $1; 229 ([\u103D\u103E]+) \u103C \u103C $1; 230 \u103E\u103D \u103D\u103E ; 231 232 # Contractions with vowel signs 233 ([\u1031]+) ($vowelsign*) \u1039 ($consonant) \u1039 $3 $1 $2; 234 ($vowelsign+) \u1039 ($consonant) \u1039 $2 $1; 235 236 # Move vowel sign E \u1031 after medials, but not across consonants 237 ($umedial*) ([\u1031]+) ($umedial*) $1 $3 $2; 238 239 # Reorder dot below after medials and vowel diacritics 240 \u1037 ([\u102D-\u1030\u1032\u1036\u103b-\u103e]+) $1 \u1037; 241 242 # Move vowel signs after medials 243 ($vowelsign+) ($umedial+) $2 $1; 244 245 # Reorder modifiers and asat 246 ($consonant) ([\u102B-\u1032\u1036\u103B-\u103E]) \u103A ($consonant) $1 \u103A $2 $3; 247 248 249 #### 250 #### Stage 5. More reorderings 251 #### Vowel signs after medials, sort medials, 252 #### 253 ::Null; 254 255 # Replace CA + YA with JHA after moving other things beyond the medials. 256 \u1005 \u103b \u1008; 257 258 # More moving vowel signs after medials 259 ([\u102b-\u1032]) ($umedial) $2 $1; 260 261 # Sort the medials 262 ([\u103C\u103D\u103E]) \u103B \u103B $1; 263 ([\u103D\u103E]) \u103C \u103C $1; 264 \u103E\u103D \u103D\u103E ; 265 266 # Move visarga after other signs 267 \u1038 ($vowelmedial) $1 \u1038; 268 269 # Reorder 270 \u1036 \u102f \u102f \u1036; 271 272 273 ### 274 ### Stage 6 275 ### Finish conflicting and extra diacritics. Remove some white space 276 ### 277 ::Null; 278 279 # Fix duplicate combiners 280 \u102D \u102D+ \u102D; 281 \u102E \u102E+ \u102E; 282 \u102F \u102F+ \u102F; 283 \u1030 \u1030+ \u1030; 284 \u1032 \u1032+ \u1032; 285 \u1036 \u1036+ \u1036; 286 \u1037 \u1037+ \u1037; 287 \u1039 \u1039+ \u1039; 288 \u103a \u103a+ \u103a; 289 \u103b \u103b+ \u103b; 290 \u103c \u103c+ \u103c; 291 \u103d \u103d+ \u103d; 292 \u103e \u103e+ \u103e; # http://unicode.org/cldr/trac/ticket/10386 293 294 # Fix overlapping signs 295 \u102F [\u1030\u103a] \u102F; 296 \u102D \u102E \u102E; 297 298 # Remove space directly before diacritics. 299 ($wspace)+ ([\u102b-\u1032\u1036-\u103e]) $2; 300 301 # Remove ZWSP at start and end 302 ^ \u200b+ ; 303 \u200b+ $ ; 304 305 # Fix multiple spaces around ZWSP to single ZWSP. 306 $wspace* \u200b $wspace* \u200b; 307 ]]></tRule> 308 </transform> 309 </transforms> 310 </supplementalData> 311