1 #!/bin/sh 2 # Copyright (c) 2014 The Chromium Authors. All rights reserved. 3 # Use of this source code is governed by a BSD-style license that can be 4 # found in the LICENSE file. 5 6 # References: 7 # http://encoding.spec.whatwg.org/#euc-jp 8 # http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932 9 # http://www.iana.org/assignments/charset-reg/CP51932 10 # Table 3-64 in CJKV Information Processing 2/e. 11 12 # Download the following two files, run it in source/data/mappings directory 13 # and save the result to euc-jp-html5.ucm 14 # http://encoding.spec.whatwg.org/index-jis0208.txt 15 # http://encoding.spec.whatwg.org/index-jis0212.txt 16 17 function preamble { 18 cat <<PREAMBLE 19 # *************************************************************************** 20 # * 21 # * Copyright (C) 1995-2014, International Business Machines 22 # * Corporation and others. All Rights Reserved. 23 # * 24 # * Generated per the algorithm for EUC-JP 25 # * described at http://encoding.spec.whatwg.org/#euc-jp. 26 # * 27 # *************************************************************************** 28 <code_set_name> "euc-jp-html5" 29 <char_name_mask> "AXXXX" 30 <mb_cur_max> 3 31 <mb_cur_min> 1 32 <uconv_class> "MBCS" 33 <subchar> \xF4\xFE 34 <subchar1> \x1A 35 <icu:charsetFamily> "ASCII" 36 37 <icu:state> 0-7f, 8e:2, 8f:3, a1-fe:1 38 <icu:state> a1-fe 39 <icu:state> a1-e2 40 <icu:state> a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4, f4-fe:4 41 <icu:state> a1-fe.u 42 43 CHARMAP 44 PREAMBLE 45 } 46 47 #<U0000> \x00 |0 48 function ascii { 49 for i in $(seq 0 127) 50 do 51 printf '<U%04X> \\x%02X |0\n' $i $i 52 done 53 } 54 55 56 # Map 0x8E 0x[A1-DF] to U+FF61 to U+FF9F 57 function half_width_kana { 58 for i in $(seq 0xA1 0xDF) 59 do 60 # 65377 = 0xFF61, 161 = 0xA1 61 printf '<U%04X> \\x8E\\x%02X |0\n' $(($i + 65377 - 161)) $i 62 done 63 } 64 65 66 # index-jis0208.txt has index pointers larger than the size of 67 # the encoding space available in 2-byte Graphic plane of ISO-2022-based 68 # encoding (94 x 94 = 8836). We have to exclude them because they're for 69 # Shift-JIS. 70 # In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries. 71 # All the bi-directional mapping entries come *before* the uni-directional 72 # (EUC-JP to Unicode) entries so that we put '|3' if we have seen 73 # the same Unicode code point earlier in the list. According to the definition 74 # of 'index pointer' in the W3C encoding spec, it's the first entry in the 75 # file for a given Unicode code point. 76 77 function jis208 { 78 awk '!/^#/ && !/^$/ && $1 <= 8836 \ 79 { printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\ 80 $1 / 94 + 0xA1, $1 % 94 + 0xA1,\ 81 ($2 in uset) ? 3 : 0); \ 82 uset[$2] = 1; 83 }' \ 84 index-jis0208.txt 85 } 86 87 # JIS X 212 is for decoding only (use '|3' to denote that). 88 89 function jis212 { 90 awk '!/^#/ && !/^$/ \ 91 { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\ 92 $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \ 93 index-jis0212.txt 94 } 95 96 function unsorted_table { 97 ascii 98 half_width_kana 99 jis208 100 jis212 101 decode_only_extra 102 echo '<U00A5> \x5C |1' 103 echo '<U203E> \x7E |1' 104 } 105 106 preamble 107 unsorted_table | sort | uniq 108 echo 'END CHARMAP' 109