Home | History | Annotate | Download | only in scripts
      1 #!/bin/sh
      2 # Copyright (c) 2014 The Chromium Authors. All rights reserved.
      3 # Use of this source code is governed by a BSD-style license that can be
      4 # found in the LICENSE file.
      5 
      6 # References:
      7 #   http://encoding.spec.whatwg.org/#euc-jp
      8 #   http://legacy-encoding.sourceforge.jp/wiki/index.php?cp51932
      9 #   http://www.iana.org/assignments/charset-reg/CP51932
     10 #   Table 3-64 in CJKV Information Processing 2/e.
     11 
     12 # Download the following two files, run it in source/data/mappings directory
     13 # and save the result to euc-jp-html5.ucm
     14 #   http://encoding.spec.whatwg.org/index-jis0208.txt
     15 #   http://encoding.spec.whatwg.org/index-jis0212.txt
     16 
     17 function preamble {
     18 cat <<PREAMBLE
     19 # ***************************************************************************
     20 # *
     21 # *   Copyright (C) 1995-2014, International Business Machines
     22 # *   Corporation and others.  All Rights Reserved.
     23 # *
     24 # *   Generated per the algorithm for EUC-JP
     25 # *   described at http://encoding.spec.whatwg.org/#euc-jp.
     26 # *
     27 # ***************************************************************************
     28 <code_set_name>               "euc-jp-html5"
     29 <char_name_mask>              "AXXXX"
     30 <mb_cur_max>                  3
     31 <mb_cur_min>                  1
     32 <uconv_class>                 "MBCS"
     33 <subchar>                     \xF4\xFE
     34 <subchar1>                    \x1A
     35 <icu:charsetFamily>           "ASCII"
     36 
     37 <icu:state>                   0-7f, 8e:2, 8f:3, a1-fe:1
     38 <icu:state>                   a1-fe
     39 <icu:state>                   a1-e2
     40 <icu:state>                   a1-fe:1, a1:4, a3-a5:4, a8:4, ac-af:4, ee-f2:4, f4-fe:4
     41 <icu:state>                   a1-fe.u
     42 
     43 CHARMAP
     44 PREAMBLE
     45 }
     46 
     47 #<U0000> \x00 |0
     48 function ascii {
     49   for i in $(seq 0 127)
     50   do
     51     printf '<U%04X> \\x%02X |0\n' $i $i
     52   done
     53 }
     54 
     55 
     56 # Map 0x8E 0x[A1-DF] to U+FF61 to U+FF9F
     57 function half_width_kana {
     58   for i in $(seq 0xA1 0xDF)
     59   do
     60     # 65377 = 0xFF61, 161 = 0xA1
     61     printf '<U%04X> \\x8E\\x%02X |0\n' $(($i + 65377 - 161))  $i
     62   done
     63 }
     64 
     65 
     66 # index-jis0208.txt has index pointers larger than the size of
     67 # the encoding space available in 2-byte Graphic plane of ISO-2022-based
     68 # encoding (94 x 94 = 8836). We have to exclude them because they're for
     69 # Shift-JIS.
     70 # In addition, index-jis0208.txt has 10 pairs of duplicate mapping entries.
     71 # All the bi-directional mapping entries come *before* the uni-directional
     72 # (EUC-JP to Unicode) entries so that we put '|3' if we have seen
     73 # the same Unicode code point earlier in the list. According to the definition
     74 # of 'index pointer' in the W3C encoding spec, it's the first entry in the
     75 # file for a given Unicode code point.
     76 
     77 function jis208 {
     78   awk '!/^#/ && !/^$/ && $1 <= 8836  \
     79        { printf ("<U%4s> \\x%02X\\x%02X |%d\n", substr($2, 3),\
     80                  $1 / 94 + 0xA1, $1 % 94 + 0xA1,\
     81                  ($2 in uset) ? 3 : 0); \
     82          uset[$2] = 1;
     83        }' \
     84   index-jis0208.txt
     85 }
     86 
     87 # JIS X 212 is for decoding only (use '|3' to denote that).
     88 
     89 function jis212 {
     90   awk '!/^#/ && !/^$/ \
     91        { printf ("<U%4s> \\x8F\\x%02X\\x%02X |3\n", substr($2, 3),\
     92                  $1 / 94 + 0xA1, $1 % 94 + 0xA1);}' \
     93   index-jis0212.txt
     94 }
     95 
     96 function unsorted_table {
     97   ascii
     98   half_width_kana
     99   jis208
    100   jis212
    101   decode_only_extra
    102   echo '<U00A5> \x5C |1'
    103   echo '<U203E> \x7E |1'
    104 }
    105 
    106 preamble
    107 unsorted_table | sort  | uniq
    108 echo 'END CHARMAP'
    109