Home | History | Annotate | Download | only in dictionary
      1 # Step 04 - generate Java literals.
      2 #
      3 # Java byte-code has ridiculous restrictions. There is no such thing as
      4 # "array literal" - those are implemented as series of data[x] = y;
      5 # as a consequence N-byte array will use 7N bytes in class, plus N bytes
      6 # in instantiated variable. Also no literal could be longer than 64KiB.
      7 #
      8 # To keep dictionary data compact both in source code and in compiled format
      9 # we use the following tricks:
     10 #  * use String as a data container
     11 #  * store only lowest 7 bits; i.e. all characters fit ASCII table; this allows
     12 #    efficient conversion to byte array; also ASCII characters use only 1 byte
     13 #.   of memory (UTF-8 encoding)
     14 #  * RLE-compress sequence of 8-th bits
     15 #
     16 # This script generates literals used in Java code.
     17 
     18 bin_path = "dictionary.bin"
     19 
     20 with open(bin_path, "rb") as raw:
     21   data = raw.read()
     22 
     23 low = []
     24 hi = []
     25 is_skip = True
     26 skip_flip_offset = 36
     27 cntr = skip_flip_offset
     28 for b in data:
     29   value = ord(b)
     30   low.append(chr(value & 0x7F))
     31   if is_skip:
     32     if value < 0x80:
     33       cntr += 1
     34     else:
     35       is_skip = False
     36       hi.append(unichr(cntr))
     37       cntr = skip_flip_offset + 1
     38   else:
     39     if value >= 0x80:
     40       cntr += 1
     41     else:
     42       is_skip = True
     43       hi.append(unichr(cntr))
     44       cntr = skip_flip_offset + 1
     45 hi.append(unichr(cntr))
     46 
     47 low0 = low[0 : len(low) // 2]
     48 low1 = low[len(low) // 2 : len(low)]
     49 
     50 def escape(chars):
     51   result = []
     52   for c in chars:
     53     if "\r" == c:
     54       result.append("\\r")
     55     elif "\n" == c:
     56       result.append("\\n")
     57     elif "\t" == c:
     58       result.append("\\t")
     59     elif "\"" == c:
     60       result.append("\\\"")
     61     elif "\\" == c:
     62       result.append("\\\\")
     63     elif ord(c) < 32 or ord(c) >= 127:
     64       result.append("\\u%04X" % ord(c))
     65     else:
     66       result.append(c);
     67   return result
     68 
     69 
     70 source_code = [
     71     "  private static final String DATA0 = \"", "".join(escape(low0)), "\";\n",
     72     "  private static final String DATA1 = \"", "".join(escape(low1)), "\";\n",
     73     "  private static final String SKIP_FLIP = \"", "".join(escape(hi)), "\";\n"
     74 ]
     75 
     76 src_path = "DictionaryData.inc.java"
     77 
     78 with open(src_path, "w") as source:
     79   source.write("".join(source_code))
     80