Home | History | Annotate | Download | only in parser
      1 #!/usr/bin/env python
      2 # Copyright (c) 2010 Google Inc. All rights reserved.
      3 #
      4 # Redistribution and use in source and binary forms, with or without
      5 # modification, are permitted provided that the following conditions are
      6 # met:
      7 # 
      8 #     * Redistributions of source code must retain the above copyright
      9 # notice, this list of conditions and the following disclaimer.
     10 #     * Redistributions in binary form must reproduce the above
     11 # copyright notice, this list of conditions and the following disclaimer
     12 # in the documentation and/or other materials provided with the
     13 # distribution.
     14 #     * Neither the name of Google Inc. nor the names of its
     15 # contributors may be used to endorse or promote products derived from
     16 # this software without specific prior written permission.
     17 # 
     18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
     19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
     20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
     21 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
     22 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
     23 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
     24 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
     25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
     26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     28 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     29 
     30 """This python script creates the raw data that is our entity
     31 database. The representation is one string database containing all
     32 strings we could need, and then a mapping from offset+length -> entity
     33 data. That is compact, easy to use and efficient."""
     34 
     35 import csv
     36 import os.path
     37 import string
     38 import sys
     39 
     40 ENTITY = 0
     41 VALUE = 1
     42 
     43 def convert_value_to_int(value):
     44     if not value:
     45         return "0";
     46     assert(value[0] == "U")
     47     assert(value[1] == "+")
     48     return "0x" + value[2:]
     49 
     50 
     51 def offset_table_entry(offset):
     52     return "    &staticEntityTable[%s]," % offset
     53 
     54 
     55 program_name = os.path.basename(__file__)
     56 if len(sys.argv) < 4 or sys.argv[1] != "-o":
     57     # Python 3, change to: print("Usage: %s -o OUTPUT_FILE INPUT_FILE" % program_name, file=sys.stderr)
     58     sys.stderr.write("Usage: %s -o OUTPUT_FILE INPUT_FILE\n" % program_name)
     59     exit(1)
     60 
     61 output_path = sys.argv[2]
     62 input_path = sys.argv[3]
     63 
     64 with open(input_path) as html_entity_names_file:
     65     entries = list(csv.reader(html_entity_names_file))
     66 
     67 entries.sort(key = lambda entry: entry[ENTITY])
     68 entity_count = len(entries)
     69 
     70 output_file = open(output_path, "w")
     71 
     72 output_file.write("""/*
     73  * Copyright (C) 2010 Google, Inc. All Rights Reserved.
     74  *
     75  * Redistribution and use in source and binary forms, with or without
     76  * modification, are permitted provided that the following conditions
     77  * are met:
     78  * 1. Redistributions of source code must retain the above copyright
     79  *    notice, this list of conditions and the following disclaimer.
     80  * 2. Redistributions in binary form must reproduce the above copyright
     81  *    notice, this list of conditions and the following disclaimer in the
     82  *    documentation and/or other materials provided with the distribution.
     83  *
     84  * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY
     85  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     86  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
     87  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL APPLE INC. OR
     88  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
     89  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
     90  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
     91  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
     92  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     93  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
     94  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 
     95  */
     96 
     97 // THIS FILE IS GENERATED BY core/html/parser/create-html-entity-table
     98 // DO NOT EDIT (unless you are a ninja)!
     99 
    100 #include "config.h"
    101 #include "core/html/parser/HTMLEntityTable.h"
    102 
    103 namespace blink {
    104 
    105 namespace {
    106 """)
    107 
    108 assert len(entries) > 0, "Code assumes a non-empty entity array."
    109 def check_ascii(entity_string):
    110     for c in entity_string:
    111         code = ord(c)
    112         assert 0 <= code <= 127, (c + " is not ASCII. Need to change type " +
    113                                   "of storage from LChar to UChar to support " +
    114                                   "this entity.")
    115 
    116 output_file.write("static const LChar staticEntityStringStorage[] = {\n")
    117 output_file.write("'")
    118 all_data = ""
    119 entity_offset = 0
    120 first_output = True
    121 saved_by_reusing = 0
    122 for entry in entries:
    123     check_ascii(entry[ENTITY])
    124     # Reuse substrings from earlier entries. This saves 1-2000
    125     # characters, but it's O(n^2) and not very smart. The optimal
    126     # solution has to solve the "Shortest Common Superstring" problem
    127     # and that is NP-Complete or worse.
    128     #
    129     # This would be even more efficient if we didn't store the
    130     # semi-colon in the array but as a bit in the entry.
    131     entity = entry[ENTITY]
    132     already_existing_offset = all_data.find(entity)
    133     if already_existing_offset != -1:
    134         # Reusing space.
    135         this_offset = already_existing_offset
    136         saved_by_reusing += len(entity)
    137     else:
    138         if not first_output:
    139             output_file.write(",\n'")
    140         first_output = False
    141 
    142         # Try the end of the string and see if we can reuse that to
    143         # fit the start of the new entity.
    144         data_to_add = entity
    145         this_offset = entity_offset
    146         for truncated_len in range(len(entity) - 1, 0, -1):
    147             if all_data.endswith(entity[:truncated_len]):
    148                 data_to_add = entity[truncated_len:]
    149                 this_offset = entity_offset - truncated_len
    150                 saved_by_reusing += truncated_len
    151                 break
    152 
    153         output_file.write("', '".join(data_to_add))
    154         all_data += data_to_add
    155         output_file.write("'")
    156         entity_offset += len(data_to_add)
    157     assert len(entry) == 2, "We will use slot [2] in the list for the offset."
    158     assert this_offset < 32768 # Stored in a 16 bit short.
    159     entry.append(this_offset)
    160 
    161 output_file.write("};\n")
    162 
    163 index = {}
    164 for offset, entry in enumerate(entries):
    165     starting_letter = entry[ENTITY][0]
    166     if starting_letter not in index:
    167         index[starting_letter] = offset
    168 
    169 output_file.write("""
    170 static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count)
    171 
    172 for entry in entries:
    173     values = entry[VALUE].split(' ')
    174     assert len(values) <= 2, values
    175     output_file.write('    { %s, %s, %s, %s }, // &%s\n' % (
    176         convert_value_to_int(values[0]),
    177         convert_value_to_int(values[1] if len(values) >= 2 else ""),
    178         entry[2],
    179         len(entry[ENTITY]),
    180         entry[ENTITY],
    181         ))
    182 
    183 output_file.write("""};
    184 
    185 """)
    186 
    187 output_file.write("""
    188 }
    189 """)
    190 
    191 output_file.write("static const short uppercaseOffset[] = {\n")
    192 for letter in string.ascii_uppercase:
    193     output_file.write("%d,\n" % index[letter])
    194 output_file.write("%d\n" % index['a'])
    195 output_file.write("""};
    196 
    197 static const short lowercaseOffset[] = {\n""")
    198 for letter in string.ascii_lowercase:
    199     output_file.write("%d,\n" % index[letter])
    200 output_file.write("%d\n" % entity_count)
    201 output_file.write("""};
    202 
    203 const LChar* HTMLEntityTable::entityString(const HTMLEntityTableEntry& entry)
    204 {
    205     return staticEntityStringStorage + entry.entityOffset;
    206 }
    207 
    208 LChar HTMLEntityTableEntry::lastCharacter() const
    209 {
    210     return HTMLEntityTable::entityString(*this)[length - 1];
    211 }
    212 
    213 const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c)
    214 {
    215     if (c >= 'A' && c <= 'Z')
    216         return &staticEntityTable[uppercaseOffset[c - 'A']];
    217     if (c >= 'a' && c <= 'z')
    218         return &staticEntityTable[lowercaseOffset[c - 'a']];
    219     return 0;
    220 }
    221 
    222 const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c)
    223 {
    224     if (c >= 'A' && c <= 'Z')
    225         return &staticEntityTable[uppercaseOffset[c - 'A' + 1]] - 1;
    226     if (c >= 'a' && c <= 'z')
    227         return &staticEntityTable[lowercaseOffset[c - 'a' + 1]] - 1;
    228     return 0;
    229 }
    230 
    231 const HTMLEntityTableEntry* HTMLEntityTable::firstEntry()
    232 {
    233     return &staticEntityTable[0];
    234 }
    235 
    236 const HTMLEntityTableEntry* HTMLEntityTable::lastEntry()
    237 {
    238     return &staticEntityTable[%s - 1];
    239 }
    240 
    241 }
    242 """ % entity_count)
    243