1 #!/usr/bin/env python 2 # Copyright (c) 2010 Google Inc. All rights reserved. 3 # 4 # Redistribution and use in source and binary forms, with or without 5 # modification, are permitted provided that the following conditions are 6 # met: 7 # 8 # * Redistributions of source code must retain the above copyright 9 # notice, this list of conditions and the following disclaimer. 10 # * Redistributions in binary form must reproduce the above 11 # copyright notice, this list of conditions and the following disclaimer 12 # in the documentation and/or other materials provided with the 13 # distribution. 14 # * Neither the name of Google Inc. nor the names of its 15 # contributors may be used to endorse or promote products derived from 16 # this software without specific prior written permission. 17 # 18 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 19 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 20 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 21 # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 22 # OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 23 # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 24 # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 25 # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 26 # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 28 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 30 """This python script creates the raw data that is our entity 31 database. The representation is one string database containing all 32 strings we could need, and then a mapping from offset+length -> entity 33 data. That is compact, easy to use and efficient.""" 34 35 import csv 36 import os.path 37 import string 38 import sys 39 40 ENTITY = 0 41 VALUE = 1 42 43 def convert_value_to_int(value): 44 if not value: 45 return "0"; 46 assert(value[0] == "U") 47 assert(value[1] == "+") 48 return "0x" + value[2:] 49 50 51 def offset_table_entry(offset): 52 return " &staticEntityTable[%s]," % offset 53 54 55 program_name = os.path.basename(__file__) 56 if len(sys.argv) < 4 or sys.argv[1] != "-o": 57 # Python 3, change to: print("Usage: %s -o OUTPUT_FILE INPUT_FILE" % program_name, file=sys.stderr) 58 sys.stderr.write("Usage: %s -o OUTPUT_FILE INPUT_FILE\n" % program_name) 59 exit(1) 60 61 output_path = sys.argv[2] 62 input_path = sys.argv[3] 63 64 with open(input_path) as html_entity_names_file: 65 entries = list(csv.reader(html_entity_names_file)) 66 67 entries.sort(key = lambda entry: entry[ENTITY]) 68 entity_count = len(entries) 69 70 output_file = open(output_path, "w") 71 72 output_file.write("""/* 73 * Copyright (C) 2010 Google, Inc. All Rights Reserved. 74 * 75 * Redistribution and use in source and binary forms, with or without 76 * modification, are permitted provided that the following conditions 77 * are met: 78 * 1. Redistributions of source code must retain the above copyright 79 * notice, this list of conditions and the following disclaimer. 80 * 2. Redistributions in binary form must reproduce the above copyright 81 * notice, this list of conditions and the following disclaimer in the 82 * documentation and/or other materials provided with the distribution. 83 * 84 * THIS SOFTWARE IS PROVIDED BY APPLE INC. ``AS IS'' AND ANY 85 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 86 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 87 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL APPLE INC. OR 88 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 89 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 90 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 91 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 92 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 93 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 94 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 95 */ 96 97 // THIS FILE IS GENERATED BY core/html/parser/create-html-entity-table 98 // DO NOT EDIT (unless you are a ninja)! 99 100 #include "config.h" 101 #include "core/html/parser/HTMLEntityTable.h" 102 103 namespace blink { 104 105 namespace { 106 """) 107 108 assert len(entries) > 0, "Code assumes a non-empty entity array." 109 def check_ascii(entity_string): 110 for c in entity_string: 111 code = ord(c) 112 assert 0 <= code <= 127, (c + " is not ASCII. Need to change type " + 113 "of storage from LChar to UChar to support " + 114 "this entity.") 115 116 output_file.write("static const LChar staticEntityStringStorage[] = {\n") 117 output_file.write("'") 118 all_data = "" 119 entity_offset = 0 120 first_output = True 121 saved_by_reusing = 0 122 for entry in entries: 123 check_ascii(entry[ENTITY]) 124 # Reuse substrings from earlier entries. This saves 1-2000 125 # characters, but it's O(n^2) and not very smart. The optimal 126 # solution has to solve the "Shortest Common Superstring" problem 127 # and that is NP-Complete or worse. 128 # 129 # This would be even more efficient if we didn't store the 130 # semi-colon in the array but as a bit in the entry. 131 entity = entry[ENTITY] 132 already_existing_offset = all_data.find(entity) 133 if already_existing_offset != -1: 134 # Reusing space. 135 this_offset = already_existing_offset 136 saved_by_reusing += len(entity) 137 else: 138 if not first_output: 139 output_file.write(",\n'") 140 first_output = False 141 142 # Try the end of the string and see if we can reuse that to 143 # fit the start of the new entity. 144 data_to_add = entity 145 this_offset = entity_offset 146 for truncated_len in range(len(entity) - 1, 0, -1): 147 if all_data.endswith(entity[:truncated_len]): 148 data_to_add = entity[truncated_len:] 149 this_offset = entity_offset - truncated_len 150 saved_by_reusing += truncated_len 151 break 152 153 output_file.write("', '".join(data_to_add)) 154 all_data += data_to_add 155 output_file.write("'") 156 entity_offset += len(data_to_add) 157 assert len(entry) == 2, "We will use slot [2] in the list for the offset." 158 assert this_offset < 32768 # Stored in a 16 bit short. 159 entry.append(this_offset) 160 161 output_file.write("};\n") 162 163 index = {} 164 for offset, entry in enumerate(entries): 165 starting_letter = entry[ENTITY][0] 166 if starting_letter not in index: 167 index[starting_letter] = offset 168 169 output_file.write(""" 170 static const HTMLEntityTableEntry staticEntityTable[%s] = {\n""" % entity_count) 171 172 for entry in entries: 173 values = entry[VALUE].split(' ') 174 assert len(values) <= 2, values 175 output_file.write(' { %s, %s, %s, %s }, // &%s\n' % ( 176 convert_value_to_int(values[0]), 177 convert_value_to_int(values[1] if len(values) >= 2 else ""), 178 entry[2], 179 len(entry[ENTITY]), 180 entry[ENTITY], 181 )) 182 183 output_file.write("""}; 184 185 """) 186 187 output_file.write(""" 188 } 189 """) 190 191 output_file.write("static const short uppercaseOffset[] = {\n") 192 for letter in string.ascii_uppercase: 193 output_file.write("%d,\n" % index[letter]) 194 output_file.write("%d\n" % index['a']) 195 output_file.write("""}; 196 197 static const short lowercaseOffset[] = {\n""") 198 for letter in string.ascii_lowercase: 199 output_file.write("%d,\n" % index[letter]) 200 output_file.write("%d\n" % entity_count) 201 output_file.write("""}; 202 203 const LChar* HTMLEntityTable::entityString(const HTMLEntityTableEntry& entry) 204 { 205 return staticEntityStringStorage + entry.entityOffset; 206 } 207 208 LChar HTMLEntityTableEntry::lastCharacter() const 209 { 210 return HTMLEntityTable::entityString(*this)[length - 1]; 211 } 212 213 const HTMLEntityTableEntry* HTMLEntityTable::firstEntryStartingWith(UChar c) 214 { 215 if (c >= 'A' && c <= 'Z') 216 return &staticEntityTable[uppercaseOffset[c - 'A']]; 217 if (c >= 'a' && c <= 'z') 218 return &staticEntityTable[lowercaseOffset[c - 'a']]; 219 return 0; 220 } 221 222 const HTMLEntityTableEntry* HTMLEntityTable::lastEntryStartingWith(UChar c) 223 { 224 if (c >= 'A' && c <= 'Z') 225 return &staticEntityTable[uppercaseOffset[c - 'A' + 1]] - 1; 226 if (c >= 'a' && c <= 'z') 227 return &staticEntityTable[lowercaseOffset[c - 'a' + 1]] - 1; 228 return 0; 229 } 230 231 const HTMLEntityTableEntry* HTMLEntityTable::firstEntry() 232 { 233 return &staticEntityTable[0]; 234 } 235 236 const HTMLEntityTableEntry* HTMLEntityTable::lastEntry() 237 { 238 return &staticEntityTable[%s - 1]; 239 } 240 241 } 242 """ % entity_count) 243