1 package org.unicode.cldr.tool; 2 3 import java.io.File; 4 import java.io.FileReader; 5 import java.io.IOException; 6 import java.io.OutputStreamWriter; 7 import java.io.Reader; 8 import java.io.Writer; 9 import java.util.ArrayList; 10 import java.util.Arrays; 11 import java.util.Collections; 12 import java.util.EnumSet; 13 import java.util.HashMap; 14 import java.util.HashSet; 15 import java.util.Iterator; 16 import java.util.LinkedHashMap; 17 import java.util.LinkedHashSet; 18 import java.util.List; 19 import java.util.Locale; 20 import java.util.Map; 21 import java.util.Map.Entry; 22 import java.util.Set; 23 import java.util.Stack; 24 import java.util.TreeMap; 25 import java.util.TreeSet; 26 import java.util.regex.Matcher; 27 import java.util.regex.Pattern; 28 29 import org.unicode.cldr.tool.Option.Options; 30 import org.unicode.cldr.util.CLDRPaths; 31 import org.unicode.cldr.util.CLDRTool; 32 import org.unicode.cldr.util.ChainedMap; 33 import org.unicode.cldr.util.ChainedMap.M4; 34 import org.unicode.cldr.util.CldrUtility; 35 import org.unicode.cldr.util.Counter; 36 import org.unicode.cldr.util.DtdData; 37 import org.unicode.cldr.util.DtdData.Attribute; 38 import org.unicode.cldr.util.DtdData.Element; 39 import org.unicode.cldr.util.DtdType; 40 import org.unicode.cldr.util.Pair; 41 import org.unicode.cldr.util.PatternCache; 42 import org.unicode.cldr.util.RegexUtilities; 43 import org.unicode.cldr.util.SimpleHtmlParser; 44 import org.unicode.cldr.util.SimpleHtmlParser.Type; 45 import org.unicode.cldr.util.TransliteratorUtilities; 46 47 import com.google.common.collect.ImmutableSet; 48 import com.ibm.icu.dev.util.CollectionUtilities; 49 import com.ibm.icu.impl.Relation; 50 import com.ibm.icu.impl.Row.R4; 51 import com.ibm.icu.text.BreakIterator; 52 import com.ibm.icu.util.Output; 53 import com.ibm.icu.util.ULocale; 54 55 @CLDRTool(alias = "checkhtmlfiles", description = "Look for errors in CLDR documentation tools", hidden = "Used for CLDR process") 56 public class CheckHtmlFiles { 57 58 static final Set<String> NOPOP = new HashSet<>(Arrays.asList("br", "img", "link", "meta", "!doctype", "hr", "col", "input")); 59 60 static final EnumSet<Type> SUPPRESS = EnumSet.of( 61 Type.ELEMENT, Type.ELEMENT_START, Type.ELEMENT_END, Type.ELEMENT_POP, 62 Type.ATTRIBUTE, Type.ATTRIBUTE_CONTENT); 63 64 final static Options myOptions = new Options(); 65 final static Writer LOG = new OutputStreamWriter(System.out); 66 static Pattern WELLFORMED_HEADER = PatternCache.get("\\s*(\\d+(\\.\\d+)*\\s*).*"); 67 static Pattern SUPPRESS_SECTION_NUMBER = PatternCache.get( 68 "(Annex [A-Z]: .*)" + 69 "|(Appendix [A-Z].*)" + 70 "|(.*Migrati(on|ng).*)" + 71 "|Step \\d+.*" + 72 "|Example \\d+.*" + 73 "|D\\d+\\.\\s.*" + 74 "|References" + 75 "|Acknowledge?ments" + 76 "|Rights to .*Images" + 77 "|Modifications" + 78 "|(Revision \\d+\\.?)"); 79 static Pattern SUPPRESS_REVISION = PatternCache.get("Revision \\d+\\.?"); 80 static Pattern SPACES = PatternCache.get("\\s+"); 81 82 enum MyOptions { 83 // old(".*", Settings.OTHER_WORKSPACE_DIRECTORY + "cldr-archive/cldr-22.1/specs/ldml/tr35\\.html", "source data (regex)"), 84 target(".*", CLDRPaths.BASE_DIRECTORY + "specs" + File.separator + "ldml" + File.separator + 85 "tr35(-.*)?\\.html", "target data (regex); ucd for Unicode docs; " 86 + "for others use the format -t ${workspace_loc}/unicode-draft/reports/tr51/tr51.html"), verbose(".*", "none", "verbose debugging messages"), 87 // contents(".*", CLDRPaths.BASE_DIRECTORY + "specs/ldml/tr35(-.*)?\\.html", "generate contents"), 88 // /cldr-archive 89 ; 90 91 // boilerplate 92 final Option option; 93 94 MyOptions(String argumentPattern, String defaultArgument, String helpText) { 95 option = myOptions.add(this, argumentPattern, defaultArgument, helpText); 96 } 97 } 98 99 enum Verbosity { 100 none, element, all; 101 static Verbosity of(String input) { 102 return input == null ? Verbosity.none : Verbosity.valueOf(input.toLowerCase(Locale.ROOT)); 103 } 104 } 105 106 static Verbosity verbose; 107 static boolean doContents; 108 static boolean isLdml; 109 110 public static void main(String[] args) throws IOException { 111 System.out.println("First do a replace of <a\\s+name=\"([^\"]*)\"\\s*> by <a name=\"$1\" href=\"#$1\">"); 112 System.out.println("Then check for all links with no anchors: <a([^>]*)></a>"); 113 System.out.println("Then check for all links that don't start with name or href <a (?!href|name)"); 114 115 myOptions.parse(MyOptions.target, args, true); 116 verbose = Verbosity.of(MyOptions.verbose.option.getValue()); 117 118 String targetString = MyOptions.target.option.getValue(); 119 if (targetString.contains("ldml")) { 120 isLdml = true; 121 } 122 if (targetString.equalsIgnoreCase("ucd")) { 123 targetString = CLDRPaths.BASE_DIRECTORY + "../unicode-draft/reports/tr(\\d+)/tr(\\d+).html"; 124 } else if (targetString.equalsIgnoreCase("security")) { 125 targetString = CLDRPaths.BASE_DIRECTORY + "../unicode-draft/reports/tr(3[69])/tr(3[69]).html"; 126 } 127 Data target = new Data().getSentences(targetString); 128 if (target.count == 0) { 129 throw new IllegalArgumentException("No files matched with " + targetString); 130 } 131 132 if (isLdml) { 133 checkForDtd(target); 134 } 135 136 System.out.println("*TOTAL COUNTS* files:" + target.count + ", fatal errors:" + target.totalFatalCount + ", nonfatal errors:" 137 + target.totalErrorCount); 138 if (target.totalFatalCount > 0 || target.totalErrorCount > 0) { 139 System.exit(1); // give an error status 140 } 141 142 System.exit(0); 143 144 // Data source = new Data().getSentences(MyOptions.old.option.getValue()); 145 // String file = MyOptions.target.option.getValue(); 146 // 147 // Data target = new Data().getSentences(file); 148 // 149 // int missingCount = 0, extraCount = 0; 150 // int line = 0; 151 // for (String sentence : source) { 152 // ++line; 153 // long sourceCount = source.getCount(sentence); 154 // long targetCount = target.getCount(sentence); 155 // if (targetCount == 0) { 156 // System.out.println(line + "\tMISSING:\t" + sourceCount + "" + targetCount + "\t" + sentence); 157 // ++missingCount; 158 // } 159 // } 160 // line = 0; 161 // for (String sentence : target) { 162 // ++line; 163 // long sourceCount = source.getCount(sentence); 164 // long targetCount = target.getCount(sentence); 165 // if (sourceCount == 0) { 166 // System.out.println(line + "\tEXTRA:\t" + targetCount + "" + sourceCount + "\t" + sentence); 167 // ++extraCount; 168 // } 169 // } 170 // System.out.println("Missing:\t" + missingCount); 171 // System.out.println("Extra:\t" + extraCount); 172 } 173 174 private static final Set<String> SKIP_ATTR = ImmutableSet.of("draft", "alt", "references", "cldrVersion", "unicodeVersion"); 175 176 private static void checkForDtd(Data target) { 177 M4<String, String, DtdType, Boolean> typeToElements = ChainedMap.of(new TreeMap(), new TreeMap(), new TreeMap(), Boolean.class); 178 for (DtdType type : DtdType.values()) { 179 if (type == DtdType.ldmlICU) continue; 180 DtdData dtdData = DtdData.getInstance(type); 181 Set<Element> elements = dtdData.getElements(); 182 for (Element element : elements) { 183 if (element.isDeprecated() 184 || element.equals(dtdData.PCDATA) 185 || element.equals(dtdData.ANY)) continue; 186 typeToElements.put(element.name, element.toDtdString(), type, Boolean.TRUE); 187 } 188 Set<Attribute> attributes = dtdData.getAttributes(); 189 for (Attribute attribute : attributes) { 190 if (attribute.isDeprecated()) continue; 191 if (SKIP_ATTR.contains(attribute.name)) { 192 continue; 193 } 194 typeToElements.put(attribute.element.name, attribute.appendDtdString(new StringBuilder()).toString(), type, Boolean.TRUE); 195 } 196 } 197 final Map<String, String> skeletonToInFile = new HashMap<>(); 198 Relation<String, String> extra = new Relation(new TreeMap(), TreeSet.class); 199 for (R4<String, String, String, Boolean> elementItem : target.dtdItems.rows()) { 200 String file = elementItem.get0(); 201 String element = elementItem.get1(); 202 String item = elementItem.get2(); 203 extra.put(element, item); 204 skeletonToInFile.put(item.replace(" ", ""), item); 205 } 206 ChainedMap.M4<String, String, DtdType, Comparison> status = ChainedMap.of(new TreeMap(), new TreeMap(), new TreeMap(), Comparison.class); 207 for (R4<String, String, DtdType, Boolean> entry : typeToElements.rows()) { 208 final String element = entry.get0(); 209 final String key = entry.get1(); 210 final DtdType dtdType = entry.get2(); 211 String spaceless = key.replace(" ", ""); 212 String realKey = skeletonToInFile.get(spaceless); 213 if (realKey == null) { 214 status.put(element, key, dtdType, Comparison.missing); 215 } else { 216 boolean found = extra.remove(element, realKey); 217 if (!found) { 218 status.put(element, key, dtdType, Comparison.no_rem); 219 } 220 } 221 } 222 for (Entry<String, String> extraItem : extra.entrySet()) { 223 status.put(extraItem.getKey(), extraItem.getValue(), DtdType.ldmlICU, Comparison.extra); 224 } 225 TreeSet<String> reverse = new TreeSet<>(Collections.reverseOrder()); 226 for (Entry<String, Map<String, Map<DtdType, Comparison>>> entry1 : status) { 227 String element = entry1.getKey(); 228 reverse.clear(); 229 final Map<String, Map<DtdType, Comparison>> itemToDtdTypeToComparison = entry1.getValue(); 230 reverse.addAll(itemToDtdTypeToComparison.keySet()); 231 for (String item : reverse) { 232 Map<DtdType, Comparison> typeToComparison = itemToDtdTypeToComparison.get(item); 233 for (Entry<DtdType, Comparison> entry2 : typeToComparison.entrySet()) { 234 System.out.println(element 235 + "\t" + entry2.getValue() 236 + "\t" + CldrUtility.ifSame(entry2.getKey(), DtdType.ldmlICU, "") 237 + "\t" + item); 238 } 239 } 240 } 241 } 242 243 enum Comparison { 244 missing, extra, no_rem 245 } 246 247 static Pattern WHITESPACE = PatternCache.get("[\\s]+"); 248 static Pattern BADSECTION = PatternCache.get("^\\s*(\\d+\\s*)?Section\\s*\\d+\\s*[-:]\\s*"); 249 250 static final Set<String> FORCEBREAK = new HashSet<String>(Arrays.asList( 251 "table", "div", "blockquote", 252 "p", "br", "td", "th", "h1", "h2", "h3", "h4", "h5", "li")); 253 254 // enum ContentsElements {h1, h2, h3, h4, h5, caption} 255 256 static final Set<String> DO_CONTENTS = new HashSet<String>(Arrays.asList( 257 "h1", "h2", "h3", "h4", "h5", "caption")); 258 259 static class Levels implements Comparable<Levels> { 260 final int[] levels = new int[10]; 261 final int h2_start; 262 263 public Levels(int h2_start) { 264 levels[0] = h2_start; // special adjustment of starting header level 265 this.h2_start = h2_start; 266 } 267 268 public Levels() { 269 this(0); 270 } 271 272 /** 273 * h2 = level 0, h3 is level 1, etc. 274 * @param level 275 * @return 276 */ 277 Levels next(int level, Output<Boolean> missingLevel) { 278 level -= 2; // h2 = level 0 279 missingLevel.value = false; 280 if (levels[0] < h2_start) { 281 missingLevel.value = true; 282 } 283 for (int i = 1; i < level; ++i) { 284 if (levels[i] == 0) { 285 missingLevel.value = true; 286 } 287 } 288 levels[level]++; 289 for (int i = level + 1; i < levels.length; ++i) { 290 levels[i] = 0; 291 } 292 return this; 293 } 294 295 public int getDepth() { 296 for (int i = 0;; ++i) { 297 int level = levels[i]; 298 if (level == 0) { 299 return i - 1; 300 } 301 } 302 } 303 304 @Override 305 public String toString() { 306 StringBuilder b = new StringBuilder(); 307 for (int i = 0;; ++i) { 308 int level = levels[i]; 309 if (level == 0) { 310 return b.toString(); 311 } 312 if (b.length() != 0) { 313 b.append('.'); 314 } 315 b.append(level); 316 } 317 } 318 319 public static Levels parse(String group) { 320 Levels result = new Levels(); 321 int currentLevel = 0; 322 for (int i = 0; i < group.length(); ++i) { 323 char ch = group.charAt(i); 324 if (ch == '.') { 325 currentLevel++; 326 } else { 327 ch -= '0'; 328 if (ch > '9') { 329 break; 330 } 331 result.levels[currentLevel] = result.levels[currentLevel] * 10 + ch; 332 } 333 } 334 return result; 335 } 336 337 @Override 338 public int compareTo(Levels other) { 339 for (int i = 0; i < levels.length; ++i) { 340 if (levels[i] != other.levels[i]) { 341 return levels[i] < other.levels[i] ? -1 : 1; 342 } 343 } 344 return 0; 345 } 346 347 public void set(Levels other) { 348 for (int i = 0; i < levels.length; ++i) { 349 levels[i] = other.levels[i]; 350 } 351 } 352 } 353 354 static class HeadingInfo { 355 private Levels levels = new Levels(); 356 private String text = ""; 357 private Set<String> ids = new LinkedHashSet<String>(); 358 private boolean suppressSection; 359 private boolean isHeader; 360 361 // temporary 362 private int level; 363 364 public void setLevel(String headingLabel, HeadingInfo lastHeading) { 365 isHeader = !headingLabel.equals("caption"); 366 level = isHeader ? headingLabel.charAt(1) - '0' : lastHeading.level; 367 } 368 369 @Override 370 public String toString() { 371 // <h3><a name="Identity_Elements" href="#Identity_Elements">5.3 Identity Elements</a></h3> 372 String id = ids.isEmpty() ? "NOID" : ids.iterator().next(); 373 String result = "<" + getLabel() 374 + "<a name=\"" + id + "\" href=\"#" + id + "\">" 375 + (!isHeader ? "" : suppressSection ? "" : levels + " ") 376 + TransliteratorUtilities.toHTML.transform(text) 377 + "</a>"; 378 if (ids.size() > 1) { 379 boolean first = true; 380 for (String id2 : ids) { 381 if (first) { 382 first = false; 383 } else { 384 result += "<a name=\"" + id2 + "\"></a>"; 385 } 386 } 387 } 388 return result + "</" + getLabel(); 389 } 390 391 public String getLabel() { 392 return isHeader ? "h" + level + ">" : "caption>"; 393 } 394 395 public String toHeader() { 396 String id = ids.iterator().next(); 397 return ("<li>" 398 + (!isHeader ? (text.contains("Table") || text.contains("Figure") ? "" : "Table: ") : suppressSection ? "" : levels + " ") 399 + "<a href=\"#" + id + "\">" 400 + TransliteratorUtilities.toHTML.transform(text) 401 + "</a>"); 402 } 403 404 public void addText(String toAppend) { 405 String temp = TransliteratorUtilities.fromHTML.transform(toAppend); 406 if (text.isEmpty()) { 407 if (temp.startsWith(" ")) { 408 text = temp.substring(1); 409 } else { 410 text = temp; 411 } 412 } else { 413 text += temp; 414 } 415 text = SPACES.matcher(text).replaceAll(" "); // clean up all spaces; make more efficient later 416 // used to trim, but we need to retain space between elements. So only trim the start, and later, the end 417 } 418 419 public boolean isContents() { 420 return text.toString().startsWith("Contents"); 421 } 422 423 void addId(String id) { 424 this.ids.add(id); 425 } 426 427 public void setLevels(int line, Levels levels, Set<String> errors) { 428 this.levels.set(levels); 429 String error = ""; 430 if (badSectionMatcher.reset(text).find()) { 431 text = text.substring(badSectionMatcher.end()); 432 error += "Extra 'Section...' at start; "; 433 } 434 if (isHeader) { 435 if (!headerMatcher.reset(text).matches()) { 436 if (!SUPPRESS_SECTION_NUMBER.matcher(text).matches()) { 437 error += "Missing section numbers; "; 438 } 439 } else { 440 text = text.substring(headerMatcher.end(1)); 441 if (text.startsWith(".")) { 442 text = text.substring(1).trim(); 443 error += "Extra . at start; "; 444 } 445 Levels parsedLevels = Levels.parse(headerMatcher.group(1)); 446 if (levels.compareTo(parsedLevels) != 0) { 447 error += "Section numbers mismatch, was " + parsedLevels + "; "; 448 } 449 } 450 } 451 if (ids.isEmpty()) { 452 addId(text.toString().trim().replaceAll("[^A-Za-z0-9]+", "_")); 453 error += "Missing double link"; 454 } 455 if (!error.isEmpty()) { 456 errors.add(this + "\t<!-- " + line + ": " + error + " -->"); 457 } 458 suppressSection = SUPPRESS_SECTION_NUMBER.matcher(text).matches(); 459 } 460 461 public void addIds(Counter<String> idCounter) { 462 for (String id : ids) { 463 idCounter.add(id, 1); 464 } 465 } 466 467 public HeadingInfo fixText() { 468 if (text.endsWith(" ")) { 469 text = text.substring(0, text.length() - 1); 470 } 471 return this; 472 } 473 } 474 475 static Matcher headerMatcher = WELLFORMED_HEADER.matcher(""); 476 static Matcher badSectionMatcher = BADSECTION.matcher(""); 477 478 static class HeadingInfoList { 479 private static final long serialVersionUID = -6722150173224993960L; 480 Levels lastBuildLevel; 481 private Set<String> errors = new LinkedHashSet<String>(); 482 Output<Boolean> missingLevel = new Output<Boolean>(false); 483 private String fileName; 484 ArrayList<HeadingInfo> list = new ArrayList<>(); 485 486 public HeadingInfoList(String fileName, int h2_START) { 487 this.fileName = fileName; 488 lastBuildLevel = new Levels(h2_START); 489 } 490 491 public boolean add(int line, HeadingInfo h) { 492 h.fixText(); 493 if (SUPPRESS_REVISION.matcher(h.text).matches()) { 494 return false; 495 } 496 if (h.isHeader) { 497 h.setLevels(line, lastBuildLevel.next(h.level, missingLevel), errors); 498 } else { 499 h.setLevels(line, lastBuildLevel, errors); 500 } 501 if (missingLevel.value) { 502 errors.add("FATAL: Missing Level in: " + h); 503 } 504 return list.add(h); 505 } 506 507 static final String PAD = "\t"; 508 509 public void listContents() { 510 511 System.out.print("\n\t\t<!-- START Generated TOC: CheckHtmlFiles -->"); 512 Counter<String> idCounter = new Counter<String>(); 513 514 int lastLevel = new Levels().getDepth(); 515 String pad = PAD; 516 int ulCount = 0; 517 int liCount = 0; 518 for (HeadingInfo h : list) { 519 h.addIds(idCounter); 520 final int depth = h.levels.getDepth() + (h.isHeader ? 0 : 1); 521 int levelDiff = depth - lastLevel; 522 lastLevel = depth; 523 if (levelDiff > 0) { 524 System.out.println(); 525 for (int i = 0; i < levelDiff; ++i) { 526 pad += PAD; 527 System.out.println(pad + "<ul class=\"toc\">"); 528 ++ulCount; 529 } 530 pad += PAD; 531 } else if (levelDiff < 0) { 532 System.out.println("</li>"); 533 --liCount; 534 for (int i = 0; i > levelDiff; --i) { 535 pad = pad.substring(PAD.length()); 536 System.out.println(pad + "</ul>"); 537 --ulCount; 538 pad = pad.substring(PAD.length()); 539 System.out.println(pad + "</li>"); 540 --liCount; 541 } 542 } else { 543 System.out.println("</li>"); 544 --liCount; 545 } 546 547 System.out.print(pad + h.toHeader()); 548 ++liCount; 549 550 // <li>1.1 <a href="#Conformance">Conformance</a></li> 551 552 // <ul class="toc"> 553 // <li>1 <a href="#Introduction">Introduction</a> 554 // <ul class="toc"> 555 // <li>1.1 <a href="#Conformance">Conformance</a> 556 // </li> 557 // ... 558 // </ul> 559 // </li> 560 } 561 562 // finish up and make sure we are balances 563 564 int levelDiff = -lastLevel; 565 System.out.println("</li>"); 566 --liCount; 567 for (int i = 0; i > levelDiff; --i) { 568 pad = pad.substring(PAD.length()); 569 System.out.println(pad + "</ul>"); 570 --ulCount; 571 pad = pad.substring(PAD.length()); 572 System.out.println(pad + "</li>"); 573 --liCount; 574 } 575 pad = pad.substring(PAD.length()); 576 System.out.println(pad + "</ul>"); 577 System.out.println(pad + "<!-- END Generated TOC: CheckHtmlFiles -->"); 578 --ulCount; 579 if (liCount != 0 || ulCount != 0) { 580 throw new IllegalArgumentException("Mismatched counts in generated contents, li:" + liCount + ", ul:" + ulCount); 581 } 582 for (String id : idCounter) { 583 long count = idCounter.get(id); 584 if (count != 1) { 585 errors.add("FATAL: Non-Unique ID: " + id); 586 } 587 } 588 } 589 590 /** 591 * Prints out errs 592 * @return fatal err count 593 */ 594 public int showErrors() { 595 int fatalCount = 0; 596 if (!errors.isEmpty()) { 597 System.out.println("\n*ERRORS*\n"); 598 for (String error : errors) { 599 if (error.startsWith("FATAL:")) { 600 System.out.println(fileName + "\t" + error); 601 fatalCount++; 602 } 603 } 604 if (fatalCount == 0) { 605 for (String error : errors) { 606 System.out.println(fileName + "\t" + error); 607 } 608 } 609 } 610 if (this.list.size() == 0) { 611 System.out.println("No header items (eg <h2>) captured."); 612 fatalCount = 1; 613 } 614 return fatalCount; 615 } 616 617 /** 618 * @return total number of errors 619 */ 620 public int totalErrorCount() { 621 return errors.size(); 622 } 623 } 624 625 static class ElementLine { 626 final String element; 627 final int line; 628 629 public ElementLine(String element, int line) { 630 super(); 631 this.element = element; 632 this.line = line; 633 } 634 635 @Override 636 public String toString() { 637 return element + '[' + line + ']'; 638 } 639 } 640 641 static class Data implements Iterable<String> { 642 private static final Pattern ELEMENT_ATTLIST = Pattern.compile("<!(ELEMENT|ATTLIST)\\s+(\\S+)[^>]*>"); 643 List<String> sentences = new ArrayList<String>(); 644 M4<String, String, String, Boolean> dtdItems = ChainedMap.of( 645 new LinkedHashMap<String, Object>(), 646 new TreeMap<String, Object>(), 647 new TreeMap<String, Object>(), Boolean.class); 648 Counter<String> hashedSentences = new Counter<String>(); 649 int count = 0; 650 int totalErrorCount = 0; 651 int totalFatalCount = 0; 652 653 public Data getSentences(String fileRegex) throws IOException { 654 String base; 655 String regex; 656 try { 657 int firstParen = fileRegex.indexOf('('); 658 if (firstParen < 0) { 659 firstParen = fileRegex.length(); 660 } 661 int lastSlash = fileRegex.lastIndexOf(File.separatorChar, firstParen); 662 base = fileRegex.substring(0, lastSlash); 663 regex = fileRegex.substring(lastSlash + 1); 664 } catch (Exception e) { 665 throw new IllegalArgumentException("Target file must be in special format. " + 666 "Up to the first path part /.../ containing a paragraph is constant, and the rest is a regex."); 667 } 668 669 //File sourceFile = new File(fileRegex); 670 File sourceDirectory = new File(base); 671 if (!sourceDirectory.exists()) { 672 throw new IllegalArgumentException("Can't find " + sourceDirectory); 673 } 674 String canonicalBase = sourceDirectory.getCanonicalPath(); 675 String FileRegex = canonicalBase + File.separator + regex; 676 FileRegex = FileRegex.replace("\\", "\\\\"); 677 FileRegex = FileRegex.replace("\\\\.", "\\."); 678 Matcher m = PatternCache.get(FileRegex).matcher(""); 679 System.out.println("Matcher: " + m); 680 681 return getSentences(sourceDirectory, m); 682 } 683 684 public Data getSentences(File sourceDirectory, Matcher m) throws IOException { 685 //System.out.println("Processing:\t" + sourceDirectory); 686 for (File file : sourceDirectory.listFiles()) { 687 if (file.isDirectory()) { 688 getSentences(file, m); 689 continue; 690 } 691 String fileString = file.getCanonicalFile().toString(); 692 File fileCanonical = new File(fileString); 693 if (!m.reset(fileString).matches()) { 694 if (verbose == Verbosity.all) { 695 System.out.println("Skipping: " + RegexUtilities.showMismatch(m, fileString) 696 + "\t" + sourceDirectory); 697 } 698 continue; 699 } 700 701 System.out.println("\nProcessing:\t" + sourceDirectory + File.separator + fileString); 702 703 int H2_START = fileString.contains("tr18") ? -1 : 0; 704 try (Reader in = new FileReader(fileCanonical)) { 705 parseFile(fileCanonical, H2_START, in); 706 } 707 } 708 return this; 709 } 710 711 SimpleHtmlParser parser = new SimpleHtmlParser(); 712 713 public void parseFile(File fileCanonical, int H2_START, Reader in) throws IOException { 714 Matcher wsMatcher = WHITESPACE.matcher(""); 715 ++count; 716 // SimpleHtmlParser parser = new SimpleHtmlParser().setReader(in); 717 parser.setReader(in); 718 StringBuilder buffer = new StringBuilder(); 719 StringBuilder content = new StringBuilder(); 720 HeadingInfo heading = new HeadingInfo(); 721 final String fileName = fileCanonical.getName(); 722 HeadingInfoList headingInfoList = new HeadingInfoList(fileName, H2_START); 723 Stack<ElementLine> elementStack = new Stack<>(); 724 Stack<Pair<String, String>> attributeStack = new Stack<>(); 725 String contentString; 726 boolean inHeading = false; 727 boolean inPop = false; 728 boolean inAnchor = false; 729 boolean haveContents = false; 730 HeadingInfo lastHeading = null; 731 // for detecting missing captions 732 boolean pushedTable = false; 733 boolean checkCaption = false; 734 List<Integer> captionWarnings = new ArrayList<Integer>(); 735 736 main: while (true) { 737 int lineCount = parser.getLineCount(); 738 Type x = parser.next(content); 739 if (verbose == Verbosity.all && !SUPPRESS.contains(x)) { 740 LOG.write(parser.getLineCount() + "\t" + x + ":\t" + content + ""); 741 //SimpleHtmlParser.writeResult(x, content, LOG); 742 LOG.write("\n"); 743 LOG.flush(); 744 } 745 switch (x) { 746 case QUOTE: 747 contentString = content.toString().toLowerCase(Locale.ENGLISH).trim(); 748 if (contentString.equalsIgnoreCase("nocaption")) { 749 pushedTable = false; 750 } 751 break; 752 case ATTRIBUTE: 753 contentString = content.toString().toLowerCase(Locale.ENGLISH); 754 if (inHeading && (contentString.equals("name") || contentString.equals("id"))) { 755 inAnchor = true; 756 } else { 757 inAnchor = false; 758 } 759 attributeStack.add(new Pair<String, String>(contentString, null)); 760 break; 761 case ATTRIBUTE_CONTENT: 762 contentString = content.toString().toLowerCase(Locale.ENGLISH); 763 if (inAnchor) { 764 heading.addId(content.toString()); 765 } 766 Pair<String, String> lastAttribute = attributeStack.peek(); 767 if (lastAttribute.getSecond() != null) { 768 System.out.println(lineCount + "\tDouble Attribute: " + contentString + ", peek=" + lastAttribute); 769 } else { 770 lastAttribute.setSecond(contentString); 771 } 772 break; 773 case ELEMENT: 774 contentString = content.toString().toLowerCase(Locale.ENGLISH); 775 if (inPop) { 776 ElementLine peek; 777 while (true) { 778 peek = elementStack.peek(); 779 if (!NOPOP.contains(peek.element)) { 780 break; 781 } 782 elementStack.pop(); 783 } 784 if (!peek.element.equals(contentString)) { 785 System.out.println(lineCount 786 + "\tCouldn't pop: " + contentString 787 + ", " + showElementStack(elementStack)); 788 } else { 789 elementStack.pop(); 790 } 791 } else { 792 // check that the first element following a table is a caption 793 if (pushedTable && !"caption".equals(contentString)) { 794 captionWarnings.add(lineCount); 795 } 796 elementStack.push(new ElementLine(contentString, lineCount)); 797 pushedTable = checkCaption && "table".equals(contentString); 798 if (!checkCaption && "h3".equals(contentString)) { // h3 around Summary in standard format 799 checkCaption = true; 800 } 801 } 802 if (verbose != Verbosity.none) { 803 LOG.write(parser.getLineCount() + "\telem:\t" + showElementStack(elementStack) + "\n"); 804 LOG.flush(); 805 } 806 if (FORCEBREAK.contains(contentString)) { 807 buffer.append("\n"); 808 } 809 if (DO_CONTENTS.contains(contentString)) { 810 if (inPop) { 811 if (inHeading) { 812 inHeading = false; 813 if (heading.isContents()) { 814 haveContents = true; 815 } else if (haveContents) { 816 headingInfoList.add(parser.getLineCount(), heading); 817 lastHeading = heading; 818 } 819 heading = new HeadingInfo(); 820 } 821 } else { 822 heading.setLevel(contentString, lastHeading); 823 inHeading = true; 824 } 825 } 826 break; 827 case ELEMENT_START: 828 inPop = false; 829 break; 830 case ELEMENT_END: 831 if (verbose == Verbosity.all && !attributeStack.isEmpty()) { 832 LOG.write(parser.getLineCount() + "\tattr:\t" + showAttributeStack(attributeStack) + System.lineSeparator()); 833 LOG.flush(); 834 } 835 attributeStack.clear(); 836 inPop = false; 837 break; 838 case ELEMENT_POP: 839 inPop = true; 840 break; 841 case ELEMENT_CONTENT: 842 contentString = wsMatcher.reset(content).replaceAll(" ").replace(" ", " "); 843 buffer.append(contentString.indexOf('&') >= 0 844 ? TransliteratorUtilities.fromHTML.transform(contentString) 845 : contentString); 846 if (inHeading) { 847 heading.addText(contentString); 848 } 849 break; 850 case DONE: 851 break main; 852 default: 853 break; // skip everything else. 854 } 855 } 856 857 // get DTD elements 858 Matcher m = ELEMENT_ATTLIST.matcher(buffer); 859 while (m.find()) { 860 dtdItems.put(fileName, m.group(2), m.group(), true); 861 //System.out.println(fileName + "\t" + m.group()); 862 } 863 BreakIterator sentenceBreak = BreakIterator.getSentenceInstance(ULocale.ENGLISH); 864 String bufferString = normalizeWhitespace(buffer); 865 sentenceBreak.setText(bufferString); 866 int last = 0; 867 while (true) { 868 int pos = sentenceBreak.next(); 869 if (pos == BreakIterator.DONE) { 870 break; 871 } 872 String sentence = bufferString.substring(last, pos).trim(); 873 last = pos; 874 if (sentence.isEmpty()) { 875 continue; 876 } 877 hashedSentences.add(sentence, 1); 878 sentences.add(sentence); 879 } 880 if (!captionWarnings.isEmpty()) { 881 System.out.println("WARNING: Missing <caption> on the following lines: " 882 + "\n " + CollectionUtilities.join(captionWarnings, ", ") 883 + "\n\tTo fix, add <caption> after the <table>, such as:" 884 + "\n\t\t<table>" 885 + "\n\t\t\t<caption>Private Use Codes in CLDR</a></caption>" 886 + "\n\tOften the sentence just before the <table> can be made into the caption." 887 + "\n\tThe next time you run this program, youll be prompted with double-links." 888 + "\n\tIf it really shouldn't have a caption, add <!-- nocaption --> after the <table> instead."); 889 } 890 int fatalCount = headingInfoList.showErrors(); 891 totalFatalCount += fatalCount; 892 totalErrorCount += headingInfoList.totalErrorCount(); 893 if (fatalCount == 0) { 894 headingInfoList.listContents(); 895 } else { 896 System.out.println("\nFix fatal errors in " + fileCanonical + " before contents can be generated"); 897 } 898 } 899 900 private String showAttributeStack(Stack<Pair<String, String>> attributeStack) { 901 StringBuilder result = new StringBuilder(); 902 for (Pair<String, String> s : attributeStack) { 903 result.append("[@"); 904 result.append(s.getFirst()); 905 final String second = s.getSecond(); 906 if (second != null) { 907 result.append("='"); 908 result.append(second); 909 result.append("'"); 910 } 911 result.append("]"); 912 } 913 return result.toString(); 914 } 915 916 private String showElementStack(Stack<ElementLine> elementStack) { 917 StringBuilder result = new StringBuilder(); 918 for (ElementLine s : elementStack) { 919 result.append('/').append(s); 920 } 921 return result.toString(); 922 } 923 924 /** 925 * Return string after collapsing multiple whitespace containing '\\n' to '\\n', 926 * and otherwise 'space'. 927 * @param input 928 * @return 929 */ 930 private String normalizeWhitespace(CharSequence input) { 931 Matcher m = WHITESPACE.matcher(input); 932 StringBuilder buffer = new StringBuilder(); 933 int last = 0; 934 while (m.find()) { 935 int start = m.start(); 936 buffer.append(input.subSequence(last, start)); 937 last = m.end(); 938 String whiteString = m.group(); 939 if (whiteString.indexOf('\n') >= 0) { 940 buffer.append('\n'); 941 } else { 942 buffer.append(' '); 943 } 944 } 945 buffer.append(input.subSequence(last, input.length())); 946 return buffer.toString().trim(); 947 } 948 949 public long getCount(String sentence) { 950 return hashedSentences.getCount(sentence); 951 } 952 953 @Override 954 public Iterator<String> iterator() { 955 return sentences.iterator(); 956 } 957 } 958 } 959