Home | History | Annotate | Download | only in tool
      1 package org.unicode.cldr.tool;
      2 
      3 import java.io.File;
      4 import java.io.FileReader;
      5 import java.io.IOException;
      6 import java.io.OutputStreamWriter;
      7 import java.io.Reader;
      8 import java.io.Writer;
      9 import java.util.ArrayList;
     10 import java.util.Arrays;
     11 import java.util.Collections;
     12 import java.util.EnumSet;
     13 import java.util.HashMap;
     14 import java.util.HashSet;
     15 import java.util.Iterator;
     16 import java.util.LinkedHashMap;
     17 import java.util.LinkedHashSet;
     18 import java.util.List;
     19 import java.util.Locale;
     20 import java.util.Map;
     21 import java.util.Map.Entry;
     22 import java.util.Set;
     23 import java.util.Stack;
     24 import java.util.TreeMap;
     25 import java.util.TreeSet;
     26 import java.util.regex.Matcher;
     27 import java.util.regex.Pattern;
     28 
     29 import org.unicode.cldr.tool.Option.Options;
     30 import org.unicode.cldr.util.CLDRPaths;
     31 import org.unicode.cldr.util.CLDRTool;
     32 import org.unicode.cldr.util.ChainedMap;
     33 import org.unicode.cldr.util.ChainedMap.M4;
     34 import org.unicode.cldr.util.CldrUtility;
     35 import org.unicode.cldr.util.Counter;
     36 import org.unicode.cldr.util.DtdData;
     37 import org.unicode.cldr.util.DtdData.Attribute;
     38 import org.unicode.cldr.util.DtdData.Element;
     39 import org.unicode.cldr.util.DtdType;
     40 import org.unicode.cldr.util.Pair;
     41 import org.unicode.cldr.util.PatternCache;
     42 import org.unicode.cldr.util.RegexUtilities;
     43 import org.unicode.cldr.util.SimpleHtmlParser;
     44 import org.unicode.cldr.util.SimpleHtmlParser.Type;
     45 import org.unicode.cldr.util.TransliteratorUtilities;
     46 
     47 import com.google.common.collect.ImmutableSet;
     48 import com.ibm.icu.dev.util.CollectionUtilities;
     49 import com.ibm.icu.impl.Relation;
     50 import com.ibm.icu.impl.Row.R4;
     51 import com.ibm.icu.text.BreakIterator;
     52 import com.ibm.icu.util.Output;
     53 import com.ibm.icu.util.ULocale;
     54 
     55 @CLDRTool(alias = "checkhtmlfiles", description = "Look for errors in CLDR documentation tools", hidden = "Used for CLDR process")
     56 public class CheckHtmlFiles {
     57 
     58     static final Set<String> NOPOP = new HashSet<>(Arrays.asList("br", "img", "link", "meta", "!doctype", "hr", "col", "input"));
     59 
     60     static final EnumSet<Type> SUPPRESS = EnumSet.of(
     61         Type.ELEMENT, Type.ELEMENT_START, Type.ELEMENT_END, Type.ELEMENT_POP,
     62         Type.ATTRIBUTE, Type.ATTRIBUTE_CONTENT);
     63 
     64     final static Options myOptions = new Options();
     65     final static Writer LOG = new OutputStreamWriter(System.out);
     66     static Pattern WELLFORMED_HEADER = PatternCache.get("\\s*(\\d+(\\.\\d+)*\\s*).*");
     67     static Pattern SUPPRESS_SECTION_NUMBER = PatternCache.get(
     68         "(Annex [A-Z]: .*)" +
     69             "|(Appendix [A-Z].*)" +
     70             "|(.*Migrati(on|ng).*)" +
     71             "|Step \\d+.*" +
     72             "|Example \\d+.*" +
     73             "|D\\d+\\.\\s.*" +
     74             "|References" +
     75             "|Acknowledge?ments" +
     76             "|Rights to .*Images" +
     77             "|Modifications" +
     78             "|(Revision \\d+\\.?)");
     79     static Pattern SUPPRESS_REVISION = PatternCache.get("Revision \\d+\\.?");
     80     static Pattern SPACES = PatternCache.get("\\s+");
     81 
     82     enum MyOptions {
     83 //        old(".*", Settings.OTHER_WORKSPACE_DIRECTORY + "cldr-archive/cldr-22.1/specs/ldml/tr35\\.html", "source data (regex)"),
     84         target(".*", CLDRPaths.BASE_DIRECTORY + "specs" + File.separator + "ldml" + File.separator +
     85             "tr35(-.*)?\\.html", "target data (regex); ucd for Unicode docs; "
     86                 + "for others use the format -t ${workspace_loc}/unicode-draft/reports/tr51/tr51.html"), verbose(".*", "none", "verbose debugging messages"),
     87 //        contents(".*", CLDRPaths.BASE_DIRECTORY + "specs/ldml/tr35(-.*)?\\.html", "generate contents"),
     88         // /cldr-archive
     89         ;
     90 
     91         // boilerplate
     92         final Option option;
     93 
     94         MyOptions(String argumentPattern, String defaultArgument, String helpText) {
     95             option = myOptions.add(this, argumentPattern, defaultArgument, helpText);
     96         }
     97     }
     98 
     99     enum Verbosity {
    100         none, element, all;
    101         static Verbosity of(String input) {
    102             return input == null ? Verbosity.none : Verbosity.valueOf(input.toLowerCase(Locale.ROOT));
    103         }
    104     }
    105 
    106     static Verbosity verbose;
    107     static boolean doContents;
    108     static boolean isLdml;
    109 
    110     public static void main(String[] args) throws IOException {
    111         System.out.println("First do a replace of <a\\s+name=\"([^\"]*)\"\\s*> by <a name=\"$1\" href=\"#$1\">");
    112         System.out.println("Then check for all links with no anchors: <a([^>]*)></a>");
    113         System.out.println("Then check for all links that don't start with name or href <a (?!href|name)");
    114 
    115         myOptions.parse(MyOptions.target, args, true);
    116         verbose = Verbosity.of(MyOptions.verbose.option.getValue());
    117 
    118         String targetString = MyOptions.target.option.getValue();
    119         if (targetString.contains("ldml")) {
    120             isLdml = true;
    121         }
    122         if (targetString.equalsIgnoreCase("ucd")) {
    123             targetString = CLDRPaths.BASE_DIRECTORY + "../unicode-draft/reports/tr(\\d+)/tr(\\d+).html";
    124         } else if (targetString.equalsIgnoreCase("security")) {
    125             targetString = CLDRPaths.BASE_DIRECTORY + "../unicode-draft/reports/tr(3[69])/tr(3[69]).html";
    126         }
    127         Data target = new Data().getSentences(targetString);
    128         if (target.count == 0) {
    129             throw new IllegalArgumentException("No files matched with " + targetString);
    130         }
    131 
    132         if (isLdml) {
    133             checkForDtd(target);
    134         }
    135 
    136         System.out.println("*TOTAL COUNTS*  files:" + target.count + ", fatal errors:" + target.totalFatalCount + ", nonfatal errors:"
    137             + target.totalErrorCount);
    138         if (target.totalFatalCount > 0 || target.totalErrorCount > 0) {
    139             System.exit(1); // give an error status
    140         }
    141 
    142         System.exit(0);
    143 
    144 //        Data source = new Data().getSentences(MyOptions.old.option.getValue());
    145 //        String file = MyOptions.target.option.getValue();
    146 //
    147 //        Data target = new Data().getSentences(file);
    148 //
    149 //        int missingCount = 0, extraCount = 0;
    150 //        int line = 0;
    151 //        for (String sentence : source) {
    152 //            ++line;
    153 //            long sourceCount = source.getCount(sentence);
    154 //            long targetCount = target.getCount(sentence);
    155 //            if (targetCount == 0) {
    156 //                System.out.println(line + "\tMISSING:\t" + sourceCount + "" + targetCount + "\t" + sentence);
    157 //                ++missingCount;
    158 //            }
    159 //        }
    160 //        line = 0;
    161 //        for (String sentence : target) {
    162 //            ++line;
    163 //            long sourceCount = source.getCount(sentence);
    164 //            long targetCount = target.getCount(sentence);
    165 //            if (sourceCount == 0) {
    166 //                System.out.println(line + "\tEXTRA:\t" + targetCount + "" + sourceCount + "\t" + sentence);
    167 //                ++extraCount;
    168 //            }
    169 //        }
    170 //        System.out.println("Missing:\t" + missingCount);
    171 //        System.out.println("Extra:\t" + extraCount);
    172     }
    173 
    174     private static final Set<String> SKIP_ATTR = ImmutableSet.of("draft", "alt", "references", "cldrVersion", "unicodeVersion");
    175 
    176     private static void checkForDtd(Data target) {
    177         M4<String, String, DtdType, Boolean> typeToElements = ChainedMap.of(new TreeMap(), new TreeMap(), new TreeMap(), Boolean.class);
    178         for (DtdType type : DtdType.values()) {
    179             if (type == DtdType.ldmlICU) continue;
    180             DtdData dtdData = DtdData.getInstance(type);
    181             Set<Element> elements = dtdData.getElements();
    182             for (Element element : elements) {
    183                 if (element.isDeprecated()
    184                     || element.equals(dtdData.PCDATA)
    185                     || element.equals(dtdData.ANY)) continue;
    186                 typeToElements.put(element.name, element.toDtdString(), type, Boolean.TRUE);
    187             }
    188             Set<Attribute> attributes = dtdData.getAttributes();
    189             for (Attribute attribute : attributes) {
    190                 if (attribute.isDeprecated()) continue;
    191                 if (SKIP_ATTR.contains(attribute.name)) {
    192                     continue;
    193                 }
    194                 typeToElements.put(attribute.element.name, attribute.appendDtdString(new StringBuilder()).toString(), type, Boolean.TRUE);
    195             }
    196         }
    197         final Map<String, String> skeletonToInFile = new HashMap<>();
    198         Relation<String, String> extra = new Relation(new TreeMap(), TreeSet.class);
    199         for (R4<String, String, String, Boolean> elementItem : target.dtdItems.rows()) {
    200             String file = elementItem.get0();
    201             String element = elementItem.get1();
    202             String item = elementItem.get2();
    203             extra.put(element, item);
    204             skeletonToInFile.put(item.replace(" ", ""), item);
    205         }
    206         ChainedMap.M4<String, String, DtdType, Comparison> status = ChainedMap.of(new TreeMap(), new TreeMap(), new TreeMap(), Comparison.class);
    207         for (R4<String, String, DtdType, Boolean> entry : typeToElements.rows()) {
    208             final String element = entry.get0();
    209             final String key = entry.get1();
    210             final DtdType dtdType = entry.get2();
    211             String spaceless = key.replace(" ", "");
    212             String realKey = skeletonToInFile.get(spaceless);
    213             if (realKey == null) {
    214                 status.put(element, key, dtdType, Comparison.missing);
    215             } else {
    216                 boolean found = extra.remove(element, realKey);
    217                 if (!found) {
    218                     status.put(element, key, dtdType, Comparison.no_rem);
    219                 }
    220             }
    221         }
    222         for (Entry<String, String> extraItem : extra.entrySet()) {
    223             status.put(extraItem.getKey(), extraItem.getValue(), DtdType.ldmlICU, Comparison.extra);
    224         }
    225         TreeSet<String> reverse = new TreeSet<>(Collections.reverseOrder());
    226         for (Entry<String, Map<String, Map<DtdType, Comparison>>> entry1 : status) {
    227             String element = entry1.getKey();
    228             reverse.clear();
    229             final Map<String, Map<DtdType, Comparison>> itemToDtdTypeToComparison = entry1.getValue();
    230             reverse.addAll(itemToDtdTypeToComparison.keySet());
    231             for (String item : reverse) {
    232                 Map<DtdType, Comparison> typeToComparison = itemToDtdTypeToComparison.get(item);
    233                 for (Entry<DtdType, Comparison> entry2 : typeToComparison.entrySet()) {
    234                     System.out.println(element
    235                         + "\t" + entry2.getValue()
    236                         + "\t" + CldrUtility.ifSame(entry2.getKey(), DtdType.ldmlICU, "")
    237                         + "\t" + item);
    238                 }
    239             }
    240         }
    241     }
    242 
    243     enum Comparison {
    244         missing, extra, no_rem
    245     }
    246 
    247     static Pattern WHITESPACE = PatternCache.get("[\\s]+");
    248     static Pattern BADSECTION = PatternCache.get("^\\s*(\\d+\\s*)?Section\\s*\\d+\\s*[-:]\\s*");
    249 
    250     static final Set<String> FORCEBREAK = new HashSet<String>(Arrays.asList(
    251         "table", "div", "blockquote",
    252         "p", "br", "td", "th", "h1", "h2", "h3", "h4", "h5", "li"));
    253 
    254 //    enum ContentsElements {h1, h2, h3, h4, h5, caption}
    255 
    256     static final Set<String> DO_CONTENTS = new HashSet<String>(Arrays.asList(
    257         "h1", "h2", "h3", "h4", "h5", "caption"));
    258 
    259     static class Levels implements Comparable<Levels> {
    260         final int[] levels = new int[10];
    261         final int h2_start;
    262 
    263         public Levels(int h2_start) {
    264             levels[0] = h2_start; // special adjustment of starting header level
    265             this.h2_start = h2_start;
    266         }
    267 
    268         public Levels() {
    269             this(0);
    270         }
    271 
    272         /**
    273          * h2 = level 0, h3 is level 1, etc.
    274          * @param level
    275          * @return
    276          */
    277         Levels next(int level, Output<Boolean> missingLevel) {
    278             level -= 2; // h2 = level 0
    279             missingLevel.value = false;
    280             if (levels[0] < h2_start) {
    281                 missingLevel.value = true;
    282             }
    283             for (int i = 1; i < level; ++i) {
    284                 if (levels[i] == 0) {
    285                     missingLevel.value = true;
    286                 }
    287             }
    288             levels[level]++;
    289             for (int i = level + 1; i < levels.length; ++i) {
    290                 levels[i] = 0;
    291             }
    292             return this;
    293         }
    294 
    295         public int getDepth() {
    296             for (int i = 0;; ++i) {
    297                 int level = levels[i];
    298                 if (level == 0) {
    299                     return i - 1;
    300                 }
    301             }
    302         }
    303 
    304         @Override
    305         public String toString() {
    306             StringBuilder b = new StringBuilder();
    307             for (int i = 0;; ++i) {
    308                 int level = levels[i];
    309                 if (level == 0) {
    310                     return b.toString();
    311                 }
    312                 if (b.length() != 0) {
    313                     b.append('.');
    314                 }
    315                 b.append(level);
    316             }
    317         }
    318 
    319         public static Levels parse(String group) {
    320             Levels result = new Levels();
    321             int currentLevel = 0;
    322             for (int i = 0; i < group.length(); ++i) {
    323                 char ch = group.charAt(i);
    324                 if (ch == '.') {
    325                     currentLevel++;
    326                 } else {
    327                     ch -= '0';
    328                     if (ch > '9') {
    329                         break;
    330                     }
    331                     result.levels[currentLevel] = result.levels[currentLevel] * 10 + ch;
    332                 }
    333             }
    334             return result;
    335         }
    336 
    337         @Override
    338         public int compareTo(Levels other) {
    339             for (int i = 0; i < levels.length; ++i) {
    340                 if (levels[i] != other.levels[i]) {
    341                     return levels[i] < other.levels[i] ? -1 : 1;
    342                 }
    343             }
    344             return 0;
    345         }
    346 
    347         public void set(Levels other) {
    348             for (int i = 0; i < levels.length; ++i) {
    349                 levels[i] = other.levels[i];
    350             }
    351         }
    352     }
    353 
    354     static class HeadingInfo {
    355         private Levels levels = new Levels();
    356         private String text = "";
    357         private Set<String> ids = new LinkedHashSet<String>();
    358         private boolean suppressSection;
    359         private boolean isHeader;
    360 
    361         // temporary
    362         private int level;
    363 
    364         public void setLevel(String headingLabel, HeadingInfo lastHeading) {
    365             isHeader = !headingLabel.equals("caption");
    366             level = isHeader ? headingLabel.charAt(1) - '0' : lastHeading.level;
    367         }
    368 
    369         @Override
    370         public String toString() {
    371             //   <h3><a name="Identity_Elements" href="#Identity_Elements">5.3 Identity Elements</a></h3>
    372             String id = ids.isEmpty() ? "NOID" : ids.iterator().next();
    373             String result = "<" + getLabel()
    374                 + "<a name=\"" + id + "\" href=\"#" + id + "\">"
    375                 + (!isHeader ? "" : suppressSection ? "" : levels + " ")
    376                 + TransliteratorUtilities.toHTML.transform(text)
    377                 + "</a>";
    378             if (ids.size() > 1) {
    379                 boolean first = true;
    380                 for (String id2 : ids) {
    381                     if (first) {
    382                         first = false;
    383                     } else {
    384                         result += "<a name=\"" + id2 + "\"></a>";
    385                     }
    386                 }
    387             }
    388             return result + "</" + getLabel();
    389         }
    390 
    391         public String getLabel() {
    392             return isHeader ? "h" + level + ">" : "caption>";
    393         }
    394 
    395         public String toHeader() {
    396             String id = ids.iterator().next();
    397             return ("<li>"
    398                 + (!isHeader ? (text.contains("Table") || text.contains("Figure") ? "" : "Table: ") : suppressSection ? "" : levels + " ")
    399                 + "<a href=\"#" + id + "\">"
    400                 + TransliteratorUtilities.toHTML.transform(text)
    401                 + "</a>");
    402         }
    403 
    404         public void addText(String toAppend) {
    405             String temp = TransliteratorUtilities.fromHTML.transform(toAppend);
    406             if (text.isEmpty()) {
    407                 if (temp.startsWith(" ")) {
    408                     text = temp.substring(1);
    409                 } else {
    410                     text = temp;
    411                 }
    412             } else {
    413                 text += temp;
    414             }
    415             text = SPACES.matcher(text).replaceAll(" "); // clean up all spaces; make more efficient later
    416             // used to trim, but we need to retain space between elements. So only trim the start, and later, the end
    417         }
    418 
    419         public boolean isContents() {
    420             return text.toString().startsWith("Contents");
    421         }
    422 
    423         void addId(String id) {
    424             this.ids.add(id);
    425         }
    426 
    427         public void setLevels(int line, Levels levels, Set<String> errors) {
    428             this.levels.set(levels);
    429             String error = "";
    430             if (badSectionMatcher.reset(text).find()) {
    431                 text = text.substring(badSectionMatcher.end());
    432                 error += "Extra 'Section...' at start; ";
    433             }
    434             if (isHeader) {
    435                 if (!headerMatcher.reset(text).matches()) {
    436                     if (!SUPPRESS_SECTION_NUMBER.matcher(text).matches()) {
    437                         error += "Missing section numbers; ";
    438                     }
    439                 } else {
    440                     text = text.substring(headerMatcher.end(1));
    441                     if (text.startsWith(".")) {
    442                         text = text.substring(1).trim();
    443                         error += "Extra . at start; ";
    444                     }
    445                     Levels parsedLevels = Levels.parse(headerMatcher.group(1));
    446                     if (levels.compareTo(parsedLevels) != 0) {
    447                         error += "Section numbers mismatch, was " + parsedLevels + "; ";
    448                     }
    449                 }
    450             }
    451             if (ids.isEmpty()) {
    452                 addId(text.toString().trim().replaceAll("[^A-Za-z0-9]+", "_"));
    453                 error += "Missing double link";
    454             }
    455             if (!error.isEmpty()) {
    456                 errors.add(this + "\t<!-- " + line + ": " + error + " -->");
    457             }
    458             suppressSection = SUPPRESS_SECTION_NUMBER.matcher(text).matches();
    459         }
    460 
    461         public void addIds(Counter<String> idCounter) {
    462             for (String id : ids) {
    463                 idCounter.add(id, 1);
    464             }
    465         }
    466 
    467         public HeadingInfo fixText() {
    468             if (text.endsWith(" ")) {
    469                 text = text.substring(0, text.length() - 1);
    470             }
    471             return this;
    472         }
    473     }
    474 
    475     static Matcher headerMatcher = WELLFORMED_HEADER.matcher("");
    476     static Matcher badSectionMatcher = BADSECTION.matcher("");
    477 
    478     static class HeadingInfoList {
    479         private static final long serialVersionUID = -6722150173224993960L;
    480         Levels lastBuildLevel;
    481         private Set<String> errors = new LinkedHashSet<String>();
    482         Output<Boolean> missingLevel = new Output<Boolean>(false);
    483         private String fileName;
    484         ArrayList<HeadingInfo> list = new ArrayList<>();
    485 
    486         public HeadingInfoList(String fileName, int h2_START) {
    487             this.fileName = fileName;
    488             lastBuildLevel = new Levels(h2_START);
    489         }
    490 
    491         public boolean add(int line, HeadingInfo h) {
    492             h.fixText();
    493             if (SUPPRESS_REVISION.matcher(h.text).matches()) {
    494                 return false;
    495             }
    496             if (h.isHeader) {
    497                 h.setLevels(line, lastBuildLevel.next(h.level, missingLevel), errors);
    498             } else {
    499                 h.setLevels(line, lastBuildLevel, errors);
    500             }
    501             if (missingLevel.value) {
    502                 errors.add("FATAL: Missing Level in: " + h);
    503             }
    504             return list.add(h);
    505         }
    506 
    507         static final String PAD = "\t";
    508 
    509         public void listContents() {
    510 
    511             System.out.print("\n\t\t<!-- START Generated TOC: CheckHtmlFiles -->");
    512             Counter<String> idCounter = new Counter<String>();
    513 
    514             int lastLevel = new Levels().getDepth();
    515             String pad = PAD;
    516             int ulCount = 0;
    517             int liCount = 0;
    518             for (HeadingInfo h : list) {
    519                 h.addIds(idCounter);
    520                 final int depth = h.levels.getDepth() + (h.isHeader ? 0 : 1);
    521                 int levelDiff = depth - lastLevel;
    522                 lastLevel = depth;
    523                 if (levelDiff > 0) {
    524                     System.out.println();
    525                     for (int i = 0; i < levelDiff; ++i) {
    526                         pad += PAD;
    527                         System.out.println(pad + "<ul class=\"toc\">");
    528                         ++ulCount;
    529                     }
    530                     pad += PAD;
    531                 } else if (levelDiff < 0) {
    532                     System.out.println("</li>");
    533                     --liCount;
    534                     for (int i = 0; i > levelDiff; --i) {
    535                         pad = pad.substring(PAD.length());
    536                         System.out.println(pad + "</ul>");
    537                         --ulCount;
    538                         pad = pad.substring(PAD.length());
    539                         System.out.println(pad + "</li>");
    540                         --liCount;
    541                     }
    542                 } else {
    543                     System.out.println("</li>");
    544                     --liCount;
    545                 }
    546 
    547                 System.out.print(pad + h.toHeader());
    548                 ++liCount;
    549 
    550                 //              <li>1.1 <a href="#Conformance">Conformance</a></li>
    551 
    552                 //                <ul class="toc">
    553                 //                <li>1 <a href="#Introduction">Introduction</a>
    554                 //                  <ul class="toc">
    555                 //                    <li>1.1 <a href="#Conformance">Conformance</a>
    556                 //                    </li>
    557                 //                    ...
    558                 //                  </ul>
    559                 //                </li>
    560             }
    561 
    562             // finish up and make sure we are balances
    563 
    564             int levelDiff = -lastLevel;
    565             System.out.println("</li>");
    566             --liCount;
    567             for (int i = 0; i > levelDiff; --i) {
    568                 pad = pad.substring(PAD.length());
    569                 System.out.println(pad + "</ul>");
    570                 --ulCount;
    571                 pad = pad.substring(PAD.length());
    572                 System.out.println(pad + "</li>");
    573                 --liCount;
    574             }
    575             pad = pad.substring(PAD.length());
    576             System.out.println(pad + "</ul>");
    577             System.out.println(pad + "<!-- END Generated TOC: CheckHtmlFiles -->");
    578             --ulCount;
    579             if (liCount != 0 || ulCount != 0) {
    580                 throw new IllegalArgumentException("Mismatched counts in generated contents, li:" + liCount + ", ul:" + ulCount);
    581             }
    582             for (String id : idCounter) {
    583                 long count = idCounter.get(id);
    584                 if (count != 1) {
    585                     errors.add("FATAL: Non-Unique ID: " + id);
    586                 }
    587             }
    588         }
    589 
    590         /**
    591          * Prints out errs
    592          * @return fatal err count
    593          */
    594         public int showErrors() {
    595             int fatalCount = 0;
    596             if (!errors.isEmpty()) {
    597                 System.out.println("\n*ERRORS*\n");
    598                 for (String error : errors) {
    599                     if (error.startsWith("FATAL:")) {
    600                         System.out.println(fileName + "\t" + error);
    601                         fatalCount++;
    602                     }
    603                 }
    604                 if (fatalCount == 0) {
    605                     for (String error : errors) {
    606                         System.out.println(fileName + "\t" + error);
    607                     }
    608                 }
    609             }
    610             if (this.list.size() == 0) {
    611                 System.out.println("No header items (eg <h2>) captured.");
    612                 fatalCount = 1;
    613             }
    614             return fatalCount;
    615         }
    616 
    617         /**
    618          * @return total number of errors
    619          */
    620         public int totalErrorCount() {
    621             return errors.size();
    622         }
    623     }
    624 
    625     static class ElementLine {
    626         final String element;
    627         final int line;
    628 
    629         public ElementLine(String element, int line) {
    630             super();
    631             this.element = element;
    632             this.line = line;
    633         }
    634 
    635         @Override
    636         public String toString() {
    637             return element + '[' + line + ']';
    638         }
    639     }
    640 
    641     static class Data implements Iterable<String> {
    642         private static final Pattern ELEMENT_ATTLIST = Pattern.compile("<!(ELEMENT|ATTLIST)\\s+(\\S+)[^>]*>");
    643         List<String> sentences = new ArrayList<String>();
    644         M4<String, String, String, Boolean> dtdItems = ChainedMap.of(
    645             new LinkedHashMap<String, Object>(),
    646             new TreeMap<String, Object>(),
    647             new TreeMap<String, Object>(), Boolean.class);
    648         Counter<String> hashedSentences = new Counter<String>();
    649         int count = 0;
    650         int totalErrorCount = 0;
    651         int totalFatalCount = 0;
    652 
    653         public Data getSentences(String fileRegex) throws IOException {
    654             String base;
    655             String regex;
    656             try {
    657                 int firstParen = fileRegex.indexOf('(');
    658                 if (firstParen < 0) {
    659                     firstParen = fileRegex.length();
    660                 }
    661                 int lastSlash = fileRegex.lastIndexOf(File.separatorChar, firstParen);
    662                 base = fileRegex.substring(0, lastSlash);
    663                 regex = fileRegex.substring(lastSlash + 1);
    664             } catch (Exception e) {
    665                 throw new IllegalArgumentException("Target file must be in special format. " +
    666                     "Up to the first path part /.../ containing a paragraph is constant, and the rest is a regex.");
    667             }
    668 
    669             //File sourceFile = new File(fileRegex);
    670             File sourceDirectory = new File(base);
    671             if (!sourceDirectory.exists()) {
    672                 throw new IllegalArgumentException("Can't find " + sourceDirectory);
    673             }
    674             String canonicalBase = sourceDirectory.getCanonicalPath();
    675             String FileRegex = canonicalBase + File.separator + regex;
    676             FileRegex = FileRegex.replace("\\", "\\\\");
    677             FileRegex = FileRegex.replace("\\\\.", "\\.");
    678             Matcher m = PatternCache.get(FileRegex).matcher("");
    679             System.out.println("Matcher: " + m);
    680 
    681             return getSentences(sourceDirectory, m);
    682         }
    683 
    684         public Data getSentences(File sourceDirectory, Matcher m) throws IOException {
    685             //System.out.println("Processing:\t" + sourceDirectory);
    686             for (File file : sourceDirectory.listFiles()) {
    687                 if (file.isDirectory()) {
    688                     getSentences(file, m);
    689                     continue;
    690                 }
    691                 String fileString = file.getCanonicalFile().toString();
    692                 File fileCanonical = new File(fileString);
    693                 if (!m.reset(fileString).matches()) {
    694                     if (verbose == Verbosity.all) {
    695                         System.out.println("Skipping: " + RegexUtilities.showMismatch(m, fileString)
    696                             + "\t" + sourceDirectory);
    697                     }
    698                     continue;
    699                 }
    700 
    701                 System.out.println("\nProcessing:\t" + sourceDirectory + File.separator + fileString);
    702 
    703                 int H2_START = fileString.contains("tr18") ? -1 : 0;
    704                 try (Reader in = new FileReader(fileCanonical)) {
    705                     parseFile(fileCanonical, H2_START, in);
    706                 }
    707             }
    708             return this;
    709         }
    710 
    711         SimpleHtmlParser parser = new SimpleHtmlParser();
    712 
    713         public void parseFile(File fileCanonical, int H2_START, Reader in) throws IOException {
    714             Matcher wsMatcher = WHITESPACE.matcher("");
    715             ++count;
    716             // SimpleHtmlParser parser = new SimpleHtmlParser().setReader(in);
    717             parser.setReader(in);
    718             StringBuilder buffer = new StringBuilder();
    719             StringBuilder content = new StringBuilder();
    720             HeadingInfo heading = new HeadingInfo();
    721             final String fileName = fileCanonical.getName();
    722             HeadingInfoList headingInfoList = new HeadingInfoList(fileName, H2_START);
    723             Stack<ElementLine> elementStack = new Stack<>();
    724             Stack<Pair<String, String>> attributeStack = new Stack<>();
    725             String contentString;
    726             boolean inHeading = false;
    727             boolean inPop = false;
    728             boolean inAnchor = false;
    729             boolean haveContents = false;
    730             HeadingInfo lastHeading = null;
    731             // for detecting missing captions
    732             boolean pushedTable = false;
    733             boolean checkCaption = false;
    734             List<Integer> captionWarnings = new ArrayList<Integer>();
    735 
    736             main: while (true) {
    737                 int lineCount = parser.getLineCount();
    738                 Type x = parser.next(content);
    739                 if (verbose == Verbosity.all && !SUPPRESS.contains(x)) {
    740                     LOG.write(parser.getLineCount() + "\t" + x + ":\t" + content + "");
    741                     //SimpleHtmlParser.writeResult(x, content, LOG);
    742                     LOG.write("\n");
    743                     LOG.flush();
    744                 }
    745                 switch (x) {
    746                 case QUOTE:
    747                     contentString = content.toString().toLowerCase(Locale.ENGLISH).trim();
    748                     if (contentString.equalsIgnoreCase("nocaption")) {
    749                         pushedTable = false;
    750                     }
    751                     break;
    752                 case ATTRIBUTE:
    753                     contentString = content.toString().toLowerCase(Locale.ENGLISH);
    754                     if (inHeading && (contentString.equals("name") || contentString.equals("id"))) {
    755                         inAnchor = true;
    756                     } else {
    757                         inAnchor = false;
    758                     }
    759                     attributeStack.add(new Pair<String, String>(contentString, null));
    760                     break;
    761                 case ATTRIBUTE_CONTENT:
    762                     contentString = content.toString().toLowerCase(Locale.ENGLISH);
    763                     if (inAnchor) {
    764                         heading.addId(content.toString());
    765                     }
    766                     Pair<String, String> lastAttribute = attributeStack.peek();
    767                     if (lastAttribute.getSecond() != null) {
    768                         System.out.println(lineCount + "\tDouble Attribute: " + contentString + ", peek=" + lastAttribute);
    769                     } else {
    770                         lastAttribute.setSecond(contentString);
    771                     }
    772                     break;
    773                 case ELEMENT:
    774                     contentString = content.toString().toLowerCase(Locale.ENGLISH);
    775                     if (inPop) {
    776                         ElementLine peek;
    777                         while (true) {
    778                             peek = elementStack.peek();
    779                             if (!NOPOP.contains(peek.element)) {
    780                                 break;
    781                             }
    782                             elementStack.pop();
    783                         }
    784                         if (!peek.element.equals(contentString)) {
    785                             System.out.println(lineCount
    786                                 + "\tCouldn't pop: " + contentString
    787                                 + ", " + showElementStack(elementStack));
    788                         } else {
    789                             elementStack.pop();
    790                         }
    791                     } else {
    792                         // check that the first element following a table is a caption
    793                         if (pushedTable && !"caption".equals(contentString)) {
    794                             captionWarnings.add(lineCount);
    795                         }
    796                         elementStack.push(new ElementLine(contentString, lineCount));
    797                         pushedTable = checkCaption && "table".equals(contentString);
    798                         if (!checkCaption && "h3".equals(contentString)) { // h3 around Summary in standard format
    799                             checkCaption = true;
    800                         }
    801                     }
    802                     if (verbose != Verbosity.none) {
    803                         LOG.write(parser.getLineCount() + "\telem:\t" + showElementStack(elementStack) + "\n");
    804                         LOG.flush();
    805                     }
    806                     if (FORCEBREAK.contains(contentString)) {
    807                         buffer.append("\n");
    808                     }
    809                     if (DO_CONTENTS.contains(contentString)) {
    810                         if (inPop) {
    811                             if (inHeading) {
    812                                 inHeading = false;
    813                                 if (heading.isContents()) {
    814                                     haveContents = true;
    815                                 } else if (haveContents) {
    816                                     headingInfoList.add(parser.getLineCount(), heading);
    817                                     lastHeading = heading;
    818                                 }
    819                                 heading = new HeadingInfo();
    820                             }
    821                         } else {
    822                             heading.setLevel(contentString, lastHeading);
    823                             inHeading = true;
    824                         }
    825                     }
    826                     break;
    827                 case ELEMENT_START:
    828                     inPop = false;
    829                     break;
    830                 case ELEMENT_END:
    831                     if (verbose == Verbosity.all && !attributeStack.isEmpty()) {
    832                         LOG.write(parser.getLineCount() + "\tattr:\t" + showAttributeStack(attributeStack) + System.lineSeparator());
    833                         LOG.flush();
    834                     }
    835                     attributeStack.clear();
    836                     inPop = false;
    837                     break;
    838                 case ELEMENT_POP:
    839                     inPop = true;
    840                     break;
    841                 case ELEMENT_CONTENT:
    842                     contentString = wsMatcher.reset(content).replaceAll(" ").replace("&nbsp;", " ");
    843                     buffer.append(contentString.indexOf('&') >= 0
    844                         ? TransliteratorUtilities.fromHTML.transform(contentString)
    845                         : contentString);
    846                     if (inHeading) {
    847                         heading.addText(contentString);
    848                     }
    849                     break;
    850                 case DONE:
    851                     break main;
    852                 default:
    853                     break; // skip everything else.
    854                 }
    855             }
    856 
    857             // get DTD elements
    858             Matcher m = ELEMENT_ATTLIST.matcher(buffer);
    859             while (m.find()) {
    860                 dtdItems.put(fileName, m.group(2), m.group(), true);
    861                 //System.out.println(fileName + "\t" + m.group());
    862             }
    863             BreakIterator sentenceBreak = BreakIterator.getSentenceInstance(ULocale.ENGLISH);
    864             String bufferString = normalizeWhitespace(buffer);
    865             sentenceBreak.setText(bufferString);
    866             int last = 0;
    867             while (true) {
    868                 int pos = sentenceBreak.next();
    869                 if (pos == BreakIterator.DONE) {
    870                     break;
    871                 }
    872                 String sentence = bufferString.substring(last, pos).trim();
    873                 last = pos;
    874                 if (sentence.isEmpty()) {
    875                     continue;
    876                 }
    877                 hashedSentences.add(sentence, 1);
    878                 sentences.add(sentence);
    879             }
    880             if (!captionWarnings.isEmpty()) {
    881                 System.out.println("WARNING: Missing <caption> on the following lines: "
    882                     + "\n    " + CollectionUtilities.join(captionWarnings, ", ")
    883                     + "\n\tTo fix, add <caption> after the <table>, such as:"
    884                     + "\n\t\t<table>"
    885                     + "\n\t\t\t<caption>Private Use Codes in CLDR</a></caption>"
    886                     + "\n\tOften the sentence just before the <table> can be made into the caption."
    887                     + "\n\tThe next time you run this program, youll be prompted with double-links."
    888                     + "\n\tIf it really shouldn't have a caption, add <!-- nocaption --> after the <table> instead.");
    889             }
    890             int fatalCount = headingInfoList.showErrors();
    891             totalFatalCount += fatalCount;
    892             totalErrorCount += headingInfoList.totalErrorCount();
    893             if (fatalCount == 0) {
    894                 headingInfoList.listContents();
    895             } else {
    896                 System.out.println("\nFix fatal errors in " + fileCanonical + " before contents can be generated");
    897             }
    898         }
    899 
    900         private String showAttributeStack(Stack<Pair<String, String>> attributeStack) {
    901             StringBuilder result = new StringBuilder();
    902             for (Pair<String, String> s : attributeStack) {
    903                 result.append("[@");
    904                 result.append(s.getFirst());
    905                 final String second = s.getSecond();
    906                 if (second != null) {
    907                     result.append("='");
    908                     result.append(second);
    909                     result.append("'");
    910                 }
    911                 result.append("]");
    912             }
    913             return result.toString();
    914         }
    915 
    916         private String showElementStack(Stack<ElementLine> elementStack) {
    917             StringBuilder result = new StringBuilder();
    918             for (ElementLine s : elementStack) {
    919                 result.append('/').append(s);
    920             }
    921             return result.toString();
    922         }
    923 
    924         /**
    925          * Return string after collapsing multiple whitespace containing '\\n' to '\\n',
    926          * and otherwise 'space'.
    927          * @param input
    928          * @return
    929          */
    930         private String normalizeWhitespace(CharSequence input) {
    931             Matcher m = WHITESPACE.matcher(input);
    932             StringBuilder buffer = new StringBuilder();
    933             int last = 0;
    934             while (m.find()) {
    935                 int start = m.start();
    936                 buffer.append(input.subSequence(last, start));
    937                 last = m.end();
    938                 String whiteString = m.group();
    939                 if (whiteString.indexOf('\n') >= 0) {
    940                     buffer.append('\n');
    941                 } else {
    942                     buffer.append(' ');
    943                 }
    944             }
    945             buffer.append(input.subSequence(last, input.length()));
    946             return buffer.toString().trim();
    947         }
    948 
    949         public long getCount(String sentence) {
    950             return hashedSentences.getCount(sentence);
    951         }
    952 
    953         @Override
    954         public Iterator<String> iterator() {
    955             return sentences.iterator();
    956         }
    957     }
    958 }
    959