Home | History | Annotate | Download | only in rbbi
      1 //  2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html#License
      3 /*
      4  * Created on May 5, 2004
      5  *
      6  * Copyright (C) 2004-2016 International Business Machines Corporation and others.
      7  * All Rights Reserved.
      8  *
      9  */
     10 package com.ibm.icu.dev.test.rbbi;
     11 
     12 import java.io.IOException;
     13 import java.io.InputStream;
     14 import java.io.InputStreamReader;
     15 import java.util.Arrays;
     16 
     17 import org.junit.Test;
     18 import org.junit.runner.RunWith;
     19 import org.junit.runners.JUnit4;
     20 
     21 import com.ibm.icu.dev.test.TestFmwk;
     22 import com.ibm.icu.impl.Utility;
     23 import com.ibm.icu.lang.UCharacter;
     24 import com.ibm.icu.text.BreakIterator;
     25 import com.ibm.icu.text.RuleBasedBreakIterator;
     26 import com.ibm.icu.util.ULocale;
     27 
     28 
     29 /**
     30  * Rule based break iterator data driven test.
     31  *      Perform the tests from the file rbbitst.txt.
     32  *      The test data file is common to both ICU4C and ICU4J.
     33  *      See the data file for a description of the tests.
     34  *
     35  */
     36 @RunWith(JUnit4.class)
     37 public class RBBITestExtended extends TestFmwk {
     38 public RBBITestExtended() {
     39     }
     40 
     41 
     42 
     43 static class TestParams {
     44     BreakIterator   bi;
     45     StringBuilder   dataToBreak    = new StringBuilder();
     46     int[]           expectedBreaks = new int[4000];
     47     int[]           srcLine        = new int[4000];
     48     int[]           srcCol         = new int[4000];
     49     ULocale         currentLocale  = new ULocale("en_US");
     50 }
     51 
     52 
     53 @Test
     54 public void TestExtended() {
     55     TestParams     tp = new TestParams();
     56 
     57 
     58     //
     59     //  Open and read the test data file.
     60     //
     61     StringBuilder testFileBuf = new StringBuilder();
     62     InputStream is = null;
     63     try {
     64         is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
     65         if (is == null) {
     66             errln("Could not open test data file rbbitst.txt");
     67             return;
     68         }
     69         InputStreamReader isr = new InputStreamReader(is, "UTF-8");
     70         try {
     71             int c;
     72             int count = 0;
     73             for (;;) {
     74                 c = isr.read();
     75                 if (c < 0) {
     76                     break;
     77                 }
     78                 count++;
     79                 if (c == 0xFEFF && count == 1) {
     80                     // BOM in the test data file. Discard it.
     81                     continue;
     82                 }
     83 
     84                 testFileBuf.appendCodePoint(c);
     85             }
     86         } finally {
     87             isr.close();
     88         }
     89     } catch (IOException e) {
     90         errln(e.toString());
     91         try {
     92             is.close();
     93         } catch (IOException ignored) {
     94         }
     95         return;
     96     }
     97 
     98     String testString = testFileBuf.toString();
     99 
    100 
    101     final int  PARSE_COMMENT = 1;
    102     final int  PARSE_TAG     = 2;
    103     final int  PARSE_DATA    = 3;
    104     final int  PARSE_NUM     = 4;
    105     final int  PARSE_RULES   = 5;
    106 
    107     int parseState = PARSE_TAG;
    108 
    109     int savedState = PARSE_TAG;
    110 
    111     int    lineNum  = 1;
    112     int    colStart = 0;
    113     int    column   = 0;
    114     int    charIdx  = 0;
    115     int    i;
    116 
    117     int    tagValue = 0;       // The numeric value of a <nnn> tag.
    118 
    119     StringBuilder   rules = new StringBuilder();     // Holds rules from a <rules> ... </rules> block
    120     int             rulesFirstLine = 0;              // Line number of the start of current <rules> block
    121 
    122     int    len = testString.length();
    123 
    124     for (charIdx = 0; charIdx < len; ) {
    125         int c = testString.codePointAt(charIdx);
    126         charIdx++;
    127         if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') {
    128             // treat CRLF as a unit
    129             c = '\n';
    130             charIdx++;
    131         }
    132         if (c == '\n' || c == '\r') {
    133             lineNum++;
    134             colStart = charIdx;
    135         }
    136         column = charIdx - colStart + 1;
    137 
    138         switch (parseState) {
    139         case PARSE_COMMENT:
    140             if (c == 0x0a || c == 0x0d) {
    141                 parseState = savedState;
    142             }
    143             break;
    144 
    145         case PARSE_TAG:
    146             {
    147             if (c == '#') {
    148                 parseState = PARSE_COMMENT;
    149                 savedState = PARSE_TAG;
    150                 break;
    151             }
    152             if (UCharacter.isWhitespace(c)) {
    153                 break;
    154             }
    155            if (testString.startsWith("<word>", charIdx-1)) {
    156                 tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
    157                 charIdx += 5;
    158                 break;
    159             }
    160             if (testString.startsWith("<char>", charIdx-1)) {
    161                 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
    162                 charIdx += 5;
    163                 break;
    164             }
    165             if (testString.startsWith("<line>", charIdx-1)) {
    166                 tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
    167                 charIdx += 5;
    168                 break;
    169             }
    170             if (testString.startsWith("<sent>", charIdx-1)) {
    171                 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
    172                 charIdx += 5;
    173                 break;
    174             }
    175             if (testString.startsWith("<title>", charIdx-1)) {
    176                 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
    177                 charIdx += 6;
    178                 break;
    179             }
    180             if (testString.startsWith("<rules>", charIdx-1) ||
    181                     testString.startsWith("<badrules>", charIdx-1)) {
    182                 charIdx = testString.indexOf('>', charIdx) + 1;
    183                 parseState = PARSE_RULES;
    184                 rules.setLength(0);
    185                 rulesFirstLine = lineNum;
    186                 break;
    187             }
    188 
    189             if (testString.startsWith("<locale ", charIdx-1)) {
    190                 int closeIndex = testString.indexOf(">", charIdx);
    191                 if (closeIndex < 0) {
    192                     errln("line" + lineNum + ": missing close on <locale  tag.");
    193                     break;
    194                 }
    195                 String localeName = testString.substring(charIdx+6, closeIndex);
    196                 localeName = localeName.trim();
    197                 tp.currentLocale = new ULocale(localeName);
    198                 charIdx = closeIndex+1;
    199                 break;
    200             }
    201             if (testString.startsWith("<data>", charIdx-1)) {
    202                 parseState = PARSE_DATA;
    203                 charIdx += 5;
    204                 tp.dataToBreak.setLength(0);
    205                 Arrays.fill(tp.expectedBreaks, 0);
    206                 Arrays.fill(tp.srcCol, 0);
    207                 Arrays.fill(tp.srcLine, 0);
    208                 break;
    209             }
    210 
    211             errln("line" + lineNum + ": Tag expected in test file.");
    212             return;
    213             //parseState = PARSE_COMMENT;
    214             //savedState = PARSE_DATA;
    215             }
    216 
    217         case PARSE_RULES:
    218             if (testString.startsWith("</rules>", charIdx-1)) {
    219                 charIdx += 7;
    220                 parseState = PARSE_TAG;
    221                 try {
    222                     tp.bi = new RuleBasedBreakIterator(rules.toString());
    223                 } catch (IllegalArgumentException e) {
    224                     errln(String.format("rbbitst.txt:%d  Error creating break iterator from rules.  %s", lineNum, e));
    225                 }
    226             } else if (testString.startsWith("</badrules>", charIdx-1)) {
    227                 charIdx += 10;
    228                 parseState = PARSE_TAG;
    229                 boolean goodRules = true;
    230                 try {
    231                     new RuleBasedBreakIterator(rules.toString());
    232                 } catch (IllegalArgumentException e) {
    233                     goodRules = false;
    234                 }
    235                 if (goodRules) {
    236                     errln(String.format(
    237                             "rbbitst.txt:%d  Expected, but did not get, a failure creating break iterator from rules.",
    238                             lineNum));
    239                 }
    240             } else {
    241                 rules.appendCodePoint(c);
    242             }
    243             break;
    244 
    245         case PARSE_DATA:
    246             if (c == '') {
    247                 int  breakIdx = tp.dataToBreak.length();
    248                 tp.expectedBreaks[breakIdx] = -1;
    249                 tp.srcLine[breakIdx]        = lineNum;
    250                 tp.srcCol[breakIdx]         = column;
    251                 break;
    252             }
    253 
    254             if (testString.startsWith("</data>", charIdx-1))  {
    255                 // Add final entry to mappings from break location to source file position.
    256                 //  Need one extra because last break position returned is after the
    257                 //    last char in the data, not at the last char.
    258                 int idx = tp.dataToBreak.length();
    259                 tp.srcLine[idx] = lineNum;
    260                 tp.srcCol[idx]  = column;
    261 
    262                 parseState = PARSE_TAG;
    263                 charIdx += 6;
    264 
    265                 // RUN THE TEST!
    266                 executeTest(tp);
    267                 break;
    268             }
    269 
    270            if (testString.startsWith("\\N{", charIdx-1)) {
    271                int nameEndIdx = testString.indexOf('}', charIdx);
    272                if (nameEndIdx == -1) {
    273                    errln("Error in named character in test file at line " + lineNum +
    274                            ", col " + column);
    275                }
    276                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
    277                 // Get the code point from the name and insert it into the test data.
    278                 String charName = testString.substring(charIdx+2, nameEndIdx);
    279                 c = UCharacter.getCharFromName(charName);
    280                 if (c == -1) {
    281                     errln("Error in named character in test file at line " + lineNum +
    282                             ", col " + column);
    283                 } else {
    284                     // Named code point was recognized.  Insert it
    285                     //   into the test data.
    286                     tp.dataToBreak.appendCodePoint(c);
    287                     for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
    288                         tp.srcLine[i] = lineNum;
    289                         tp.srcCol[i]  = column;
    290                     }
    291 
    292                  }
    293                 if (nameEndIdx > charIdx) {
    294                     charIdx = nameEndIdx+1;
    295                 }
    296                 break;
    297             }
    298 
    299             if (testString.startsWith("<>", charIdx-1)) {
    300                 charIdx++;
    301                 int  breakIdx = tp.dataToBreak.length();
    302                 tp.expectedBreaks[breakIdx] = -1;
    303                 tp.srcLine[breakIdx]        = lineNum;
    304                 tp.srcCol[breakIdx]         = column;
    305                 break;
    306             }
    307 
    308             if (c == '<') {
    309                 tagValue   = 0;
    310                 parseState = PARSE_NUM;
    311                 break;
    312             }
    313 
    314             if (c == '#' && column==3) {   // TODO:  why is column off so far?
    315                 parseState = PARSE_COMMENT;
    316                 savedState = PARSE_DATA;
    317                 break;
    318             }
    319 
    320             if (c == '\\') {
    321                 // Check for \ at end of line, a line continuation.
    322                 //     Advance over (discard) the newline
    323                 int cp = testString.codePointAt(charIdx);
    324                 if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') {
    325                     // We have a CR LF
    326                     //  Need an extra increment of the input ptr to move over both of them
    327                     charIdx++;
    328                 }
    329                 if (cp == '\n' || cp == '\r') {
    330                     lineNum++;
    331                     column   = 0;
    332                     charIdx++;
    333                     colStart = charIdx;
    334                     break;
    335                 }
    336 
    337                 // Let unescape handle the back slash.
    338                 int  charIdxAr[] = new int[1];
    339                 charIdxAr[0] = charIdx;
    340                 cp = Utility.unescapeAt(testString, charIdxAr);
    341                 if (cp != -1) {
    342                     // Escape sequence was recognized.  Insert the char
    343                     //   into the test data.
    344                     charIdx = charIdxAr[0];
    345                     tp.dataToBreak.appendCodePoint(cp);
    346                     for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
    347                         tp.srcLine[i] = lineNum;
    348                         tp.srcCol[i]  = column;
    349                     }
    350 
    351                     break;
    352                 }
    353 
    354 
    355                 // Not a recognized backslash escape sequence.
    356                 // Take the next char as a literal.
    357                 //  TODO:  Should this be an error?
    358                 c = testString.codePointAt(charIdx);
    359                 charIdx = testString.offsetByCodePoints(charIdx, 1);
    360              }
    361 
    362             // Normal, non-escaped data char.
    363             tp.dataToBreak.appendCodePoint(c);
    364 
    365             // Save the mapping from offset in the data to line/column numbers in
    366             //   the original input file.  Will be used for better error messages only.
    367             //   If there's an expected break before this char, the slot in the mapping
    368             //     vector will already be set for this char; don't overwrite it.
    369             for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
    370                 tp.srcLine[i] = lineNum;
    371                 tp.srcCol[i]  = column;
    372             }
    373             break;
    374 
    375 
    376         case PARSE_NUM:
    377             // We are parsing an expected numeric tag value, like <1234>,
    378             //   within a chunk of data.
    379             if (UCharacter.isWhitespace(c)) {
    380                 break;
    381             }
    382 
    383             if (c == '>') {
    384                 // Finished the number.  Add the info to the expected break data,
    385                 //   and switch parse state back to doing plain data.
    386                 parseState = PARSE_DATA;
    387                 if (tagValue == 0) {
    388                     tagValue = -1;
    389                 }
    390                 int  breakIdx = tp.dataToBreak.length();
    391                 tp.expectedBreaks[breakIdx] = tagValue;
    392                 tp.srcLine[breakIdx]        = lineNum;
    393                 tp.srcCol[breakIdx]         = column;
    394                 break;
    395             }
    396 
    397             if (UCharacter.isDigit(c)) {
    398                 tagValue = tagValue*10 + UCharacter.digit(c);
    399                 break;
    400             }
    401 
    402             errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column));
    403             return;
    404         }
    405     }
    406 
    407     // Reached end of test file. Raise an error if parseState indicates that we are
    408     //   within a block that should have been terminated.
    409     if (parseState == PARSE_RULES) {
    410         errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
    411             lineNum, rulesFirstLine));
    412     }
    413     if (parseState == PARSE_DATA) {
    414         errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum));
    415     }
    416 }
    417 
    418 void executeTest(TestParams t) {
    419     // TODO: also rerun tests with a break iterator re-created from bi.getRules()
    420     //       and from bi.clone(). If in exhaustive mode only.
    421     int    bp;
    422     int    prevBP;
    423     int    i;
    424 
    425     if (t.bi == null) {
    426         return;
    427     }
    428 
    429     t.bi.setText(t.dataToBreak.toString());
    430     //
    431     //  Run the iterator forward
    432     //
    433     prevBP = -1;
    434     for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
    435         if (prevBP ==  bp) {
    436             // Fail for lack of forward progress.
    437             errln("Forward Iteration, no forward progress.  Break Pos=" + bp +
    438                     "  File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
    439             break;
    440         }
    441 
    442         // Check that there were we didn't miss an expected break between the last one
    443         //  and this one.
    444         for (i=prevBP+1; i<bp; i++) {
    445             if (t.expectedBreaks[i] != 0) {
    446                 errln("Forward Iteration, break expected, but not found.  Pos=" + i +
    447                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
    448             }
    449         }
    450 
    451         // Check that the break we did find was expected
    452         if (t.expectedBreaks[bp] == 0) {
    453             errln("Forward Iteration, break found, but not expected.  Pos=" + bp +
    454                     "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
    455         } else {
    456             // The break was expected.
    457             //   Check that the {nnn} tag value is correct.
    458             int expectedTagVal = t.expectedBreaks[bp];
    459             if (expectedTagVal == -1) {
    460                 expectedTagVal = 0;
    461             }
    462             int line = t.srcLine[bp];
    463             int rs = t.bi.getRuleStatus();
    464             if (rs != expectedTagVal) {
    465                 errln("Incorrect status for forward break.  Pos = " + bp +
    466                         ".  File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
    467                       "          Actual, Expected status = " + rs + ", " + expectedTagVal);
    468             }
    469             int[] fillInArray = new int[4];
    470             int numStatusVals = t.bi.getRuleStatusVec(fillInArray);
    471             assertTrue("", numStatusVals >= 1);
    472             assertEquals("", expectedTagVal, fillInArray[0]);
    473         }
    474 
    475 
    476         prevBP = bp;
    477     }
    478 
    479     // Verify that there were no missed expected breaks after the last one found
    480     for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
    481         if (t.expectedBreaks[i] != 0) {
    482             errln("Forward Iteration, break expected, but not found.  Pos=" + i +
    483                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
    484        }
    485     }
    486 
    487 
    488     //
    489     //  Run the iterator backwards, verify that the same breaks are found.
    490     //
    491     prevBP = t.dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
    492     for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
    493         if (prevBP ==  bp) {
    494             // Fail for lack of progress.
    495             errln("Reverse Iteration, no progress.  Break Pos=" + bp +
    496                     "File line,col=" + t.srcLine[bp] + " " +  t.srcCol[bp]);
    497             break;
    498         }
    499 
    500         // Check that we didn't miss an expected break between the last one
    501         //  and this one.  (UVector returns zeros for index out of bounds.)
    502         for (i=prevBP-1; i>bp; i--) {
    503             if (t.expectedBreaks[i] != 0) {
    504                 errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
    505                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
    506             }
    507         }
    508 
    509         // Check that the break we did find was expected
    510         if (t.expectedBreaks[bp] == 0) {
    511             errln("Reverse Itertion, break found, but not expected.  Pos=" + bp +
    512                     "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
    513         } else {
    514             // The break was expected.
    515             //   Check that the {nnn} tag value is correct.
    516             int expectedTagVal = t.expectedBreaks[bp];
    517             if (expectedTagVal == -1) {
    518                 expectedTagVal = 0;
    519             }
    520             int line = t.srcLine[bp];
    521             int rs = t.bi.getRuleStatus();
    522             if (rs != expectedTagVal) {
    523                 errln("Incorrect status for reverse break.  Pos = " + bp +
    524                       "  File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
    525                       "          Actual, Expected status = " + rs + ", " + expectedTagVal);
    526             }
    527         }
    528 
    529         prevBP = bp;
    530     }
    531 
    532     // Verify that there were no missed breaks prior to the last one found
    533     for (i=prevBP-1; i>=0; i--) {
    534         if (t.expectedBreaks[i] != 0) {
    535             errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
    536                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
    537          }
    538     }
    539     // Check isBoundary()
    540     for (i=0; i<=t.dataToBreak.length(); i++) {
    541         boolean boundaryExpected = (t.expectedBreaks[i] != 0);
    542         boolean boundaryFound    = t.bi.isBoundary(i);
    543         if (boundaryExpected != boundaryFound) {
    544             errln("isBoundary(" + i + ") incorrect.\n" +
    545                   "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
    546                   "    Expected, Actual= " + boundaryExpected + ", " + boundaryFound);
    547         }
    548     }
    549 
    550     // Check following()
    551     for (i=0; i<=t.dataToBreak.length(); i++) {
    552         int actualBreak = t.bi.following(i);
    553         int expectedBreak = BreakIterator.DONE;
    554         for (int j=i+1; j < t.expectedBreaks.length; j++) {
    555             if (t.expectedBreaks[j] != 0) {
    556                 expectedBreak = j;
    557                 break;
    558             }
    559         }
    560         if (expectedBreak != actualBreak) {
    561             errln("following(" + i + ") incorrect.\n" +
    562                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
    563                     "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
    564         }
    565     }
    566 
    567     // Check preceding()
    568     for (i=t.dataToBreak.length(); i>=0; i--) {
    569         int actualBreak = t.bi.preceding(i);
    570         int expectedBreak = BreakIterator.DONE;
    571 
    572         for (int j=i-1; j >= 0; j--) {
    573             if (t.expectedBreaks[j] != 0) {
    574                 expectedBreak = j;
    575                 break;
    576             }
    577         }
    578         if (expectedBreak != actualBreak) {
    579             errln("preceding(" + i + ") incorrect.\n" +
    580                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
    581                     "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
    582         }
    583     }
    584 
    585 }
    586 
    587 
    588 
    589 
    590 }
    591