Home | History | Annotate | Download | only in rbbi
      1 /* GENERATED SOURCE. DO NOT MODIFY. */
      2 //  2016 and later: Unicode, Inc. and others.
      3 // License & terms of use: http://www.unicode.org/copyright.html#License
      4 /*
      5  * Created on May 5, 2004
      6  *
      7  * Copyright (C) 2004-2016 International Business Machines Corporation and others.
      8  * All Rights Reserved.
      9  *
     10  */
     11 package android.icu.dev.test.rbbi;
     12 
     13 import java.io.IOException;
     14 import java.io.InputStream;
     15 import java.io.InputStreamReader;
     16 import java.util.Arrays;
     17 
     18 import org.junit.Test;
     19 import org.junit.runner.RunWith;
     20 import org.junit.runners.JUnit4;
     21 
     22 import android.icu.dev.test.TestFmwk;
     23 import android.icu.impl.Utility;
     24 import android.icu.lang.UCharacter;
     25 import android.icu.text.BreakIterator;
     26 import android.icu.text.RuleBasedBreakIterator;
     27 import android.icu.util.ULocale;
     28 import android.icu.testsharding.MainTestShard;
     29 
     30 
     31 /**
     32  * Rule based break iterator data driven test.
     33  *      Perform the tests from the file rbbitst.txt.
     34  *      The test data file is common to both ICU4C and ICU4J.
     35  *      See the data file for a description of the tests.
     36  *
     37  */
     38 @MainTestShard
     39 @RunWith(JUnit4.class)
     40 public class RBBITestExtended extends TestFmwk {
     41 public RBBITestExtended() {
     42     }
     43 
     44 
     45 
     46 static class TestParams {
     47     BreakIterator   bi;
     48     StringBuilder   dataToBreak    = new StringBuilder();
     49     int[]           expectedBreaks = new int[4000];
     50     int[]           srcLine        = new int[4000];
     51     int[]           srcCol         = new int[4000];
     52     ULocale         currentLocale  = new ULocale("en_US");
     53 }
     54 
     55 
     56 @Test
     57 public void TestExtended() {
     58     TestParams     tp = new TestParams();
     59 
     60 
     61     //
     62     //  Open and read the test data file.
     63     //
     64     StringBuilder testFileBuf = new StringBuilder();
     65     InputStream is = null;
     66     try {
     67         is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
     68         if (is == null) {
     69             errln("Could not open test data file rbbitst.txt");
     70             return;
     71         }
     72         InputStreamReader isr = new InputStreamReader(is, "UTF-8");
     73         try {
     74             int c;
     75             int count = 0;
     76             for (;;) {
     77                 c = isr.read();
     78                 if (c < 0) {
     79                     break;
     80                 }
     81                 count++;
     82                 if (c == 0xFEFF && count == 1) {
     83                     // BOM in the test data file. Discard it.
     84                     continue;
     85                 }
     86 
     87                 testFileBuf.appendCodePoint(c);
     88             }
     89         } finally {
     90             isr.close();
     91         }
     92     } catch (IOException e) {
     93         errln(e.toString());
     94         try {
     95             is.close();
     96         } catch (IOException ignored) {
     97         }
     98         return;
     99     }
    100 
    101     String testString = testFileBuf.toString();
    102 
    103 
    104     final int  PARSE_COMMENT = 1;
    105     final int  PARSE_TAG     = 2;
    106     final int  PARSE_DATA    = 3;
    107     final int  PARSE_NUM     = 4;
    108     final int  PARSE_RULES   = 5;
    109 
    110     int parseState = PARSE_TAG;
    111 
    112     int savedState = PARSE_TAG;
    113 
    114     int    lineNum  = 1;
    115     int    colStart = 0;
    116     int    column   = 0;
    117     int    charIdx  = 0;
    118     int    i;
    119 
    120     int    tagValue = 0;       // The numeric value of a <nnn> tag.
    121 
    122     StringBuilder   rules = new StringBuilder();     // Holds rules from a <rules> ... </rules> block
    123     int             rulesFirstLine = 0;              // Line number of the start of current <rules> block
    124 
    125     int    len = testString.length();
    126 
    127     for (charIdx = 0; charIdx < len; ) {
    128         int c = testString.codePointAt(charIdx);
    129         charIdx++;
    130         if (c == '\r' && charIdx<len && testString.charAt(charIdx) == '\n') {
    131             // treat CRLF as a unit
    132             c = '\n';
    133             charIdx++;
    134         }
    135         if (c == '\n' || c == '\r') {
    136             lineNum++;
    137             colStart = charIdx;
    138         }
    139         column = charIdx - colStart + 1;
    140 
    141         switch (parseState) {
    142         case PARSE_COMMENT:
    143             if (c == 0x0a || c == 0x0d) {
    144                 parseState = savedState;
    145             }
    146             break;
    147 
    148         case PARSE_TAG:
    149             {
    150             if (c == '#') {
    151                 parseState = PARSE_COMMENT;
    152                 savedState = PARSE_TAG;
    153                 break;
    154             }
    155             if (UCharacter.isWhitespace(c)) {
    156                 break;
    157             }
    158            if (testString.startsWith("<word>", charIdx-1)) {
    159                 tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
    160                 charIdx += 5;
    161                 break;
    162             }
    163             if (testString.startsWith("<char>", charIdx-1)) {
    164                 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
    165                 charIdx += 5;
    166                 break;
    167             }
    168             if (testString.startsWith("<line>", charIdx-1)) {
    169                 tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
    170                 charIdx += 5;
    171                 break;
    172             }
    173             if (testString.startsWith("<sent>", charIdx-1)) {
    174                 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
    175                 charIdx += 5;
    176                 break;
    177             }
    178             if (testString.startsWith("<title>", charIdx-1)) {
    179                 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
    180                 charIdx += 6;
    181                 break;
    182             }
    183             if (testString.startsWith("<rules>", charIdx-1) ||
    184                     testString.startsWith("<badrules>", charIdx-1)) {
    185                 charIdx = testString.indexOf('>', charIdx) + 1;
    186                 parseState = PARSE_RULES;
    187                 rules.setLength(0);
    188                 rulesFirstLine = lineNum;
    189                 break;
    190             }
    191 
    192             if (testString.startsWith("<locale ", charIdx-1)) {
    193                 int closeIndex = testString.indexOf(">", charIdx);
    194                 if (closeIndex < 0) {
    195                     errln("line" + lineNum + ": missing close on <locale  tag.");
    196                     break;
    197                 }
    198                 String localeName = testString.substring(charIdx+6, closeIndex);
    199                 localeName = localeName.trim();
    200                 tp.currentLocale = new ULocale(localeName);
    201                 charIdx = closeIndex+1;
    202                 break;
    203             }
    204             if (testString.startsWith("<data>", charIdx-1)) {
    205                 parseState = PARSE_DATA;
    206                 charIdx += 5;
    207                 tp.dataToBreak.setLength(0);
    208                 Arrays.fill(tp.expectedBreaks, 0);
    209                 Arrays.fill(tp.srcCol, 0);
    210                 Arrays.fill(tp.srcLine, 0);
    211                 break;
    212             }
    213 
    214             errln("line" + lineNum + ": Tag expected in test file.");
    215             return;
    216             //parseState = PARSE_COMMENT;
    217             //savedState = PARSE_DATA;
    218             }
    219 
    220         case PARSE_RULES:
    221             if (testString.startsWith("</rules>", charIdx-1)) {
    222                 charIdx += 7;
    223                 parseState = PARSE_TAG;
    224                 try {
    225                     tp.bi = new RuleBasedBreakIterator(rules.toString());
    226                 } catch (IllegalArgumentException e) {
    227                     errln(String.format("rbbitst.txt:%d  Error creating break iterator from rules.  %s", lineNum, e));
    228                 }
    229             } else if (testString.startsWith("</badrules>", charIdx-1)) {
    230                 charIdx += 10;
    231                 parseState = PARSE_TAG;
    232                 boolean goodRules = true;
    233                 try {
    234                     new RuleBasedBreakIterator(rules.toString());
    235                 } catch (IllegalArgumentException e) {
    236                     goodRules = false;
    237                 }
    238                 if (goodRules) {
    239                     errln(String.format(
    240                             "rbbitst.txt:%d  Expected, but did not get, a failure creating break iterator from rules.",
    241                             lineNum));
    242                 }
    243             } else {
    244                 rules.appendCodePoint(c);
    245             }
    246             break;
    247 
    248         case PARSE_DATA:
    249             if (c == '') {
    250                 int  breakIdx = tp.dataToBreak.length();
    251                 tp.expectedBreaks[breakIdx] = -1;
    252                 tp.srcLine[breakIdx]        = lineNum;
    253                 tp.srcCol[breakIdx]         = column;
    254                 break;
    255             }
    256 
    257             if (testString.startsWith("</data>", charIdx-1))  {
    258                 // Add final entry to mappings from break location to source file position.
    259                 //  Need one extra because last break position returned is after the
    260                 //    last char in the data, not at the last char.
    261                 int idx = tp.dataToBreak.length();
    262                 tp.srcLine[idx] = lineNum;
    263                 tp.srcCol[idx]  = column;
    264 
    265                 parseState = PARSE_TAG;
    266                 charIdx += 6;
    267 
    268                 // RUN THE TEST!
    269                 executeTest(tp);
    270                 break;
    271             }
    272 
    273            if (testString.startsWith("\\N{", charIdx-1)) {
    274                int nameEndIdx = testString.indexOf('}', charIdx);
    275                if (nameEndIdx == -1) {
    276                    errln("Error in named character in test file at line " + lineNum +
    277                            ", col " + column);
    278                }
    279                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
    280                 // Get the code point from the name and insert it into the test data.
    281                 String charName = testString.substring(charIdx+2, nameEndIdx);
    282                 c = UCharacter.getCharFromName(charName);
    283                 if (c == -1) {
    284                     errln("Error in named character in test file at line " + lineNum +
    285                             ", col " + column);
    286                 } else {
    287                     // Named code point was recognized.  Insert it
    288                     //   into the test data.
    289                     tp.dataToBreak.appendCodePoint(c);
    290                     for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
    291                         tp.srcLine[i] = lineNum;
    292                         tp.srcCol[i]  = column;
    293                     }
    294 
    295                  }
    296                 if (nameEndIdx > charIdx) {
    297                     charIdx = nameEndIdx+1;
    298                 }
    299                 break;
    300             }
    301 
    302             if (testString.startsWith("<>", charIdx-1)) {
    303                 charIdx++;
    304                 int  breakIdx = tp.dataToBreak.length();
    305                 tp.expectedBreaks[breakIdx] = -1;
    306                 tp.srcLine[breakIdx]        = lineNum;
    307                 tp.srcCol[breakIdx]         = column;
    308                 break;
    309             }
    310 
    311             if (c == '<') {
    312                 tagValue   = 0;
    313                 parseState = PARSE_NUM;
    314                 break;
    315             }
    316 
    317             if (c == '#' && column==3) {   // TODO:  why is column off so far?
    318                 parseState = PARSE_COMMENT;
    319                 savedState = PARSE_DATA;
    320                 break;
    321             }
    322 
    323             if (c == '\\') {
    324                 // Check for \ at end of line, a line continuation.
    325                 //     Advance over (discard) the newline
    326                 int cp = testString.codePointAt(charIdx);
    327                 if (cp == '\r' && charIdx<len && testString.codePointAt(charIdx+1) == '\n') {
    328                     // We have a CR LF
    329                     //  Need an extra increment of the input ptr to move over both of them
    330                     charIdx++;
    331                 }
    332                 if (cp == '\n' || cp == '\r') {
    333                     lineNum++;
    334                     column   = 0;
    335                     charIdx++;
    336                     colStart = charIdx;
    337                     break;
    338                 }
    339 
    340                 // Let unescape handle the back slash.
    341                 int  charIdxAr[] = new int[1];
    342                 charIdxAr[0] = charIdx;
    343                 cp = Utility.unescapeAt(testString, charIdxAr);
    344                 if (cp != -1) {
    345                     // Escape sequence was recognized.  Insert the char
    346                     //   into the test data.
    347                     charIdx = charIdxAr[0];
    348                     tp.dataToBreak.appendCodePoint(cp);
    349                     for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
    350                         tp.srcLine[i] = lineNum;
    351                         tp.srcCol[i]  = column;
    352                     }
    353 
    354                     break;
    355                 }
    356 
    357 
    358                 // Not a recognized backslash escape sequence.
    359                 // Take the next char as a literal.
    360                 //  TODO:  Should this be an error?
    361                 c = testString.codePointAt(charIdx);
    362                 charIdx = testString.offsetByCodePoints(charIdx, 1);
    363              }
    364 
    365             // Normal, non-escaped data char.
    366             tp.dataToBreak.appendCodePoint(c);
    367 
    368             // Save the mapping from offset in the data to line/column numbers in
    369             //   the original input file.  Will be used for better error messages only.
    370             //   If there's an expected break before this char, the slot in the mapping
    371             //     vector will already be set for this char; don't overwrite it.
    372             for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
    373                 tp.srcLine[i] = lineNum;
    374                 tp.srcCol[i]  = column;
    375             }
    376             break;
    377 
    378 
    379         case PARSE_NUM:
    380             // We are parsing an expected numeric tag value, like <1234>,
    381             //   within a chunk of data.
    382             if (UCharacter.isWhitespace(c)) {
    383                 break;
    384             }
    385 
    386             if (c == '>') {
    387                 // Finished the number.  Add the info to the expected break data,
    388                 //   and switch parse state back to doing plain data.
    389                 parseState = PARSE_DATA;
    390                 if (tagValue == 0) {
    391                     tagValue = -1;
    392                 }
    393                 int  breakIdx = tp.dataToBreak.length();
    394                 tp.expectedBreaks[breakIdx] = tagValue;
    395                 tp.srcLine[breakIdx]        = lineNum;
    396                 tp.srcCol[breakIdx]         = column;
    397                 break;
    398             }
    399 
    400             if (UCharacter.isDigit(c)) {
    401                 tagValue = tagValue*10 + UCharacter.digit(c);
    402                 break;
    403             }
    404 
    405             errln(String.format("Syntax Error in rbbitst.txt at line %d, col %d", lineNum, column));
    406             return;
    407         }
    408     }
    409 
    410     // Reached end of test file. Raise an error if parseState indicates that we are
    411     //   within a block that should have been terminated.
    412     if (parseState == PARSE_RULES) {
    413         errln(String.format("rbbitst.txt:%d <rules> block beginning at line %d is not closed.",
    414             lineNum, rulesFirstLine));
    415     }
    416     if (parseState == PARSE_DATA) {
    417         errln(String.format("rbbitst.txt:%d <data> block not closed.", lineNum));
    418     }
    419 }
    420 
    421 void executeTest(TestParams t) {
    422     // TODO: also rerun tests with a break iterator re-created from bi.getRules()
    423     //       and from bi.clone(). If in exhaustive mode only.
    424     int    bp;
    425     int    prevBP;
    426     int    i;
    427 
    428     if (t.bi == null) {
    429         return;
    430     }
    431 
    432     t.bi.setText(t.dataToBreak.toString());
    433     //
    434     //  Run the iterator forward
    435     //
    436     prevBP = -1;
    437     for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
    438         if (prevBP ==  bp) {
    439             // Fail for lack of forward progress.
    440             errln("Forward Iteration, no forward progress.  Break Pos=" + bp +
    441                     "  File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
    442             break;
    443         }
    444 
    445         // Check that there were we didn't miss an expected break between the last one
    446         //  and this one.
    447         for (i=prevBP+1; i<bp; i++) {
    448             if (t.expectedBreaks[i] != 0) {
    449                 errln("Forward Iteration, break expected, but not found.  Pos=" + i +
    450                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
    451             }
    452         }
    453 
    454         // Check that the break we did find was expected
    455         if (t.expectedBreaks[bp] == 0) {
    456             errln("Forward Iteration, break found, but not expected.  Pos=" + bp +
    457                     "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
    458         } else {
    459             // The break was expected.
    460             //   Check that the {nnn} tag value is correct.
    461             int expectedTagVal = t.expectedBreaks[bp];
    462             if (expectedTagVal == -1) {
    463                 expectedTagVal = 0;
    464             }
    465             int line = t.srcLine[bp];
    466             int rs = t.bi.getRuleStatus();
    467             if (rs != expectedTagVal) {
    468                 errln("Incorrect status for forward break.  Pos = " + bp +
    469                         ".  File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
    470                       "          Actual, Expected status = " + rs + ", " + expectedTagVal);
    471             }
    472             int[] fillInArray = new int[4];
    473             int numStatusVals = t.bi.getRuleStatusVec(fillInArray);
    474             assertTrue("", numStatusVals >= 1);
    475             assertEquals("", expectedTagVal, fillInArray[0]);
    476         }
    477 
    478 
    479         prevBP = bp;
    480     }
    481 
    482     // Verify that there were no missed expected breaks after the last one found
    483     for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
    484         if (t.expectedBreaks[i] != 0) {
    485             errln("Forward Iteration, break expected, but not found.  Pos=" + i +
    486                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
    487        }
    488     }
    489 
    490 
    491     //
    492     //  Run the iterator backwards, verify that the same breaks are found.
    493     //
    494     prevBP = t.dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
    495     for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
    496         if (prevBP ==  bp) {
    497             // Fail for lack of progress.
    498             errln("Reverse Iteration, no progress.  Break Pos=" + bp +
    499                     "File line,col=" + t.srcLine[bp] + " " +  t.srcCol[bp]);
    500             break;
    501         }
    502 
    503         // Check that we didn't miss an expected break between the last one
    504         //  and this one.  (UVector returns zeros for index out of bounds.)
    505         for (i=prevBP-1; i>bp; i--) {
    506             if (t.expectedBreaks[i] != 0) {
    507                 errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
    508                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
    509             }
    510         }
    511 
    512         // Check that the break we did find was expected
    513         if (t.expectedBreaks[bp] == 0) {
    514             errln("Reverse Itertion, break found, but not expected.  Pos=" + bp +
    515                     "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
    516         } else {
    517             // The break was expected.
    518             //   Check that the {nnn} tag value is correct.
    519             int expectedTagVal = t.expectedBreaks[bp];
    520             if (expectedTagVal == -1) {
    521                 expectedTagVal = 0;
    522             }
    523             int line = t.srcLine[bp];
    524             int rs = t.bi.getRuleStatus();
    525             if (rs != expectedTagVal) {
    526                 errln("Incorrect status for reverse break.  Pos = " + bp +
    527                       "  File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
    528                       "          Actual, Expected status = " + rs + ", " + expectedTagVal);
    529             }
    530         }
    531 
    532         prevBP = bp;
    533     }
    534 
    535     // Verify that there were no missed breaks prior to the last one found
    536     for (i=prevBP-1; i>=0; i--) {
    537         if (t.expectedBreaks[i] != 0) {
    538             errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
    539                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
    540          }
    541     }
    542     // Check isBoundary()
    543     for (i=0; i<=t.dataToBreak.length(); i++) {
    544         boolean boundaryExpected = (t.expectedBreaks[i] != 0);
    545         boolean boundaryFound    = t.bi.isBoundary(i);
    546         if (boundaryExpected != boundaryFound) {
    547             errln("isBoundary(" + i + ") incorrect.\n" +
    548                   "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
    549                   "    Expected, Actual= " + boundaryExpected + ", " + boundaryFound);
    550         }
    551     }
    552 
    553     // Check following()
    554     for (i=0; i<=t.dataToBreak.length(); i++) {
    555         int actualBreak = t.bi.following(i);
    556         int expectedBreak = BreakIterator.DONE;
    557         for (int j=i+1; j < t.expectedBreaks.length; j++) {
    558             if (t.expectedBreaks[j] != 0) {
    559                 expectedBreak = j;
    560                 break;
    561             }
    562         }
    563         if (expectedBreak != actualBreak) {
    564             errln("following(" + i + ") incorrect.\n" +
    565                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
    566                     "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
    567         }
    568     }
    569 
    570     // Check preceding()
    571     for (i=t.dataToBreak.length(); i>=0; i--) {
    572         int actualBreak = t.bi.preceding(i);
    573         int expectedBreak = BreakIterator.DONE;
    574 
    575         for (int j=i-1; j >= 0; j--) {
    576             if (t.expectedBreaks[j] != 0) {
    577                 expectedBreak = j;
    578                 break;
    579             }
    580         }
    581         if (expectedBreak != actualBreak) {
    582             errln("preceding(" + i + ") incorrect.\n" +
    583                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
    584                     "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
    585         }
    586     }
    587 
    588 }
    589 
    590 
    591 
    592 
    593 }
    594