Home | History | Annotate | Download | only in rbbi
      1 /*
      2  * Created on May 5, 2004
      3  *
      4  * Copyright (C) 2004-2015 International Business Machines Corporation and others.
      5  * All Rights Reserved.
      6  *
      7  */
      8 package com.ibm.icu.dev.test.rbbi;
      9 
     10 import java.io.IOException;
     11 import java.io.InputStream;
     12 import java.io.InputStreamReader;
     13 import java.util.Arrays;
     14 
     15 import com.ibm.icu.dev.test.TestFmwk;
     16 import com.ibm.icu.impl.Utility;
     17 import com.ibm.icu.lang.UCharacter;
     18 import com.ibm.icu.text.BreakIterator;
     19 import com.ibm.icu.text.UTF16;
     20 import com.ibm.icu.util.ULocale;
     21 
     22 
     23 /**
     24  * Rule based break iterator data driven test.
     25  *      Perform the tests from the file rbbitst.txt.
     26  *      The test data file is common to both ICU4C and ICU4J.
     27  *      See the data file for a description of the tests.
     28  *
     29  */
     30 public class RBBITestExtended extends TestFmwk {
     31 
     32     public static void main(String[] args)throws Exception {
     33         new RBBITestExtended().run(args);
     34     }
     35 
     36 
     37 public RBBITestExtended() {
     38     }
     39 
     40 
     41 
     42 static class TestParams {
     43     BreakIterator   bi;
     44     StringBuffer    dataToBreak    = new StringBuffer();
     45     int[]           expectedBreaks = new int[1000];
     46     int[]           srcLine        = new int[1000];
     47     int[]           srcCol         = new int[1000];
     48     ULocale         currentLocale  = new ULocale("en_US");
     49 }
     50 
     51 
     52 public void TestExtended() {
     53     TestParams     tp = new TestParams();
     54 
     55 
     56     //
     57     //  Open and read the test data file.
     58     //
     59     StringBuffer testFileBuf = new StringBuffer();
     60     InputStream is = null;
     61     try {
     62         is = RBBITestExtended.class.getResourceAsStream("rbbitst.txt");
     63         if (is == null) {
     64             errln("Could not open test data file rbbitst.txt");
     65             return;
     66         }
     67         InputStreamReader isr = new InputStreamReader(is, "UTF-8");
     68         try {
     69             int c;
     70             int count = 0;
     71             for (;;) {
     72                 c = isr.read();
     73                 if (c < 0) {
     74                     break;
     75                 }
     76                 count++;
     77                 if (c == 0xFEFF && count == 1) {
     78                     // BOM in the test data file. Discard it.
     79                     continue;
     80                 }
     81 
     82                 UTF16.append(testFileBuf, c);
     83             }
     84         } finally {
     85             isr.close();
     86         }
     87     } catch (IOException e) {
     88         errln(e.toString());
     89         try {
     90             is.close();
     91         } catch (IOException ignored) {
     92         }
     93         return;
     94     }
     95 
     96     String testString = testFileBuf.toString();
     97 
     98 
     99     final int  PARSE_COMMENT = 1;
    100     final int  PARSE_TAG     = 2;
    101     final int  PARSE_DATA    = 3;
    102     final int  PARSE_NUM     = 4;
    103 
    104     int parseState = PARSE_TAG;
    105 
    106     int savedState = PARSE_TAG;
    107 
    108     final char CH_LF        = 0x0a;
    109     final char CH_CR        = 0x0d;
    110     final char CH_HASH      = 0x23;
    111     /*static const UChar CH_PERIOD    = 0x2e;*/
    112     final char CH_LT        = 0x3c;
    113     final char CH_GT        = 0x3e;
    114     final char CH_BACKSLASH = 0x5c;
    115     final char CH_BULLET    = 0x2022;
    116 
    117     int    lineNum  = 1;
    118     int    colStart = 0;
    119     int    column   = 0;
    120     int    charIdx  = 0;
    121     int    i;
    122 
    123     int    tagValue = 0;       // The numeric value of a <nnn> tag.
    124     int    len = testString.length();
    125 
    126     for (charIdx = 0; charIdx < len; ) {
    127         int  c = UTF16.charAt(testString, charIdx);
    128         charIdx++;
    129         if (c == CH_CR && charIdx<len && testString.charAt(charIdx) == CH_LF) {
    130             // treat CRLF as a unit
    131             c = CH_LF;
    132             charIdx++;
    133         }
    134         if (c == CH_LF || c == CH_CR) {
    135             lineNum++;
    136             colStart = charIdx;
    137         }
    138         column = charIdx - colStart + 1;
    139 
    140         switch (parseState) {
    141         case PARSE_COMMENT:
    142             if (c == 0x0a || c == 0x0d) {
    143                 parseState = savedState;
    144             }
    145             break;
    146 
    147         case PARSE_TAG:
    148             {
    149             if (c == CH_HASH) {
    150                 parseState = PARSE_COMMENT;
    151                 savedState = PARSE_TAG;
    152                 break;
    153             }
    154             if (UCharacter.isWhitespace(c)) {
    155                 break;
    156             }
    157            if (testString.startsWith("<word>", charIdx-1)) {
    158                 tp.bi = BreakIterator.getWordInstance(tp.currentLocale);
    159                 charIdx += 5;
    160                 break;
    161             }
    162             if (testString.startsWith("<char>", charIdx-1)) {
    163                 tp.bi = BreakIterator.getCharacterInstance(tp.currentLocale);
    164                 charIdx += 5;
    165                 break;
    166             }
    167             if (testString.startsWith("<line>", charIdx-1)) {
    168                 tp.bi = BreakIterator.getLineInstance(tp.currentLocale);
    169                 charIdx += 5;
    170                 break;
    171             }
    172             if (testString.startsWith("<sent>", charIdx-1)) {
    173                 tp.bi = BreakIterator.getSentenceInstance(tp.currentLocale);
    174                 charIdx += 5;
    175                 break;
    176             }
    177             if (testString.startsWith("<title>", charIdx-1)) {
    178                 tp.bi = BreakIterator.getTitleInstance(tp.currentLocale);
    179                 charIdx += 6;
    180                 break;
    181             }
    182             if (testString.startsWith("<locale ", charIdx-1)) {
    183                 int closeIndex = testString.indexOf(">", charIdx);
    184                 if (closeIndex < 0) {
    185                     errln("line" + lineNum + ": missing close on <locale  tag.");
    186                     break;
    187                 }
    188                 String localeName = testString.substring(charIdx+6, closeIndex);
    189                 localeName = localeName.trim();
    190                 tp.currentLocale = new ULocale(localeName);
    191                 charIdx = closeIndex+1;
    192                 break;
    193             }
    194             if (testString.startsWith("<data>", charIdx-1)) {
    195                 parseState = PARSE_DATA;
    196                 charIdx += 5;
    197                 tp.dataToBreak.setLength(0);
    198                 Arrays.fill(tp.expectedBreaks, 0);
    199                 Arrays.fill(tp.srcCol, 0);
    200                 Arrays.fill(tp.srcLine, 0);
    201                 break;
    202             }
    203 
    204             errln("line" + lineNum + ": Tag expected in test file.");
    205             return;
    206             //parseState = PARSE_COMMENT;
    207             //savedState = PARSE_DATA;
    208             }
    209 
    210         case PARSE_DATA:
    211             if (c == CH_BULLET) {
    212                 int  breakIdx = tp.dataToBreak.length();
    213                 tp.expectedBreaks[breakIdx] = -1;
    214                 tp.srcLine[breakIdx]        = lineNum;
    215                 tp.srcCol[breakIdx]         = column;
    216                 break;
    217             }
    218 
    219             if (testString.startsWith("</data>", charIdx-1))  {
    220                 // Add final entry to mappings from break location to source file position.
    221                 //  Need one extra because last break position returned is after the
    222                 //    last char in the data, not at the last char.
    223                 int idx = tp.dataToBreak.length();
    224                 tp.srcLine[idx] = lineNum;
    225                 tp.srcCol[idx]  = column;
    226 
    227                 parseState = PARSE_TAG;
    228                 charIdx += 6;
    229 
    230                 // RUN THE TEST!
    231                 executeTest(tp);
    232                 break;
    233             }
    234 
    235            if (testString.startsWith("\\N{", charIdx-1)) {
    236                int nameEndIdx = testString.indexOf('}', charIdx);
    237                if (nameEndIdx == -1) {
    238                    errln("Error in named character in test file at line " + lineNum +
    239                            ", col " + column);
    240                }
    241                 // Named character, e.g. \N{COMBINING GRAVE ACCENT}
    242                 // Get the code point from the name and insert it into the test data.
    243                 String charName = testString.substring(charIdx+2, nameEndIdx);
    244                 c = UCharacter.getCharFromName(charName);
    245                 if (c == -1) {
    246                     errln("Error in named character in test file at line " + lineNum +
    247                             ", col " + column);
    248                 } else {
    249                     // Named code point was recognized.  Insert it
    250                     //   into the test data.
    251                     UTF16.append(tp.dataToBreak, c);
    252                     for (i = tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
    253                         tp.srcLine[i] = lineNum;
    254                         tp.srcCol[i]  = column;
    255                     }
    256 
    257                  }
    258                 if (nameEndIdx > charIdx) {
    259                     charIdx = nameEndIdx+1;
    260                 }
    261                 break;
    262             }
    263 
    264             if (testString.startsWith("<>", charIdx-1)) {
    265                 charIdx++;
    266                 int  breakIdx = tp.dataToBreak.length();
    267                 tp.expectedBreaks[breakIdx] = -1;
    268                 tp.srcLine[breakIdx]        = lineNum;
    269                 tp.srcCol[breakIdx]         = column;
    270                 break;
    271             }
    272 
    273             if (c == CH_LT) {
    274                 tagValue   = 0;
    275                 parseState = PARSE_NUM;
    276                 break;
    277             }
    278 
    279             if (c == CH_HASH && column==3) {   // TODO:  why is column off so far?
    280                 parseState = PARSE_COMMENT;
    281                 savedState = PARSE_DATA;
    282                 break;
    283             }
    284 
    285             if (c == CH_BACKSLASH) {
    286                 // Check for \ at end of line, a line continuation.
    287                 //     Advance over (discard) the newline
    288                 int cp = UTF16.charAt(testString, charIdx);
    289                 if (cp == CH_CR && charIdx<len && UTF16.charAt(testString, charIdx+1) == CH_LF) {
    290                     // We have a CR LF
    291                     //  Need an extra increment of the input ptr to move over both of them
    292                     charIdx++;
    293                 }
    294                 if (cp == CH_LF || cp == CH_CR) {
    295                     lineNum++;
    296                     column   = 0;
    297                     charIdx++;
    298                     colStart = charIdx;
    299                     break;
    300                 }
    301 
    302                 // Let unescape handle the back slash.
    303                 int  charIdxAr[] = new int[1];
    304                 charIdxAr[0] = charIdx;
    305                 cp = Utility.unescapeAt(testString, charIdxAr);
    306                 if (cp != -1) {
    307                     // Escape sequence was recognized.  Insert the char
    308                     //   into the test data.
    309                     charIdx = charIdxAr[0];
    310                     UTF16.append(tp.dataToBreak, cp);
    311                     for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
    312                         tp.srcLine[i] = lineNum;
    313                         tp.srcCol[i]  = column;
    314                     }
    315 
    316                     break;
    317                 }
    318 
    319 
    320                 // Not a recognized backslash escape sequence.
    321                 // Take the next char as a literal.
    322                 //  TODO:  Should this be an error?
    323                 c = UTF16.charAt(testString,charIdx);
    324                 charIdx = UTF16.moveCodePointOffset(testString, charIdx, 1);
    325              }
    326 
    327             // Normal, non-escaped data char.
    328             UTF16.append(tp.dataToBreak, c);
    329 
    330             // Save the mapping from offset in the data to line/column numbers in
    331             //   the original input file.  Will be used for better error messages only.
    332             //   If there's an expected break before this char, the slot in the mapping
    333             //     vector will already be set for this char; don't overwrite it.
    334             for (i=tp.dataToBreak.length()-1; i>=0 && tp.srcLine[i]==0; i--) {
    335                 tp.srcLine[i] = lineNum;
    336                 tp.srcCol[i]  = column;
    337             }
    338             break;
    339 
    340 
    341         case PARSE_NUM:
    342             // We are parsing an expected numeric tag value, like <1234>,
    343             //   within a chunk of data.
    344             if (UCharacter.isWhitespace(c)) {
    345                 break;
    346             }
    347 
    348             if (c == CH_GT) {
    349                 // Finished the number.  Add the info to the expected break data,
    350                 //   and switch parse state back to doing plain data.
    351                 parseState = PARSE_DATA;
    352                 if (tagValue == 0) {
    353                     tagValue = -1;
    354                 }
    355                 int  breakIdx = tp.dataToBreak.length();
    356                 tp.expectedBreaks[breakIdx] = tagValue;
    357                 tp.srcLine[breakIdx]        = lineNum;
    358                 tp.srcCol[breakIdx]         = column;
    359                 break;
    360             }
    361 
    362             if (UCharacter.isDigit(c)) {
    363                 tagValue = tagValue*10 + UCharacter.digit(c);
    364                 break;
    365             }
    366 
    367             errln("Syntax Error in test file at line "+ lineNum +", col %d" + column);
    368             return;
    369 
    370             // parseState = PARSE_COMMENT;   // TODO: unreachable.  Don't stop on errors.
    371             // break;
    372         }
    373 
    374 
    375 
    376     }
    377 }
    378 
    379 void executeTest(TestParams t) {
    380     int    bp;
    381     int    prevBP;
    382     int    i;
    383 
    384     if (t.bi == null) {
    385         return;
    386     }
    387 
    388     t.bi.setText(t.dataToBreak.toString());
    389     //
    390     //  Run the iterator forward
    391     //
    392     prevBP = -1;
    393     for (bp = t.bi.first(); bp != BreakIterator.DONE; bp = t.bi.next()) {
    394         if (prevBP ==  bp) {
    395             // Fail for lack of forward progress.
    396             errln("Forward Iteration, no forward progress.  Break Pos=" + bp +
    397                     "  File line,col=" + t.srcLine[bp] + ", " + t.srcCol[bp]);
    398             break;
    399         }
    400 
    401         // Check that there were we didn't miss an expected break between the last one
    402         //  and this one.
    403         for (i=prevBP+1; i<bp; i++) {
    404             if (t.expectedBreaks[i] != 0) {
    405                 errln("Forward Iteration, break expected, but not found.  Pos=" + i +
    406                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
    407             }
    408         }
    409 
    410         // Check that the break we did find was expected
    411         if (t.expectedBreaks[bp] == 0) {
    412             errln("Forward Iteration, break found, but not expected.  Pos=" + bp +
    413                     "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
    414         } else {
    415             // The break was expected.
    416             //   Check that the {nnn} tag value is correct.
    417             int expectedTagVal = t.expectedBreaks[bp];
    418             if (expectedTagVal == -1) {
    419                 expectedTagVal = 0;
    420             }
    421             int line = t.srcLine[bp];
    422             int rs = t.bi.getRuleStatus();
    423             if (rs != expectedTagVal) {
    424                 errln("Incorrect status for forward break.  Pos = " + bp +
    425                         ".  File line,col = " + line + ", " + t.srcCol[bp] + "\n" +
    426                       "          Actual, Expected status = " + rs + ", " + expectedTagVal);
    427             }
    428             int[] fillInArray = new int[4];
    429             int numStatusVals = t.bi.getRuleStatusVec(fillInArray);
    430             assertTrue("", numStatusVals >= 1);
    431             assertEquals("", expectedTagVal, fillInArray[0]);
    432         }
    433 
    434 
    435         prevBP = bp;
    436     }
    437 
    438     // Verify that there were no missed expected breaks after the last one found
    439     for (i=prevBP+1; i<t.dataToBreak.length()+1; i++) {
    440         if (t.expectedBreaks[i] != 0) {
    441             errln("Forward Iteration, break expected, but not found.  Pos=" + i +
    442                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
    443        }
    444     }
    445 
    446 
    447     //
    448     //  Run the iterator backwards, verify that the same breaks are found.
    449     //
    450     prevBP = t.dataToBreak.length()+2;  // start with a phony value for the last break pos seen.
    451     for (bp = t.bi.last(); bp != BreakIterator.DONE; bp = t.bi.previous()) {
    452         if (prevBP ==  bp) {
    453             // Fail for lack of progress.
    454             errln("Reverse Iteration, no progress.  Break Pos=" + bp +
    455                     "File line,col=" + t.srcLine[bp] + " " +  t.srcCol[bp]);
    456             break;
    457         }
    458 
    459         // Check that we didn't miss an expected break between the last one
    460         //  and this one.  (UVector returns zeros for index out of bounds.)
    461         for (i=prevBP-1; i>bp; i--) {
    462             if (t.expectedBreaks[i] != 0) {
    463                 errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
    464                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
    465             }
    466         }
    467 
    468         // Check that the break we did find was expected
    469         if (t.expectedBreaks[bp] == 0) {
    470             errln("Reverse Itertion, break found, but not expected.  Pos=" + bp +
    471                     "  File line,col= " + t.srcLine[bp] + ", " + t.srcCol[bp]);
    472         } else {
    473             // The break was expected.
    474             //   Check that the {nnn} tag value is correct.
    475             int expectedTagVal = t.expectedBreaks[bp];
    476             if (expectedTagVal == -1) {
    477                 expectedTagVal = 0;
    478             }
    479             int line = t.srcLine[bp];
    480             int rs = t.bi.getRuleStatus();
    481             if (rs != expectedTagVal) {
    482                 errln("Incorrect status for reverse break.  Pos=  " + bp +
    483                         "File line,col= " + line + ", " + t.srcCol[bp] + "\n" +
    484                       "          Actual, Expected status = " + rs + ", " + expectedTagVal);
    485             }
    486         }
    487 
    488         prevBP = bp;
    489     }
    490 
    491     // Verify that there were no missed breaks prior to the last one found
    492     for (i=prevBP-1; i>=0; i--) {
    493         if (t.expectedBreaks[i] != 0) {
    494             errln("Reverse Itertion, break expected, but not found.  Pos=" + i +
    495                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i]);
    496          }
    497     }
    498     // Check isBoundary()
    499     for (i=0; i<=t.dataToBreak.length(); i++) {
    500         boolean boundaryExpected = (t.expectedBreaks[i] != 0);
    501         boolean boundaryFound    = t.bi.isBoundary(i);
    502         if (boundaryExpected != boundaryFound) {
    503             errln("isBoundary(" + i + ") incorrect.\n" +
    504                   "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
    505                   "    Expected, Actual= " + boundaryExpected + ", " + boundaryFound);
    506         }
    507     }
    508 
    509     // Check following()
    510     for (i=0; i<=t.dataToBreak.length(); i++) {
    511         int actualBreak = t.bi.following(i);
    512         int expectedBreak = BreakIterator.DONE;
    513         for (int j=i+1; j < t.expectedBreaks.length; j++) {
    514             if (t.expectedBreaks[j] != 0) {
    515                 expectedBreak = j;
    516                 break;
    517             }
    518         }
    519         if (expectedBreak != actualBreak) {
    520             errln("following(" + i + ") incorrect.\n" +
    521                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
    522                     "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
    523         }
    524     }
    525 
    526     // Check preceding()
    527     for (i=t.dataToBreak.length(); i>=0; i--) {
    528         int actualBreak = t.bi.preceding(i);
    529         int expectedBreak = BreakIterator.DONE;
    530 
    531         for (int j=i-1; j >= 0; j--) {
    532             if (t.expectedBreaks[j] != 0) {
    533                 expectedBreak = j;
    534                 break;
    535             }
    536         }
    537         if (expectedBreak != actualBreak) {
    538             errln("preceding(" + i + ") incorrect.\n" +
    539                     "  File line,col= " + t.srcLine[i] + ", " + t.srcCol[i] +
    540                     "    Expected, Actual= " + expectedBreak + ", " + actualBreak);
    541         }
    542     }
    543 
    544 }
    545 
    546 
    547 
    548 
    549 }
    550