Home | History | Annotate | Download | only in thaitest
      1 /*
      2  ******************************************************************************
      3  * Copyright (C) 1998-2003, 2006, International Business Machines Corporation *
      4  * and others. All Rights Reserved.                                           *
      5  ******************************************************************************
      6  */
      7 
      8 #include <errno.h>
      9 #include <stdio.h>
     10 #include <string.h>
     11 
     12 #include "unicode/utypes.h"
     13 #include "unicode/uchar.h"
     14 #include "unicode/uchriter.h"
     15 #include "unicode/brkiter.h"
     16 #include "unicode/locid.h"
     17 #include "unicode/unistr.h"
     18 #include "unicode/uniset.h"
     19 #include "unicode/ustring.h"
     20 
     21 /*
     22  * This program takes a Unicode text file containing Thai text with
     23  * spaces inserted where the word breaks are. It computes a copy of
     24  * the text without spaces and uses a word instance of a Thai BreakIterator
     25  * to compute the word breaks. The program reports any differences in the
     26  * breaks.
     27  *
     28  * NOTE: by it's very nature, Thai word breaking is not exact, so it is
     29  * exptected that this program will always report some differences.
     30  */
     31 
     32 /*
     33  * This class is a break iterator that counts words and spaces.
     34  */
     35 class SpaceBreakIterator
     36 {
     37 public:
     38     // The constructor:
     39     // text  - pointer to an array of UChars to iterate over
     40     // count - the number of UChars in text
     41     SpaceBreakIterator(const UChar *text, int32_t count);
     42 
     43     // the destructor
     44     ~SpaceBreakIterator();
     45 
     46     // return next break position
     47     int32_t next();
     48 
     49     // return current word count
     50     int32_t getWordCount();
     51 
     52     // return current space count
     53     int32_t getSpaceCount();
     54 
     55 private:
     56     // No arg constructor: private so clients can't call it.
     57     SpaceBreakIterator();
     58 
     59     // The underlying BreakIterator
     60     BreakIterator *fBreakIter;
     61 
     62     // address of the UChar array
     63     const UChar *fText;
     64 
     65     // number of UChars in fText
     66     int32_t fTextCount;
     67 
     68     // current word count
     69     int32_t fWordCount;
     70 
     71     // current space count
     72     int32_t fSpaceCount;
     73 
     74     // UnicodeSet of SA characters
     75     UnicodeSet fComplexContext;
     76 
     77     // true when fBreakIter has returned DONE
     78     UBool fDone;
     79 };
     80 
     81 /*
     82  * This is the main class. It compares word breaks and reports the differences.
     83  */
     84 class ThaiWordbreakTest
     85 {
     86 public:
     87     // The main constructor:
     88     // spaces       - pointer to a UChar array for the text with spaces
     89     // spaceCount   - the number of characters in the spaces array
     90     // noSpaces     - pointer to a UChar array for the text without spaces
     91     // noSpaceCount - the number of characters in the noSpaces array
     92     // verbose      - report all breaks if true, otherwise just report differences
     93     ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount, const UChar *noSpaces, int32_t noSpaceCount, UBool verbose);
     94     ~ThaiWordbreakTest();
     95 
     96     // returns the number of breaks that are in the spaces array
     97     // but aren't found in the noSpaces array
     98     int32_t getBreaksNotFound();
     99 
    100     // returns the number of breaks which are found in the noSpaces
    101     // array but aren't in the spaces array
    102     int32_t getInvalidBreaks();
    103 
    104     // returns the number of words found in the spaces array
    105     int32_t getWordCount();
    106 
    107     // reads the input Unicode text file:
    108     // fileName  - the path name of the file
    109     // charCount - set to the number of UChars read from the file
    110     // returns   - the address of the UChar array containing the characters
    111     static const UChar *readFile(char *fileName, int32_t &charCount);
    112 
    113     // removes spaces form the input UChar array:
    114     // spaces        - pointer to the input UChar array
    115     // count         - number of UChars in the spaces array
    116     // nonSpaceCount - the number of UChars in the result array
    117     // returns       - the address of the UChar array with spaces removed
    118     static const UChar *crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount);
    119 
    120 private:
    121     // The no arg constructor - private so clients can't call it
    122     ThaiWordbreakTest();
    123 
    124     // This does the actual comparison:
    125     // spaces - the address of the UChar array for the text with spaces
    126     // spaceCount - the number of UChars in the spaces array
    127     // noSpaces   - the address of the UChar array for the text without spaces
    128     // noSpaceCount - the number of UChars in the noSpaces array
    129     // returns      - true if all breaks match, FALSE otherwise
    130     UBool compareWordBreaks(const UChar *spaces, int32_t spaceCount,
    131                             const UChar *noSpaces, int32_t noSpaceCount);
    132 
    133     // helper method to report a break in the spaces
    134     // array that's not found in the noSpaces array
    135     void breakNotFound(int32_t br);
    136 
    137     // helper method to report a break that's found in
    138     // the noSpaces array that's not in the spaces array
    139     void foundInvalidBreak(int32_t br);
    140 
    141     // count of breaks in the spaces array that
    142     // aren't found in the noSpaces array
    143     int32_t fBreaksNotFound;
    144 
    145     // count of breaks found in the noSpaces array
    146     // that aren't in the spaces array
    147     int32_t fInvalidBreaks;
    148 
    149     // number of words found in the spaces array
    150     int32_t fWordCount;
    151 
    152     // report all breaks if true, otherwise just report differences
    153     UBool fVerbose;
    154 };
    155 
    156 /*
    157  * The main constructor: it calls compareWordBreaks and reports any differences
    158  */
    159 ThaiWordbreakTest::ThaiWordbreakTest(const UChar *spaces, int32_t spaceCount,
    160                                      const UChar *noSpaces, int32_t noSpaceCount, UBool verbose)
    161 : fBreaksNotFound(0), fInvalidBreaks(0), fWordCount(0), fVerbose(verbose)
    162 {
    163     compareWordBreaks(spaces, spaceCount, noSpaces, noSpaceCount);
    164 }
    165 
    166 /*
    167  * The no arg constructor
    168  */
    169 ThaiWordbreakTest::ThaiWordbreakTest()
    170 {
    171     // nothing
    172 }
    173 
    174 /*
    175  * The destructor
    176  */
    177 ThaiWordbreakTest::~ThaiWordbreakTest()
    178 {
    179     // nothing?
    180 }
    181 
    182 /*
    183  * returns the number of breaks in the spaces array
    184  * that aren't found in the noSpaces array
    185  */
    186 inline int32_t ThaiWordbreakTest::getBreaksNotFound()
    187 {
    188     return fBreaksNotFound;
    189 }
    190 
    191 /*
    192  * Returns the number of breaks found in the noSpaces
    193  * array that aren't in the spaces array
    194  */
    195 inline int32_t ThaiWordbreakTest::getInvalidBreaks()
    196 {
    197     return fInvalidBreaks;
    198 }
    199 
    200 /*
    201  * Returns the number of words found in the spaces array
    202  */
    203 inline int32_t ThaiWordbreakTest::getWordCount()
    204 {
    205     return fWordCount;
    206 }
    207 
    208 /*
    209  * This method does the acutal break comparison and reports the results.
    210  * It uses a SpaceBreakIterator to iterate over the text with spaces,
    211  * and a word instance of a Thai BreakIterator to iterate over the text
    212  * without spaces.
    213  */
    214 UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
    215                                            const UChar *noSpaces, int32_t noSpaceCount)
    216 {
    217     UBool result = TRUE;
    218     Locale thai("th");
    219     UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
    220     UErrorCode status = U_ZERO_ERROR;
    221 
    222     BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
    223     breakIter->adoptText(noSpaceIter);
    224 
    225     SpaceBreakIterator spaceIter(spaces, spaceCount);
    226 
    227     int32_t nextBreak = 0;
    228     int32_t nextSpaceBreak = 0;
    229     int32_t iterCount = 0;
    230 
    231     while (TRUE) {
    232         nextSpaceBreak = spaceIter.next();
    233         nextBreak = breakIter->next();
    234 
    235         if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
    236             if (nextBreak != BreakIterator::DONE) {
    237                 fprintf(stderr, "break iterator didn't end.\n");
    238             } else if (nextSpaceBreak != BreakIterator::DONE) {
    239                 fprintf(stderr, "premature break iterator end.\n");
    240             }
    241 
    242             break;
    243         }
    244 
    245         while (nextSpaceBreak != nextBreak &&
    246                nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
    247             if (nextSpaceBreak < nextBreak) {
    248                 breakNotFound(nextSpaceBreak);
    249                 result = FALSE;
    250                 nextSpaceBreak = spaceIter.next();
    251             } else if (nextSpaceBreak > nextBreak) {
    252                 foundInvalidBreak(nextBreak);
    253                 result = FALSE;
    254                 nextBreak = breakIter->next();
    255             }
    256         }
    257 
    258         if (fVerbose) {
    259             printf("%d   %d\n", nextSpaceBreak, nextBreak);
    260         }
    261     }
    262 
    263 
    264     fWordCount = spaceIter.getWordCount();
    265 
    266     delete breakIter;
    267 
    268     return result;
    269 }
    270 
    271 /*
    272  * Report a break that's in the text with spaces but
    273  * not found in the text without spaces.
    274  */
    275 void ThaiWordbreakTest::breakNotFound(int32_t br)
    276 {
    277     if (fVerbose) {
    278         printf("%d   ****\n", br);
    279     } else {
    280         fprintf(stderr, "break not found: %d\n", br);
    281     }
    282 
    283     fBreaksNotFound += 1;
    284 }
    285 
    286 /*
    287  * Report a break that's found in the text without spaces
    288  * that isn't in the text with spaces.
    289  */
    290 void ThaiWordbreakTest::foundInvalidBreak(int32_t br)
    291 {
    292     if (fVerbose) {
    293         printf("****   %d\n", br);
    294     } else {
    295         fprintf(stderr, "found invalid break: %d\n", br);
    296     }
    297 
    298     fInvalidBreaks += 1;
    299 }
    300 
    301 /*
    302  * Read the text from a file. The text must start with a Unicode Byte
    303  * Order Mark (BOM) so that we know what order to read the bytes in.
    304  */
    305 const UChar *ThaiWordbreakTest::readFile(char *fileName, int32_t &charCount)
    306 {
    307     FILE *f;
    308     int32_t fileSize;
    309 
    310     UChar *buffer;
    311     char *bufferChars;
    312 
    313     f = fopen(fileName, "rb");
    314 
    315     if( f == NULL ) {
    316         fprintf(stderr,"Couldn't open %s reason: %s \n", fileName, strerror(errno));
    317         return 0;
    318     }
    319 
    320     fseek(f, 0, SEEK_END);
    321     fileSize = ftell(f);
    322 
    323     fseek(f, 0, SEEK_SET);
    324     bufferChars = new char[fileSize];
    325 
    326     if(bufferChars == 0) {
    327         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
    328         fclose(f);
    329         return 0;
    330     }
    331 
    332     fread(bufferChars, sizeof(char), fileSize, f);
    333     if( ferror(f) ) {
    334         fprintf(stderr,"Couldn't read %s reason: %s \n", fileName, strerror(errno));
    335         fclose(f);
    336         delete[] bufferChars;
    337         return 0;
    338     }
    339     fclose(f);
    340 
    341     UnicodeString myText(bufferChars, fileSize, "UTF-8");
    342 
    343     delete[] bufferChars;
    344 
    345     charCount = myText.length();
    346     buffer = new UChar[charCount];
    347     if(buffer == 0) {
    348         fprintf(stderr,"Couldn't get memory for reading %s reason: %s \n", fileName, strerror(errno));
    349         return 0;
    350     }
    351 
    352     myText.extract(1, myText.length(), buffer);
    353     charCount--;  // skip the BOM
    354     buffer[charCount] = 0;    // NULL terminate for easier reading in the debugger
    355 
    356     return buffer;
    357 }
    358 
    359 /*
    360  * Remove spaces from the input UChar array.
    361  *
    362  * We check explicitly for a Unicode code value of 0x0020
    363  * because Unicode::isSpaceChar returns true for CR, LF, etc.
    364  *
    365  */
    366 const UChar *ThaiWordbreakTest::crunchSpaces(const UChar *spaces, int32_t count, int32_t &nonSpaceCount)
    367 {
    368     int32_t i, out, spaceCount;
    369 
    370     spaceCount = 0;
    371     for (i = 0; i < count; i += 1) {
    372         if (spaces[i] == 0x0020 /*Unicode::isSpaceChar(spaces[i])*/) {
    373             spaceCount += 1;
    374         }
    375     }
    376 
    377     nonSpaceCount = count - spaceCount;
    378     UChar *noSpaces = new UChar[nonSpaceCount];
    379 
    380     if (noSpaces == 0) {
    381         fprintf(stderr, "Couldn't allocate memory for the space stripped text.\n");
    382         return 0;
    383     }
    384 
    385     for (out = 0, i = 0; i < count; i += 1) {
    386         if (spaces[i] != 0x0020 /*! Unicode::isSpaceChar(spaces[i])*/) {
    387             noSpaces[out++] = spaces[i];
    388         }
    389     }
    390 
    391     return noSpaces;
    392 }
    393 
    394 /*
    395  * Generate a text file with spaces in it from a file without.
    396  */
    397 int generateFile(const UChar *chars, int32_t length) {
    398     Locale root("");
    399     UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
    400     UErrorCode status = U_ZERO_ERROR;
    401 
    402     UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
    403     BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
    404     breakIter->adoptText(noSpaceIter);
    405     char outbuf[1024];
    406     int32_t strlength;
    407     UChar bom = 0xFEFF;
    408 
    409     printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
    410     int32_t prevbreak = 0;
    411     while (U_SUCCESS(status)) {
    412         int32_t nextbreak = breakIter->next();
    413         if (nextbreak == BreakIterator::DONE) {
    414             break;
    415         }
    416         printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
    417                                     nextbreak-prevbreak, &status));
    418         if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
    419             && complexContext.contains(chars[nextbreak])) {
    420             printf(" ");
    421         }
    422         prevbreak = nextbreak;
    423     }
    424 
    425     if (U_FAILURE(status)) {
    426         fprintf(stderr, "generate failed: %s\n", u_errorName(status));
    427         return status;
    428     }
    429     else {
    430         return 0;
    431     }
    432 }
    433 
    434 /*
    435  * The main routine. Read the command line arguments, read the text file,
    436  * remove the spaces, do the comparison and report the final results
    437  */
    438 int main(int argc, char **argv)
    439 {
    440     char *fileName = "space.txt";
    441     int arg = 1;
    442     UBool verbose = FALSE;
    443     UBool generate = FALSE;
    444 
    445     if (argc >= 2 && strcmp(argv[1], "-generate") == 0) {
    446         generate = TRUE;
    447         arg += 1;
    448     }
    449 
    450     if (argc >= 2 && strcmp(argv[1], "-verbose") == 0) {
    451         verbose = TRUE;
    452         arg += 1;
    453     }
    454 
    455     if (arg == argc - 1) {
    456         fileName = argv[arg++];
    457     }
    458 
    459     if (arg != argc) {
    460         fprintf(stderr, "Usage: %s [-verbose] [<file>]\n", argv[0]);
    461         return 1;
    462     }
    463 
    464     int32_t spaceCount, nonSpaceCount;
    465     const UChar *spaces, *noSpaces;
    466 
    467     spaces = ThaiWordbreakTest::readFile(fileName, spaceCount);
    468 
    469     if (spaces == 0) {
    470         return 1;
    471     }
    472 
    473     if (generate) {
    474         return generateFile(spaces, spaceCount);
    475     }
    476 
    477     noSpaces = ThaiWordbreakTest::crunchSpaces(spaces, spaceCount, nonSpaceCount);
    478 
    479     if (noSpaces == 0) {
    480         return 1;
    481     }
    482 
    483     ThaiWordbreakTest test(spaces, spaceCount, noSpaces, nonSpaceCount, verbose);
    484 
    485     printf("word count: %d\n", test.getWordCount());
    486     printf("breaks not found: %d\n", test.getBreaksNotFound());
    487     printf("invalid breaks found: %d\n", test.getInvalidBreaks());
    488 
    489     return 0;
    490 }
    491 
    492 /*
    493  * The main constructor. Clear all the counts and construct a default
    494  * word instance of a BreakIterator.
    495  */
    496 SpaceBreakIterator::SpaceBreakIterator(const UChar *text, int32_t count)
    497   : fBreakIter(0), fText(text), fTextCount(count), fWordCount(0), fSpaceCount(0), fDone(FALSE)
    498 {
    499     UCharCharacterIterator *iter = new UCharCharacterIterator(text, count);
    500     UErrorCode status = U_ZERO_ERROR;
    501     fComplexContext.applyPattern(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
    502     Locale root("");
    503 
    504     fBreakIter = BreakIterator::createWordInstance(root, status);
    505     fBreakIter->adoptText(iter);
    506 }
    507 
    508 SpaceBreakIterator::SpaceBreakIterator()
    509 {
    510     // nothing
    511 }
    512 
    513 /*
    514  * The destructor. delete the underlying BreakIterator
    515  */
    516 SpaceBreakIterator::~SpaceBreakIterator()
    517 {
    518     delete fBreakIter;
    519 }
    520 
    521 /*
    522  * Return the next break, counting words and spaces.
    523  */
    524 int32_t SpaceBreakIterator::next()
    525 {
    526     if (fDone) {
    527         return BreakIterator::DONE;
    528     }
    529 
    530     int32_t nextBreak;
    531     do {
    532         nextBreak = fBreakIter->next();
    533 
    534         if (nextBreak == BreakIterator::DONE) {
    535             fDone = TRUE;
    536             return BreakIterator::DONE;
    537         }
    538     }
    539     while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
    540             && fComplexContext.contains(fText[nextBreak]));
    541 
    542    int32_t result = nextBreak - fSpaceCount;
    543 
    544     if (nextBreak < fTextCount) {
    545         if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
    546             fSpaceCount += fBreakIter->next() - nextBreak;
    547         }
    548     }
    549 
    550     fWordCount += 1;
    551 
    552     return result;
    553 }
    554 
    555 /*
    556  * Returns the current space count
    557  */
    558 int32_t SpaceBreakIterator::getSpaceCount()
    559 {
    560     return fSpaceCount;
    561 }
    562 
    563 /*
    564  * Returns the current word count
    565  */
    566 int32_t SpaceBreakIterator::getWordCount()
    567 {
    568     return fWordCount;
    569 }
    570 
    571 
    572