1 // 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html#License 3 /* 4 ********************************************************************** 5 * Copyright (c) 2002-2008, International Business Machines * 6 * Corporation and others. All Rights Reserved. * 7 ********************************************************************** 8 */ 9 package com.ibm.icu.dev.test.perf; 10 11 import java.io.FileInputStream; 12 import java.util.ArrayList; 13 14 public class BreakIteratorPerformanceTest extends PerfTest { 15 16 String fileContents; 17 18 com.ibm.icu.text.BreakIterator iSentenceIter; 19 com.ibm.icu.text.BreakIterator iWordIter; 20 com.ibm.icu.text.BreakIterator iLineIter; 21 com.ibm.icu.text.BreakIterator iCharacterIter; 22 java.text.BreakIterator jSentenceIter; 23 java.text.BreakIterator jWordIter; 24 java.text.BreakIterator jLineIter; 25 java.text.BreakIterator jCharacterIter; 26 String[] iSentences; 27 String[] iWords; 28 String[] iLines; 29 String[] iCharacters; 30 String[] jSentences; 31 String[] jWords; 32 String[] jLines; 33 String[] jCharacters; 34 35 public static void main(String[] args) throws Exception { 36 new BreakIteratorPerformanceTest().run(args); 37 } 38 39 protected void setup(String[] args) { 40 try { 41 // read in the input file, being careful with a possible BOM 42 FileInputStream in = new FileInputStream(fileName); 43 BOMFreeReader reader = new BOMFreeReader(in, encoding); 44 fileContents = new String(readToEOS(reader)); 45 46 // // get rid of any characters that may cause differences between ICU4J and Java BreakIterator 47 // // fileContents = fileContents.replaceAll("[\t\f\r\n\\-/ ]+", " "); 48 // String res = ""; 49 // StringTokenizer tokenizer = new StringTokenizer(fileContents, "\t\f\r\n-/ "); 50 // while (tokenizer.hasMoreTokens()) 51 // res += tokenizer.nextToken() + " "; 52 // fileContents = res.trim(); 53 54 // create the break iterators with respect to locale 55 if (locale == null) { 56 iSentenceIter = com.ibm.icu.text.BreakIterator.getSentenceInstance(); 57 iWordIter = com.ibm.icu.text.BreakIterator.getWordInstance(); 58 iLineIter = com.ibm.icu.text.BreakIterator.getLineInstance(); 59 iCharacterIter = com.ibm.icu.text.BreakIterator.getCharacterInstance(); 60 61 jSentenceIter = java.text.BreakIterator.getSentenceInstance(); 62 jWordIter = java.text.BreakIterator.getWordInstance(); 63 jLineIter = java.text.BreakIterator.getLineInstance(); 64 jCharacterIter = java.text.BreakIterator.getCharacterInstance(); 65 } else { 66 iSentenceIter = com.ibm.icu.text.BreakIterator.getSentenceInstance(locale); 67 iWordIter = com.ibm.icu.text.BreakIterator.getWordInstance(locale); 68 iLineIter = com.ibm.icu.text.BreakIterator.getLineInstance(locale); 69 iCharacterIter = com.ibm.icu.text.BreakIterator.getCharacterInstance(locale); 70 71 jSentenceIter = java.text.BreakIterator.getSentenceInstance(locale); 72 jWordIter = java.text.BreakIterator.getWordInstance(locale); 73 jLineIter = java.text.BreakIterator.getLineInstance(locale); 74 jCharacterIter = java.text.BreakIterator.getCharacterInstance(locale); 75 } 76 77 iSentences = init(iSentenceIter); 78 iWords = init(iWordIter); 79 iLines = init(iLineIter); 80 iCharacters = init(iCharacterIter); 81 jSentences = init(jSentenceIter); 82 jWords = init(jWordIter); 83 jLines = init(jLineIter); 84 jCharacters = init(jCharacterIter); 85 86 } catch (Exception ex) { 87 ex.printStackTrace(); 88 throw new RuntimeException(ex.getMessage()); 89 } 90 91 // we created some heavy objects, so lets try to clean up a little before running the tests 92 gc(); 93 } 94 95 private String[] init(com.ibm.icu.text.BreakIterator iter) { 96 // set the string to iterate on 97 iter.setText(fileContents); 98 99 // produce a token list 100 ArrayList tokenList = new ArrayList(); 101 int start = iter.first(); 102 for (int end = iter.next(); end != com.ibm.icu.text.BreakIterator.DONE; start = end, end = iter.next()) 103 tokenList.add(fileContents.substring(start, end)); 104 105 // return the token list as a string array 106 return (String[]) tokenList.toArray(new String[0]); 107 } 108 109 private String[] init(java.text.BreakIterator iter) { 110 // set the string to iterate on 111 iter.setText(fileContents); 112 113 // produce a token list 114 ArrayList tokenList = new ArrayList(); 115 int start = iter.first(); 116 for (int end = iter.next(); end != com.ibm.icu.text.BreakIterator.DONE; start = end, end = iter.next()) 117 tokenList.add(fileContents.substring(start, end)); 118 119 // return the token list as a string array 120 return (String[]) tokenList.toArray(new String[0]); 121 } 122 123 PerfTest.Function createTestICU(final com.ibm.icu.text.BreakIterator iIter, final String[] correct, 124 final String breakType) { 125 return new PerfTest.Function() { 126 public void call() { 127 int k = 0; 128 int start = iIter.first(); 129 for (int end = iIter.next(); end != com.ibm.icu.text.BreakIterator.DONE; start = end, end = iIter 130 .next()) 131 if (!correct[k++].equals(fileContents.substring(start, end))) 132 throw new RuntimeException("ICU4J BreakIterator gave the wrong answer for " + breakType + " " 133 + (k - 1) + " during the performance test. Cannot continue the performance test."); 134 if (k != correct.length) 135 throw new RuntimeException("ICU4J BreakIterator gave the wrong number of " + breakType 136 + "s during the performance test. Cannot continue the performance test."); 137 } 138 139 public long getOperationsPerIteration() { 140 return fileContents.length(); 141 } 142 }; 143 } 144 145 PerfTest.Function createTestJava(final java.text.BreakIterator jIter, final String[] correct, final String breakType) { 146 return new PerfTest.Function() { 147 public void call() { 148 int k = 0; 149 int start = jIter.first(); 150 for (int end = jIter.next(); end != java.text.BreakIterator.DONE; start = end, end = jIter.next()) 151 if (!correct[k++].equals(fileContents.substring(start, end))) 152 throw new RuntimeException("Java BreakIterator gave the wrong answer for " + breakType + " " 153 + (k - 1) + " during the performance test. Cannot continue the performance test."); 154 if (k != correct.length) 155 throw new RuntimeException("Java BreakIterator gave the wrong number of " + breakType 156 + "s during the performance test. Cannot continue the performance test."); 157 } 158 159 public long getOperationsPerIteration() { 160 return fileContents.length(); 161 } 162 }; 163 } 164 165 PerfTest.Function TestICUSentences() { 166 return createTestICU(iSentenceIter, iSentences, "sentence"); 167 } 168 169 PerfTest.Function TestICUWords() { 170 return createTestICU(iWordIter, iWords, "word"); 171 } 172 173 PerfTest.Function TestICULines() { 174 return createTestICU(iLineIter, iLines, "line"); 175 } 176 177 PerfTest.Function TestICUCharacters() { 178 return createTestICU(iCharacterIter, iCharacters, "character"); 179 } 180 181 PerfTest.Function TestJavaSentences() { 182 return createTestJava(jSentenceIter, jSentences, "sentence"); 183 } 184 185 PerfTest.Function TestJavaWords() { 186 return createTestJava(jWordIter, jWords, "word"); 187 } 188 189 PerfTest.Function TestJavaLines() { 190 return createTestJava(jLineIter, jLines, "line"); 191 } 192 193 PerfTest.Function TestJavaCharacters() { 194 return createTestJava(jCharacterIter, jCharacters, "character"); 195 } 196 } 197