1 /* 2 * Copyright (C) 2013 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.inputmethod.latin.utils; 18 19 import com.android.inputmethod.annotations.UsedForTesting; 20 21 import java.util.ArrayList; 22 23 /** 24 * Utility methods for parsing and serializing Comma-Separated Values. The public APIs of this 25 * utility class are {@link #split(String)}, {@link #split(int,String)}, {@link #join(String...)}, 26 * {@link #join(int,String...)}, and {@link #join(int,int[],String...)}. 27 * 28 * This class implements CSV parsing and serializing methods conforming to RFC 4180 with an 29 * exception: 30 * These methods can't handle new line code escaped in double quotes. 31 */ 32 @UsedForTesting 33 public final class CsvUtils { 34 private CsvUtils() { 35 // This utility class is not publicly instantiable. 36 } 37 38 public static final int SPLIT_FLAGS_NONE = 0x0; 39 /** 40 * A flag for {@link #split(int,String)}. If this flag is specified, the method will trim 41 * spaces around fields before splitting. Note that this behavior doesn't conform to RFC 4180. 42 */ 43 public static final int SPLIT_FLAGS_TRIM_SPACES = 0x1; 44 45 public static final int JOIN_FLAGS_NONE = 0x0; 46 /** 47 * A flag for {@link #join(int,String...)} and {@link #join(int,int[],String...)}. If this 48 * flag is specified, these methods surround each field with double quotes before joining. 49 */ 50 public static final int JOIN_FLAGS_ALWAYS_QUOTED = 0x1; 51 /** 52 * A flag for {@link #join(int,String...)} and {@link #join(int,int[],String...)}. If this 53 * flag is specified, these methods add an extra space just after the comma separator. Note that 54 * this behavior doesn't conform to RFC 4180. 55 */ 56 public static final int JOIN_FLAGS_EXTRA_SPACE = 0x2; 57 58 // Note that none of these characters match high or low surrogate characters, so we need not 59 // take care of matching by code point. 60 private static final char COMMA = ','; 61 private static final char SPACE = ' '; 62 private static final char QUOTE = '"'; 63 64 @SuppressWarnings("serial") 65 public static class CsvParseException extends RuntimeException { 66 public CsvParseException(final String message) { 67 super(message); 68 } 69 } 70 71 /** 72 * Find the first non-space character in the text. 73 * 74 * @param text the text to be searched. 75 * @param fromIndex the index to start the search from, inclusive. 76 * @return the index of the first occurrence of the non-space character in the 77 * <code>text</code> that is greater than or equal to <code>fromIndex</code>, or the length of 78 * the <code>text</code> if the character does not occur. 79 */ 80 private static int indexOfNonSpace(final String text, final int fromIndex) { 81 final int length = text.length(); 82 if (fromIndex < 0 || fromIndex > length) { 83 throw new IllegalArgumentException("text=" + text + " fromIndex=" + fromIndex); 84 } 85 int index = fromIndex; 86 while (index < length && text.charAt(index) == SPACE) { 87 index++; 88 } 89 return index; 90 } 91 92 /** 93 * Find the last non-space character in the text. 94 * 95 * @param text the text to be searched. 96 * @param fromIndex the index to start the search from, exclusive. 97 * @param toIndex the index to end the search at, inclusive. Usually <code>toIndex</code> 98 * points a non-space character. 99 * @return the index of the last occurrence of the non-space character in the 100 * <code>text</code>, exclusive. It is less than <code>fromIndex</code> and greater than 101 * <code>toIndex</code>, or <code>toIndex</code> if the character does not occur. 102 */ 103 private static int lastIndexOfNonSpace(final String text, final int fromIndex, 104 final int toIndex) { 105 if (toIndex < 0 || fromIndex > text.length() || fromIndex < toIndex) { 106 throw new IllegalArgumentException( 107 "text=" + text + " fromIndex=" + fromIndex + " toIndex=" + toIndex); 108 } 109 int index = fromIndex; 110 while (index > toIndex && text.charAt(index - 1) == SPACE) { 111 index--; 112 } 113 return index; 114 } 115 116 /** 117 * Find the index of a comma separator. The search takes account of quoted fields and escape 118 * quotes. 119 * 120 * @param text the text to be searched. 121 * @param fromIndex the index to start the search from, inclusive. 122 * @return the index of the comma separator, exclusive. 123 */ 124 private static int indexOfSeparatorComma(final String text, final int fromIndex) { 125 final int length = text.length(); 126 if (fromIndex < 0 || fromIndex > length) { 127 throw new IllegalArgumentException("text=" + text + " fromIndex=" + fromIndex); 128 } 129 final boolean isQuoted = (length - fromIndex > 0 && text.charAt(fromIndex) == QUOTE); 130 for (int index = fromIndex + (isQuoted ? 1 : 0); index < length; index++) { 131 final char c = text.charAt(index); 132 if (c == COMMA && !isQuoted) { 133 return index; 134 } 135 if (c == QUOTE) { 136 final int nextIndex = index + 1; 137 if (nextIndex < length && text.charAt(nextIndex) == QUOTE) { 138 // Quoted quote. 139 index = nextIndex; 140 continue; 141 } 142 // Closing quote. 143 final int endIndex = text.indexOf(COMMA, nextIndex); 144 return endIndex < 0 ? length : endIndex; 145 } 146 } 147 return length; 148 } 149 150 /** 151 * Removing any enclosing QUOTEs (U+0022), and convert any two consecutive QUOTEs into 152 * one QUOTE. 153 * 154 * @param text the CSV field text that may have enclosing QUOTEs and escaped QUOTE character. 155 * @return the text that has been removed enclosing quotes and converted two consecutive QUOTEs 156 * into one QUOTE. 157 */ 158 @UsedForTesting 159 /* private */ static String unescapeField(final String text) { 160 StringBuilder sb = null; 161 final int length = text.length(); 162 final boolean isQuoted = (length > 0 && text.charAt(0) == QUOTE); 163 int start = isQuoted ? 1 : 0; 164 int end = start; 165 while (start <= length && (end = text.indexOf(QUOTE, start)) >= start) { 166 final int nextIndex = end + 1; 167 if (nextIndex == length && isQuoted) { 168 // Closing quote. 169 break; 170 } 171 if (nextIndex < length && text.charAt(nextIndex) == QUOTE) { 172 if (!isQuoted) { 173 throw new CsvParseException("Escaped quote in text"); 174 } 175 // Quoted quote. 176 if (sb == null) { 177 sb = new StringBuilder(); 178 } 179 sb.append(text.substring(start, nextIndex)); 180 start = nextIndex + 1; 181 } else { 182 throw new CsvParseException( 183 isQuoted ? "Raw quote in quoted text" : "Raw quote in text"); 184 } 185 } 186 if (end < 0 && isQuoted) { 187 throw new CsvParseException("Unterminated quote"); 188 } 189 if (end < 0) { 190 end = length; 191 } 192 if (sb != null && start < length) { 193 sb.append(text.substring(start, end)); 194 } 195 return sb == null ? text.substring(start, end) : sb.toString(); 196 } 197 198 /** 199 * Split the CSV text into fields. The leading and trailing spaces of the each field can be 200 * trimmed optionally. 201 * 202 * @param splitFlags flags for split behavior. {@link #SPLIT_FLAGS_TRIM_SPACES} will trim 203 * spaces around each fields. 204 * @param line the text of CSV fields. 205 * @return the array of unescaped CVS fields. 206 * @throws CsvParseException 207 */ 208 @UsedForTesting 209 public static String[] split(final int splitFlags, final String line) throws CsvParseException { 210 final boolean trimSpaces = (splitFlags & SPLIT_FLAGS_TRIM_SPACES) != 0; 211 final ArrayList<String> fields = CollectionUtils.newArrayList(); 212 final int length = line.length(); 213 int start = 0; 214 do { 215 final int csvStart = trimSpaces ? indexOfNonSpace(line, start) : start; 216 final int end = indexOfSeparatorComma(line, csvStart); 217 final int csvEnd = trimSpaces ? lastIndexOfNonSpace(line, end, csvStart) : end; 218 final String csvText = unescapeField(line.substring(csvStart, csvEnd)); 219 fields.add(csvText); 220 start = end + 1; 221 } while (start <= length); 222 return fields.toArray(new String[fields.size()]); 223 } 224 225 @UsedForTesting 226 public static String[] split(final String line) throws CsvParseException { 227 return split(SPLIT_FLAGS_NONE, line); 228 } 229 230 /** 231 * Convert the raw CSV field text to the escaped text. It adds enclosing QUOTEs (U+0022) if the 232 * raw value contains any QUOTE or comma. Also it converts any QUOTE character into two 233 * consecutive QUOTE characters. 234 * 235 * @param text the raw CSV field text to be escaped. 236 * @param alwaysQuoted true if the escaped text should always be enclosed by QUOTEs. 237 * @return the escaped text. 238 */ 239 @UsedForTesting 240 /* private */ static String escapeField(final String text, final boolean alwaysQuoted) { 241 StringBuilder sb = null; 242 boolean needsQuoted = alwaysQuoted; 243 final int length = text.length(); 244 int indexToBeAppended = 0; 245 for (int index = indexToBeAppended; index < length; index++) { 246 final char c = text.charAt(index); 247 if (c == COMMA) { 248 needsQuoted = true; 249 } else if (c == QUOTE) { 250 needsQuoted = true; 251 if (sb == null) { 252 sb = new StringBuilder(); 253 } 254 sb.append(text.substring(indexToBeAppended, index)); 255 indexToBeAppended = index + 1; 256 sb.append(QUOTE); // escaping quote. 257 sb.append(QUOTE); // escaped quote. 258 } 259 } 260 if (sb != null && indexToBeAppended < length) { 261 sb.append(text.substring(indexToBeAppended)); 262 } 263 final String escapedText = (sb == null) ? text : sb.toString(); 264 return needsQuoted ? QUOTE + escapedText + QUOTE : escapedText; 265 } 266 267 private static final String SPACES = " "; 268 269 private static void padToColumn(final StringBuilder sb, final int column) { 270 int padding; 271 while ((padding = column - sb.length()) > 0) { 272 final String spaces = SPACES.substring(0, Math.min(padding, SPACES.length())); 273 sb.append(spaces); 274 } 275 } 276 277 /** 278 * Join CSV text fields with comma. The column positions of the fields can be specified 279 * optionally. Surround each fields with double quotes before joining. 280 * 281 * @param joinFlags flags for join behavior. {@link #JOIN_FLAGS_EXTRA_SPACE} will add an extra 282 * space after each comma separator. {@link #JOIN_FLAGS_ALWAYS_QUOTED} will always add 283 * surrounding quotes to each element. 284 * @param columnPositions the array of column positions of the fields. It can be shorter than 285 * <code>fields</code> or null. Note that specifying the array column positions of the fields 286 * doesn't conform to RFC 4180. 287 * @param fields the CSV text fields. 288 * @return the string of the joined and escaped <code>fields</code>. 289 */ 290 @UsedForTesting 291 public static String join(final int joinFlags, final int columnPositions[], 292 final String... fields) { 293 final boolean alwaysQuoted = (joinFlags & JOIN_FLAGS_ALWAYS_QUOTED) != 0; 294 final String separator = COMMA + ((joinFlags & JOIN_FLAGS_EXTRA_SPACE) != 0 ? " " : ""); 295 final StringBuilder sb = new StringBuilder(); 296 for (int index = 0; index < fields.length; index++) { 297 if (index > 0) { 298 sb.append(separator); 299 } 300 if (columnPositions != null && index < columnPositions.length) { 301 padToColumn(sb, columnPositions[index]); 302 } 303 final String escapedText = escapeField(fields[index], alwaysQuoted); 304 sb.append(escapedText); 305 } 306 return sb.toString(); 307 } 308 309 @UsedForTesting 310 public static String join(final int joinFlags, final String... fields) { 311 return join(joinFlags, null, fields); 312 } 313 314 @UsedForTesting 315 public static String join(final String... fields) { 316 return join(JOIN_FLAGS_NONE, null, fields); 317 } 318 } 319