Home | History | Annotate | Download | only in utils
      1 /*
      2  * Copyright (C) 2013 The Android Open Source Project
      3  *
      4  * Licensed under the Apache License, Version 2.0 (the "License");
      5  * you may not use this file except in compliance with the License.
      6  * You may obtain a copy of the License at
      7  *
      8  *      http://www.apache.org/licenses/LICENSE-2.0
      9  *
     10  * Unless required by applicable law or agreed to in writing, software
     11  * distributed under the License is distributed on an "AS IS" BASIS,
     12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13  * See the License for the specific language governing permissions and
     14  * limitations under the License.
     15  */
     16 
     17 package com.android.inputmethod.latin.utils;
     18 
     19 import com.android.inputmethod.annotations.UsedForTesting;
     20 
     21 import java.util.ArrayList;
     22 
     23 /**
     24  * Utility methods for parsing and serializing Comma-Separated Values. The public APIs of this
     25  * utility class are {@link #split(String)}, {@link #split(int,String)}, {@link #join(String...)},
     26  * {@link #join(int,String...)}, and {@link #join(int,int[],String...)}.
     27  *
     28  * This class implements CSV parsing and serializing methods conforming to RFC 4180 with an
     29  * exception:
     30  *  These methods can't handle new line code escaped in double quotes.
     31  */
     32 @UsedForTesting
     33 public final class CsvUtils {
     34     private CsvUtils() {
     35         // This utility class is not publicly instantiable.
     36     }
     37 
     38     public static final int SPLIT_FLAGS_NONE = 0x0;
     39     /**
     40      * A flag for {@link #split(int,String)}. If this flag is specified, the method will trim
     41      * spaces around fields before splitting. Note that this behavior doesn't conform to RFC 4180.
     42      */
     43     public static final int SPLIT_FLAGS_TRIM_SPACES  = 0x1;
     44 
     45     public static final int JOIN_FLAGS_NONE = 0x0;
     46     /**
     47      * A flag for {@link #join(int,String...)} and {@link #join(int,int[],String...)}. If this
     48      * flag is specified, these methods surround each field with double quotes before joining.
     49      */
     50     public static final int JOIN_FLAGS_ALWAYS_QUOTED = 0x1;
     51     /**
     52      * A flag for {@link #join(int,String...)} and {@link #join(int,int[],String...)}. If this
     53      * flag is specified, these methods add an extra space just after the comma separator. Note that
     54      * this behavior doesn't conform to RFC 4180.
     55      */
     56     public static final int JOIN_FLAGS_EXTRA_SPACE   = 0x2;
     57 
     58     // Note that none of these characters match high or low surrogate characters, so we need not
     59     // take care of matching by code point.
     60     private static final char COMMA = ',';
     61     private static final char SPACE = ' ';
     62     private static final char QUOTE = '"';
     63 
     64     @SuppressWarnings("serial")
     65     public static class CsvParseException extends RuntimeException {
     66         public CsvParseException(final String message) {
     67             super(message);
     68         }
     69     }
     70 
     71     /**
     72      * Find the first non-space character in the text.
     73      *
     74      * @param text the text to be searched.
     75      * @param fromIndex the index to start the search from, inclusive.
     76      * @return the index of the first occurrence of the non-space character in the
     77      * <code>text</code> that is greater than or equal to <code>fromIndex</code>, or the length of
     78      * the <code>text</code> if the character does not occur.
     79      */
     80     private static int indexOfNonSpace(final String text, final int fromIndex) {
     81         final int length = text.length();
     82         if (fromIndex < 0 || fromIndex > length) {
     83             throw new IllegalArgumentException("text=" + text + " fromIndex=" + fromIndex);
     84         }
     85         int index = fromIndex;
     86         while (index < length && text.charAt(index) == SPACE) {
     87             index++;
     88         }
     89         return index;
     90     }
     91 
     92     /**
     93      * Find the last non-space character in the text.
     94      *
     95      * @param text the text to be searched.
     96      * @param fromIndex the index to start the search from, exclusive.
     97      * @param toIndex the index to end the search at, inclusive. Usually <code>toIndex</code>
     98      * points a non-space character.
     99      * @return the index of the last occurrence of the non-space character in the
    100      * <code>text</code>, exclusive. It is less than <code>fromIndex</code> and greater than
    101      * <code>toIndex</code>, or <code>toIndex</code> if the character does not occur.
    102      */
    103     private static int lastIndexOfNonSpace(final String text, final int fromIndex,
    104             final int toIndex) {
    105         if (toIndex < 0 || fromIndex > text.length() || fromIndex < toIndex) {
    106             throw new IllegalArgumentException(
    107                     "text=" + text + " fromIndex=" + fromIndex + " toIndex=" + toIndex);
    108         }
    109         int index = fromIndex;
    110         while (index > toIndex && text.charAt(index - 1) == SPACE) {
    111             index--;
    112         }
    113         return index;
    114     }
    115 
    116     /**
    117      * Find the index of a comma separator. The search takes account of quoted fields and escape
    118      * quotes.
    119      *
    120      * @param text the text to be searched.
    121      * @param fromIndex the index to start the search from, inclusive.
    122      * @return the index of the comma separator, exclusive.
    123      */
    124     private static int indexOfSeparatorComma(final String text, final int fromIndex) {
    125         final int length = text.length();
    126         if (fromIndex < 0 || fromIndex > length) {
    127             throw new IllegalArgumentException("text=" + text + " fromIndex=" + fromIndex);
    128         }
    129         final boolean isQuoted = (length - fromIndex > 0 && text.charAt(fromIndex) == QUOTE);
    130         for (int index = fromIndex + (isQuoted ? 1 : 0); index < length; index++) {
    131             final char c = text.charAt(index);
    132             if (c == COMMA && !isQuoted) {
    133                 return index;
    134             }
    135             if (c == QUOTE) {
    136                 final int nextIndex = index + 1;
    137                 if (nextIndex < length && text.charAt(nextIndex) == QUOTE) {
    138                     // Quoted quote.
    139                     index = nextIndex;
    140                     continue;
    141                 }
    142                 // Closing quote.
    143                 final int endIndex = text.indexOf(COMMA, nextIndex);
    144                 return endIndex < 0 ? length : endIndex;
    145             }
    146         }
    147         return length;
    148     }
    149 
    150     /**
    151      * Removing any enclosing QUOTEs (U+0022), and convert any two consecutive QUOTEs into
    152      * one QUOTE.
    153      *
    154      * @param text the CSV field text that may have enclosing QUOTEs and escaped QUOTE character.
    155      * @return the text that has been removed enclosing quotes and converted two consecutive QUOTEs
    156      * into one QUOTE.
    157      */
    158     @UsedForTesting
    159     /* private */ static String unescapeField(final String text) {
    160         StringBuilder sb = null;
    161         final int length = text.length();
    162         final boolean isQuoted = (length > 0 && text.charAt(0) == QUOTE);
    163         int start = isQuoted ? 1 : 0;
    164         int end = start;
    165         while (start <= length && (end = text.indexOf(QUOTE, start)) >= start) {
    166             final int nextIndex = end + 1;
    167             if (nextIndex == length && isQuoted) {
    168                 // Closing quote.
    169                 break;
    170             }
    171             if (nextIndex < length && text.charAt(nextIndex) == QUOTE) {
    172                 if (!isQuoted) {
    173                     throw new CsvParseException("Escaped quote in text");
    174                 }
    175                 // Quoted quote.
    176                 if (sb == null) {
    177                     sb = new StringBuilder();
    178                 }
    179                 sb.append(text.substring(start, nextIndex));
    180                 start = nextIndex + 1;
    181             } else {
    182                 throw new CsvParseException(
    183                         isQuoted ? "Raw quote in quoted text" : "Raw quote in text");
    184             }
    185         }
    186         if (end < 0 && isQuoted) {
    187             throw new CsvParseException("Unterminated quote");
    188         }
    189         if (end < 0) {
    190             end = length;
    191         }
    192         if (sb != null && start < length) {
    193             sb.append(text.substring(start, end));
    194         }
    195         return sb == null ? text.substring(start, end) : sb.toString();
    196     }
    197 
    198     /**
    199      * Split the CSV text into fields. The leading and trailing spaces of the each field can be
    200      * trimmed optionally.
    201      *
    202      * @param splitFlags flags for split behavior. {@link #SPLIT_FLAGS_TRIM_SPACES} will trim
    203      * spaces around each fields.
    204      * @param line the text of CSV fields.
    205      * @return the array of unescaped CVS fields.
    206      * @throws CsvParseException
    207      */
    208     @UsedForTesting
    209     public static String[] split(final int splitFlags, final String line) throws CsvParseException {
    210         final boolean trimSpaces = (splitFlags & SPLIT_FLAGS_TRIM_SPACES) != 0;
    211         final ArrayList<String> fields = CollectionUtils.newArrayList();
    212         final int length = line.length();
    213         int start = 0;
    214         do {
    215             final int csvStart = trimSpaces ? indexOfNonSpace(line, start) : start;
    216             final int end = indexOfSeparatorComma(line, csvStart);
    217             final int csvEnd = trimSpaces ? lastIndexOfNonSpace(line, end, csvStart) : end;
    218             final String csvText = unescapeField(line.substring(csvStart, csvEnd));
    219             fields.add(csvText);
    220             start = end + 1;
    221         } while (start <= length);
    222         return fields.toArray(new String[fields.size()]);
    223     }
    224 
    225     @UsedForTesting
    226     public static String[] split(final String line) throws CsvParseException {
    227         return split(SPLIT_FLAGS_NONE, line);
    228     }
    229 
    230     /**
    231      * Convert the raw CSV field text to the escaped text. It adds enclosing QUOTEs (U+0022) if the
    232      * raw value contains any QUOTE or comma. Also it converts any QUOTE character into two
    233      * consecutive QUOTE characters.
    234      *
    235      * @param text the raw CSV field text to be escaped.
    236      * @param alwaysQuoted true if the escaped text should always be enclosed by QUOTEs.
    237      * @return the escaped text.
    238      */
    239     @UsedForTesting
    240     /* private */ static String escapeField(final String text, final boolean alwaysQuoted) {
    241         StringBuilder sb = null;
    242         boolean needsQuoted = alwaysQuoted;
    243         final int length = text.length();
    244         int indexToBeAppended = 0;
    245         for (int index = indexToBeAppended; index < length; index++) {
    246             final char c = text.charAt(index);
    247             if (c == COMMA) {
    248                 needsQuoted = true;
    249             } else if (c == QUOTE) {
    250                 needsQuoted = true;
    251                 if (sb == null) {
    252                     sb = new StringBuilder();
    253                 }
    254                 sb.append(text.substring(indexToBeAppended, index));
    255                 indexToBeAppended = index + 1;
    256                 sb.append(QUOTE); // escaping quote.
    257                 sb.append(QUOTE); // escaped quote.
    258             }
    259         }
    260         if (sb != null && indexToBeAppended < length) {
    261             sb.append(text.substring(indexToBeAppended));
    262         }
    263         final String escapedText = (sb == null) ? text : sb.toString();
    264         return needsQuoted ? QUOTE + escapedText + QUOTE : escapedText;
    265     }
    266 
    267     private static final String SPACES = "                    ";
    268 
    269     private static void padToColumn(final StringBuilder sb, final int column) {
    270         int padding;
    271         while ((padding = column - sb.length()) > 0) {
    272             final String spaces = SPACES.substring(0, Math.min(padding, SPACES.length()));
    273             sb.append(spaces);
    274         }
    275     }
    276 
    277     /**
    278      * Join CSV text fields with comma. The column positions of the fields can be specified
    279      * optionally. Surround each fields with double quotes before joining.
    280      *
    281      * @param joinFlags flags for join behavior. {@link #JOIN_FLAGS_EXTRA_SPACE} will add an extra
    282      * space after each comma separator. {@link #JOIN_FLAGS_ALWAYS_QUOTED} will always add
    283      * surrounding quotes to each element.
    284      * @param columnPositions the array of column positions of the fields. It can be shorter than
    285      * <code>fields</code> or null. Note that specifying the array column positions of the fields
    286      * doesn't conform to RFC 4180.
    287      * @param fields the CSV text fields.
    288      * @return the string of the joined and escaped <code>fields</code>.
    289      */
    290     @UsedForTesting
    291     public static String join(final int joinFlags, final int columnPositions[],
    292             final String... fields) {
    293         final boolean alwaysQuoted = (joinFlags & JOIN_FLAGS_ALWAYS_QUOTED) != 0;
    294         final String separator = COMMA + ((joinFlags & JOIN_FLAGS_EXTRA_SPACE) != 0 ? " " : "");
    295         final StringBuilder sb = new StringBuilder();
    296         for (int index = 0; index < fields.length; index++) {
    297             if (index > 0) {
    298                 sb.append(separator);
    299             }
    300             if (columnPositions != null && index < columnPositions.length) {
    301                 padToColumn(sb, columnPositions[index]);
    302             }
    303             final String escapedText = escapeField(fields[index], alwaysQuoted);
    304             sb.append(escapedText);
    305         }
    306         return sb.toString();
    307     }
    308 
    309     @UsedForTesting
    310     public static String join(final int joinFlags, final String... fields) {
    311         return join(joinFlags, null, fields);
    312     }
    313 
    314     @UsedForTesting
    315     public static String join(final String... fields) {
    316         return join(JOIN_FLAGS_NONE, null, fields);
    317     }
    318 }
    319