Home | History | Annotate | Download | only in Lib
      1 
      2 """
      3 csv.py - read/write/investigate CSV files
      4 """
      5 
      6 import re
      7 from functools import reduce
      8 from _csv import Error, __version__, writer, reader, register_dialect, \
      9                  unregister_dialect, get_dialect, list_dialects, \
     10                  field_size_limit, \
     11                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
     12                  __doc__
     13 from _csv import Dialect as _Dialect
     14 
     15 try:
     16     from cStringIO import StringIO
     17 except ImportError:
     18     from StringIO import StringIO
     19 
     20 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
     21             "Error", "Dialect", "__doc__", "excel", "excel_tab",
     22             "field_size_limit", "reader", "writer",
     23             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
     24             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
     25 
     26 class Dialect:
     27     """Describe an Excel dialect.
     28 
     29     This must be subclassed (see csv.excel).  Valid attributes are:
     30     delimiter, quotechar, escapechar, doublequote, skipinitialspace,
     31     lineterminator, quoting.
     32 
     33     """
     34     _name = ""
     35     _valid = False
     36     # placeholders
     37     delimiter = None
     38     quotechar = None
     39     escapechar = None
     40     doublequote = None
     41     skipinitialspace = None
     42     lineterminator = None
     43     quoting = None
     44 
     45     def __init__(self):
     46         if self.__class__ != Dialect:
     47             self._valid = True
     48         self._validate()
     49 
     50     def _validate(self):
     51         try:
     52             _Dialect(self)
     53         except TypeError, e:
     54             # We do this for compatibility with py2.3
     55             raise Error(str(e))
     56 
     57 class excel(Dialect):
     58     """Describe the usual properties of Excel-generated CSV files."""
     59     delimiter = ','
     60     quotechar = '"'
     61     doublequote = True
     62     skipinitialspace = False
     63     lineterminator = '\r\n'
     64     quoting = QUOTE_MINIMAL
     65 register_dialect("excel", excel)
     66 
     67 class excel_tab(excel):
     68     """Describe the usual properties of Excel-generated TAB-delimited files."""
     69     delimiter = '\t'
     70 register_dialect("excel-tab", excel_tab)
     71 
     72 
     73 class DictReader:
     74     def __init__(self, f, fieldnames=None, restkey=None, restval=None,
     75                  dialect="excel", *args, **kwds):
     76         self._fieldnames = fieldnames   # list of keys for the dict
     77         self.restkey = restkey          # key to catch long rows
     78         self.restval = restval          # default value for short rows
     79         self.reader = reader(f, dialect, *args, **kwds)
     80         self.dialect = dialect
     81         self.line_num = 0
     82 
     83     def __iter__(self):
     84         return self
     85 
     86     @property
     87     def fieldnames(self):
     88         if self._fieldnames is None:
     89             try:
     90                 self._fieldnames = self.reader.next()
     91             except StopIteration:
     92                 pass
     93         self.line_num = self.reader.line_num
     94         return self._fieldnames
     95 
     96     # Issue 20004: Because DictReader is a classic class, this setter is
     97     # ignored.  At this point in 2.7's lifecycle, it is too late to change the
     98     # base class for fear of breaking working code.  If you want to change
     99     # fieldnames without overwriting the getter, set _fieldnames directly.
    100     @fieldnames.setter
    101     def fieldnames(self, value):
    102         self._fieldnames = value
    103 
    104     def next(self):
    105         if self.line_num == 0:
    106             # Used only for its side effect.
    107             self.fieldnames
    108         row = self.reader.next()
    109         self.line_num = self.reader.line_num
    110 
    111         # unlike the basic reader, we prefer not to return blanks,
    112         # because we will typically wind up with a dict full of None
    113         # values
    114         while row == []:
    115             row = self.reader.next()
    116         d = dict(zip(self.fieldnames, row))
    117         lf = len(self.fieldnames)
    118         lr = len(row)
    119         if lf < lr:
    120             d[self.restkey] = row[lf:]
    121         elif lf > lr:
    122             for key in self.fieldnames[lr:]:
    123                 d[key] = self.restval
    124         return d
    125 
    126 
    127 class DictWriter:
    128     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
    129                  dialect="excel", *args, **kwds):
    130         self.fieldnames = fieldnames    # list of keys for the dict
    131         self.restval = restval          # for writing short dicts
    132         if extrasaction.lower() not in ("raise", "ignore"):
    133             raise ValueError, \
    134                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
    135                    extrasaction)
    136         self.extrasaction = extrasaction
    137         self.writer = writer(f, dialect, *args, **kwds)
    138 
    139     def writeheader(self):
    140         header = dict(zip(self.fieldnames, self.fieldnames))
    141         self.writerow(header)
    142 
    143     def _dict_to_list(self, rowdict):
    144         if self.extrasaction == "raise":
    145             wrong_fields = [k for k in rowdict if k not in self.fieldnames]
    146             if wrong_fields:
    147                 raise ValueError("dict contains fields not in fieldnames: "
    148                                  + ", ".join([repr(x) for x in wrong_fields]))
    149         return [rowdict.get(key, self.restval) for key in self.fieldnames]
    150 
    151     def writerow(self, rowdict):
    152         return self.writer.writerow(self._dict_to_list(rowdict))
    153 
    154     def writerows(self, rowdicts):
    155         rows = []
    156         for rowdict in rowdicts:
    157             rows.append(self._dict_to_list(rowdict))
    158         return self.writer.writerows(rows)
    159 
    160 # Guard Sniffer's type checking against builds that exclude complex()
    161 try:
    162     complex
    163 except NameError:
    164     complex = float
    165 
    166 class Sniffer:
    167     '''
    168     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
    169     Returns a Dialect object.
    170     '''
    171     def __init__(self):
    172         # in case there is more than one possible delimiter
    173         self.preferred = [',', '\t', ';', ' ', ':']
    174 
    175 
    176     def sniff(self, sample, delimiters=None):
    177         """
    178         Returns a dialect (or None) corresponding to the sample
    179         """
    180 
    181         quotechar, doublequote, delimiter, skipinitialspace = \
    182                    self._guess_quote_and_delimiter(sample, delimiters)
    183         if not delimiter:
    184             delimiter, skipinitialspace = self._guess_delimiter(sample,
    185                                                                 delimiters)
    186 
    187         if not delimiter:
    188             raise Error, "Could not determine delimiter"
    189 
    190         class dialect(Dialect):
    191             _name = "sniffed"
    192             lineterminator = '\r\n'
    193             quoting = QUOTE_MINIMAL
    194             # escapechar = ''
    195 
    196         dialect.doublequote = doublequote
    197         dialect.delimiter = delimiter
    198         # _csv.reader won't accept a quotechar of ''
    199         dialect.quotechar = quotechar or '"'
    200         dialect.skipinitialspace = skipinitialspace
    201 
    202         return dialect
    203 
    204 
    205     def _guess_quote_and_delimiter(self, data, delimiters):
    206         """
    207         Looks for text enclosed between two identical quotes
    208         (the probable quotechar) which are preceded and followed
    209         by the same character (the probable delimiter).
    210         For example:
    211                          ,'some text',
    212         The quote with the most wins, same with the delimiter.
    213         If there is no quotechar the delimiter can't be determined
    214         this way.
    215         """
    216 
    217         matches = []
    218         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
    219                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
    220                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
    221                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
    222             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
    223             matches = regexp.findall(data)
    224             if matches:
    225                 break
    226 
    227         if not matches:
    228             # (quotechar, doublequote, delimiter, skipinitialspace)
    229             return ('', False, None, 0)
    230         quotes = {}
    231         delims = {}
    232         spaces = 0
    233         for m in matches:
    234             n = regexp.groupindex['quote'] - 1
    235             key = m[n]
    236             if key:
    237                 quotes[key] = quotes.get(key, 0) + 1
    238             try:
    239                 n = regexp.groupindex['delim'] - 1
    240                 key = m[n]
    241             except KeyError:
    242                 continue
    243             if key and (delimiters is None or key in delimiters):
    244                 delims[key] = delims.get(key, 0) + 1
    245             try:
    246                 n = regexp.groupindex['space'] - 1
    247             except KeyError:
    248                 continue
    249             if m[n]:
    250                 spaces += 1
    251 
    252         quotechar = reduce(lambda a, b, quotes = quotes:
    253                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
    254 
    255         if delims:
    256             delim = reduce(lambda a, b, delims = delims:
    257                            (delims[a] > delims[b]) and a or b, delims.keys())
    258             skipinitialspace = delims[delim] == spaces
    259             if delim == '\n': # most likely a file with a single column
    260                 delim = ''
    261         else:
    262             # there is *no* delimiter, it's a single column of quoted data
    263             delim = ''
    264             skipinitialspace = 0
    265 
    266         # if we see an extra quote between delimiters, we've got a
    267         # double quoted format
    268         dq_regexp = re.compile(
    269                                r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
    270                                {'delim':re.escape(delim), 'quote':quotechar}, re.MULTILINE)
    271 
    272 
    273 
    274         if dq_regexp.search(data):
    275             doublequote = True
    276         else:
    277             doublequote = False
    278 
    279         return (quotechar, doublequote, delim, skipinitialspace)
    280 
    281 
    282     def _guess_delimiter(self, data, delimiters):
    283         """
    284         The delimiter /should/ occur the same number of times on
    285         each row. However, due to malformed data, it may not. We don't want
    286         an all or nothing approach, so we allow for small variations in this
    287         number.
    288           1) build a table of the frequency of each character on every line.
    289           2) build a table of frequencies of this frequency (meta-frequency?),
    290              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
    291              7 times in 2 rows'
    292           3) use the mode of the meta-frequency to determine the /expected/
    293              frequency for that character
    294           4) find out how often the character actually meets that goal
    295           5) the character that best meets its goal is the delimiter
    296         For performance reasons, the data is evaluated in chunks, so it can
    297         try and evaluate the smallest portion of the data possible, evaluating
    298         additional chunks as necessary.
    299         """
    300 
    301         data = filter(None, data.split('\n'))
    302 
    303         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
    304 
    305         # build frequency tables
    306         chunkLength = min(10, len(data))
    307         iteration = 0
    308         charFrequency = {}
    309         modes = {}
    310         delims = {}
    311         start, end = 0, min(chunkLength, len(data))
    312         while start < len(data):
    313             iteration += 1
    314             for line in data[start:end]:
    315                 for char in ascii:
    316                     metaFrequency = charFrequency.get(char, {})
    317                     # must count even if frequency is 0
    318                     freq = line.count(char)
    319                     # value is the mode
    320                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
    321                     charFrequency[char] = metaFrequency
    322 
    323             for char in charFrequency.keys():
    324                 items = charFrequency[char].items()
    325                 if len(items) == 1 and items[0][0] == 0:
    326                     continue
    327                 # get the mode of the frequencies
    328                 if len(items) > 1:
    329                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
    330                                          items)
    331                     # adjust the mode - subtract the sum of all
    332                     # other frequencies
    333                     items.remove(modes[char])
    334                     modes[char] = (modes[char][0], modes[char][1]
    335                                    - reduce(lambda a, b: (0, a[1] + b[1]),
    336                                             items)[1])
    337                 else:
    338                     modes[char] = items[0]
    339 
    340             # build a list of possible delimiters
    341             modeList = modes.items()
    342             total = float(chunkLength * iteration)
    343             # (rows of consistent data) / (number of rows) = 100%
    344             consistency = 1.0
    345             # minimum consistency threshold
    346             threshold = 0.9
    347             while len(delims) == 0 and consistency >= threshold:
    348                 for k, v in modeList:
    349                     if v[0] > 0 and v[1] > 0:
    350                         if ((v[1]/total) >= consistency and
    351                             (delimiters is None or k in delimiters)):
    352                             delims[k] = v
    353                 consistency -= 0.01
    354 
    355             if len(delims) == 1:
    356                 delim = delims.keys()[0]
    357                 skipinitialspace = (data[0].count(delim) ==
    358                                     data[0].count("%c " % delim))
    359                 return (delim, skipinitialspace)
    360 
    361             # analyze another chunkLength lines
    362             start = end
    363             end += chunkLength
    364 
    365         if not delims:
    366             return ('', 0)
    367 
    368         # if there's more than one, fall back to a 'preferred' list
    369         if len(delims) > 1:
    370             for d in self.preferred:
    371                 if d in delims.keys():
    372                     skipinitialspace = (data[0].count(d) ==
    373                                         data[0].count("%c " % d))
    374                     return (d, skipinitialspace)
    375 
    376         # nothing else indicates a preference, pick the character that
    377         # dominates(?)
    378         items = [(v,k) for (k,v) in delims.items()]
    379         items.sort()
    380         delim = items[-1][1]
    381 
    382         skipinitialspace = (data[0].count(delim) ==
    383                             data[0].count("%c " % delim))
    384         return (delim, skipinitialspace)
    385 
    386 
    387     def has_header(self, sample):
    388         # Creates a dictionary of types of data in each column. If any
    389         # column is of a single type (say, integers), *except* for the first
    390         # row, then the first row is presumed to be labels. If the type
    391         # can't be determined, it is assumed to be a string in which case
    392         # the length of the string is the determining factor: if all of the
    393         # rows except for the first are the same length, it's a header.
    394         # Finally, a 'vote' is taken at the end for each column, adding or
    395         # subtracting from the likelihood of the first row being a header.
    396 
    397         rdr = reader(StringIO(sample), self.sniff(sample))
    398 
    399         header = rdr.next() # assume first row is header
    400 
    401         columns = len(header)
    402         columnTypes = {}
    403         for i in range(columns): columnTypes[i] = None
    404 
    405         checked = 0
    406         for row in rdr:
    407             # arbitrary number of rows to check, to keep it sane
    408             if checked > 20:
    409                 break
    410             checked += 1
    411 
    412             if len(row) != columns:
    413                 continue # skip rows that have irregular number of columns
    414 
    415             for col in columnTypes.keys():
    416 
    417                 for thisType in [int, long, float, complex]:
    418                     try:
    419                         thisType(row[col])
    420                         break
    421                     except (ValueError, OverflowError):
    422                         pass
    423                 else:
    424                     # fallback to length of string
    425                     thisType = len(row[col])
    426 
    427                 # treat longs as ints
    428                 if thisType == long:
    429                     thisType = int
    430 
    431                 if thisType != columnTypes[col]:
    432                     if columnTypes[col] is None: # add new column type
    433                         columnTypes[col] = thisType
    434                     else:
    435                         # type is inconsistent, remove column from
    436                         # consideration
    437                         del columnTypes[col]
    438 
    439         # finally, compare results against first row and "vote"
    440         # on whether it's a header
    441         hasHeader = 0
    442         for col, colType in columnTypes.items():
    443             if type(colType) == type(0): # it's a length
    444                 if len(header[col]) != colType:
    445                     hasHeader += 1
    446                 else:
    447                     hasHeader -= 1
    448             else: # attempt typecast
    449                 try:
    450                     colType(header[col])
    451                 except (ValueError, TypeError):
    452                     hasHeader += 1
    453                 else:
    454                     hasHeader -= 1
    455 
    456         return hasHeader > 0
    457