Home | History | Annotate | Download | only in python2.7
      1 
      2 """
      3 csv.py - read/write/investigate CSV files
      4 """
      5 
      6 import re
      7 from functools import reduce
      8 from _csv import Error, __version__, writer, reader, register_dialect, \
      9                  unregister_dialect, get_dialect, list_dialects, \
     10                  field_size_limit, \
     11                  QUOTE_MINIMAL, QUOTE_ALL, QUOTE_NONNUMERIC, QUOTE_NONE, \
     12                  __doc__
     13 from _csv import Dialect as _Dialect
     14 
     15 try:
     16     from cStringIO import StringIO
     17 except ImportError:
     18     from StringIO import StringIO
     19 
     20 __all__ = [ "QUOTE_MINIMAL", "QUOTE_ALL", "QUOTE_NONNUMERIC", "QUOTE_NONE",
     21             "Error", "Dialect", "__doc__", "excel", "excel_tab",
     22             "field_size_limit", "reader", "writer",
     23             "register_dialect", "get_dialect", "list_dialects", "Sniffer",
     24             "unregister_dialect", "__version__", "DictReader", "DictWriter" ]
     25 
     26 class Dialect:
     27     """Describe an Excel dialect.
     28 
     29     This must be subclassed (see csv.excel).  Valid attributes are:
     30     delimiter, quotechar, escapechar, doublequote, skipinitialspace,
     31     lineterminator, quoting.
     32 
     33     """
     34     _name = ""
     35     _valid = False
     36     # placeholders
     37     delimiter = None
     38     quotechar = None
     39     escapechar = None
     40     doublequote = None
     41     skipinitialspace = None
     42     lineterminator = None
     43     quoting = None
     44 
     45     def __init__(self):
     46         if self.__class__ != Dialect:
     47             self._valid = True
     48         self._validate()
     49 
     50     def _validate(self):
     51         try:
     52             _Dialect(self)
     53         except TypeError, e:
     54             # We do this for compatibility with py2.3
     55             raise Error(str(e))
     56 
     57 class excel(Dialect):
     58     """Describe the usual properties of Excel-generated CSV files."""
     59     delimiter = ','
     60     quotechar = '"'
     61     doublequote = True
     62     skipinitialspace = False
     63     lineterminator = '\r\n'
     64     quoting = QUOTE_MINIMAL
     65 register_dialect("excel", excel)
     66 
     67 class excel_tab(excel):
     68     """Describe the usual properties of Excel-generated TAB-delimited files."""
     69     delimiter = '\t'
     70 register_dialect("excel-tab", excel_tab)
     71 
     72 
     73 class DictReader:
     74     def __init__(self, f, fieldnames=None, restkey=None, restval=None,
     75                  dialect="excel", *args, **kwds):
     76         self._fieldnames = fieldnames   # list of keys for the dict
     77         self.restkey = restkey          # key to catch long rows
     78         self.restval = restval          # default value for short rows
     79         self.reader = reader(f, dialect, *args, **kwds)
     80         self.dialect = dialect
     81         self.line_num = 0
     82 
     83     def __iter__(self):
     84         return self
     85 
     86     @property
     87     def fieldnames(self):
     88         if self._fieldnames is None:
     89             try:
     90                 self._fieldnames = self.reader.next()
     91             except StopIteration:
     92                 pass
     93         self.line_num = self.reader.line_num
     94         return self._fieldnames
     95 
     96     @fieldnames.setter
     97     def fieldnames(self, value):
     98         self._fieldnames = value
     99 
    100     def next(self):
    101         if self.line_num == 0:
    102             # Used only for its side effect.
    103             self.fieldnames
    104         row = self.reader.next()
    105         self.line_num = self.reader.line_num
    106 
    107         # unlike the basic reader, we prefer not to return blanks,
    108         # because we will typically wind up with a dict full of None
    109         # values
    110         while row == []:
    111             row = self.reader.next()
    112         d = dict(zip(self.fieldnames, row))
    113         lf = len(self.fieldnames)
    114         lr = len(row)
    115         if lf < lr:
    116             d[self.restkey] = row[lf:]
    117         elif lf > lr:
    118             for key in self.fieldnames[lr:]:
    119                 d[key] = self.restval
    120         return d
    121 
    122 
    123 class DictWriter:
    124     def __init__(self, f, fieldnames, restval="", extrasaction="raise",
    125                  dialect="excel", *args, **kwds):
    126         self.fieldnames = fieldnames    # list of keys for the dict
    127         self.restval = restval          # for writing short dicts
    128         if extrasaction.lower() not in ("raise", "ignore"):
    129             raise ValueError, \
    130                   ("extrasaction (%s) must be 'raise' or 'ignore'" %
    131                    extrasaction)
    132         self.extrasaction = extrasaction
    133         self.writer = writer(f, dialect, *args, **kwds)
    134 
    135     def writeheader(self):
    136         header = dict(zip(self.fieldnames, self.fieldnames))
    137         self.writerow(header)
    138 
    139     def _dict_to_list(self, rowdict):
    140         if self.extrasaction == "raise":
    141             wrong_fields = [k for k in rowdict if k not in self.fieldnames]
    142             if wrong_fields:
    143                 raise ValueError("dict contains fields not in fieldnames: " +
    144                                  ", ".join(wrong_fields))
    145         return [rowdict.get(key, self.restval) for key in self.fieldnames]
    146 
    147     def writerow(self, rowdict):
    148         return self.writer.writerow(self._dict_to_list(rowdict))
    149 
    150     def writerows(self, rowdicts):
    151         rows = []
    152         for rowdict in rowdicts:
    153             rows.append(self._dict_to_list(rowdict))
    154         return self.writer.writerows(rows)
    155 
    156 # Guard Sniffer's type checking against builds that exclude complex()
    157 try:
    158     complex
    159 except NameError:
    160     complex = float
    161 
    162 class Sniffer:
    163     '''
    164     "Sniffs" the format of a CSV file (i.e. delimiter, quotechar)
    165     Returns a Dialect object.
    166     '''
    167     def __init__(self):
    168         # in case there is more than one possible delimiter
    169         self.preferred = [',', '\t', ';', ' ', ':']
    170 
    171 
    172     def sniff(self, sample, delimiters=None):
    173         """
    174         Returns a dialect (or None) corresponding to the sample
    175         """
    176 
    177         quotechar, doublequote, delimiter, skipinitialspace = \
    178                    self._guess_quote_and_delimiter(sample, delimiters)
    179         if not delimiter:
    180             delimiter, skipinitialspace = self._guess_delimiter(sample,
    181                                                                 delimiters)
    182 
    183         if not delimiter:
    184             raise Error, "Could not determine delimiter"
    185 
    186         class dialect(Dialect):
    187             _name = "sniffed"
    188             lineterminator = '\r\n'
    189             quoting = QUOTE_MINIMAL
    190             # escapechar = ''
    191 
    192         dialect.doublequote = doublequote
    193         dialect.delimiter = delimiter
    194         # _csv.reader won't accept a quotechar of ''
    195         dialect.quotechar = quotechar or '"'
    196         dialect.skipinitialspace = skipinitialspace
    197 
    198         return dialect
    199 
    200 
    201     def _guess_quote_and_delimiter(self, data, delimiters):
    202         """
    203         Looks for text enclosed between two identical quotes
    204         (the probable quotechar) which are preceded and followed
    205         by the same character (the probable delimiter).
    206         For example:
    207                          ,'some text',
    208         The quote with the most wins, same with the delimiter.
    209         If there is no quotechar the delimiter can't be determined
    210         this way.
    211         """
    212 
    213         matches = []
    214         for restr in ('(?P<delim>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?P=delim)', # ,".*?",
    215                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?P<delim>[^\w\n"\'])(?P<space> ?)',   #  ".*?",
    216                       '(?P<delim>>[^\w\n"\'])(?P<space> ?)(?P<quote>["\']).*?(?P=quote)(?:$|\n)',  # ,".*?"
    217                       '(?:^|\n)(?P<quote>["\']).*?(?P=quote)(?:$|\n)'):                            #  ".*?" (no delim, no space)
    218             regexp = re.compile(restr, re.DOTALL | re.MULTILINE)
    219             matches = regexp.findall(data)
    220             if matches:
    221                 break
    222 
    223         if not matches:
    224             # (quotechar, doublequote, delimiter, skipinitialspace)
    225             return ('', False, None, 0)
    226         quotes = {}
    227         delims = {}
    228         spaces = 0
    229         for m in matches:
    230             n = regexp.groupindex['quote'] - 1
    231             key = m[n]
    232             if key:
    233                 quotes[key] = quotes.get(key, 0) + 1
    234             try:
    235                 n = regexp.groupindex['delim'] - 1
    236                 key = m[n]
    237             except KeyError:
    238                 continue
    239             if key and (delimiters is None or key in delimiters):
    240                 delims[key] = delims.get(key, 0) + 1
    241             try:
    242                 n = regexp.groupindex['space'] - 1
    243             except KeyError:
    244                 continue
    245             if m[n]:
    246                 spaces += 1
    247 
    248         quotechar = reduce(lambda a, b, quotes = quotes:
    249                            (quotes[a] > quotes[b]) and a or b, quotes.keys())
    250 
    251         if delims:
    252             delim = reduce(lambda a, b, delims = delims:
    253                            (delims[a] > delims[b]) and a or b, delims.keys())
    254             skipinitialspace = delims[delim] == spaces
    255             if delim == '\n': # most likely a file with a single column
    256                 delim = ''
    257         else:
    258             # there is *no* delimiter, it's a single column of quoted data
    259             delim = ''
    260             skipinitialspace = 0
    261 
    262         # if we see an extra quote between delimiters, we've got a
    263         # double quoted format
    264         dq_regexp = re.compile(r"((%(delim)s)|^)\W*%(quote)s[^%(delim)s\n]*%(quote)s[^%(delim)s\n]*%(quote)s\W*((%(delim)s)|$)" % \
    265                                {'delim':delim, 'quote':quotechar}, re.MULTILINE)
    266 
    267 
    268 
    269         if dq_regexp.search(data):
    270             doublequote = True
    271         else:
    272             doublequote = False
    273 
    274         return (quotechar, doublequote, delim, skipinitialspace)
    275 
    276 
    277     def _guess_delimiter(self, data, delimiters):
    278         """
    279         The delimiter /should/ occur the same number of times on
    280         each row. However, due to malformed data, it may not. We don't want
    281         an all or nothing approach, so we allow for small variations in this
    282         number.
    283           1) build a table of the frequency of each character on every line.
    284           2) build a table of frequencies of this frequency (meta-frequency?),
    285              e.g.  'x occurred 5 times in 10 rows, 6 times in 1000 rows,
    286              7 times in 2 rows'
    287           3) use the mode of the meta-frequency to determine the /expected/
    288              frequency for that character
    289           4) find out how often the character actually meets that goal
    290           5) the character that best meets its goal is the delimiter
    291         For performance reasons, the data is evaluated in chunks, so it can
    292         try and evaluate the smallest portion of the data possible, evaluating
    293         additional chunks as necessary.
    294         """
    295 
    296         data = filter(None, data.split('\n'))
    297 
    298         ascii = [chr(c) for c in range(127)] # 7-bit ASCII
    299 
    300         # build frequency tables
    301         chunkLength = min(10, len(data))
    302         iteration = 0
    303         charFrequency = {}
    304         modes = {}
    305         delims = {}
    306         start, end = 0, min(chunkLength, len(data))
    307         while start < len(data):
    308             iteration += 1
    309             for line in data[start:end]:
    310                 for char in ascii:
    311                     metaFrequency = charFrequency.get(char, {})
    312                     # must count even if frequency is 0
    313                     freq = line.count(char)
    314                     # value is the mode
    315                     metaFrequency[freq] = metaFrequency.get(freq, 0) + 1
    316                     charFrequency[char] = metaFrequency
    317 
    318             for char in charFrequency.keys():
    319                 items = charFrequency[char].items()
    320                 if len(items) == 1 and items[0][0] == 0:
    321                     continue
    322                 # get the mode of the frequencies
    323                 if len(items) > 1:
    324                     modes[char] = reduce(lambda a, b: a[1] > b[1] and a or b,
    325                                          items)
    326                     # adjust the mode - subtract the sum of all
    327                     # other frequencies
    328                     items.remove(modes[char])
    329                     modes[char] = (modes[char][0], modes[char][1]
    330                                    - reduce(lambda a, b: (0, a[1] + b[1]),
    331                                             items)[1])
    332                 else:
    333                     modes[char] = items[0]
    334 
    335             # build a list of possible delimiters
    336             modeList = modes.items()
    337             total = float(chunkLength * iteration)
    338             # (rows of consistent data) / (number of rows) = 100%
    339             consistency = 1.0
    340             # minimum consistency threshold
    341             threshold = 0.9
    342             while len(delims) == 0 and consistency >= threshold:
    343                 for k, v in modeList:
    344                     if v[0] > 0 and v[1] > 0:
    345                         if ((v[1]/total) >= consistency and
    346                             (delimiters is None or k in delimiters)):
    347                             delims[k] = v
    348                 consistency -= 0.01
    349 
    350             if len(delims) == 1:
    351                 delim = delims.keys()[0]
    352                 skipinitialspace = (data[0].count(delim) ==
    353                                     data[0].count("%c " % delim))
    354                 return (delim, skipinitialspace)
    355 
    356             # analyze another chunkLength lines
    357             start = end
    358             end += chunkLength
    359 
    360         if not delims:
    361             return ('', 0)
    362 
    363         # if there's more than one, fall back to a 'preferred' list
    364         if len(delims) > 1:
    365             for d in self.preferred:
    366                 if d in delims.keys():
    367                     skipinitialspace = (data[0].count(d) ==
    368                                         data[0].count("%c " % d))
    369                     return (d, skipinitialspace)
    370 
    371         # nothing else indicates a preference, pick the character that
    372         # dominates(?)
    373         items = [(v,k) for (k,v) in delims.items()]
    374         items.sort()
    375         delim = items[-1][1]
    376 
    377         skipinitialspace = (data[0].count(delim) ==
    378                             data[0].count("%c " % delim))
    379         return (delim, skipinitialspace)
    380 
    381 
    382     def has_header(self, sample):
    383         # Creates a dictionary of types of data in each column. If any
    384         # column is of a single type (say, integers), *except* for the first
    385         # row, then the first row is presumed to be labels. If the type
    386         # can't be determined, it is assumed to be a string in which case
    387         # the length of the string is the determining factor: if all of the
    388         # rows except for the first are the same length, it's a header.
    389         # Finally, a 'vote' is taken at the end for each column, adding or
    390         # subtracting from the likelihood of the first row being a header.
    391 
    392         rdr = reader(StringIO(sample), self.sniff(sample))
    393 
    394         header = rdr.next() # assume first row is header
    395 
    396         columns = len(header)
    397         columnTypes = {}
    398         for i in range(columns): columnTypes[i] = None
    399 
    400         checked = 0
    401         for row in rdr:
    402             # arbitrary number of rows to check, to keep it sane
    403             if checked > 20:
    404                 break
    405             checked += 1
    406 
    407             if len(row) != columns:
    408                 continue # skip rows that have irregular number of columns
    409 
    410             for col in columnTypes.keys():
    411 
    412                 for thisType in [int, long, float, complex]:
    413                     try:
    414                         thisType(row[col])
    415                         break
    416                     except (ValueError, OverflowError):
    417                         pass
    418                 else:
    419                     # fallback to length of string
    420                     thisType = len(row[col])
    421 
    422                 # treat longs as ints
    423                 if thisType == long:
    424                     thisType = int
    425 
    426                 if thisType != columnTypes[col]:
    427                     if columnTypes[col] is None: # add new column type
    428                         columnTypes[col] = thisType
    429                     else:
    430                         # type is inconsistent, remove column from
    431                         # consideration
    432                         del columnTypes[col]
    433 
    434         # finally, compare results against first row and "vote"
    435         # on whether it's a header
    436         hasHeader = 0
    437         for col, colType in columnTypes.items():
    438             if type(colType) == type(0): # it's a length
    439                 if len(header[col]) != colType:
    440                     hasHeader += 1
    441                 else:
    442                     hasHeader -= 1
    443             else: # attempt typecast
    444                 try:
    445                     colType(header[col])
    446                 except (ValueError, TypeError):
    447                     hasHeader += 1
    448                 else:
    449                     hasHeader -= 1
    450 
    451         return hasHeader > 0
    452