Home | History | Annotate | Download | only in csv
      1 // Copyright 2011 The Go Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style
      3 // license that can be found in the LICENSE file.
      4 
      5 // Package csv reads and writes comma-separated values (CSV) files.
      6 //
      7 // A csv file contains zero or more records of one or more fields per record.
      8 // Each record is separated by the newline character. The final record may
      9 // optionally be followed by a newline character.
     10 //
     11 //	field1,field2,field3
     12 //
     13 // White space is considered part of a field.
     14 //
     15 // Carriage returns before newline characters are silently removed.
     16 //
     17 // Blank lines are ignored.  A line with only whitespace characters (excluding
     18 // the ending newline character) is not considered a blank line.
     19 //
     20 // Fields which start and stop with the quote character " are called
     21 // quoted-fields.  The beginning and ending quote are not part of the
     22 // field.
     23 //
     24 // The source:
     25 //
     26 //	normal string,"quoted-field"
     27 //
     28 // results in the fields
     29 //
     30 //	{`normal string`, `quoted-field`}
     31 //
     32 // Within a quoted-field a quote character followed by a second quote
     33 // character is considered a single quote.
     34 //
     35 //	"the ""word"" is true","a ""quoted-field"""
     36 //
     37 // results in
     38 //
     39 //	{`the "word" is true`, `a "quoted-field"`}
     40 //
     41 // Newlines and commas may be included in a quoted-field
     42 //
     43 //	"Multi-line
     44 //	field","comma is ,"
     45 //
     46 // results in
     47 //
     48 //	{`Multi-line
     49 //	field`, `comma is ,`}
     50 package csv
     51 
     52 import (
     53 	"bufio"
     54 	"bytes"
     55 	"errors"
     56 	"fmt"
     57 	"io"
     58 	"unicode"
     59 )
     60 
     61 // A ParseError is returned for parsing errors.
     62 // The first line is 1.  The first column is 0.
     63 type ParseError struct {
     64 	Line   int   // Line where the error occurred
     65 	Column int   // Column (rune index) where the error occurred
     66 	Err    error // The actual error
     67 }
     68 
     69 func (e *ParseError) Error() string {
     70 	return fmt.Sprintf("line %d, column %d: %s", e.Line, e.Column, e.Err)
     71 }
     72 
     73 // These are the errors that can be returned in ParseError.Error
     74 var (
     75 	ErrTrailingComma = errors.New("extra delimiter at end of line") // no longer used
     76 	ErrBareQuote     = errors.New("bare \" in non-quoted-field")
     77 	ErrQuote         = errors.New("extraneous \" in field")
     78 	ErrFieldCount    = errors.New("wrong number of fields in line")
     79 )
     80 
     81 // A Reader reads records from a CSV-encoded file.
     82 //
     83 // As returned by NewReader, a Reader expects input conforming to RFC 4180.
     84 // The exported fields can be changed to customize the details before the
     85 // first call to Read or ReadAll.
     86 //
     87 // Comma is the field delimiter.  It defaults to ','.
     88 //
     89 // Comment, if not 0, is the comment character. Lines beginning with the
     90 // Comment character are ignored.
     91 //
     92 // If FieldsPerRecord is positive, Read requires each record to
     93 // have the given number of fields.  If FieldsPerRecord is 0, Read sets it to
     94 // the number of fields in the first record, so that future records must
     95 // have the same field count.  If FieldsPerRecord is negative, no check is
     96 // made and records may have a variable number of fields.
     97 //
     98 // If LazyQuotes is true, a quote may appear in an unquoted field and a
     99 // non-doubled quote may appear in a quoted field.
    100 //
    101 // If TrimLeadingSpace is true, leading white space in a field is ignored.
    102 type Reader struct {
    103 	Comma            rune // field delimiter (set to ',' by NewReader)
    104 	Comment          rune // comment character for start of line
    105 	FieldsPerRecord  int  // number of expected fields per record
    106 	LazyQuotes       bool // allow lazy quotes
    107 	TrailingComma    bool // ignored; here for backwards compatibility
    108 	TrimLeadingSpace bool // trim leading space
    109 	line             int
    110 	column           int
    111 	r                *bufio.Reader
    112 	field            bytes.Buffer
    113 }
    114 
    115 // NewReader returns a new Reader that reads from r.
    116 func NewReader(r io.Reader) *Reader {
    117 	return &Reader{
    118 		Comma: ',',
    119 		r:     bufio.NewReader(r),
    120 	}
    121 }
    122 
    123 // error creates a new ParseError based on err.
    124 func (r *Reader) error(err error) error {
    125 	return &ParseError{
    126 		Line:   r.line,
    127 		Column: r.column,
    128 		Err:    err,
    129 	}
    130 }
    131 
    132 // Read reads one record from r.  The record is a slice of strings with each
    133 // string representing one field.
    134 func (r *Reader) Read() (record []string, err error) {
    135 	for {
    136 		record, err = r.parseRecord()
    137 		if record != nil {
    138 			break
    139 		}
    140 		if err != nil {
    141 			return nil, err
    142 		}
    143 	}
    144 
    145 	if r.FieldsPerRecord > 0 {
    146 		if len(record) != r.FieldsPerRecord {
    147 			r.column = 0 // report at start of record
    148 			return record, r.error(ErrFieldCount)
    149 		}
    150 	} else if r.FieldsPerRecord == 0 {
    151 		r.FieldsPerRecord = len(record)
    152 	}
    153 	return record, nil
    154 }
    155 
    156 // ReadAll reads all the remaining records from r.
    157 // Each record is a slice of fields.
    158 // A successful call returns err == nil, not err == EOF. Because ReadAll is
    159 // defined to read until EOF, it does not treat end of file as an error to be
    160 // reported.
    161 func (r *Reader) ReadAll() (records [][]string, err error) {
    162 	for {
    163 		record, err := r.Read()
    164 		if err == io.EOF {
    165 			return records, nil
    166 		}
    167 		if err != nil {
    168 			return nil, err
    169 		}
    170 		records = append(records, record)
    171 	}
    172 }
    173 
    174 // readRune reads one rune from r, folding \r\n to \n and keeping track
    175 // of how far into the line we have read.  r.column will point to the start
    176 // of this rune, not the end of this rune.
    177 func (r *Reader) readRune() (rune, error) {
    178 	r1, _, err := r.r.ReadRune()
    179 
    180 	// Handle \r\n here.  We make the simplifying assumption that
    181 	// anytime \r is followed by \n that it can be folded to \n.
    182 	// We will not detect files which contain both \r\n and bare \n.
    183 	if r1 == '\r' {
    184 		r1, _, err = r.r.ReadRune()
    185 		if err == nil {
    186 			if r1 != '\n' {
    187 				r.r.UnreadRune()
    188 				r1 = '\r'
    189 			}
    190 		}
    191 	}
    192 	r.column++
    193 	return r1, err
    194 }
    195 
    196 // skip reads runes up to and including the rune delim or until error.
    197 func (r *Reader) skip(delim rune) error {
    198 	for {
    199 		r1, err := r.readRune()
    200 		if err != nil {
    201 			return err
    202 		}
    203 		if r1 == delim {
    204 			return nil
    205 		}
    206 	}
    207 }
    208 
    209 // parseRecord reads and parses a single csv record from r.
    210 func (r *Reader) parseRecord() (fields []string, err error) {
    211 	// Each record starts on a new line.  We increment our line
    212 	// number (lines start at 1, not 0) and set column to -1
    213 	// so as we increment in readRune it points to the character we read.
    214 	r.line++
    215 	r.column = -1
    216 
    217 	// Peek at the first rune.  If it is an error we are done.
    218 	// If we support comments and it is the comment character
    219 	// then skip to the end of line.
    220 
    221 	r1, _, err := r.r.ReadRune()
    222 	if err != nil {
    223 		return nil, err
    224 	}
    225 
    226 	if r.Comment != 0 && r1 == r.Comment {
    227 		return nil, r.skip('\n')
    228 	}
    229 	r.r.UnreadRune()
    230 
    231 	// At this point we have at least one field.
    232 	for {
    233 		haveField, delim, err := r.parseField()
    234 		if haveField {
    235 			// If FieldsPerRecord is greater then 0 we can assume the final
    236 			// length of fields to be equal to FieldsPerRecord.
    237 			if r.FieldsPerRecord > 0 && fields == nil {
    238 				fields = make([]string, 0, r.FieldsPerRecord)
    239 			}
    240 			fields = append(fields, r.field.String())
    241 		}
    242 		if delim == '\n' || err == io.EOF {
    243 			return fields, err
    244 		} else if err != nil {
    245 			return nil, err
    246 		}
    247 	}
    248 }
    249 
    250 // parseField parses the next field in the record.  The read field is
    251 // located in r.field.  Delim is the first character not part of the field
    252 // (r.Comma or '\n').
    253 func (r *Reader) parseField() (haveField bool, delim rune, err error) {
    254 	r.field.Reset()
    255 
    256 	r1, err := r.readRune()
    257 	for err == nil && r.TrimLeadingSpace && r1 != '\n' && unicode.IsSpace(r1) {
    258 		r1, err = r.readRune()
    259 	}
    260 
    261 	if err == io.EOF && r.column != 0 {
    262 		return true, 0, err
    263 	}
    264 	if err != nil {
    265 		return false, 0, err
    266 	}
    267 
    268 	switch r1 {
    269 	case r.Comma:
    270 		// will check below
    271 
    272 	case '\n':
    273 		// We are a trailing empty field or a blank line
    274 		if r.column == 0 {
    275 			return false, r1, nil
    276 		}
    277 		return true, r1, nil
    278 
    279 	case '"':
    280 		// quoted field
    281 	Quoted:
    282 		for {
    283 			r1, err = r.readRune()
    284 			if err != nil {
    285 				if err == io.EOF {
    286 					if r.LazyQuotes {
    287 						return true, 0, err
    288 					}
    289 					return false, 0, r.error(ErrQuote)
    290 				}
    291 				return false, 0, err
    292 			}
    293 			switch r1 {
    294 			case '"':
    295 				r1, err = r.readRune()
    296 				if err != nil || r1 == r.Comma {
    297 					break Quoted
    298 				}
    299 				if r1 == '\n' {
    300 					return true, r1, nil
    301 				}
    302 				if r1 != '"' {
    303 					if !r.LazyQuotes {
    304 						r.column--
    305 						return false, 0, r.error(ErrQuote)
    306 					}
    307 					// accept the bare quote
    308 					r.field.WriteRune('"')
    309 				}
    310 			case '\n':
    311 				r.line++
    312 				r.column = -1
    313 			}
    314 			r.field.WriteRune(r1)
    315 		}
    316 
    317 	default:
    318 		// unquoted field
    319 		for {
    320 			r.field.WriteRune(r1)
    321 			r1, err = r.readRune()
    322 			if err != nil || r1 == r.Comma {
    323 				break
    324 			}
    325 			if r1 == '\n' {
    326 				return true, r1, nil
    327 			}
    328 			if !r.LazyQuotes && r1 == '"' {
    329 				return false, 0, r.error(ErrBareQuote)
    330 			}
    331 		}
    332 	}
    333 
    334 	if err != nil {
    335 		if err == io.EOF {
    336 			return true, 0, err
    337 		}
    338 		return false, 0, err
    339 	}
    340 
    341 	return true, r1, nil
    342 }
    343