In [2]:
import csv

reader

csv.reader(
csvfile, dialect='excel', **fmtparams
)

return a reader object which will iterate over lines in the give csvfile

  • if csvfile is a file object, it should be opened with newline=''

writer

csv.writer(
csvfile, dialect='excel', **fmtparams
)

return a writer object responsible for converting the user's data into delimited strings on the give file-like object.

  • if csvfile is a file object, it should be opened with newline=''

Dialect

Group parameters that control how to parse a csv file.

Get list of existing Dialect


In [5]:
csv.list_dialects()


Out[5]:
['excel', 'excel-tab', 'unix']
  • excel: microsoft excel, libreoffice
  • unix: quotes field with double-quotes and use \n as record separator

Register/Unregister a dialect

csv.register_dialect(name[, dialect[, **fmtparams]])
csv.unregister_dialect(name)

In [ ]:
# %load data/testdata.pipes
"Title 1"|"Title 2"|"Title 3"
1|"first line
second line"|08/18/07

In [6]:
csv.register_dialect('pipes', delimiter='|')
with open("data/testdata.pipes",'r') as f:
    reader = csv.reader(f,dialect='pipes')
    for row in reader:
        print(row)


['Title 1', 'Title 2', 'Title 3']
['1', 'first line\nsecond line', '08/18/07']

In [15]:
csv.unregister_dialect("pipes")

get_dialect

csv.get_dialect(name)

In [9]:
dialect_pipes = csv.get_dialect("pipes")

In [11]:
dialect_pipes.delimiter, dialect_pipes.doublequote


Out[11]:
('|', 1)

Dialect Paramters

CSV Dialect Parameters
Attribute Default Meaning
delimiter , Field separator (one character)
doublequote True Flag controlling whether quotechar instances are doubled
escapechar None Character used to indicate an escape sequence
lineterminator \r\n String used by writer to terminate a line
quotechar " String to surround fields containing special values (one character)
quoting QUOTE_MINIMAL Controls quoting behavior described earlier
skipinitialspace False Ignore whitespace after the field delimiter

Examples:


In [16]:
import csv
import sys

csv.register_dialect('escaped',
                     escapechar='\\',
                     doublequote=False,
                     quoting=csv.QUOTE_NONE,
                     )
csv.register_dialect('singlequote',
                     quotechar="'",
                     quoting=csv.QUOTE_ALL,
                     )

quoting_modes = {
    getattr(csv, n): n
    for n in dir(csv)
    if n.startswith('QUOTE_')
}

TEMPLATE = '''\
Dialect: "{name}"

  delimiter   = {dl!r:<6}    skipinitialspace = {si!r}
  doublequote = {dq!r:<6}    quoting          = {qu}
  quotechar   = {qc!r:<6}    lineterminator   = {lt!r}
  escapechar  = {ec!r:<6}
'''

for name in sorted(csv.list_dialects()):
    dialect = csv.get_dialect(name)

    print(TEMPLATE.format(
        name=name,
        dl=dialect.delimiter,
        si=dialect.skipinitialspace,
        dq=dialect.doublequote,
        qu=quoting_modes[dialect.quoting],
        qc=dialect.quotechar,
        lt=dialect.lineterminator,
        ec=dialect.escapechar,
    ))

    writer = csv.writer(sys.stdout, dialect=dialect)
    writer.writerow(
        ('col1', 1, '10/01/2010',
         'Special chars: " \' {} to parse'.format(
             dialect.delimiter))
    )
    print()


Dialect: "escaped"

  delimiter   = ','       skipinitialspace = 0
  doublequote = 0         quoting          = QUOTE_NONE
  quotechar   = '"'       lineterminator   = '\r\n'
  escapechar  = '\\'  

col1,1,10/01/2010,Special chars: \" ' \, to parse

Dialect: "excel"

  delimiter   = ','       skipinitialspace = 0
  doublequote = 1         quoting          = QUOTE_MINIMAL
  quotechar   = '"'       lineterminator   = '\r\n'
  escapechar  = None  

col1,1,10/01/2010,"Special chars: "" ' , to parse"

Dialect: "excel-tab"

  delimiter   = '\t'      skipinitialspace = 0
  doublequote = 1         quoting          = QUOTE_MINIMAL
  quotechar   = '"'       lineterminator   = '\r\n'
  escapechar  = None  

col1	1	10/01/2010	"Special chars: "" ' 	 to parse"

Dialect: "singlequote"

  delimiter   = ','       skipinitialspace = 0
  doublequote = 1         quoting          = QUOTE_ALL
  quotechar   = "'"       lineterminator   = '\r\n'
  escapechar  = None  

'col1','1','10/01/2010','Special chars: " '' , to parse'

Dialect: "unix"

  delimiter   = ','       skipinitialspace = 0
  doublequote = 1         quoting          = QUOTE_ALL
  quotechar   = '"'       lineterminator   = '\n'
  escapechar  = None  

"col1","1","10/01/2010","Special chars: "" ' , to parse"

Automatically Detect Dialects

for data where the dialect parameters are unknown, the Sniffer class can be used to make an educated guess. Sniffer class takes a sample of the input data and an optional argument giving the possible delimiter characters.


In [17]:
import csv
from io import StringIO
import textwrap

csv.register_dialect('escaped',
                     escapechar='\\',
                     doublequote=False,
                     quoting=csv.QUOTE_NONE)
csv.register_dialect('singlequote',
                     quotechar="'",
                     quoting=csv.QUOTE_ALL)

# Generate sample data for all known dialects
samples = []
for name in sorted(csv.list_dialects()):
    buffer = StringIO()
    dialect = csv.get_dialect(name)
    writer = csv.writer(buffer, dialect=dialect)
    writer.writerow(
        ('col1', 1, '10/01/2010',
         'Special chars " \' {} to parse'.format(
             dialect.delimiter))
    )
    samples.append((name, dialect, buffer.getvalue()))

# Guess the dialect for a given sample, and then use the results
# to parse the data.
sniffer = csv.Sniffer()
for name, expected, sample in samples:
    print('Dialect: "{}"'.format(name))
    print('In: {}'.format(sample.rstrip()))
    dialect = sniffer.sniff(sample, delimiters=',\t')
    reader = csv.reader(StringIO(sample), dialect=dialect)
    print('Parsed:\n  {}\n'.format(
          '\n  '.join(repr(r) for r in next(reader))))


Dialect: "escaped"
In: col1,1,10/01/2010,Special chars \" ' \, to parse
Parsed:
  'col1'
  '1'
  '10/01/2010'
  'Special chars \\" \' \\'
  ' to parse'

Dialect: "excel"
In: col1,1,10/01/2010,"Special chars "" ' , to parse"
Parsed:
  'col1'
  '1'
  '10/01/2010'
  'Special chars " \' , to parse'

Dialect: "excel-tab"
In: col1	1	10/01/2010	"Special chars "" ' 	 to parse"
Parsed:
  'col1'
  '1'
  '10/01/2010'
  'Special chars " \' \t to parse'

Dialect: "singlequote"
In: 'col1','1','10/01/2010','Special chars " '' , to parse'
Parsed:
  'col1'
  '1'
  '10/01/2010'
  'Special chars " \' , to parse'

Dialect: "unix"
In: "col1","1","10/01/2010","Special chars "" ' , to parse"
Parsed:
  'col1'
  '1'
  '10/01/2010'
  'Special chars " \' , to parse'

Using field name: DictReader/DictWriter

translate rows to dictionaries instead of lists

DictReader

csv.DictReader(csvfile, fieldnames=None, restkey=None, restval=None, dialect=’excel’, *args, **kwds)

In [21]:
with open("data/csv_data.csv",'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        print(row)


OrderedDict([('title', 'a'), ('number', '2'), ('price', '5.0')])
OrderedDict([('title', 'b'), ('number', '89'), ('price', '9.9')])
OrderedDict([('title', 'c'), ('number', '78'), ('price', '12.0')])

DictWriter

csv.DictWriter(csvfile, fieldnames, restval=”, extrasaction=’raise’, dialect=’excel’, *args, **kwds)

Note: The writer must be given a list of field names so it knows how to order the columns in the output.


In [22]:
with open('data/csv_dict_writer.csv', 'w') as csvfile:
    fieldnames = ['first_name', 'last_name']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    writer.writerow({'first_name': 'Baked', 'last_name': 'Beans'})
    writer.writerow({'first_name': 'Lovely', 'last_name': 'Spam'})
    writer.writerow({'first_name': 'Wonderful', 'last_name': 'Spam'})

In [ ]:
# %load data/csv_dict_writer.csv
first_name,last_name
Baked,Beans
Lovely,Spam
Wonderful,Spam

Quoting

There are four different quoting options, defined as constants in the csv module.

  • QUOTE_ALL Quote everything, regardless of type.
  • QUOTE_MINIMAL Quote fields with special characters (anything that would confuse a parser configured with the same dialect and options). This is the default
  • QUOTE_NONNUMERIC Quote all fields that are not integers or floats. When used with the reader, input fields that are not quoted are converted to floats.
  • QUOTE_NONE Do not quote anything on output. When used with the reader, quote characters are included in the field values (normally, they are treated as delimiters and stripped).

In [ ]: