In [22]:
import numpy as np
import pandas as pd
import os
import sys
import matplotlib
%matplotlib inline

from pandas import Series, DataFrame

In [23]:
!cat ../data/ch3/ex1.csv

# !type ../data/ch3/ex1.csv


a,b,c,d,message
1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

Check version


In [24]:
pd.__version__


Out[24]:
u'0.19.1'

Simplest Way


In [25]:
data = pd.read_csv('../data/ch3/ex1.csv')
data


Out[25]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo

If no header


In [26]:
!cat ../data/ch3/ex2.csv

# !type ../data/ch3/ex2.csv


1,2,3,4,hello
5,6,7,8,world
9,10,11,12,foo

In [27]:
data3 = pd.read_csv('../data/ch3/ex2.csv', header = None)
# data2 = pd.read_csv('../data/ch3/ex2.csv', names=['a', 'b', 'c', 'd', 'message'])
data3


Out[27]:
0 1 2 3 4
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo

Only read specific columns


In [28]:
datan = pd.read_csv('../data/ch3/ex1.csv', usecols = ['a', 'b', 'c'])
datan


Out[28]:
a b c
0 1 2 3
1 5 6 7
2 9 10 11

Specify Index


In [29]:
names = ['a', 'b', 'c', 'd', 'message']
data4 = pd.read_csv('../data/ch3/ex2.csv', names=names, index_col='message')
data4


Out[29]:
a b c d
message
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12

Only handle specified columns


In [30]:
data.a


Out[30]:
0    1
1    5
2    9
Name: a, dtype: int64

In [31]:
data['a']


Out[31]:
0    1
1    5
2    9
Name: a, dtype: int64

In [32]:
data[data.columns[[1, 2]]]


Out[32]:
b c
0 2 3
1 6 7
2 10 11

Missing Value Handling


In [33]:
!cat ../data/ch3/ex5.csv

#!type ../data/ch3/ex5.csv


something,a,b,c,d,message
one,1,2,3,4,NA
two,5,6,,8,world
three,9,10,11,12,foo

In [34]:
data5 = pd.read_csv('../data/ch3/ex5.csv')
# print data5.isnull()
# print data5.isnull().values
print data5[data5.isnull().values == True]


  something  a  b    c  d message
0       one  1  2  3.0  4     NaN
1       two  5  6  NaN  8   world

In [35]:
data6 = pd.read_csv('../data/ch3/ex5.csv', na_values = ['world'])
data6


Out[35]:
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 NaN
2 three 9 10 11.0 12 foo

In [36]:
sentinels = {'message':['foo', 'NA'], 'something':['two']}
data7 = pd.read_csv('../data/ch3/ex5.csv', na_values = sentinels)
data7


Out[36]:
something a b c d message
0 one 1 2 3.0 4 NaN
1 NaN 5 6 NaN 8 world
2 three 9 10 11.0 12 NaN

Read Big File


In [37]:
data8 = pd.read_csv('../data/ch3/ex6.csv', nrows = 5)
data8


Out[37]:
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q

In [38]:
# chunksize specifies the number of rows in each chunk
chunker = pd.read_csv('../data/ch3/ex6.csv', chunksize = 1000)
tot = Series([])
for piece in chunker:
    tot = tot.add(piece['key'].value_counts(), fill_value = 0)
tot = tot.sort_values(ascending = False)
tot


Out[38]:
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
V    328.0
I    327.0
U    326.0
P    324.0
D    320.0
A    320.0
R    318.0
Y    314.0
G    308.0
S    308.0
N    306.0
W    305.0
T    304.0
B    302.0
Z    288.0
C    286.0
4    171.0
6    166.0
7    164.0
8    162.0
3    162.0
5    157.0
2    152.0
0    151.0
9    150.0
1    146.0
dtype: float64

Write to CSV


In [39]:
data.to_csv('ch3-out1.csv')

In [40]:
data.to_csv('ch3-out2.csv', index = False)
data.to_csv('ch3-out3.csv', index = False, header = False, columns = ['a', 'b', 'c'])

data5.to_csv('../data/ch3/ex1.tsv', index = False, header = False, sep = '\t')
!cat '../data/ch3/ex1.tsv'


one	1	2	3.0	4	
two	5	6		8	world
three	9	10	11.0	12	foo

Read TSV


In [41]:
data = pd.read_csv('../data/ch3/ex1.tsv', sep = '\t')
data


Out[41]:
one 1 2 3.0 4 Unnamed: 5
0 two 5 6 NaN 8 world
1 three 9 10 11.0 12 foo

In [42]:
data = pd.read_table('../data/ch3/ex1.tsv')
data


Out[42]:
one 1 2 3.0 4 Unnamed: 5
0 two 5 6 NaN 8 world
1 three 9 10 11.0 12 foo

In [43]:
data5.to_csv('ch3-out4.tsv', sep = '\t', na_rep = 'NULL', index = False, header = False)
!cat 'ch3-out4.tsv'


one	1	2	3.0	4	NULL
two	5	6	NULL	8	world
three	9	10	11.0	12	foo