Chapter_06



In [8]:
import pandas as pd
import numpy as np
from pandas import DataFrame, Series

In [9]:
df = pd.read_csv('ex1.csv')
df


Out[9]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo

In [10]:
pd.read_table('ex1.csv', sep=',')


Out[10]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo

In [13]:
pd.read_csv('ex2.csv', header=None)


Out[13]:
0 1 2 3 4
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo

In [14]:
names = ['a', 'b', 'c', 'd', 'message']
pd.read_csv('ex2.csv', names = names, index_col = 'message')


Out[14]:
a b c d
message
hello 1 2 3 4
world 5 6 7 8
foo 9 10 11 12

In [18]:
parsed = pd.read_csv('csv_minindex.csv', index_col = ['key1', 'key2'])
parsed


Out[18]:
value1 value2
key1 key2
one a 1 2
b 3 4
c 5 6
d 7 8
two a 9 10
b 11 12
c 12 14
d 15 16

In [19]:
list(open('ex3.txt'))


Out[19]:
['        A      B      C\n',
 'aaa -0.22 -3.123  1.234\n',
 'bbb  0.92  3.123 -3.333\n',
 'ccc  1.22  1.222  3.444\n',
 'ddd  12.1  6.666  6.666\n']

In [21]:
result = pd.read_table('ex3.txt', sep='\s+')
result


Out[21]:
A B C
aaa -0.22 -3.123 1.234
bbb 0.92 3.123 -3.333
ccc 1.22 1.222 3.444
ddd 12.10 6.666 6.666

In [22]:
pd.read_csv('ex4.csv', skiprows=[0, 2, 3])


Out[22]:
a b c d message
0 1 2 3 4 hello
1 5 6 7 8 world
2 9 10 11 12 foo

In [23]:
result = pd.read_csv('ex5.csv')
result


Out[23]:
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo

In [24]:
pd.isnull(result)


Out[24]:
something a b c d message
0 False False False False False True
1 False False False True False False
2 False False False False False False

In [30]:
result = pd.read_csv('ex5.csv', na_values=['NULL'])
result


Out[30]:
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo

In [32]:
sentinels = {'message' : ['foo', 'NA'],
             'something' : ['two']}
pd.read_csv('ex5.csv', na_values=sentinels)


Out[32]:
something a b c d message
0 one 1 2 3.0 4 NaN
1 NaN 5 6 NaN 8 world
2 three 9 10 11.0 12 NaN

In [33]:
result = pd.read_csv('ex6.csv')
result


Out[33]:
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q
5 1.817480 0.742273 0.419395 -2.251035 Q
6 -0.776764 0.935518 -0.332872 -1.875641 U
7 -0.913135 1.530624 -0.572657 0.477252 K
8 0.358480 -0.497572 -0.367016 0.507702 S
9 -1.740877 -1.160417 -1.637830 2.172201 G
10 0.240564 -0.328249 1.252155 1.072796 8
11 0.764018 1.165476 -0.639544 1.495258 R
12 0.571035 -0.310537 0.582437 -0.298765 1
13 2.317658 0.430710 -1.334216 0.199679 P
14 1.547771 -1.119753 -2.277634 0.329586 J
15 -1.310608 0.401719 -1.000987 1.156708 E
16 -0.088496 0.634712 0.153324 0.415335 B
17 -0.018663 -0.247487 -1.446522 0.750938 A
18 -0.070127 -1.579097 0.120892 0.671432 F
19 -0.194678 -0.492039 2.359605 0.319810 H
20 -0.248618 0.868707 -0.492226 -0.717959 W
21 -1.091549 -0.867110 -0.647760 -0.832562 C
22 0.641404 -0.138822 -0.621963 -0.284839 C
23 1.216408 0.992687 0.165162 -0.069619 V
24 -0.564474 0.792832 0.747053 0.571675 I
25 1.759879 -0.515666 -0.230481 1.362317 S
26 0.126266 0.309281 0.382820 -0.239199 L
27 1.334360 -0.100152 -0.840731 -0.643967 6
28 -0.737620 0.278087 -0.053235 -0.950972 J
29 -1.148486 -0.986292 -0.144963 0.124362 Y
... ... ... ... ... ...
9970 0.633495 -0.186524 0.927627 0.143164 4
9971 0.308636 -0.112857 0.762842 -1.072977 1
9972 -1.627051 -0.978151 0.154745 -1.229037 Z
9973 0.314847 0.097989 0.199608 0.955193 P
9974 1.666907 0.992005 0.496128 -0.686391 S
9975 0.010603 0.708540 -1.258711 0.226541 K
9976 0.118693 -0.714455 -0.501342 -0.254764 K
9977 0.302616 -2.011527 -0.628085 0.768827 H
9978 -0.098572 1.769086 -0.215027 -0.053076 A
9979 -0.019058 1.964994 0.738538 -0.883776 F
9980 -0.595349 0.001781 -1.423355 -1.458477 M
9981 1.392170 -1.396560 -1.425306 -0.847535 H
9982 -0.896029 -0.152287 1.924483 0.365184 6
9983 -2.274642 -0.901874 1.500352 0.996541 N
9984 -0.301898 1.019906 1.102160 2.624526 I
9985 -2.548389 -0.585374 1.496201 -0.718815 D
9986 -0.064588 0.759292 -1.568415 -0.420933 E
9987 -0.143365 -1.111760 -1.815581 0.435274 2
9988 -0.070412 -1.055921 0.338017 -0.440763 X
9989 0.649148 0.994273 -1.384227 0.485120 Q
9990 -0.370769 0.404356 -1.051628 -1.050899 8
9991 -0.409980 0.155627 -0.818990 1.277350 W
9992 0.301214 -1.111203 0.668258 0.671922 A
9993 1.821117 0.416445 0.173874 0.505118 X
9994 0.068804 1.322759 0.802346 0.223618 H
9995 2.311896 -0.417070 -1.409599 -0.515821 L
9996 -0.479893 -0.650419 0.745152 -0.646038 E
9997 0.523331 0.787112 0.486066 1.093156 K
9998 -0.362559 0.598894 -1.843201 0.887292 G
9999 -0.096376 -1.012999 -0.657431 -0.573315 0

10000 rows × 5 columns


In [34]:
pd.read_csv('ex6.csv', nrows=5)


Out[34]:
one two three four key
0 0.467976 -0.038649 -0.295344 -1.824726 L
1 -0.358893 1.404453 0.704965 -0.200638 B
2 -0.501840 0.659254 -0.421691 -0.057688 G
3 0.204886 1.074134 1.388361 -0.982404 R
4 0.354628 -0.133116 0.283763 -0.837063 Q

In [36]:
chunker = pd.read_csv('ex6.csv', chunksize=1000)
chunker


Out[36]:
<pandas.io.parsers.TextFileReader at 0x10c34b2e8>

In [39]:
tot = Series([])
for piece in pd.read_csv('ex6.csv', chunksize=1000):
    tot = tot.add(piece['key'].value_counts(), fill_value=0)
tot = tot.sort_values(ascending = False)
tot


Out[39]:
E    368.0
X    364.0
L    346.0
O    343.0
Q    340.0
M    338.0
J    337.0
F    335.0
K    334.0
H    330.0
V    328.0
I    327.0
U    326.0
P    324.0
D    320.0
A    320.0
R    318.0
Y    314.0
G    308.0
S    308.0
N    306.0
W    305.0
T    304.0
B    302.0
Z    288.0
C    286.0
4    171.0
6    166.0
7    164.0
8    162.0
3    162.0
5    157.0
2    152.0
0    151.0
9    150.0
1    146.0
dtype: float64

In [40]:
data = pd.read_csv('ex5.csv')
data


Out[40]:
something a b c d message
0 one 1 2 3.0 4 NaN
1 two 5 6 NaN 8 world
2 three 9 10 11.0 12 foo

In [41]:
data.to_csv('out.csv')

In [42]:
!cat out.csv


,something,a,b,c,d,message
0,one,1,2,3.0,4,
1,two,5,6,,8,world
2,three,9,10,11.0,12,foo

In [44]:
import sys
data.to_csv(sys.stdout, sep='|')


|something|a|b|c|d|message
0|one|1|2|3.0|4|
1|two|5|6||8|world
2|three|9|10|11.0|12|foo

In [47]:
data.to_csv(sys.stdout, na_rep='NULL')


,something,a,b,c,d,message
0,one,1,2,3.0,4,NULL
1,two,5,6,NULL,8,world
2,three,9,10,11.0,12,foo

In [48]:
data.to_csv(sys.stdout, index = False, header = False)


one,1,2,3.0,4,
two,5,6,,8,world
three,9,10,11.0,12,foo

In [50]:
data.to_csv(sys.stdout, index = False, columns = ['a', 'b', 'c'])


a,b,c
1,2,3.0
5,6,
9,10,11.0

In [52]:
dates = pd.date_range('1/1/2000', periods = 7)
dates


Out[52]:
DatetimeIndex(['2000-01-01', '2000-01-02', '2000-01-03', '2000-01-04',
               '2000-01-05', '2000-01-06', '2000-01-07'],
              dtype='datetime64[ns]', freq='D')

In [54]:
ts = Series(np.arange(7), index=dates)
ts


Out[54]:
2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
Freq: D, dtype: int64

In [55]:
ts.to_csv('tseries.csv')

In [56]:
!cat tseries.csv


2000-01-01,0
2000-01-02,1
2000-01-03,2
2000-01-04,3
2000-01-05,4
2000-01-06,5
2000-01-07,6

In [57]:
Series.from_csv('tseries.csv', parse_dates=True)


Out[57]:
2000-01-01    0
2000-01-02    1
2000-01-03    2
2000-01-04    3
2000-01-05    4
2000-01-06    5
2000-01-07    6
dtype: int64

In [59]:
import csv
f = open('ex7.csv')
reader = csv.reader(f)
for line in reader:
    print(line)


['a', 'b', 'c']
['1', '2', '3']
['1', '2', '3', '4']

In [61]:
lines = list(csv.reader(open('ex7.csv')))
lines


Out[61]:
[['a', 'b', 'c'], ['1', '2', '3'], ['1', '2', '3', '4']]

In [63]:
header, values = lines[0], lines[1:]
data_dict = {h : v for h, v in zip(header, zip(*values))}
data_dict


Out[63]:
{'a': ('1', '1'), 'b': ('2', '2'), 'c': ('3', '3')}

In [64]:
obj = """
{"name": "Wes",
 "places_lived": ["United States", "Spain", "Germany"],
 "pet": null,
 "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"},
              {"name": "Katie", "age": 33, "pet": "Cisco"}]
}
"""

In [66]:
import json
result = json.loads(obj)
result


Out[66]:
{'name': 'Wes',
 'pet': None,
 'places_lived': ['United States', 'Spain', 'Germany'],
 'siblings': [{'age': 25, 'name': 'Scott', 'pet': 'Zuko'},
  {'age': 33, 'name': 'Katie', 'pet': 'Cisco'}]}

In [67]:
asjson = json.dumps(result)
asjson


Out[67]:
'{"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}]}'

In [69]:
siblings = DataFrame(result['siblings'], columns = ['name', 'age'])
siblings


Out[69]:
name age
0 Scott 25
1 Katie 33

In [71]:
from lxml.html import parse
from urllib2 import urlopen

parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
doc = parsed.getroot()
doc


---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-71-88d78579c550> in <module>()
      1 from lxml.html import parse
----> 2 from urllib2 import urlopen
      3 
      4 parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options'))
      5 doc = parsed.getroot()

ModuleNotFoundError: No module named 'urllib2'

In [74]:
frame = pd.read_csv('ex1.csv')

In [ ]: