6.26 - Pandas DataFrames, part I


In [1]:
import numpy as np
import pandas as pd

In [2]:
from numpy.random import randn

In [20]:
np.random.seed(101) # the seed random dataset...meaning the 'random' numbers will actually be the same as the video

Creating DataFrames


In [21]:
df = pd.DataFrame(randn(5,4),['a','b','c','d','e'],['w','x','y','z'])

In [22]:
df # nuhhh


Out[22]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
c -2.018168 0.740122 0.528813 -0.589001
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

Selecting columns


In [23]:
df['w'] # each column is a Series tho


Out[23]:
a    2.706850
b    0.651118
c   -2.018168
d    0.188695
e    0.190794
Name: w, dtype: float64

In [24]:
type(df['w']) # o_0


Out[24]:
pandas.core.series.Series

In [25]:
type(df)


Out[25]:
pandas.core.frame.DataFrame

In [26]:
df.w # SQL-like notation


Out[26]:
a    2.706850
b    0.651118
c   -2.018168
d    0.188695
e    0.190794
Name: w, dtype: float64

In [27]:
df[['w','z']]


Out[27]:
w z
a 2.706850 0.503826
b 0.651118 0.605965
c -2.018168 -0.589001
d 0.188695 0.955057
e 0.190794 0.683509

Adding and dropping columns

adding


In [41]:
df['new'] = df['w'] + df['y']

In [42]:
df # oh dang we just made something!


Out[42]:
w x y z new
a 2.706850 0.628133 0.907969 0.503826 3.614819
b 0.651118 -0.319318 -0.848077 0.605965 -0.196959
c -2.018168 0.740122 0.528813 -0.589001 -1.489355
d 0.188695 -0.758872 -0.933237 0.955057 -0.744542
e 0.190794 1.978757 2.605967 0.683509 2.796762

dropping to new DataFrame


In [43]:
df.drop('new',1) # gotta define which axis to drop


Out[43]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
c -2.018168 0.740122 0.528813 -0.589001
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [44]:
df # but wait! we didn't change the original


Out[44]:
w x y z new
a 2.706850 0.628133 0.907969 0.503826 3.614819
b 0.651118 -0.319318 -0.848077 0.605965 -0.196959
c -2.018168 0.740122 0.528813 -0.589001 -1.489355
d 0.188695 -0.758872 -0.933237 0.955057 -0.744542
e 0.190794 1.978757 2.605967 0.683509 2.796762

To modify the original, use the inplace=True option


In [45]:
df.drop('new',axis=1,inplace=True)

In [46]:
df


Out[46]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
c -2.018168 0.740122 0.528813 -0.589001
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [48]:
df.drop('e') #dropping columns


Out[48]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
c -2.018168 0.740122 0.528813 -0.589001
d 0.188695 -0.758872 -0.933237 0.955057

In [49]:
df.shape


Out[49]:
(5, 4)

Selecting rows


In [51]:
df.loc['a'] # rows are also series


Out[51]:
w    2.706850
x    0.628133
y    0.907969
z    0.503826
Name: a, dtype: float64

In [54]:
df.iloc[2]


Out[54]:
w   -2.018168
x    0.740122
y    0.528813
z   -0.589001
Name: c, dtype: float64

Selecting subsets


In [55]:
df.loc['b','y']


Out[55]:
-0.84807698340363147

In [56]:
df.loc['a':'b','x':'y'] # similar to matrix selection


Out[56]:
x y
a 0.628133 0.907969
b -0.319318 -0.848077

In [58]:
df.loc[['a','b'],['x','y']] # individual row-column selection


Out[58]:
x y
a 0.628133 0.907969
b -0.319318 -0.848077

6.27 - Pandas DataFrames, part II

Conditional selection


In [60]:
booldf = df > 0
booldf


Out[60]:
w x y z
a True True True True
b True False False True
c False True True False
d True False False True
e True True True True

In [63]:
df[booldf] # also `df[df>0]`


Out[63]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 NaN NaN 0.605965
c NaN 0.740122 0.528813 NaN
d 0.188695 NaN NaN 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [64]:
df['w']>0


Out[64]:
a     True
b     True
c    False
d     True
e     True
Name: w, dtype: bool

In [66]:
df[df['w']>0] # oooooOOOOOooo


Out[66]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

In [67]:
df[df['z']<0]


Out[67]:
w x y z
c -2.018168 0.740122 0.528813 -0.589001

In [71]:
resultdf = df[df['w']>0]
resultdf['x'] # two steps


Out[71]:
a    0.628133
b   -0.319318
d   -0.758872
e    1.978757
Name: x, dtype: float64

In [74]:
df[df['w']>0]['x'] # or ['x','y'] or ['x':'z']


Out[74]:
a    0.628133
b   -0.319318
d   -0.758872
e    1.978757
Name: x, dtype: float64

Selection using multiple conditions


In [76]:
df[(df['w']>0) & (df['y']>1)] # one ampersand, not `and`


Out[76]:
w x y z
e 0.190794 1.978757 2.605967 0.683509

In [77]:
df[(df['w']>0) | (df['y']>1)] # pipe operator for `or`


Out[77]:
w x y z
a 2.706850 0.628133 0.907969 0.503826
b 0.651118 -0.319318 -0.848077 0.605965
d 0.188695 -0.758872 -0.933237 0.955057
e 0.190794 1.978757 2.605967 0.683509

Changing the index


In [79]:
df.reset_index() # specify `inplace=True` to change original dataset


Out[79]:
index w x y z
0 a 2.706850 0.628133 0.907969 0.503826
1 b 0.651118 -0.319318 -0.848077 0.605965
2 c -2.018168 0.740122 0.528813 -0.589001
3 d 0.188695 -0.758872 -0.933237 0.955057
4 e 0.190794 1.978757 2.605967 0.683509

In [83]:
newind = 'CO ME NH MT NY'.split() # nice

In [84]:
df['States'] = newind

In [85]:
df.set_index('States')


Out[85]:
w x y z
States
CO 2.706850 0.628133 0.907969 0.503826
ME 0.651118 -0.319318 -0.848077 0.605965
NH -2.018168 0.740122 0.528813 -0.589001
MT 0.188695 -0.758872 -0.933237 0.955057
NY 0.190794 1.978757 2.605967 0.683509

6.28 - Pandas DataFrames, part III

Creating MultiIndex and index hierarchies


In [91]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside)) # this is cool and probably useful
hier_index = pd.MultiIndex.from_tuples(hier_index)

In [102]:
dfh = pd.DataFrame(randn(6,2),hier_index,('a','b'))

In [101]:
dfh # hnngggh


Out[101]:
a b
G1 1 0.302665 1.693723
2 -1.706086 -1.159119
3 -0.134841 0.390528
G2 1 0.166905 0.184502
2 0.807706 0.072960
3 0.638787 0.329646

In [100]:
dfh.loc['G2'].loc[2]


Out[100]:
a    0.807706
b    0.072960
Name: 2, dtype: float64

In [104]:
dfh.index.names = ['Groups','Num']
dfh


Out[104]:
a b
Groups Num
G1 1 -0.497104 -0.754070
2 -0.943406 0.484752
3 -0.116773 1.901755
G2 1 0.238127 1.996652
2 -0.993263 0.196800
3 -1.136645 0.000366

In [105]:
dfh.loc['G2'].loc[2]['b']


Out[105]:
0.19679950499134005

In [106]:
dfh.loc['G1'].loc[3]['a']


Out[106]:
-0.11677331646707445

Cross-sections


In [111]:
dfh.xs(1,level='Num')


Out[111]:
a b
Groups
G1 -0.497104 -0.754070
G2 0.238127 1.996652

In [ ]: