In [183]:
import pandas as pd
import numpy as np
In [184]:
from numpy.random import randn
np.random.seed(101)
In [185]:
df = pd.DataFrame(randn(5,4),index='A B C D E'.split(),columns='W X Y Z'.split())
In [186]:
df
Out[186]:
In [187]:
df['W']
Out[187]:
In [188]:
# Pass a list of column names
df[['W','Z']]
Out[188]:
In [189]:
# SQL Syntax (NOT RECOMMENDED!)
df.W
Out[189]:
DataFrame Columns are just Series
In [190]:
type(df['W'])
Out[190]:
Creating a new column:
In [191]:
df['new'] = df['W'] + df['Y']
In [192]:
df
Out[192]:
Removing Columns
In [193]:
df.drop('new',axis=1)
Out[193]:
In [194]:
# Not inplace unless specified!
df
Out[194]:
In [195]:
df.drop('new',axis=1,inplace=True)
In [196]:
df
Out[196]:
Can also drop rows this way:
In [197]:
df.drop('E',axis=0)
Out[197]:
Selecting Rows
In [198]:
df.loc['A']
Out[198]:
Or select based off of position instead of label
In [199]:
df.iloc[2]
Out[199]:
Selecting subset of rows and columns
In [200]:
df.loc['B','Y']
Out[200]:
In [201]:
df.loc[['A','B'],['W','Y']]
Out[201]:
In [202]:
df
Out[202]:
In [203]:
df>0
Out[203]:
In [204]:
df[df>0]
Out[204]:
In [205]:
df[df['W']>0]
Out[205]:
In [206]:
df[df['W']>0]['Y']
Out[206]:
In [207]:
df[df['W']>0][['Y','X']]
Out[207]:
For two conditions you can use | and & with parenthesis:
In [208]:
df[(df['W']>0) & (df['Y'] > 1)]
Out[208]:
In [209]:
df
Out[209]:
In [210]:
# Reset to default 0,1...n index
df.reset_index()
Out[210]:
In [211]:
newind = 'CA NY WY OR CO'.split()
In [212]:
df['States'] = newind
In [213]:
df
Out[213]:
In [214]:
df.set_index('States')
Out[214]:
In [215]:
df
Out[215]:
In [216]:
df.set_index('States',inplace=True)
In [218]:
df
Out[218]:
In [253]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index = pd.MultiIndex.from_tuples(hier_index)
In [254]:
hier_index
Out[254]:
In [257]:
df = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
df
Out[257]:
Now let's show how to index this! For index hierarchy we use df.loc[], if this was on the columns axis, you would just use normal bracket notation df[]. Calling one level of the index returns the sub-dataframe:
In [260]:
df.loc['G1']
Out[260]:
In [263]:
df.loc['G1'].loc[1]
Out[263]:
In [265]:
df.index.names
Out[265]:
In [266]:
df.index.names = ['Group','Num']
In [267]:
df
Out[267]:
In [270]:
df.xs('G1')
Out[270]:
In [271]:
df.xs(['G1',1])
Out[271]:
In [273]:
df.xs(1,level='Num')
Out[273]: