In [1]:
import pandas as pd
import numpy as np
In [2]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside)) #create a list of tuples
hier_index
Out[2]:
In [3]:
#create a multiindex
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index
Out[3]:
In [4]:
# Create a dataframe (6,2) with multi level index
df = pd.DataFrame(np.random.randn(6,2),index=hier_index,columns=['A','B'])
df
Out[4]:
In [5]:
#access columns as usual
df['A']
Out[5]:
In [6]:
#access rows
df.loc['G1']
Out[6]:
In [7]:
#acess a single row form inner
df.loc['G1'].loc[1]
Out[7]:
In [14]:
#access a single cell
df.loc['G2'].loc[3]['B']
Out[14]:
In [8]:
df.index.names
Out[8]:
In [9]:
df.index.names = ['Group', 'Serial']
df
Out[9]:
In [10]:
# Get all rows with Serial 1
df.xs(1, level='Serial')
Out[10]:
In [11]:
# Get rows with serial 2 in group 1
df.xs(['G1',2])
Out[11]:
In [12]:
d = {'a':[1,2,np.nan], 'b':[np.nan, 5, np.nan], 'c':[6,7,8]}
dfna = pd.DataFrame(d)
dfna
Out[12]:
In [13]:
# dropping rows with one or more na values
dfna.dropna()
Out[13]:
In [14]:
# dropping cols with one or more na values
dfna.dropna(axis=1)
Out[14]:
In [15]:
# Dropping rows only if 2 or more cols have na values
dfna.dropna(axis=0, thresh=2)
Out[15]:
In [16]:
dfna.fillna(value=999)
Out[16]:
In [17]:
# filling with mean value of entire dataframe
dfna.fillna(value = dfna.mean())
Out[17]:
In [18]:
# fill with mean value row by row
dfna['a'].fillna(value = dfna['a'].mean())
Out[18]:
In [19]:
comp_data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
'Sales':[200,120,340,124,243,350]}
comp_df = pd.DataFrame(comp_data)
comp_df
Out[19]:
In [21]:
# mean sales by company - automatically only applies mean on numerical columns
comp_df.groupby('Company').mean()
Out[21]:
In [22]:
# standard deviation in sales by company
comp_df.groupby('Company').std()
Out[22]:
In [23]:
comp_df.groupby('Company').describe()
Out[23]:
In [24]:
comp_df.groupby('Company').describe().transpose()
Out[24]:
In [25]:
comp_df.groupby('Company').describe().index
Out[25]:
In [26]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],'D': ['D0', 'D1', 'D2', 'D3']}, index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'], 'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],'D': ['D4', 'D5', 'D6', 'D7']}, index=[4, 5, 6, 7])
In [27]:
df1
Out[27]:
In [28]:
df2
Out[28]:
In [29]:
# extend along rows
pd.concat([df1, df2]) #flows well because index is sequential and colmns match
Out[29]:
In [30]:
#extend along columns
pd.concat([df1, df2], axis=1) #fills NaN when index dont match
Out[30]:
In [31]:
left = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],'B': ['C0', 'C1', 'C2', 'C3'],
'C': ['D0', 'D1', 'D2', 'D3']})
left
Out[31]:
In [32]:
right
Out[32]:
In [33]:
#merge along key1
pd.merge(left, right, how='inner', on='key1')
Out[33]:
When both tables have same column names that are not used for merging (on
) then pandas appends x
and y
to their names to differentiate
In [34]:
left['key2'] = ['K0', 'K1', 'K0', 'K1']
left
Out[34]:
In [35]:
right['key2'] = ['K0', 'K0', 'K0', 'K0']
right
Out[35]:
In [36]:
pd.merge(left, right, how='inner', on=['key1', 'key2'])
Out[36]:
In [37]:
om = pd.merge(left, right, how='outer', on=['key1', 'key2'])
om
Out[37]:
In [38]:
om.sort_values(by=['key1', 'key2']) #now you got the merge sorted by columns.
Out[38]:
In [39]:
pd.merge(left, right, how='right', on=['key1', 'key2']).sort_values(by='key1')
Out[39]:
In [40]:
pd.merge(left, right, how='left', on=['key1', 'key2']).sort_values(by='key1')
Out[40]:
In [41]:
df_a = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
df_b = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
df_a
Out[41]:
In [42]:
df_b
Out[42]:
In [43]:
#join b to a, default mode = keep all rows of a and matching rows of b (left join)
df_a.join(df_b)
Out[43]:
Thus all rows of df_a and those in df_b. If df_b did not have that index, then NaN for values.
In [44]:
#join b to a
df_b.join(df_a)
Out[44]:
In [45]:
#outer join - union of outputs
df_b.join(df_a, how='outer')
Out[45]: