In [1]:
import numpy as np
import pandas as pd
In [2]:
labels = ['a','b','c']
my_list = [10,20,30]
arr = np.array([10,20,30])
d = {'a': 10, 'b': 20, 'c': 100}
In [3]:
pd.Series(my_list) # pd.Series(data, index)
Out[3]:
In [7]:
pd.Series(data = my_list, index = labels)
Out[7]:
In [8]:
pd.Series(arr)
Out[8]:
In [9]:
pd.Series(arr, labels)
Out[9]:
In [10]:
pd.Series(d)
Out[10]:
In [11]:
pd.Series(data = labels)
Out[11]:
In [13]:
# Extremely flexible, storing fucntions
pd.Series([sum, print, len])
Out[13]:
In [18]:
ser1 = pd.Series([1,2,3,4], index=['USA', 'CHINA', 'INDIA', 'GERMANY'])
ser1
Out[18]:
In [19]:
ser2 = pd.Series([1,2,3,4], index=['UK', 'CHINA', 'INDIA', 'JAPAN'])
ser2
Out[19]:
In [20]:
ser1['USA']
Out[20]:
In [21]:
ser2['JAPAN']
Out[21]:
In [23]:
# ser2['CANADA'] #produces a KeyError
In [24]:
ser1 + ser2
Out[24]:
In [28]:
import numpy as np
import pandas as pd
In [35]:
from numpy.random import randn
np.random.seed(101)
In [36]:
df = pd.DataFrame(data = randn(5,4), index = ['a','b','c','d','e'], columns= ['w', 'x', 'y', 'z'])
df
Out[36]:
In [104]:
df['w'] # grab column notation (commonly used practice)
Out[104]:
In [45]:
type(df['w'])
Out[45]:
In [46]:
type(df)
Out[46]:
In [49]:
df[['w', 'z']] # grabbing multiple columns
Out[49]:
In [50]:
type(df[['w', 'z']] )
Out[50]:
In [59]:
df['new'] = df['w'] + df['y']
In [60]:
df
Out[60]:
In [61]:
# dropping column, default axis is 0, referring row axis, hence error
df.drop('new')
In [62]:
df.drop('new', axis = 1)
Out[62]:
In [63]:
df
Out[63]:
In [64]:
# deleting column from original variable
df.drop('new', axis = 1, inplace = True)
In [65]:
df
Out[65]:
In [66]:
df.drop('e')
Out[66]:
In [67]:
df
Out[67]:
In [68]:
df.shape
Out[68]:
In [69]:
# Selecting rows
df
Out[69]:
In [74]:
df.loc['a'] # loc = location
Out[74]:
In [75]:
type(df.loc['a'])
Out[75]:
In [76]:
df.iloc[0] # accessing row with index position, iloc = indexed location
Out[76]:
In [77]:
type(df.iloc[0])
Out[77]:
In [87]:
# grabbing multiple rows
df.loc[['a', 'b']]
Out[87]:
In [81]:
# grabbing a cell
df.loc['b', 'y'] # df.loc[row, column]
Out[81]:
In [82]:
df.loc[['a', 'b'], ['w', 'y']] # subset return of the dataframe
Out[82]:
In [89]:
df > 0
Out[89]:
In [90]:
booldf = df > 0
df[booldf]
Out[90]:
In [91]:
df[df>0]
Out[91]:
In [97]:
df
Out[97]:
In [98]:
# conditional filtering rows based on columns
df['w'] > 0
Out[98]:
In [99]:
df[df['w'] > 0] # conditional selection of rows based on values in column w
Out[99]:
In [100]:
df[df['z'] < 0]
Out[100]:
In [101]:
resultdf = df[df['w'] > 0]
resultdf['x']
Out[101]:
In [103]:
# One liner filter of dataframe
# We are filtering original df for values in w which are > than zero
# and then grabbing only the column x
df[df['w'] > 0]['x']
Out[103]:
In [105]:
df[df['w'] > 0][['y','x']]
Out[105]:
In [106]:
df
Out[106]:
In [109]:
# Note the & (ampersand) 'and' operator, normal 'and' will not work
df[(df['w'] > 0) & (df['y'] > 1)]
Out[109]:
In [111]:
# Note the | (pipe) 'or' operator, normal 'or' will not work
df[(df['w'] > 0) | (df['y'] > 1)]
Out[111]:
In [ ]:
# ~ is the 'not' operator in pandas
In [113]:
df.reset_index() # inplace = false
Out[113]:
In [114]:
df
Out[114]:
In [117]:
newind = 'CA NY WY OR CO'.split()
newind
Out[117]:
In [118]:
df['states'] = newind
In [119]:
df
Out[119]:
In [123]:
# So, if you have a column that you want your row index to be then do following..
df.set_index('states') # inplace = false
Out[123]:
In [124]:
df
Out[124]:
In [3]:
import numpy as np
import pandas as pd
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
hier_index
Out[3]:
In [5]:
hier_index = pd.MultiIndex.from_tuples(hier_index)
hier_index
Out[5]:
In [9]:
df = pd.DataFrame(np.random.randn(6,2), index = hier_index, columns = ['A', 'B'])
df
Out[9]:
In [11]:
# grabbing out of multi-level
df.loc['G1']
Out[11]:
In [12]:
df.loc['G1'].loc[1]
Out[12]:
In [13]:
df.index.names
Out[13]:
In [14]:
df.index.names = ['Groups', 'Num']
In [15]:
df
Out[15]:
In [16]:
df.loc['G2'].loc[2]['B']
Out[16]:
In [19]:
df
Out[19]:
In [20]:
df.xs('G1')
Out[20]:
In [22]:
# grabbing cross-section from both groups
df.xs(1, level='Num')
Out[22]:
In [23]:
import numpy as np
import pandas as pd
In [24]:
d = {'A': [1,2,np.nan], 'B': [5, np.nan, np.nan], 'C': [1,2,3]}
df = pd.DataFrame(d)
df
Out[24]:
In [26]:
# performing drop across rows
df.dropna()
Out[26]:
In [27]:
# performing drop across coolumns
df.dropna(axis = 1)
Out[27]:
In [28]:
df
Out[28]:
In [30]:
# defining drop threshold(int)
# drop where atleast 2 Nan values
df.dropna(thresh = 2)
Out[30]:
In [31]:
df
Out[31]:
In [32]:
df.fillna(value = 'FILL VALUE')
Out[32]:
In [34]:
# filling missing value in A with its mean
df['A'].fillna(value = df['A'].mean())
Out[34]:
In [1]:
import pandas as pd
# Create dataframe
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
'Sales':[200,120,340,124,243,350]}
In [2]:
df = pd.DataFrame(data)
df
Out[2]:
In [3]:
df.groupby('Company')
Out[3]:
In [10]:
byComp = df.groupby('Company')
# mean of sales
byComp.mean()
Out[10]:
In [11]:
# sum of sales
byComp.sum()
Out[11]:
In [12]:
# standard deviation
byComp.std()
Out[12]:
In [13]:
# getting sales of facebook
byComp.sum().loc['FB']
Out[13]:
In [14]:
# One linear
df.groupby('Company').count()
Out[14]:
In [15]:
df.groupby('Company').max()
Out[15]:
In [16]:
df.groupby('Company').min()
Out[16]:
In [17]:
# group by with describe method
df.groupby('Company').describe()
Out[17]:
In [18]:
df.groupby('Company').describe().transpose()
Out[18]:
In [21]:
df.groupby('Company').describe().transpose()['GOOG']
Out[21]:
!!! Concatenating is stacking, merging is sql joining, and join is joining on index instead of column.
In [22]:
import pandas as pd
In [23]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
In [24]:
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
In [25]:
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8, 9, 10, 11])
In [26]:
df1
Out[26]:
In [27]:
df2
Out[27]:
In [28]:
df3
Out[28]:
In [29]:
pd.concat([df1,df2,df3])
Out[29]:
In [30]:
pd.concat([df1,df2,df3], axis = 1)
Out[30]:
In [39]:
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
In [40]:
left
Out[40]:
In [41]:
right
Out[41]:
In [42]:
# inner join on dataframes
pd.merge(left, right, how='inner', on='key')
Out[42]:
Or to show a more complicated example:
In [43]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
In [44]:
left
Out[44]:
In [45]:
right
Out[45]:
In [46]:
# inner join on multiple columns
pd.merge(left, right, on=['key1', 'key2'])
Out[46]:
In [47]:
# outer join on multiple columns
pd.merge(left, right, how='outer', on=['key1', 'key2'])
Out[47]:
In [48]:
# right join on multiple columns
pd.merge(left, right, how='right', on=['key1', 'key2'])
Out[48]:
In [49]:
# left join on multiple columns
pd.merge(left, right, how='left', on=['key1', 'key2'])
Out[49]:
In [50]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
In [51]:
left
Out[51]:
In [52]:
right
Out[52]:
In [53]:
# if the keys we want to merge on are on index and not in columns then use join
left.join(right)
Out[53]:
In [54]:
left.join(right, how='outer')
Out[54]:
In [55]:
import numpy as np
import pandas as pd
In [56]:
df = pd.DataFrame({'col1':[1,2,3,4],
'col2':[444,555,666,444],
'col3':['abc','def','ghi','xyz']})
df.head()
Out[56]:
In [58]:
# unique values in column
df['col2'].unique()
Out[58]:
In [59]:
# count of unique values in column
df['col2'].nunique()
Out[59]:
In [61]:
# number of times a value has occured in column
df['col2'].value_counts()
Out[61]:
In [64]:
# conditional selection reminder
df[(df['col1'] > 2) &(df['col2'] == 444)]
Out[64]:
In [65]:
def times2(x):
return x * 2
df['col1'].apply(times2)
Out[65]:
In [67]:
df['col3'].apply(len)
Out[67]:
In [69]:
df['col2'].apply(lambda x:x*2)
Out[69]:
In [70]:
# removing columns
df.drop('col1', axis = 1)
Out[70]:
In [72]:
df.columns
Out[72]:
In [73]:
df.index
Out[73]:
In [74]:
df
Out[74]:
In [75]:
df.sort_values('col2')
Out[75]:
In [76]:
df.isnull()
Out[76]:
In [77]:
data = {'A':['foo','foo','foo','bar','bar','bar'],
'B':['one','one','two','two','one','one'],
'C':['x','y','x','y','x','y'],
'D':[1,3,2,5,4,1]}
df = pd.DataFrame(data)
In [78]:
df
Out[78]:
In [80]:
df.pivot_table(values = 'D', index=['A', 'B'], columns='C')
Out[80]:
In [82]:
import pandas as pd
pd.read_csv('example.csv')
Out[82]:
In [84]:
df = pd.read_csv('example.csv')
df
Out[84]:
In [87]:
# save csv without index
df.to_csv('my_output.csv', index = False)
In [88]:
pd.read_csv('my_output.csv')
Out[88]:
In [89]:
# conda install xlrd (https://pypi.python.org/pypi/xlrd)
In [92]:
df = pd.read_excel('Excel_Sample.xlsx', sheetname='Sheet1')
In [93]:
df.to_excel('Excel_Sample2.xlsx', sheet_name='NewSheet')
In [94]:
# reading HTML
data = pd.read_html('http://www.fdic.gov/bank/individual/failed/banklist.html')
In [95]:
type(data)
Out[95]:
In [102]:
type(data[0])
Out[102]:
In [103]:
data[0]
Out[103]:
In [104]:
from sqlalchemy import create_engine
engine = create_engine('sqlite:///:memory:')
df.to_sql('my_table', engine)
sqldf = pd.read_sql('my_table', con=engine)
sqldf
Out[104]:
In [ ]: