In [2]:
import numpy as np
import pandas as pd
labels = ['a','b','c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10,'b':20,'c':30}
print ("Labels:", labels)
print("My data:", my_data)
print("Dictionary:", d)
In [3]:
pd.Series(data=my_data) # Output looks very similar to a NumPy array
Out[3]:
In [4]:
pd.Series(data=my_data, index=labels) # Note the extra information about index
Out[4]:
In [5]:
# Inputs are in order of the expected parameters (not explicitly named), NumPy array is used for data
pd.Series(arr, labels)
Out[5]:
In [6]:
pd.Series(d) # Using a pre-defined Dictionary object
Out[6]:
In [7]:
print ("\nHolding numerical data\n",'-'*25, sep='')
print(pd.Series(arr))
print ("\nHolding text labels\n",'-'*20, sep='')
print(pd.Series(labels))
print ("\nHolding functions\n",'-'*20, sep='')
print(pd.Series(data=[sum,print,len]))
print ("\nHolding objects from a dictionary\n",'-'*40, sep='')
print(pd.Series(data=[d.keys, d.items, d.values]))
In [8]:
ser1 = pd.Series([1,2,3,4],['CA', 'OR', 'CO', 'AZ'])
ser2 = pd.Series([1,2,5,4],['CA', 'OR', 'NV', 'AZ'])
print ("\nIndexing by name of the item/object (string identifier)\n",'-'*56, sep='')
print("Value for CA in ser1:", ser1['CA'])
print("Value for AZ in ser1:", ser1['AZ'])
print("Value for NV in ser2:", ser2['NV'])
print ("\nIndexing by number (positional value in the series)\n",'-'*52, sep='')
print("Value for CA in ser1:", ser1[0])
print("Value for AZ in ser1:", ser1[3])
print("Value for NV in ser2:", ser2[2])
print ("\nIndexing by a range\n",'-'*25, sep='')
print ("Value for OR, CO, and AZ in ser1:\n", ser1[1:4], sep='')
In [9]:
ser1 = pd.Series([1,2,3,4],['CA', 'OR', 'CO', 'AZ'])
ser2 = pd.Series([1,2,5,4],['CA', 'OR', 'NV', 'AZ'])
ser3 = ser1+ser2
print ("\nAfter adding the two series, the result looks like this...\n",'-'*59, sep='')
print(ser3)
print("\nPython tries to add values where it finds common index name, and puts NaN where indices are missing\n")
print ("\nThe idea works even for multiplication...\n",'-'*43, sep='')
print (ser1*ser2)
print ("\nOr even for combination of mathematical operations!\n",'-'*53, sep='')
print (np.exp(ser1)+np.log10(ser2))
In [10]:
from numpy.random import randn as rn
In [11]:
np.random.seed(101)
matrix_data = rn(5,4)
row_labels = ['A','B','C','D','E']
column_headings = ['W','X','Y','Z']
df = pd.DataFrame(data=matrix_data, index=row_labels, columns=column_headings)
print("\nThe data frame looks like\n",'-'*45, sep='')
print(df)
In [12]:
print("\nThe 'X' column\n",'-'*25, sep='')
print(df['X'])
print("\nType of the column: ", type(df['X']), sep='')
print("\nThe 'X' and 'Z' columns indexed by passing a list\n",'-'*55, sep='')
print(df[['X','Z']])
print("\nType of the pair of columns: ", type(df[['X','Z']]), sep='')
print ("\nSo, for more than one column, the object turns into a DataFrame")
In [13]:
print("\nThe 'X' column accessed by DOT method (NOT recommended)\n",'-'*55, sep='')
print(df.X)
In [14]:
print("\nA column is created by assigning it in relation to an existing column\n",'-'*75, sep='')
df['New'] = df['X']+df['Z']
df['New (Sum of X and Z)'] = df['X']+df['Z']
print(df)
print("\nA column is dropped by using df.drop() method\n",'-'*55, sep='')
df = df.drop('New', axis=1) # Notice the axis=1 option, axis = 0 is default, so one has to change it to 1
print(df)
df1=df.drop('A')
print("\nA row (index) is dropped by using df.drop() method and axis=0\n",'-'*65, sep='')
print(df1)
print("\nAn in-place change can be done by making inplace=True in the drop method\n",'-'*75, sep='')
df.drop('New (Sum of X and Z)', axis=1, inplace=True)
print(df)
In [15]:
print("\nLabel-based 'loc' method can be used for selecting row(s)\n",'-'*60, sep='')
print("\nSingle row\n")
print(df.loc['C'])
print("\nMultiple rows\n")
print(df.loc[['B','C']])
print("\nIndex position based 'iloc' method can be used for selecting row(s)\n",'-'*70, sep='')
print("\nSingle row\n")
print(df.iloc[2])
print("\nMultiple rows\n")
print(df.iloc[[1,2]])
In [16]:
np.random.seed(101)
matrix_data = rn(5,4)
row_labels = ['A','B','C','D','E']
column_headings = ['W','X','Y','Z']
df = pd.DataFrame(data=matrix_data, index=row_labels, columns=column_headings)
print("\nThe DatFrame\n",'-'*45, sep='')
print(df)
print("\nElement at row 'B' and column 'Y' is\n")
print(df.loc['B','Y'])
print("\nSubset comprising of rows B and D, and columns W and Y, is\n")
df.loc[['B','D'],['W','Y']]
Out[16]:
In [17]:
print("\nThe DataFrame\n",'-'*45, sep='')
print(df)
print("\nBoolean DataFrame(s) where we are checking if the values are greater than 0\n",'-'*75, sep='')
print(df>0)
print("\n")
print(df.loc[['A','B','C']]>0)
booldf = df>0
print("\nDataFrame indexed by boolean dataframe\n",'-'*45, sep='')
print(df[booldf])
In [27]:
matrix_data = np.matrix('22,66,140;42,70,148;30,62,125;35,68,160;25,62,152')
row_labels = ['A','B','C','D','E']
column_headings = ['Age', 'Height', 'Weight']
df = pd.DataFrame(data=matrix_data, index=row_labels, columns=column_headings)
print("\nA new DataFrame\n",'-'*25, sep='')
print(df)
print("\nRows with Height > 65 inch\n",'-'*35, sep='')
print(df[df['Height']>65])
booldf1 = df['Height']>65
booldf2 = df['Weight']>145
print("\nRows with Height > 65 inch and Weight >145 lbs\n",'-'*55, sep='')
print(df[(booldf1) & (booldf2)])
print("\nDataFrame with only Age and Weight columns whose Height > 65 inch\n",'-'*68, sep='')
print(df[booldf1][['Age','Weight']])
In [50]:
matrix_data = np.matrix('22,66,140;42,70,148;30,62,125;35,68,160;25,62,152')
row_labels = ['A','B','C','D','E']
column_headings = ['Age', 'Height', 'Weight']
df = pd.DataFrame(data=matrix_data, index=row_labels, columns=column_headings)
print("\nThe DataFrame\n",'-'*25, sep='')
print(df)
print("\nAfter resetting index\n",'-'*35, sep='')
print(df.reset_index())
print("\nAfter resetting index with 'drop' option TRUE\n",'-'*45, sep='')
print(df.reset_index(drop=True))
print("\nAdding a new column 'Profession'\n",'-'*45, sep='')
df['Profession'] = "Student Teacher Engineer Doctor Nurse".split()
print(df)
print("\nSetting 'Profession' column as index\n",'-'*45, sep='')
print (df.set_index('Profession'))
In [87]:
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
print("\nTuple pairs after the zip and list command\n",'-'*45, sep='')
print(hier_index)
hier_index = pd.MultiIndex.from_tuples(hier_index)
print("\nIndex hierarchy\n",'-'*25, sep='')
print(hier_index)
print("\nIndex hierarchy type\n",'-'*25, sep='')
print(type(hier_index))
print("\nCreating DataFrame with multi-index\n",'-'*37, sep='')
np.random.seed(101)
df1 = pd.DataFrame(data=np.round(rn(6,3),2), index= hier_index, columns= ['A','B','C'])
print(df1)
print("\nSubsetting multi-index DataFrame using two 'loc' methods\n",'-'*60, sep='')
print(df1.loc['G2'].loc[[1,3]][['B','C']])
print("\nNaming the indices by 'index.names' method\n",'-'*45, sep='')
df1.index.names=['Outer', 'Inner']
print(df1)
In [109]:
print("\nGrabbing a cross-section from outer level\n",'-'*45, sep='')
print(df1.xs('G1'))
print("\nGrabbing a cross-section from inner level (for all outer levels)\n",'-'*65, sep='')
print(df1.xs(2,level='Inner'))
In [95]:
df = pd.DataFrame({'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]})
df['States']="CA NV AZ".split()
df.set_index('States',inplace=True)
print(df)
In [108]:
print("\nDropping any rows with a NaN value\n",'-'*35, sep='')
print(df.dropna(axis=0))
print("\nDropping any column with a NaN value\n",'-'*35, sep='')
print(df.dropna(axis=1))
print("\nDropping a row with a minimum 2 NaN value using 'thresh' parameter\n",'-'*68, sep='')
print(df.dropna(axis=0, thresh=2))
In [114]:
print("\nFilling values with a default value\n",'-'*35, sep='')
print(df.fillna(value='FILL VALUE'))
print("\nFilling values with a computed value (mean of column A here)\n",'-'*60, sep='')
print(df.fillna(value=df['A'].mean()))
In [151]:
# Create dataframe
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
'Sales':[200,120,340,124,243,350]}
df = pd.DataFrame(data)
df
Out[151]:
In [176]:
byComp = df.groupby('Company')
print("\nGrouping by 'Company' column and listing mean sales\n",'-'*55, sep='')
print(byComp.mean())
print("\nGrouping by 'Company' column and listing sum of sales\n",'-'*55, sep='')
print(byComp.sum())
# Note dataframe conversion of the series and transpose
print("\nAll in one line of command (Stats for 'FB')\n",'-'*65, sep='')
print(pd.DataFrame(df.groupby('Company').describe().loc['FB']).transpose())
print("\nSame type of extraction with little different command\n",'-'*68, sep='')
print(df.groupby('Company').describe().loc[['GOOG', 'MSFT']])
In [188]:
# Creating data frames
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']},
index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
'B': ['B4', 'B5', 'B6', 'B7'],
'C': ['C4', 'C5', 'C6', 'C7'],
'D': ['D4', 'D5', 'D6', 'D7']},
index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
'B': ['B8', 'B9', 'B10', 'B11'],
'C': ['C8', 'C9', 'C10', 'C11'],
'D': ['D8', 'D9', 'D10', 'D11']},
index=[8,9,10,11])
print("\nThe DataFrame number 1\n",'-'*30, sep='')
print(df1)
print("\nThe DataFrame number 2\n",'-'*30, sep='')
print(df2)
print("\nThe DataFrame number 3\n",'-'*30, sep='')
print(df3)
In [189]:
df_cat1 = pd.concat([df1,df2,df3], axis=0)
print("\nAfter concatenation along row\n",'-'*30, sep='')
print(df_cat1)
df_cat2 = pd.concat([df1,df2,df3], axis=1)
print("\nAfter concatenation along column\n",'-'*60, sep='')
print(df_cat2)
df_cat2.fillna(value=0, inplace=True)
print("\nAfter filling missing values with zero\n",'-'*60, sep='')
print(df_cat2)
In [183]:
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
print("\nThe DataFrame 'left'\n",'-'*30, sep='')
print(left)
print("\nThe DataFrame 'right'\n",'-'*30, sep='')
print(right)
In [185]:
merge1= pd.merge(left,right,how='inner',on='key')
print("\nAfter simple merging with 'inner' method\n",'-'*50, sep='')
print(merge1)
In [190]:
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
'key2': ['K0', 'K1', 'K0', 'K1'],
'A': ['A0', 'A1', 'A2', 'A3'],
'B': ['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
'key2': ['K0', 'K0', 'K0', 'K0'],
'C': ['C0', 'C1', 'C2', 'C3'],
'D': ['D0', 'D1', 'D2', 'D3']})
In [193]:
left
Out[193]:
In [194]:
right
Out[194]:
In [191]:
pd.merge(left, right, on=['key1', 'key2'])
Out[191]:
In [192]:
pd.merge(left, right, how='outer',on=['key1', 'key2'])
Out[192]:
In [195]:
pd.merge(left, right, how='left',on=['key1', 'key2'])
Out[195]:
In [196]:
pd.merge(left, right, how='right',on=['key1', 'key2'])
Out[196]:
In [197]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
In [198]:
left
Out[198]:
In [199]:
right
Out[199]:
In [200]:
left.join(right)
Out[200]:
In [201]:
left.join(right, how='outer')
Out[201]:
In [232]:
import pandas as pd
df = pd.DataFrame({'col1':[1,2,3,4,5,6,7,8,9,10],
'col2':[444,555,666,444,333,222,666,777,666,555],
'col3':'aaa bb c dd eeee fff gg h iii j'.split()})
df
Out[232]:
In [233]:
print("\nMethod head() is for showing first few entries\n",'-'*50, sep='')
df.head()
Out[233]:
In [239]:
print("\nFinding unique values in 'col2'\n",'-'*40, sep='') # Note 'unique' method applies to pd.series only
print(df['col2'].unique())
In [238]:
print("\nFinding number of unique values in 'col2'\n",'-'*45, sep='')
print(df['col2'].nunique())
In [237]:
print("\nTable of unique values in 'col2'\n",'-'*40, sep='')
t1=df['col2'].value_counts()
print(t1)
In [229]:
# Define a function
def testfunc(x):
if (x> 500):
return (10*np.log10(x))
else:
return (x/10)
In [286]:
df['FuncApplied'] = df['col2'].apply(testfunc)
print(df)
Apply works with built-in function too!
In [287]:
df['col3length']= df['col3'].apply(len)
print(df)
Combine 'apply' with lambda expession for in-line calculations
In [265]:
df['FuncApplied'].apply(lambda x: np.sqrt(x))
Out[265]:
Standard statistical functions directly apply to columns
In [267]:
print("\nSum of the column 'FuncApplied' is: ",df['FuncApplied'].sum())
print("Mean of the column 'FuncApplied' is: ",df['FuncApplied'].mean())
print("Std dev of the column 'FuncApplied' is: ",df['FuncApplied'].std())
print("Min and max of the column 'FuncApplied' are: ",df['FuncApplied'].min(),"and",df['FuncApplied'].max())
Getting the names of the columns
In [278]:
print("\nName of columns\n",'-'*20, sep='')
print(df.columns)
l = list(df.columns)
print("\nColumn names in a list of strings for later manipulation:",l)
Deletion by 'del' command # This affects the dataframe immediately, unlike drop method.
In [289]:
print("\nDeleting last column by 'del' command\n",'-'*50, sep='')
del df['col3length']
print(df)
df['col3length']= df['col3'].apply(len)
Sorting and Ordering a DataFrame
In [290]:
df.sort_values(by='col2') #inplace=False by default
Out[290]:
In [292]:
df.sort_values(by='FuncApplied',ascending=False) #inplace=False by default
Out[292]:
Find Null Values or Check for Null Values
In [294]:
df = pd.DataFrame({'col1':[1,2,3,np.nan],
'col2':[np.nan,555,666,444],
'col3':['abc','def','ghi','xyz']})
df.head()
Out[294]:
In [295]:
df.isnull()
Out[295]:
In [296]:
df.fillna('FILL')
Out[296]:
Pivot Table
In [297]:
data = {'A':['foo','foo','foo','bar','bar','bar'],
'B':['one','one','two','two','one','one'],
'C':['x','y','x','y','x','y'],
'D':[1,3,2,5,4,1]}
df = pd.DataFrame(data)
df
Out[297]:
In [303]:
# Index out of 'A' and 'B', columns from 'C', actual numerical values from 'D'
df.pivot_table(values='D',index=['A', 'B'],columns=['C'])
Out[303]:
In [302]:
# Index out of 'A' and 'B', columns from 'C', actual numerical values from 'D'
df.pivot_table(values='D',index=['A', 'B'],columns=['C'], fill_value='FILLED')
Out[302]:
Import packages
In [304]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
Read in the CSV data file
In [308]:
df1=pd.read_csv('df1.csv', index_col=0)
df1.head()
Out[308]:
In [320]:
df2=pd.read_csv('df2')
df2.head()
Out[320]:
Histogram of a single column
In [310]:
df1['A'].hist()
Out[310]:
Histogram with a different set of arguments (list of columns, bins, figure size, etc)
In [312]:
df1.hist(column=['B','C'],bins=20,figsize=(10,4))
Out[312]:
Histogram with generic plot method of Pandas
In [325]:
df1.plot(kind='hist', bins=30, grid=True, figsize=(12,7))
Out[325]:
Area plot
In [323]:
import seaborn as sns #Plot style will change to Seaborn package style from now on
df2.plot.area(alpha=0.4)
Out[323]:
Bar plot (with and without stacking)
In [326]:
df2.plot.bar()
Out[326]:
In [328]:
df2.plot.bar(stacked=True)
Out[328]:
Lineplot
In [331]:
df1.plot.line(x=df1.index,y=['B','C'],figsize=(12,4),lw=1) # Note matplotlib arguments like 'lw' and 'figsize'
Out[331]:
Scatterplot
In [332]:
df1.plot.scatter(x='A',y='B',figsize=(12,8))
Out[332]:
In [335]:
df1.plot.scatter(x='A',y='B',c='C',cmap='coolwarm',figsize=(12,8)) # Color of the scatter dots set based on column C
Out[335]:
In [344]:
df1.plot.scatter(x='A',y='B',s=10*np.exp(df1['C']),c='C',figsize=(12,8)) # Size of the dots set based on column C
Out[344]:
Boxplot
In [346]:
df2.plot.box()
Out[346]:
Hexagonal bin plot for bivariate data
In [348]:
df=pd.DataFrame(data=np.random.randn(1000,2),columns=['A','B'])
df.head()
Out[348]:
In [350]:
df.plot.hexbin(x='A',y='B',gridsize=30,cmap='coolwarm')
Out[350]:
Kernel density estimation
In [352]:
df2.plot.density(lw=3)
Out[352]:
In [ ]: