In [2]:
    
import numpy as np
import pandas as pd
labels = ['a','b','c']
my_data = [10,20,30]
arr = np.array(my_data)
d = {'a':10,'b':20,'c':30}
print ("Labels:", labels)
print("My data:", my_data)
print("Dictionary:", d)
    
    
In [3]:
    
pd.Series(data=my_data) # Output looks very similar to a NumPy array
    
    Out[3]:
In [4]:
    
pd.Series(data=my_data, index=labels) # Note the extra information about index
    
    Out[4]:
In [5]:
    
# Inputs are in order of the expected parameters (not explicitly named), NumPy array is used for data
pd.Series(arr, labels)
    
    Out[5]:
In [6]:
    
pd.Series(d) # Using a pre-defined Dictionary object
    
    Out[6]:
In [7]:
    
print ("\nHolding numerical data\n",'-'*25, sep='')
print(pd.Series(arr))
print ("\nHolding text labels\n",'-'*20, sep='')
print(pd.Series(labels))
print ("\nHolding functions\n",'-'*20, sep='')
print(pd.Series(data=[sum,print,len]))
print ("\nHolding objects from a dictionary\n",'-'*40, sep='')
print(pd.Series(data=[d.keys, d.items, d.values]))
    
    
In [8]:
    
ser1 = pd.Series([1,2,3,4],['CA', 'OR', 'CO', 'AZ'])
ser2 = pd.Series([1,2,5,4],['CA', 'OR', 'NV', 'AZ'])
print ("\nIndexing by name of the item/object (string identifier)\n",'-'*56, sep='')
print("Value for CA in ser1:", ser1['CA'])
print("Value for AZ in ser1:", ser1['AZ'])
print("Value for NV in ser2:", ser2['NV'])
print ("\nIndexing by number (positional value in the series)\n",'-'*52, sep='')
print("Value for CA in ser1:", ser1[0])
print("Value for AZ in ser1:", ser1[3])
print("Value for NV in ser2:", ser2[2])
print ("\nIndexing by a range\n",'-'*25, sep='')
print ("Value for OR, CO, and AZ in ser1:\n", ser1[1:4], sep='')
    
    
In [9]:
    
ser1 = pd.Series([1,2,3,4],['CA', 'OR', 'CO', 'AZ'])
ser2 = pd.Series([1,2,5,4],['CA', 'OR', 'NV', 'AZ'])
ser3 = ser1+ser2
print ("\nAfter adding the two series, the result looks like this...\n",'-'*59, sep='')
print(ser3)
print("\nPython tries to add values where it finds common index name, and puts NaN where indices are missing\n")
print ("\nThe idea works even for multiplication...\n",'-'*43, sep='')
print (ser1*ser2)
print ("\nOr even for combination of mathematical operations!\n",'-'*53, sep='')
print (np.exp(ser1)+np.log10(ser2))
    
    
In [10]:
    
from numpy.random import randn as rn
    
In [11]:
    
np.random.seed(101)
matrix_data = rn(5,4)
row_labels = ['A','B','C','D','E']
column_headings = ['W','X','Y','Z']
df = pd.DataFrame(data=matrix_data, index=row_labels, columns=column_headings)
print("\nThe data frame looks like\n",'-'*45, sep='')
print(df)
    
    
In [12]:
    
print("\nThe 'X' column\n",'-'*25, sep='')
print(df['X'])
print("\nType of the column: ", type(df['X']), sep='')
print("\nThe 'X' and 'Z' columns indexed by passing a list\n",'-'*55, sep='')
print(df[['X','Z']])
print("\nType of the pair of columns: ", type(df[['X','Z']]), sep='')
print ("\nSo, for more than one column, the object turns into a DataFrame")
    
    
In [13]:
    
print("\nThe 'X' column accessed by DOT method (NOT recommended)\n",'-'*55, sep='')
print(df.X)
    
    
In [14]:
    
print("\nA column is created by assigning it in relation to an existing column\n",'-'*75, sep='')
df['New'] = df['X']+df['Z']
df['New (Sum of X and Z)'] = df['X']+df['Z']
print(df)
print("\nA column is dropped by using df.drop() method\n",'-'*55, sep='')
df = df.drop('New', axis=1) # Notice the axis=1 option, axis = 0 is default, so one has to change it to 1
print(df)
df1=df.drop('A')
print("\nA row (index) is dropped by using df.drop() method and axis=0\n",'-'*65, sep='')
print(df1)
print("\nAn in-place change can be done by making inplace=True in the drop method\n",'-'*75, sep='')
df.drop('New (Sum of X and Z)', axis=1, inplace=True)
print(df)
    
    
In [15]:
    
print("\nLabel-based 'loc' method can be used for selecting row(s)\n",'-'*60, sep='')
print("\nSingle row\n")
print(df.loc['C'])
print("\nMultiple rows\n")
print(df.loc[['B','C']])
print("\nIndex position based 'iloc' method can be used for selecting row(s)\n",'-'*70, sep='')
print("\nSingle row\n")
print(df.iloc[2])
print("\nMultiple rows\n")
print(df.iloc[[1,2]])
    
    
In [16]:
    
np.random.seed(101)
matrix_data = rn(5,4)
row_labels = ['A','B','C','D','E']
column_headings = ['W','X','Y','Z']
df = pd.DataFrame(data=matrix_data, index=row_labels, columns=column_headings)
print("\nThe DatFrame\n",'-'*45, sep='')
print(df)
print("\nElement at row 'B' and column 'Y' is\n") 
print(df.loc['B','Y'])
print("\nSubset comprising of rows B and D, and columns W and Y, is\n")
df.loc[['B','D'],['W','Y']]
    
    
    Out[16]:
In [17]:
    
print("\nThe DataFrame\n",'-'*45, sep='')
print(df)
print("\nBoolean DataFrame(s) where we are checking if the values are greater than 0\n",'-'*75, sep='')
print(df>0)
print("\n")
print(df.loc[['A','B','C']]>0)
booldf = df>0
print("\nDataFrame indexed by boolean dataframe\n",'-'*45, sep='')
print(df[booldf])
    
    
In [27]:
    
matrix_data = np.matrix('22,66,140;42,70,148;30,62,125;35,68,160;25,62,152')
row_labels = ['A','B','C','D','E']
column_headings = ['Age', 'Height', 'Weight']
df = pd.DataFrame(data=matrix_data, index=row_labels, columns=column_headings)
print("\nA new DataFrame\n",'-'*25, sep='')
print(df)
print("\nRows with Height > 65 inch\n",'-'*35, sep='')
print(df[df['Height']>65])
booldf1 = df['Height']>65
booldf2 = df['Weight']>145
print("\nRows with Height > 65 inch and Weight >145 lbs\n",'-'*55, sep='')
print(df[(booldf1) & (booldf2)])
print("\nDataFrame with only Age and Weight columns whose Height > 65 inch\n",'-'*68, sep='')
print(df[booldf1][['Age','Weight']])
    
    
In [50]:
    
matrix_data = np.matrix('22,66,140;42,70,148;30,62,125;35,68,160;25,62,152')
row_labels = ['A','B','C','D','E']
column_headings = ['Age', 'Height', 'Weight']
df = pd.DataFrame(data=matrix_data, index=row_labels, columns=column_headings)
print("\nThe DataFrame\n",'-'*25, sep='')
print(df)
print("\nAfter resetting index\n",'-'*35, sep='')
print(df.reset_index())
print("\nAfter resetting index with 'drop' option TRUE\n",'-'*45, sep='')
print(df.reset_index(drop=True))
print("\nAdding a new column 'Profession'\n",'-'*45, sep='')
df['Profession'] = "Student Teacher Engineer Doctor Nurse".split()
print(df)
print("\nSetting 'Profession' column as index\n",'-'*45, sep='')
print (df.set_index('Profession'))
    
    
In [87]:
    
# Index Levels
outside = ['G1','G1','G1','G2','G2','G2']
inside = [1,2,3,1,2,3]
hier_index = list(zip(outside,inside))
print("\nTuple pairs after the zip and list command\n",'-'*45, sep='')
print(hier_index)
hier_index = pd.MultiIndex.from_tuples(hier_index)
print("\nIndex hierarchy\n",'-'*25, sep='')
print(hier_index)
print("\nIndex hierarchy type\n",'-'*25, sep='')
print(type(hier_index))
print("\nCreating DataFrame with multi-index\n",'-'*37, sep='')
np.random.seed(101)
df1 = pd.DataFrame(data=np.round(rn(6,3),2), index= hier_index, columns= ['A','B','C'])
print(df1)
print("\nSubsetting multi-index DataFrame using two 'loc' methods\n",'-'*60, sep='')
print(df1.loc['G2'].loc[[1,3]][['B','C']])
print("\nNaming the indices by 'index.names' method\n",'-'*45, sep='')
df1.index.names=['Outer', 'Inner']
print(df1)
    
    
In [109]:
    
print("\nGrabbing a cross-section from outer level\n",'-'*45, sep='')
print(df1.xs('G1'))
print("\nGrabbing a cross-section from inner level (for all outer levels)\n",'-'*65, sep='')
print(df1.xs(2,level='Inner'))
    
    
In [95]:
    
df = pd.DataFrame({'A':[1,2,np.nan],'B':[5,np.nan,np.nan],'C':[1,2,3]})
df['States']="CA NV AZ".split()
df.set_index('States',inplace=True)
print(df)
    
    
In [108]:
    
print("\nDropping any rows with a NaN value\n",'-'*35, sep='')
print(df.dropna(axis=0))
print("\nDropping any column with a NaN value\n",'-'*35, sep='')
print(df.dropna(axis=1))
print("\nDropping a row with a minimum 2 NaN value using 'thresh' parameter\n",'-'*68, sep='')
print(df.dropna(axis=0, thresh=2))
    
    
In [114]:
    
print("\nFilling values with a default value\n",'-'*35, sep='')
print(df.fillna(value='FILL VALUE'))
print("\nFilling values with a computed value (mean of column A here)\n",'-'*60, sep='')
print(df.fillna(value=df['A'].mean()))
    
    
In [151]:
    
# Create dataframe
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
       'Person':['Sam','Charlie','Amy','Vanessa','Carl','Sarah'],
       'Sales':[200,120,340,124,243,350]}
df = pd.DataFrame(data)
df
    
    Out[151]:
In [176]:
    
byComp = df.groupby('Company')
print("\nGrouping by 'Company' column and listing mean sales\n",'-'*55, sep='')
print(byComp.mean())
print("\nGrouping by 'Company' column and listing sum of sales\n",'-'*55, sep='')
print(byComp.sum())
# Note dataframe conversion of the series and transpose
print("\nAll in one line of command (Stats for 'FB')\n",'-'*65, sep='')
print(pd.DataFrame(df.groupby('Company').describe().loc['FB']).transpose())
print("\nSame type of extraction with little different command\n",'-'*68, sep='')
print(df.groupby('Company').describe().loc[['GOOG', 'MSFT']])
    
    
In [188]:
    
# Creating data frames
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']},
                        index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                        'B': ['B4', 'B5', 'B6', 'B7'],
                        'C': ['C4', 'C5', 'C6', 'C7'],
                        'D': ['D4', 'D5', 'D6', 'D7']},
                         index=[4, 5, 6, 7])
df3 = pd.DataFrame({'A': ['A8', 'A9', 'A10', 'A11'],
                        'B': ['B8', 'B9', 'B10', 'B11'],
                        'C': ['C8', 'C9', 'C10', 'C11'],
                        'D': ['D8', 'D9', 'D10', 'D11']},
                        index=[8,9,10,11])
print("\nThe DataFrame number 1\n",'-'*30, sep='')
print(df1)
print("\nThe DataFrame number 2\n",'-'*30, sep='')
print(df2)
print("\nThe DataFrame number 3\n",'-'*30, sep='')
print(df3)
    
    
In [189]:
    
df_cat1 = pd.concat([df1,df2,df3], axis=0)
print("\nAfter concatenation along row\n",'-'*30, sep='')
print(df_cat1)
df_cat2 = pd.concat([df1,df2,df3], axis=1)
print("\nAfter concatenation along column\n",'-'*60, sep='')
print(df_cat2)
df_cat2.fillna(value=0, inplace=True)
print("\nAfter filling missing values with zero\n",'-'*60, sep='')
print(df_cat2)
    
    
In [183]:
    
left = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                     'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3']})
   
right = pd.DataFrame({'key': ['K0', 'K1', 'K2', 'K3'],
                          'C': ['C0', 'C1', 'C2', 'C3'],
                          'D': ['D0', 'D1', 'D2', 'D3']})
print("\nThe DataFrame 'left'\n",'-'*30, sep='')
print(left)
print("\nThe DataFrame 'right'\n",'-'*30, sep='')
print(right)
    
    
In [185]:
    
merge1= pd.merge(left,right,how='inner',on='key')
print("\nAfter simple merging with 'inner' method\n",'-'*50, sep='')
print(merge1)
    
    
In [190]:
    
left = pd.DataFrame({'key1': ['K0', 'K0', 'K1', 'K2'],
                     'key2': ['K0', 'K1', 'K0', 'K1'],
                        'A': ['A0', 'A1', 'A2', 'A3'],
                        'B': ['B0', 'B1', 'B2', 'B3']})
    
right = pd.DataFrame({'key1': ['K0', 'K1', 'K1', 'K2'],
                               'key2': ['K0', 'K0', 'K0', 'K0'],
                                  'C': ['C0', 'C1', 'C2', 'C3'],
                                  'D': ['D0', 'D1', 'D2', 'D3']})
    
In [193]:
    
left
    
    Out[193]:
In [194]:
    
right
    
    Out[194]:
In [191]:
    
pd.merge(left, right, on=['key1', 'key2'])
    
    Out[191]:
In [192]:
    
pd.merge(left, right, how='outer',on=['key1', 'key2'])
    
    Out[192]:
In [195]:
    
pd.merge(left, right, how='left',on=['key1', 'key2'])
    
    Out[195]:
In [196]:
    
pd.merge(left, right, how='right',on=['key1', 'key2'])
    
    Out[196]:
In [197]:
    
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2']) 
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])
    
In [198]:
    
left
    
    Out[198]:
In [199]:
    
right
    
    Out[199]:
In [200]:
    
left.join(right)
    
    Out[200]:
In [201]:
    
left.join(right, how='outer')
    
    Out[201]:
In [232]:
    
import pandas as pd
df = pd.DataFrame({'col1':[1,2,3,4,5,6,7,8,9,10],
                   'col2':[444,555,666,444,333,222,666,777,666,555],
                   'col3':'aaa bb c dd eeee fff gg h iii j'.split()})
df
    
    Out[232]:
In [233]:
    
print("\nMethod head() is for showing first few entries\n",'-'*50, sep='')
df.head()
    
    
    Out[233]:
In [239]:
    
print("\nFinding unique values in 'col2'\n",'-'*40, sep='') # Note 'unique' method applies to pd.series only
print(df['col2'].unique())
    
    
In [238]:
    
print("\nFinding number of unique values in 'col2'\n",'-'*45, sep='')
print(df['col2'].nunique())
    
    
In [237]:
    
print("\nTable of unique values in 'col2'\n",'-'*40, sep='')
t1=df['col2'].value_counts()
print(t1)
    
    
In [229]:
    
# Define a function
def testfunc(x):
    if (x> 500):
        return (10*np.log10(x))
    else:
        return (x/10)
    
In [286]:
    
df['FuncApplied'] = df['col2'].apply(testfunc)
print(df)
    
    
Apply works with built-in function too!
In [287]:
    
df['col3length']= df['col3'].apply(len)
print(df)
    
    
Combine 'apply' with lambda expession for in-line calculations
In [265]:
    
df['FuncApplied'].apply(lambda x: np.sqrt(x))
    
    Out[265]:
Standard statistical functions directly apply to columns
In [267]:
    
print("\nSum of the column 'FuncApplied' is: ",df['FuncApplied'].sum())
print("Mean of the column 'FuncApplied' is: ",df['FuncApplied'].mean())
print("Std dev of the column 'FuncApplied' is: ",df['FuncApplied'].std())
print("Min and max of the column 'FuncApplied' are: ",df['FuncApplied'].min(),"and",df['FuncApplied'].max())
    
    
Getting the names of the columns
In [278]:
    
print("\nName of columns\n",'-'*20, sep='')
print(df.columns)
l = list(df.columns)
print("\nColumn names in a list of strings for later manipulation:",l)
    
    
Deletion by 'del' command # This affects the dataframe immediately, unlike drop method.
In [289]:
    
print("\nDeleting last column by 'del' command\n",'-'*50, sep='')
del df['col3length']
print(df)
df['col3length']= df['col3'].apply(len)
    
    
Sorting and Ordering a DataFrame
In [290]:
    
df.sort_values(by='col2') #inplace=False by default
    
    Out[290]:
In [292]:
    
df.sort_values(by='FuncApplied',ascending=False) #inplace=False by default
    
    Out[292]:
Find Null Values or Check for Null Values
In [294]:
    
df = pd.DataFrame({'col1':[1,2,3,np.nan],
                   'col2':[np.nan,555,666,444],
                   'col3':['abc','def','ghi','xyz']})
df.head()
    
    Out[294]:
In [295]:
    
df.isnull()
    
    Out[295]:
In [296]:
    
df.fillna('FILL')
    
    Out[296]:
Pivot Table
In [297]:
    
data = {'A':['foo','foo','foo','bar','bar','bar'],
     'B':['one','one','two','two','one','one'],
       'C':['x','y','x','y','x','y'],
       'D':[1,3,2,5,4,1]}
df = pd.DataFrame(data)
df
    
    Out[297]:
In [303]:
    
# Index out of 'A' and 'B', columns from 'C', actual numerical values from 'D'
df.pivot_table(values='D',index=['A', 'B'],columns=['C'])
    
    Out[303]:
In [302]:
    
# Index out of 'A' and 'B', columns from 'C', actual numerical values from 'D'
df.pivot_table(values='D',index=['A', 'B'],columns=['C'], fill_value='FILLED')
    
    Out[302]:
Import packages
In [304]:
    
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
    
Read in the CSV data file
In [308]:
    
df1=pd.read_csv('df1.csv', index_col=0)
df1.head()
    
    Out[308]:
In [320]:
    
df2=pd.read_csv('df2')
df2.head()
    
    Out[320]:
Histogram of a single column
In [310]:
    
df1['A'].hist()
    
    Out[310]:
    
Histogram with a different set of arguments (list of columns, bins, figure size, etc)
In [312]:
    
df1.hist(column=['B','C'],bins=20,figsize=(10,4))
    
    Out[312]:
    
Histogram with generic plot method of Pandas
In [325]:
    
df1.plot(kind='hist', bins=30, grid=True, figsize=(12,7))
    
    Out[325]:
    
Area plot
In [323]:
    
import seaborn as sns #Plot style will change to Seaborn package style from now on
df2.plot.area(alpha=0.4)
    
    Out[323]:
    
Bar plot (with and without stacking)
In [326]:
    
df2.plot.bar()
    
    Out[326]:
    
In [328]:
    
df2.plot.bar(stacked=True)
    
    Out[328]:
    
Lineplot
In [331]:
    
df1.plot.line(x=df1.index,y=['B','C'],figsize=(12,4),lw=1) # Note matplotlib arguments like 'lw' and 'figsize'
    
    Out[331]:
    
Scatterplot
In [332]:
    
df1.plot.scatter(x='A',y='B',figsize=(12,8))
    
    Out[332]:
    
In [335]:
    
df1.plot.scatter(x='A',y='B',c='C',cmap='coolwarm',figsize=(12,8)) # Color of the scatter dots set based on column C
    
    Out[335]:
    
In [344]:
    
df1.plot.scatter(x='A',y='B',s=10*np.exp(df1['C']),c='C',figsize=(12,8)) # Size of the dots set based on column C
    
    Out[344]:
    
Boxplot
In [346]:
    
df2.plot.box()
    
    Out[346]:
    
Hexagonal bin plot for bivariate data
In [348]:
    
df=pd.DataFrame(data=np.random.randn(1000,2),columns=['A','B'])
df.head()
    
    Out[348]:
In [350]:
    
df.plot.hexbin(x='A',y='B',gridsize=30,cmap='coolwarm')
    
    Out[350]:
    
Kernel density estimation
In [352]:
    
df2.plot.density(lw=3)
    
    Out[352]:
    
In [ ]: