In [1]:
import pandas as pd
import numpy as np

pdata = sqlContext.read.format('csv').load("[MY SPARK HDFS PATH HERE]/GBM_test.csv", index_col="ID", header =True).toPandas()

In [2]:
# check all column names

list(pdata.columns)

In [3]:
print pdata.dtypes

In [4]:
# convert data type for a certain column

pdata['Loan_Amount_Submitted'] = pdata['Loan_Amount_Submitted'].convert_objects(convert_numeric=True)
print pdata.dtypes

In [5]:
# Boolean Indexing
pdata.loc[(pdata["Gender"]=="Female") & (pdata["Salary_Account"]=="ICICI Bank") & (pdata["Mobile_Verified"]=="Y"), ["Gender", "Salary_Account", "Mobile_Verified"]]

In [6]:
# apply function, similar to R apply()
def get_missing_data(x):
  return sum(x.isnull())

print "find missing data for each column:"
print pdata.apply(get_missing_data, axis = 0)

print "find missing data for each row:"
print pdata.apply(get_missing_data, axis = 1)

In [7]:
# cell 4 - fillna(), updating missing values with the overall mean/mode/median of the column
from scipy.stats import mode

# check the mode
mode(pdata['Gender'])[0][0] 

pdata['Gender'].fillna(mode(pdata['Gender'])[0][0], inplace=True)
pdata.apply(get_missing_data, axis=0)

In [8]:
# cell 5 - create Excel style pivot table

# check data type first
pdata.dtypes

# convert Monthly_Income into numerical data
pdata['Monthly_Income'] = pdata['Monthly_Income'].astype(float)
pdata.dtypes

pivot_t = pdata.pivot_table(values=['Monthly_Income'], index=['Gender', 'Mobile_Verified', 'Device_Type'], aggfunc = np.mean)
print pivot_t

In [9]:
# cell 6 - MUltiple Indexing
## I like this, only iterate rows with Monthly_Income as null
for i, r in pdata.loc[pdata['Monthly_Income'].isnull(),:].iterrows():
  index_list = tuple([r(['Gender']), r(['Mobile_Verified']), r(['Device_Type'])])
  pdata.loc[i, 'Monthly_Income'] = pivot_t.loc[index_list].values[0]   # using multiple index to locate data
  
print pdata.apply(get_missing_data, axis=0)

In [10]:
# cell 7 - cross tab
print pd.crosstab(pdata['Gender'], pdata['Mobile_Verified'], margins=True)
print

def get_percentage(ser):
  return ser/float(ser[-1])

print pd.crosstab(pdata['Gender'], pdata['Mobile_Verified'], margins=True).apply(get_percentage, axis=1)

In [11]:
# cell 8 - data merging
people_rate = pd.DataFrame([200, 400], index=['Mobile', 'Web-browser'], columns=['people_rate'])
people_rate

data_merge = pdata.merge(right=people_rate, how='inner', left_on='Device_Type', right_index=True, sort=False)
data_merge.pivot_table(values=['Monthly_Income'], index=['Device_Type', 'people_rate'], aggfunc = len)

In [12]:
# sorting on multiple columns
# sort() is the method of the old version in pandas, in new version 0.17.0, the function name should be sort_values

sorted_data = pdata.sort(['Loan_Amount_Submitted', 'Interest_Rate'], ascending=[0, 1])
sorted_data[['Loan_Amount_Submitted', 'Interest_Rate']].head(10)

In [13]:
# I think, R is much better than Python pandas in many features...
import matplotlib.pyplot as plt

fig, p = plt.subplots(nrows=1, ncols=2, figsize=(15, 15))
p[0].plot(pdata['Loan_Amount_Submitted'], pdata['Monthly_Income'], 'ro')
p[1].boxplot(pdata['Loan_Amount_Submitted'])
display(fig)

In [14]: