In [175]:
%matplotlib inline
# first we ingest the data from the source on the web
# this contains a reduced version of the data set from Lending Club
import pandas as pd
# list of NA values
na_values = ['n/a']
loansData = pd.read_csv('../datasets/loansData.csv', na_values=na_values, dtype={'Employment.Length': str})
# Has NA.
loansData['Employment.Length'].unique()
Out[175]:
In [176]:
loansData['Interest.Rate'][0:5] # first five rows of Interest.Rate
Out[176]:
In [177]:
def fix_interest(rates):
# Truncate the last character from every interest rate
rates = rates.str.slice(0,-1)
return rates
In [178]:
# First five rows after removing % signs from rates
loansData['Interest.Rate'].str.slice(0,-1)[0:5]
Out[178]:
In [179]:
# Making changes in original data
loansData['Interest.Rate'] = fix_interest(loansData['Interest.Rate'])
In [180]:
loansData['Loan.Length'][0:5]
Out[180]:
In [181]:
def remove_months(loan):
# Remove ' months' suffix from each row
loan = loan.str.slice(0, -7)
return loan
In [182]:
# Making changes in original data
loansData['Loan.Length'] = remove_months(loansData['Loan.Length'])
loansData['Loan.Length'][0:5]
Out[182]:
In [183]:
loansData['FICO.Range'][0:5] # first five rows of FICO.Range
Out[183]:
In [184]:
# Get all index for the series data
indices = loansData['FICO.Range'].index
count = 0
vals = {}
# Replace the categorical data with the middle value.
for x in loansData['FICO.Range']:
low,high = x.split('-')
mid = (int(low) + int(high))/2
vals[indices[count]] = str(int(mid))
count += 1
# Forming a series object and then assigning.
loansData['FICO.Range'] = pd.Series(vals,indices)
loansData['FICO.Range'][0:5]
Out[184]:
In [162]:
type(loansData['Loan.Length'])
Out[162]:
In [187]:
# Drop all rows with NAN value for Employment.Length.
loansData = loansData[pd.notnull(loansData['Employment.Length'])]
In [188]:
loansData['Employment.Length'].unique()
Out[188]:
In [196]:
# managing outliers - remove row with monthly income > 100K
loansData = loansData[loansData['Monthly.Income'] < 100000]
In [199]:
# With the distribution of FICO scores we see the histogram below.
import matplotlib.pyplot as plt
plt.figure()
loansmin = pd.read_csv('../datasets/loanf.csv')
fico = loansmin['FICO.Score']
p = fico.hist()
In [21]:
# BoxPlot
p = loansmin.boxplot('Interest.Rate','FICO.Score')
q = p.set_xticklabels(['640','','','','660','','','','680','','','','700',
'720','','','','740','','','','760','','','','780','','','','800','','','','820','','','','840'])
q0 = p.set_xlabel('FICO Score')
q1 = p.set_ylabel('Interest Rate %')
q2 = p.set_title(' ')
In [20]:
import pandas as pd
loansmin = pd.read_csv('../datasets/loanf.csv')
a = pd.scatter_matrix(loansmin,alpha=0.01,figsize=(10,10), diagonal='kde')
In [ ]: