In [175]:
%matplotlib inline
# first we ingest the data from the source on the web
# this contains a reduced version of the data set from Lending Club
import pandas as pd
# list of NA values
na_values = ['n/a']
loansData = pd.read_csv('../datasets/loansData.csv', na_values=na_values, dtype={'Employment.Length': str})
# Has NA.
loansData['Employment.Length'].unique()


Out[175]:
array(['< 1 year', '2 years', '5 years', '9 years', '3 years', '10+ years',
       '8 years', '6 years', '1 year', '7 years', '4 years', nan], dtype=object)

In [176]:
loansData['Interest.Rate'][0:5]  # first five rows of Interest.Rate


Out[176]:
81174     8.90%
99592    12.12%
80059    21.98%
15825     9.99%
33182    11.71%
Name: Interest.Rate, dtype: object

In [177]:
def fix_interest(rates):
    # Truncate the last character from every interest rate
    rates = rates.str.slice(0,-1)
    
    return rates

In [178]:
# First five rows after removing % signs from rates
loansData['Interest.Rate'].str.slice(0,-1)[0:5]


Out[178]:
81174     8.90
99592    12.12
80059    21.98
15825     9.99
33182    11.71
Name: Interest.Rate, dtype: object

In [179]:
# Making changes in original data
loansData['Interest.Rate'] = fix_interest(loansData['Interest.Rate'])

In [180]:
loansData['Loan.Length'][0:5]


Out[180]:
81174    36 months
99592    36 months
80059    60 months
15825    36 months
33182    36 months
Name: Loan.Length, dtype: object

In [181]:
def remove_months(loan):
    # Remove ' months' suffix from each row
    loan = loan.str.slice(0, -7)
    return loan

In [182]:
# Making changes in original data
loansData['Loan.Length'] = remove_months(loansData['Loan.Length'])
loansData['Loan.Length'][0:5]


Out[182]:
81174    36
99592    36
80059    60
15825    36
33182    36
Name: Loan.Length, dtype: object

In [183]:
loansData['FICO.Range'][0:5] # first five rows of FICO.Range


Out[183]:
81174    735-739
99592    715-719
80059    690-694
15825    695-699
33182    695-699
Name: FICO.Range, dtype: object

In [184]:
# Get all index for the series data
indices = loansData['FICO.Range'].index
count = 0
vals = {}
# Replace the categorical data with the middle value.
for x in loansData['FICO.Range']:
    low,high = x.split('-')
    mid = (int(low) + int(high))/2
    vals[indices[count]] = str(int(mid))
    count += 1
# Forming a series object and then assigning.
loansData['FICO.Range'] = pd.Series(vals,indices)
loansData['FICO.Range'][0:5]


Out[184]:
81174    737
99592    717
80059    692
15825    697
33182    697
Name: FICO.Range, dtype: object

In [162]:
type(loansData['Loan.Length'])


Out[162]:
pandas.core.series.Series

In [187]:
# Drop all rows with NAN value for Employment.Length.
loansData = loansData[pd.notnull(loansData['Employment.Length'])]

In [188]:
loansData['Employment.Length'].unique()


Out[188]:
array(['< 1 year', '2 years', '5 years', '9 years', '3 years', '10+ years',
       '8 years', '6 years', '1 year', '7 years', '4 years'], dtype=object)

In [196]:
# managing outliers - remove row with monthly income > 100K
loansData = loansData[loansData['Monthly.Income'] < 100000]

In [199]:
# With the distribution of FICO scores we see the histogram below.
import matplotlib.pyplot as plt
plt.figure()
loansmin = pd.read_csv('../datasets/loanf.csv')
fico = loansmin['FICO.Score']
p = fico.hist()



In [21]:
# BoxPlot
p = loansmin.boxplot('Interest.Rate','FICO.Score')
q = p.set_xticklabels(['640','','','','660','','','','680','','','','700',
  '720','','','','740','','','','760','','','','780','','','','800','','','','820','','','','840'])

q0 = p.set_xlabel('FICO Score')
q1 = p.set_ylabel('Interest Rate %')
q2 = p.set_title('                          ')



In [20]:
import pandas as pd
loansmin = pd.read_csv('../datasets/loanf.csv')
a = pd.scatter_matrix(loansmin,alpha=0.01,figsize=(10,10), diagonal='kde')



In [ ]: