In [238]:
%matplotlib inline
import pandas as pd
import mpld3
mpld3.enable_notebook()
%matplotlib inline
import matplotlib
import seaborn as sns
matplotlib.rcParams['savefig.dpi'] = 2 * matplotlib.rcParams['savefig.dpi']
In [4]:
import pandas as pd
In [7]:
Maps = pd.read_csv('./zip_code_database.csv')
In [333]:
import dill
with open('../LendingClubPrediction/CleanedUpData.pkl','rb') as in_strm:
df = dill.load(in_strm)
In [448]:
df['int_rate'] = [float(df.int_rate.iloc[i][:-1]) for i in xrange(len(df.int_rate))]
In [65]:
county = pd.read_csv('../zcta_county_rel_10.txt')
In [11]:
valide_zip= pd.unique(df.zip_code)
In [12]:
zip_number =[int(i.replace('xx','')) for i in valide_zip]
In [26]:
a=(Maps.zip/100).floordiv(1)
In [41]:
zip_indata = pd.DataFrame({'zip_3':zip_number})
Maps['zip_3'] = a
In [42]:
after_merge=Maps.merge(zip_indata,how='inner',on='zip_3')
In [46]:
after_merge.to_csv('trial_1.csv')
In [47]:
pd.unique(after_merge.zip).shape
Out[47]:
In [ ]:
In [68]:
county_zip = after_merge.merge(county,how='inner',left_on='zip',right_on='ZCTA5')
In [72]:
county_zip.iloc[:500].to_csv('county_1.csv')
In [ ]:
In [ ]:
In [ ]:
In [70]:
county_zip.columns
Out[70]:
In [104]:
from bokeh.sampledata import us_states, us_counties, unemployment
from bokeh.plotting import figure, show, output_file, output_notebook
In [105]:
us_states = us_states.data.copy()
us_counties = us_counties.data.copy()
unemployment = unemployment.data
del us_states["HI"]
del us_states["AK"]
state_xs = [us_states[code]["lons"] for code in us_states]
state_ys = [us_states[code]["lats"] for code in us_states]
county_xs=[us_counties[code]["lons"] for code in us_counties if us_counties[code]["state"] not in ["ak", "hi", "pr", "gu", "vi", "mp", "as"]]
county_ys=[us_counties[code]["lats"] for code in us_counties if us_counties[code]["state"] not in ["ak", "hi", "pr", "gu", "vi", "mp", "as"]]
colors = ["#F1EEF6", "#D4B9DA", "#C994C7", "#DF65B0", "#DD1C77", "#980043"]
county_colors = []
for county_id in us_counties:
if us_counties[county_id]["state"] in ["ak", "hi", "pr", "gu", "vi", "mp", "as"]:
continue
try:
rate = unemployment[county_id]
idx = min(int(rate/2), 5)
county_colors.append(colors[idx])
except KeyError:
county_colors.append("black")
In [112]:
len(county_xs[40])
Out[112]:
1.1 Need to get the map one working.
1.1.1 give options about further filtering based on FICO credit score, and LC (MAYBE, depends on whether I have time or not.)
1.2 Need to bar graphs about FICO score, home owner, other features too. So the KEY is make it PRETTY
2.1 Need to explain why use the ensemble model, Simple answer is that it works better. LOL. Boosting decision tree seems to work better, though logistic regression seems to be enough.
2.2 Another issue is that if I am going to deploy to Heroku, probably I can only use logistic regression.
2.3 Engineer some other features?? Like the difference between low and high score. Maybe some nlp stuff, not sure this will work. Or the employment rate of the states? Libral vs Conservative? This sorts of thing. I should just factor it in use a mean model.
3.1 This is an afterthought.
3.2 involves with getting the API working
In [ ]:
In [699]:
import simplejson
from requests_oauthlib import OAuth1
import requests
with open("./lendingclub_secrets.json.nogit") as fh:
secrets = simplejson.loads(fh.read())
# create an auth object
auth = OAuth1(
secrets["api_key"],
# secrets["api_secret"],
# secrets["access_token"],
# secrets["access_token_secret"]
)
In [700]:
# Query Parameters:
# showAll — A non-required Boolean parameter that defines the contents of the result.
# showAll= False
In [701]:
params={'showAll' : True}
r = requests.get("https://api.lendingclub.com/api/investor/v1/loans/listing", headers={'Authorization':'5Rtb7dWC4Wps2C3VRAn2hBQVERg='},params=params)
# r = requests.get("https://api.lendingclub.com/api/64233077/v1/loans/listing", params=params)
#
print r
In [704]:
loanlist=r.json()
print len(loanlist['loans'])
In [703]:
import dill
with open('loanlist_3.pkl','wb') as out_strm:
dill.dump(loanlist,out_strm)
In [712]:
loanlist['loans'][0]
Out[712]:
In [185]:
len(loanlist['loans'])
Out[185]:
In [187]:
ids = []
for i in range(348):
ids.append(loanlist['loans'][i]['memberId'])
In [189]:
In [ ]:
In [ ]:
In [ ]:
# output_file("choropleth.html", title="choropleth.py example")
# output_notebook()
# TOOLS = ("hover,save")
# p = figure(title="US Unemployment 2009", toolbar_location="left",tools =TOOLS,
# plot_width=1100, plot_height=700)
# p.patches(county_xs, county_ys, fill_color=county_colors, fill_alpha=0.7,
# line_color="white", line_width=0.5)
# p.patches(state_xs, state_ys, fill_alpha=0.0,
# line_color="#884444", line_width=2)
# show(p)
In [196]:
import dill
with open('loanlist_2.pkl','rb') as in_strm:
loanlist_2 = dill.load(in_strm)
with open('loanlist.pkl','rb') as in_strm:
loanlist_1 = dill.load(in_strm)
In [203]:
ids_1 = [loanlist_1['loans'][i]['id'] for i in range(348)]
ids_2 = [loanlist_2['loans'][i]['id'] for i in range(len(loanlist_2['loans']))]
In [209]:
missing =[]
for i in ids_1:
if i not in ids_2:
missing.append(i)
In [210]:
len(missing)
Out[210]:
In [211]:
loanlist_1['loans'][1]
Out[211]:
In [332]:
df.columns
Out[332]:
In [334]:
df['paidoff'] = (df.stat== 1)
In [ ]:
In [257]:
In [335]:
bystate = pd.DataFrame()
bystate['mean_rate'] = df.groupby([u'addr_state']).mean().paidoff
bystate['default_rate'] = 1- df.groupby([u'addr_state']).mean().paidoff
bystate['count'] = df.groupby([u'addr_state']).count().paidoff
bystate['fico_range_high'] = df.groupby([u'addr_state']).mean().fico_range_high
bystate['fico_range_high'] = df.groupby([u'addr_state']).mean().fico_range_high
In [336]:
bystate.to_csv('bystate.csv')
In [ ]:
In [283]:
df[df.addr_state=='IA']
Out[283]:
In [ ]:
In [ ]:
In [339]:
In [ ]:
In [ ]:
from bokeh.plotting import figure, show, output_file
In [ ]:
In [382]:
In [ ]:
In [478]:
from bokeh.models import HoverTool, ColumnDataSource
from collections import OrderedDict
# bygrade = pd.DataFrame()
# bygrade['mean_rate'] = df.groupby([u'sub_grade']).mean().paidoff
# bygrade['default_rate'] = 1-df.groupby([u'sub_grade']).mean().paidoff
# bygrade['int_rate'] = df.groupby([u'sub_grade']).mean().int_rate
# bygrade['counts'] = df.groupby([u'sub_grade']).count().paidoff
bygrade = pd.DataFrame()
bygrade['mean_rate'] = df.groupby([u'grade']).mean().paidoff
bygrade['default_rate'] = 1-df.groupby([u'grade']).mean().paidoff
bygrade['int_rate'] = df.groupby([u'grade']).mean().int_rate
bygrade['counts'] = df.groupby([u'grade']).count().paidoff
xgrades = [i for i in bygrade.index]
pay_grades = bygrade.mean_rate.values*100
dft_grades = bygrade.default_rate.values*100
In [479]:
# xx = df.groupby([u'sub_grade']).mean()
In [ ]:
In [698]:
# TOOLS = "hover,save"
# p = figure(background_fill="#EFE8E2",
# x_range=xgrades,
# x_axis_label='LC grade', y_axis_label=('Pay off rate (%)'),
# y_range = [0, 100],
# title="Loan Outcome by LendingClub Grade",
# tools = TOOLS,
# plot_width=800,
# plot_height=400)
# source1 = ColumnDataSource(
# data=dict(pay_grades=pay_grades, dft_grades=dft_grades,int_rate=bygrade['int_rate'].values)
# )
# source2 = ColumnDataSource(
# data=dict(pay_grades=pay_grades, dft_grades=dft_grades,int_rate=bygrade['int_rate'].values)
# )
# p.rect(xgrades, pay_grades/2, 0.6, pay_grades,
# fill_color="#08c994", source = source1)
# p.rect(xgrades, dft_grades/2 + pay_grades, 0.6,dft_grades,
# fill_color="#ff5a00", source = source2)
# hover = p.select(dict(type=HoverTool))
# hover.tooltips = OrderedDict([
# ('Grade', "$x"),
# ('Payoff rate (%)', '@pay_grades'),
# ('Default rate (%)', '@dft_grades'),
# ('Interest (%)','@int_rate'),
# ])
# show(p)
In [ ]:
In [ ]:
In [497]:
byfico = pd.DataFrame()
byfico['mean_rate'] = df.groupby([u'fico_range_high']).mean().paidoff.iloc[2:]
byfico['default_rate'] = 1- df.groupby([u'fico_range_high']).mean().paidoff[2:]
byfico['count'] = df.groupby([u'fico_range_high']).count().paidoff[2:]
In [ ]:
In [696]:
# TOOLS = "hover,save"
# xfico = [(str(int(byfico.index[i]-4)) + ' - ' +str(int(byfico.index[i]))) for i in xrange(len(byfico.index))]
# p = figure(background_fill="#EFE8E2",
# x_axis_label='Pay off rate (%)', y_axis_label='FICO score',
# x_range = [0, 100] ,
# y_range = xfico[::-1],
# title="Loan Outcome by FICO score",
# tools = TOOLS,
# plot_width=800,
# plot_height=600)
# source1 = ColumnDataSource(
# data=dict(fico_score=xfico, dft_rate=byfico['default_rate'].iloc[::-1]*100,payoff_rate=byfico['mean_rate'].iloc[::-1]*100)
# )
# source2 = ColumnDataSource(
# data=dict(fico_score=xfico, dft_rate=byfico['default_rate'].iloc[::-1]*100,payoff_rate=byfico['mean_rate'].iloc[::-1]*100)
# )
# p.rect(byfico['mean_rate'].iloc[::-1]*100/2,
# xfico[::-1],byfico['mean_rate'].iloc[::-1]*100 ,0.8,
# fill_color="#08c994", source = source1)
# p.rect(byfico['default_rate'].iloc[::-1]*100/2 + byfico['mean_rate'].iloc[::-1]*100,
# xfico[::-1], byfico['default_rate'].iloc[::-1]*100, 0.8,
# fill_color="#ff5a00", source = source2)
# hover = p.select(dict(type=HoverTool))
# hover.tooltips = OrderedDict([
# ('FICO score', "$y"),
# ('Payoff rate (%)', '@payoff_rate'),
# ('Default rate (%)', '@dft_rate'),
# ])
# show(p)
In [513]:
df.columns
Out[513]:
In [533]:
df['dti_bin'] = np.floor(df.dti/5)*5
In [543]:
(pd.unique(df.emp_length))
Out[543]:
In [605]:
In [ ]:
In [ ]:
In [607]:
# N = 100
# x = np.random.random(size=N) * 100
# y = np.random.random(size=N) * 100
# radii = np.random.random(size=N) * 1.5
# colors = ["#%02x%02x%02x" % (r, g, 150) for r, g in zip(np.floor(50+2*x), np.floor(30+2*y))]
# colors
In [680]:
Out[680]:
In [694]:
# data = df.groupby(['home_ownership','emp_length']).mean().paidoff
# from collections import OrderedDict
# import numpy as np
# from bokeh.plotting import ColumnDataSource, figure, show, output_file
# from bokeh.models import HoverTool
# # Read in the data with pandas. Convert the year column to string
# home_ownership = ['RENT', 'OWN', 'MORTGAGE']
# emp_lenght = [ 'n/a', '< 1 year', '1 year', '3 years', '2 years', '4 years', '5 years',
# '6 years', '7 years', '8 years', '9 years','10+ years']
# # colors = [
# # "#08C994", "#26BB81", "#45AD6F", "#649F5C", "#83914A",
# # "#A28337", "#C17525", "#E06712", "#FF5A00"
# # ]
# colors = [
# "#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", "#dfccce",
# "#ddb7b1", "#cc7878", "#933b41", "#550b1d"
# ]
# a=sorted(data.values)[::7]
# home = []
# emp = []
# color = []
# rate = []
# for y in emp_lenght:
# for m in home_ownership:
# home.append(m)
# emp.append(y)
# rate_by_home_emp = data[m][y]
# rate.append(rate_by_home_emp*100)
# for i in xrange(1,9):
# if rate_by_home_emp > a[i-1] and rate_by_home_emp<= a[i]:
# ci = 9-i
# # ci = int((rate_by_home_emp - min(data))/(max(data)-min(data))*8.9)
# color.append(colors[ci])
# output_notebook()
# TOOLS = "hover,save"
# p = figure(
# x_axis_label='Employment length (%)', y_axis_label='House Ownership',
# y_range=home_ownership, x_range=emp_lenght ,
# x_axis_location="above", plot_width=800, plot_height=400,
# toolbar_location="left", tools=TOOLS)
# source = ColumnDataSource(
# data=dict(home=home, emp=emp,color=color, rate=rate)
# )
# p.rect("emp", "home", 1, 1, source=source, color="color",line_color=None)
# p.grid.grid_line_color = None
# p.axis.axis_line_color = None
# p.axis.major_tick_line_color = None
# p.axis.major_label_text_font_size = "12pt"
# p.axis.major_label_standoff = 0
# p.xaxis.major_label_orientation = np.pi/3
# hover = p.select(dict(type=HoverTool))
# hover.tooltips = OrderedDict([
# ('Pay off rate (%)', '@rate'),
# ])
# show(p) # show the plot
In [ ]:
In [590]:
max(np.random.random(size=1000) * 100)
Out[590]:
In [ ]:
In [540]:
354/4/8
Out[540]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: