In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import os
import sys
import sklearn
import sqlite3
import matplotlib

import numpy as np
import pandas as pd
import enchant as en
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.linear_model import Ridge

src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
%aimport data
from data import make_dataset as md

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16.0, 6.0)
plt.rcParams['legend.markerscale'] = 3
matplotlib.rcParams['font.size'] = 16.0


/usr/local/lib/python2.7/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

Data: Preparing for the model

Importing the raw data


In [2]:
DIR = os.getcwd() + "/../data/"
t = pd.read_csv(DIR + 'raw/lending-club-loan-data/loan.csv', low_memory=False)
t.head(3)


Out[2]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi total_cu_tl inq_last_12m
0 1077501 1296599 5000 5000 4975 36 months 10.65 162.87 B B2 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 1077430 1314167 2500 2500 2500 60 months 15.27 59.83 C C4 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1077175 1313524 2400 2400 2400 36 months 15.96 84.33 C C5 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

3 rows × 74 columns

Cleaning, imputing missing values, feature engineering (some NLP)


In [3]:
t2 = md.clean_data(t)
t3 = md.impute_missing(t2)
df = md.simple_dataset(t3)


Now cleaning data.
Now imputing missing values and encoding categories.
Skipping NLP/geo stuff, and removing cols.
/usr/local/lib/python2.7/site-packages/pandas/core/frame.py:2705: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  **kwargs)

In [4]:
df['issue_d'].hist(bins = 50)
plt.title('Seasonality in lending')
plt.ylabel('Frequbency')
plt.xlabel('Year')
plt.show()


Fitting the model


In [5]:
X = df.drop(['int_rate', 'issue_d', 'earliest_cr_line', 'grade'], 1)
y = df['int_rate']
X.shape, y.shape


Out[5]:
((884766, 79), (884766,))

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape


Out[6]:
((592793, 79), (291973, 79), (592793,), (291973,))

Ridge


In [7]:
scores_ridge = list()
scores_std_ridge = list()

ridge = Ridge()
coefs = []
errors = []

alphas = np.logspace(-6, 6, 20)

# Train the model with different regularisation strengths
for a in alphas:
    ridge.set_params(alpha=a)
    this_scores = cross_val_score(ridge, X_train, y_train, cv=3, n_jobs=1)
    scores_ridge.append(np.mean(this_scores))
    scores_std_ridge.append(np.std(this_scores))

In [8]:
scores_ridge, scores_std_ridge = np.array(scores_ridge), np.array(scores_std_ridge)

plt.figure().set_size_inches(8, 6)
plt.semilogx(alphas, scores_ridge)

# plot error lines showing +/- std. errors of the scores
std_error = scores_std_ridge / np.sqrt(3)

plt.semilogx(alphas, scores_ridge + std_error, 'b--')
plt.semilogx(alphas, scores_ridge - std_error, 'b--')

# alpha=0.2 controls the translucency of the fill color
plt.fill_between(alphas, scores_ridge + scores_std_ridge, scores_ridge - scores_std_ridge, alpha=0.2)

plt.ylabel('CV score +/- std error')
plt.xlabel('alpha')
plt.title('Ridge Regression')
plt.axhline(np.max(scores_ridge), linestyle='--', color='.5')
plt.xlim([alphas[0], alphas[-1]])

plt.show()



In [9]:
ridge = Ridge(alpha=10**0)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)
print("CV Accuracy: {:.2f}".format(np.max(scores_ridge)))
print("Test Accuracy: {:.2f}".format(ridge.score(X_test, y_test)))


CV Accuracy: 0.66
Test Accuracy: 0.67

Random Forest Regressor


In [10]:
rfr = RandomForestRegressor(n_estimators = 10, max_features='sqrt')
total_scores = cross_val_score(rfr, X_train, y_train, cv = 3)

print("CV Accuracy: {:.2f} (+/- {:.2f})".format(total_scores.mean(), total_scores.std() * 2))


CV Accuracy: 0.81 (+/- 0.01)

In [11]:
rfr.fit(X_train, y_train)
print("Test Accuracy: {:.2f}".format(rfr.score(X_test, y_test)))


Test Accuracy: 0.82

In [12]:
fi = [{'importance': x, 'feature': y} for (x, y) in \
      sorted(zip(rfr.feature_importances_, X.columns))]
fi = pd.DataFrame(fi)
fi.sort_values(by = 'importance', ascending = False, inplace = True) 
fi.head()


Out[12]:
feature importance
78 total_rec_int 0.142944
77 term_ 60 months 0.081901
76 installment 0.076263
75 revol_util 0.051953
74 total_rec_prncp 0.050695

In [13]:
top5 = fi.head()
top5.plot(kind = 'bar')
plt.xticks(range(5), top5['feature'], fontsize=20,rotation=0)
plt.title('Feature importances (top 5 features)')
plt.ylabel('Relative importance')
plt.show()


Ada Boost Regressor


In [14]:
ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=2),
                          n_estimators=300, random_state=0)
total_scores = cross_val_score(ada, X_train, y_train, cv = 3)
ada.fit(X_train, y_train)
print("CV Accuracy: {:.2f} (+/- {:.2f})".format(total_scores.mean(), total_scores.std() * 2))
print("Test Accuracy: {:.2f}".format(ada.score(X_test, y_test)))


CV Accuracy: 0.29 (+/- 0.02)
Test Accuracy: 0.28

Cross-validating across years

What if we use last year to predict this year?


In [15]:
df['issue_d'].describe()


Out[15]:
count                  884766
unique                    101
top       2015-10-01 00:00:00
freq                    48473
first     2007-08-01 00:00:00
last      2015-12-01 00:00:00
Name: issue_d, dtype: object

In [16]:
rfr = RandomForestRegressor(n_estimators = 10, max_features='sqrt')

In [17]:
for y in range(2008, 2016):
    last_year = df[df['issue_d'] == str(y)]
    last_year_X = last_year.drop(['int_rate', 'issue_d', 'earliest_cr_line', 'grade'], 1)
    last_year_y = last_year['int_rate']
    
    this_year = df[df['issue_d'] == str(y + 1)]
    this_year_X = this_year.drop(['int_rate', 'issue_d', 'earliest_cr_line', 'grade'], 1)
    this_year_y = this_year['int_rate']
    
    rfr.fit(last_year_X, last_year_y)
    if y != 2015:
        print("Predicting year {} using {} data: \
        {:.2f}".format(y + 1, y, rfr.score(this_year_X, this_year_y)))


Predicting year 2009 using 2008 data:         -0.06
Predicting year 2010 using 2009 data:         0.29
Predicting year 2011 using 2010 data:         0.28
Predicting year 2012 using 2011 data:         0.50
Predicting year 2013 using 2012 data:         0.32
Predicting year 2014 using 2013 data:         0.43
Predicting year 2015 using 2014 data:         0.41