In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import sys
import sklearn
import sqlite3
import matplotlib
import numpy as np
import pandas as pd
import enchant as en
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.linear_model import Ridge
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
%aimport data
from data import make_dataset as md
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16.0, 6.0)
plt.rcParams['legend.markerscale'] = 3
matplotlib.rcParams['font.size'] = 16.0
In [2]:
DIR = os.getcwd() + "/../data/"
t = pd.read_csv(DIR + 'raw/lending-club-loan-data/loan.csv', low_memory=False)
t.head(3)
Out[2]:
In [3]:
t2 = md.clean_data(t)
t3 = md.impute_missing(t2)
df = md.simple_dataset(t3)
In [4]:
df['issue_d'].hist(bins = 50)
plt.title('Seasonality in lending')
plt.ylabel('Frequbency')
plt.xlabel('Year')
plt.show()
In [5]:
X = df.drop(['int_rate', 'issue_d', 'earliest_cr_line', 'grade'], 1)
y = df['int_rate']
X.shape, y.shape
Out[5]:
In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[6]:
In [7]:
scores_ridge = list()
scores_std_ridge = list()
ridge = Ridge()
coefs = []
errors = []
alphas = np.logspace(-6, 6, 20)
# Train the model with different regularisation strengths
for a in alphas:
ridge.set_params(alpha=a)
this_scores = cross_val_score(ridge, X_train, y_train, cv=3, n_jobs=1)
scores_ridge.append(np.mean(this_scores))
scores_std_ridge.append(np.std(this_scores))
In [8]:
scores_ridge, scores_std_ridge = np.array(scores_ridge), np.array(scores_std_ridge)
plt.figure().set_size_inches(8, 6)
plt.semilogx(alphas, scores_ridge)
# plot error lines showing +/- std. errors of the scores
std_error = scores_std_ridge / np.sqrt(3)
plt.semilogx(alphas, scores_ridge + std_error, 'b--')
plt.semilogx(alphas, scores_ridge - std_error, 'b--')
# alpha=0.2 controls the translucency of the fill color
plt.fill_between(alphas, scores_ridge + scores_std_ridge, scores_ridge - scores_std_ridge, alpha=0.2)
plt.ylabel('CV score +/- std error')
plt.xlabel('alpha')
plt.title('Ridge Regression')
plt.axhline(np.max(scores_ridge), linestyle='--', color='.5')
plt.xlim([alphas[0], alphas[-1]])
plt.show()
In [9]:
ridge = Ridge(alpha=10**0)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)
print("CV Accuracy: {:.2f}".format(np.max(scores_ridge)))
print("Test Accuracy: {:.2f}".format(ridge.score(X_test, y_test)))
In [10]:
rfr = RandomForestRegressor(n_estimators = 10, max_features='sqrt')
total_scores = cross_val_score(rfr, X_train, y_train, cv = 3)
print("CV Accuracy: {:.2f} (+/- {:.2f})".format(total_scores.mean(), total_scores.std() * 2))
In [11]:
rfr.fit(X_train, y_train)
print("Test Accuracy: {:.2f}".format(rfr.score(X_test, y_test)))
In [12]:
fi = [{'importance': x, 'feature': y} for (x, y) in \
sorted(zip(rfr.feature_importances_, X.columns))]
fi = pd.DataFrame(fi)
fi.sort_values(by = 'importance', ascending = False, inplace = True)
fi.head()
Out[12]:
In [13]:
top5 = fi.head()
top5.plot(kind = 'bar')
plt.xticks(range(5), top5['feature'], fontsize=20,rotation=0)
plt.title('Feature importances (top 5 features)')
plt.ylabel('Relative importance')
plt.show()
In [14]:
ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=2),
n_estimators=300, random_state=0)
total_scores = cross_val_score(ada, X_train, y_train, cv = 3)
ada.fit(X_train, y_train)
print("CV Accuracy: {:.2f} (+/- {:.2f})".format(total_scores.mean(), total_scores.std() * 2))
print("Test Accuracy: {:.2f}".format(ada.score(X_test, y_test)))
In [15]:
df['issue_d'].describe()
Out[15]:
In [16]:
rfr = RandomForestRegressor(n_estimators = 10, max_features='sqrt')
In [17]:
for y in range(2008, 2016):
last_year = df[df['issue_d'] == str(y)]
last_year_X = last_year.drop(['int_rate', 'issue_d', 'earliest_cr_line', 'grade'], 1)
last_year_y = last_year['int_rate']
this_year = df[df['issue_d'] == str(y + 1)]
this_year_X = this_year.drop(['int_rate', 'issue_d', 'earliest_cr_line', 'grade'], 1)
this_year_y = this_year['int_rate']
rfr.fit(last_year_X, last_year_y)
if y != 2015:
print("Predicting year {} using {} data: \
{:.2f}".format(y + 1, y, rfr.score(this_year_X, this_year_y)))