In [4]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import sys
import sklearn
import sqlite3
import matplotlib
import numpy as np
import pandas as pd
import enchant as en
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.cross_validation import train_test_split, cross_val_score, KFold
from sklearn.linear_model import Ridge
src_dir = os.path.join(os.getcwd(), os.pardir, 'src')
sys.path.append(src_dir)
%aimport data
from data import make_dataset as md
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16.0, 6.0)
plt.rcParams['legend.markerscale'] = 3
matplotlib.rcParams['font.size'] = 16.0
In [5]:
DIR = os.getcwd() + "/../data/"
t = pd.read_csv(DIR + 'raw/lending-club-loan-data/loan.csv', low_memory=False)
t.head(3)
Out[5]:
In [6]:
t2 = md.clean_data(t)
t3 = md.impute_missing(t2)
df = md.simple_dataset(t3)
In [19]:
df['issue_d'].describe()
Out[19]:
In [20]:
rfr = RandomForestRegressor(n_estimators = 10, max_features='sqrt')
In [24]:
for y in range(2008, 2016):
last_year = df[df['issue_d'] == str(y)]
last_year_X = last_year.drop(['int_rate', 'issue_d', 'earliest_cr_line', 'grade'], 1)
last_year_y = last_year['int_rate']
this_year = df[df['issue_d'] == str(y + 1)]
this_year_X = this_year.drop(['int_rate', 'issue_d', 'earliest_cr_line', 'grade'], 1)
this_year_y = this_year['int_rate']
rfr.fit(last_year_X, last_year_y)
if y != 2015:
print("Predicting year {} using {} data: \
{:.2f}".format(y + 1, y, rfr.score(this_year_X, this_year_y)))