In [1]:
import matplotlib

import sklearn

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import os
import sqlite3
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm

import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16.0, 6.0)
plt.rcParams['legend.markerscale'] = 3
matplotlib.rcParams['font.size'] = 16.0

from sklearn_pandas import DataFrameMapper, cross_val_score


/usr/local/lib/python3.5/site-packages/matplotlib/__init__.py:878: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

In [2]:
DIR = os.getcwd() + "/../data/"
df = pd.read_csv(DIR + 'raw/loan.csv', low_memory=False)
df.head()


Out[2]:
id member_id loan_amnt funded_amnt funded_amnt_inv term int_rate installment grade sub_grade ... total_bal_il il_util open_rv_12m open_rv_24m max_bal_bc all_util total_rev_hi_lim inq_fi total_cu_tl inq_last_12m
0 1077501 1296599 5000.0 5000.0 4975.0 36 months 10.65 162.87 B B2 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 1077430 1314167 2500.0 2500.0 2500.0 60 months 15.27 59.83 C C4 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1077175 1313524 2400.0 2400.0 2400.0 36 months 15.96 84.33 C C5 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 1076863 1277178 10000.0 10000.0 10000.0 36 months 13.49 339.31 C C1 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 1075358 1311748 3000.0 3000.0 3000.0 60 months 12.69 67.79 B B5 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 74 columns


In [3]:
# complete_cols = [column for column in df.columns if len(df[column][df[column].isnull()]) == 0]
# complete_cols

In [4]:
catagorical_cols = [
    'application_type', 'initial_list_status',
    'purpose', 'pymnt_plan', 'verification_status',
    'emp_length', 'term'
]

continous_cols =  [
 'loan_amnt','funded_amnt','funded_amnt_inv','installment',
 'dti','revol_bal'
]

y_col = ['int_rate']

In [5]:
df_data = df[catagorical_cols + continous_cols]

In [6]:
# Converted columns to floating point
for feature_name in continous_cols:
    df_data[feature_name] = df_data[feature_name].astype(float)


/usr/local/lib/python3.5/site-packages/ipykernel/__main__.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()

In [7]:
data = pd.get_dummies(df_data)

In [8]:
data.tail(3)


Out[8]:
loan_amnt funded_amnt funded_amnt_inv installment dti revol_bal application_type_INDIVIDUAL application_type_JOINT initial_list_status_f initial_list_status_w ... emp_length_4 years emp_length_5 years emp_length_6 years emp_length_7 years emp_length_8 years emp_length_9 years emp_length_< 1 year emp_length_n/a term_ 36 months term_ 60 months
887376 13000.0 13000.0 13000.0 316.07 30.90 11031.0 1.0 0.0 0.0 1.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
887377 12000.0 12000.0 12000.0 317.86 27.19 8254.0 1.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0
887378 20000.0 20000.0 20000.0 664.20 10.83 33266.0 1.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0

3 rows × 43 columns


In [9]:
x = data.values[:, :]
y = df[y_col].values[:,-1]

In [10]:
# def encode_categorical(array):
#     if not array.dtype == np.dtype('float64'):
#         return LabelEncoder().fit_transform(array) 
#     else:
#         return array
    
# # Categorical columns for use in one-hot encoder
# categorical = (df_data.dtypes.values != np.dtype('float64'))

# # Encode all labels
# data = df_data.apply(encode_categorical)

# # Get numpy array from data
# x = data.values[:, :-1]
# y = data.values[:, -1]

# # Apply one hot endcoing
# encoder = OneHotEncoder(categorical_features=categorical[:-1], sparse=False)  # Last value in mask is y
# x = encoder.fit_transform(x)

In [11]:
plt.hist(y, bins=10)  # plt.hist passes it's arguments to np.histogram
plt.axvline(np.mean(y), color='black', linestyle='-', lw=6, label='Mean Interest rate')
plt.axvline(np.mean(y) - np.std(y), color='black', linestyle='--', lw=2, label='Std')
plt.axvline(np.mean(y) + np.std(y), color='black', linestyle='--', lw=2)
plt.title("Histogram of Interest Rates, Mean of {:0.2f}%".format(np.mean(y)))
plt.legend()
plt.show()



In [12]:
%%bash
say 'Done'

In [13]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

Ridge

I tried using LASSO, it took too long.

Regulaization doesn't seem to improve the error


In [63]:
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score

scores_ridge = list()
scores_std_ridge = list()

ridge = Ridge()
coefs = []
errors = []

alphas = np.logspace(-6, 6, 20)

# Train the model with different regularisation strengths
for a in alphas:
    ridge.set_params(alpha=a)
    this_scores = cross_val_score(ridge, X_train, y_train, cv=3, n_jobs=1)
    scores_ridge.append(np.mean(this_scores))
    scores_std_ridge.append(np.std(this_scores))

In [24]:
scores_ridge, scores_std_ridge = np.array(scores_ridge), np.array(scores_std_ridge)

plt.figure().set_size_inches(8, 6)
plt.semilogx(alphas, scores_ridge)

# plot error lines showing +/- std. errors of the scores
std_error = scores_std_ridge / np.sqrt(3)

plt.semilogx(alphas, scores_ridge + std_error, 'b--')
plt.semilogx(alphas, scores_ridge - std_error, 'b--')

# alpha=0.2 controls the translucency of the fill color
plt.fill_between(alphas, scores_ridge + scores_std_ridge, scores_ridge - scores_std_ridge, alpha=0.2)

plt.ylabel('CV score +/- std error')
plt.xlabel('alpha')
plt.title('Ridge Regression')
plt.axhline(np.max(scores_ridge), linestyle='--', color='.5')
plt.xlim([alphas[0], alphas[-1]])


Out[24]:
(9.9999999999999995e-07, 1000000.0)

Linear Regression


In [59]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

linreg = LinearRegression()

linreg.fit(X_train, y_train)

prediction = linreg.predict(X_test)

print(linreg.score(X_test, y_test))

print(mean_squared_error(prediction, y_test))


0.54677017964
8.72374023156

In [58]:
print('Intercept: {:0.2f}'.format(linreg.intercept_))


Intercept: 17.70

In [57]:
print('Coeff')
pd.DataFrame.from_dict(dict(zip(data.columns, linreg.coef_)), orient='index').T


Coeff
Out[57]:
funded_amnt_inv emp_length_n/a purpose_wedding emp_length_7 years purpose_house purpose_medical verification_status_Verified application_type_JOINT revol_bal purpose_other ... purpose_small_business purpose_major_purchase purpose_vacation emp_length_2 years pymnt_plan_y purpose_credit_card emp_length_1 year emp_length_8 years emp_length_10+ years purpose_debt_consolidation
0 0.000042 0.014402 -0.108279 0.05202 0.67065 0.718338 0.657788 0.12279 -0.000006 0.788298 ... 0.619513 -1.061428 1.252798 0.041809 0.00997 -1.628181 0.078514 -0.056309 -0.095276 -0.798157

1 rows × 43 columns


In [67]:
import statsmodels.api as sm

model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())


                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.550
Model:                            OLS   Adj. R-squared:                  0.550
Method:                 Least Squares   F-statistic:                 2.016e+04
Date:                Sat, 26 Nov 2016   Prob (F-statistic):               0.00
Time:                        23:57:22   Log-Likelihood:            -1.4845e+06
No. Observations:              594543   AIC:                         2.969e+06
Df Residuals:                  594506   BIC:                         2.969e+06
Df Model:                          36                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
x1            -0.0003    1.2e-05    -27.602      0.000        -0.000    -0.000
x2            -0.0013   1.47e-05    -87.921      0.000        -0.001    -0.001
x3          4.193e-05   7.31e-06      5.738      0.000      2.76e-05  5.62e-05
x4             0.0506   8.77e-05    577.087      0.000         0.050     0.051
x5             0.0128      0.000     51.279      0.000         0.012     0.013
x6         -5.843e-06   1.85e-07    -31.519      0.000     -6.21e-06 -5.48e-06
x7             3.4348      0.129     26.647      0.000         3.182     3.687
x8             3.6804      0.148     24.893      0.000         3.391     3.970
x9             3.9347      0.113     34.844      0.000         3.713     4.156
x10            3.1805      0.113     28.160      0.000         2.959     3.402
x11           -1.1885      0.043    -27.491      0.000        -1.273    -1.104
x12           -1.1200      0.026    -42.971      0.000        -1.171    -1.069
x13           -0.2899      0.025    -11.427      0.000        -0.340    -0.240
x14            0.1445      0.165      0.874      0.382        -0.179     0.468
x15           -0.5281      0.029    -18.283      0.000        -0.585    -0.471
x16            1.1789      0.060     19.662      0.000         1.061     1.296
x17           -0.5532      0.036    -15.574      0.000        -0.623    -0.484
x18            1.2266      0.044     28.042      0.000         1.141     1.312
x19            2.2993      0.052     44.559      0.000         2.198     2.400
x20            1.2965      0.030     43.857      0.000         1.239     1.354
x21            1.3603      0.144      9.476      0.000         1.079     1.642
x22            1.1277      0.041     27.316      0.000         1.047     1.209
x23            1.7610      0.055     32.167      0.000         1.654     1.868
x24            0.3999      0.073      5.471      0.000         0.257     0.543
x25            3.5476      0.444      7.989      0.000         2.677     4.418
x26            3.5675      0.667      5.347      0.000         2.260     4.875
x27            1.8489      0.075     24.502      0.000         1.701     1.997
x28            2.2367      0.075     29.647      0.000         2.089     2.385
x29            3.0295      0.075     40.143      0.000         2.882     3.177
x30            0.6714      0.024     28.319      0.000         0.625     0.718
x31            0.4977      0.020     24.541      0.000         0.458     0.537
x32            0.6347      0.023     28.142      0.000         0.591     0.679
x33            0.6021      0.023     26.220      0.000         0.557     0.647
x34            0.5683      0.024     23.623      0.000         0.521     0.616
x35            0.5537      0.024     23.236      0.000         0.507     0.600
x36            0.6305      0.025     25.243      0.000         0.582     0.679
x37            0.6449      0.025     26.031      0.000         0.596     0.694
x38            0.5366      0.025     21.546      0.000         0.488     0.585
x39            0.5361      0.026     20.483      0.000         0.485     0.587
x40            0.6317      0.023     27.596      0.000         0.587     0.677
x41            0.6073      0.025     24.232      0.000         0.558     0.656
x42           -2.5029      0.113    -22.131      0.000        -2.725    -2.281
x43            9.6180      0.113     84.929      0.000         9.396     9.840
==============================================================================
Omnibus:                    78283.091   Durbin-Watson:                   1.999
Prob(Omnibus):                  0.000   Jarque-Bera (JB):           894843.584
Skew:                           0.198   Prob(JB):                         0.00
Kurtosis:                       8.997   Cond. No.                     1.10e+16
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 6.7e-18. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

Baseline


In [62]:
base_line = [np.mean(y)]*len(y_test)
print('R^2 : {}'.format(r2_score(base_line, y_test)))
print('Meansquared error: {} '.format(mean_squared_error(base_line, y_test)))


R^2 : -3.8124522478882046e+29
Meansquared error: 19.24796500083821 

Ada


In [77]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor

ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=2),
                          n_estimators=300, random_state=0)

ada.fit(X_train, y_train)


Out[77]:
AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best'),
         learning_rate=1.0, loss='linear', n_estimators=300,
         random_state=0)

In [78]:
ada.score(X_test, y_test)


Out[78]:
0.23232151180906613

In [79]:
print('Feature Importances')
pd.DataFrame.from_dict(dict(zip(data.columns, ada.feature_importances_)), orient='index')


Feature Importances
Out[79]:
0
funded_amnt_inv 0.000000
emp_length_n/a 0.000000
purpose_wedding 0.000000
emp_length_7 years 0.000000
purpose_house 0.000000
purpose_medical 0.000000
verification_status_Verified 0.071393
application_type_JOINT 0.000000
revol_bal 0.023052
purpose_other 0.014412
application_type_INDIVIDUAL 0.000000
emp_length_9 years 0.000000
purpose_car 0.000000
emp_length_6 years 0.000000
emp_length_5 years 0.000000
installment 0.076559
initial_list_status_w 0.000000
purpose_home_improvement 0.000000
initial_list_status_f 0.037111
loan_amnt 0.043715
emp_length_3 years 0.000000
dti 0.019227
verification_status_Not Verified 0.060669
term_ 60 months 0.550924
funded_amnt 0.000000
emp_length_4 years 0.000000
verification_status_Source Verified 0.000000
purpose_moving 0.000000
pymnt_plan_n 0.000000
purpose_educational 0.000000
term_ 36 months 0.000000
emp_length_< 1 year 0.000000
purpose_renewable_energy 0.000000
purpose_small_business 0.009503
purpose_major_purchase 0.000000
purpose_vacation 0.000000
emp_length_2 years 0.000000
pymnt_plan_y 0.000000
purpose_credit_card 0.093433
emp_length_1 year 0.000000
emp_length_8 years 0.000000
emp_length_10+ years 0.000000
purpose_debt_consolidation 0.000000

In [ ]: