In [1]:
import matplotlib
import sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import os
import sqlite3
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16.0, 6.0)
plt.rcParams['legend.markerscale'] = 3
matplotlib.rcParams['font.size'] = 16.0
from sklearn_pandas import DataFrameMapper, cross_val_score
In [2]:
DIR = os.getcwd() + "/../data/"
df = pd.read_csv(DIR + 'raw/loan.csv', low_memory=False)
df.head()
Out[2]:
In [3]:
# complete_cols = [column for column in df.columns if len(df[column][df[column].isnull()]) == 0]
# complete_cols
In [4]:
catagorical_cols = [
'application_type', 'initial_list_status',
'purpose', 'pymnt_plan', 'verification_status',
'emp_length', 'term'
]
continous_cols = [
'loan_amnt','funded_amnt','funded_amnt_inv','installment',
'dti','revol_bal'
]
y_col = ['int_rate']
In [5]:
df_data = df[catagorical_cols + continous_cols]
In [6]:
# Converted columns to floating point
for feature_name in continous_cols:
df_data[feature_name] = df_data[feature_name].astype(float)
In [7]:
data = pd.get_dummies(df_data)
In [8]:
data.tail(3)
Out[8]:
In [9]:
x = data.values[:, :]
y = df[y_col].values[:,-1]
In [10]:
# def encode_categorical(array):
# if not array.dtype == np.dtype('float64'):
# return LabelEncoder().fit_transform(array)
# else:
# return array
# # Categorical columns for use in one-hot encoder
# categorical = (df_data.dtypes.values != np.dtype('float64'))
# # Encode all labels
# data = df_data.apply(encode_categorical)
# # Get numpy array from data
# x = data.values[:, :-1]
# y = data.values[:, -1]
# # Apply one hot endcoing
# encoder = OneHotEncoder(categorical_features=categorical[:-1], sparse=False) # Last value in mask is y
# x = encoder.fit_transform(x)
In [11]:
plt.hist(y, bins=10) # plt.hist passes it's arguments to np.histogram
plt.axvline(np.mean(y), color='black', linestyle='-', lw=6, label='Mean Interest rate')
plt.axvline(np.mean(y) - np.std(y), color='black', linestyle='--', lw=2, label='Std')
plt.axvline(np.mean(y) + np.std(y), color='black', linestyle='--', lw=2)
plt.title("Histogram of Interest Rates, Mean of {:0.2f}%".format(np.mean(y)))
plt.legend()
plt.show()
In [12]:
%%bash
say 'Done'
In [13]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)
In [63]:
from sklearn.linear_model import Ridge
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
scores_ridge = list()
scores_std_ridge = list()
ridge = Ridge()
coefs = []
errors = []
alphas = np.logspace(-6, 6, 20)
# Train the model with different regularisation strengths
for a in alphas:
ridge.set_params(alpha=a)
this_scores = cross_val_score(ridge, X_train, y_train, cv=3, n_jobs=1)
scores_ridge.append(np.mean(this_scores))
scores_std_ridge.append(np.std(this_scores))
In [24]:
scores_ridge, scores_std_ridge = np.array(scores_ridge), np.array(scores_std_ridge)
plt.figure().set_size_inches(8, 6)
plt.semilogx(alphas, scores_ridge)
# plot error lines showing +/- std. errors of the scores
std_error = scores_std_ridge / np.sqrt(3)
plt.semilogx(alphas, scores_ridge + std_error, 'b--')
plt.semilogx(alphas, scores_ridge - std_error, 'b--')
# alpha=0.2 controls the translucency of the fill color
plt.fill_between(alphas, scores_ridge + scores_std_ridge, scores_ridge - scores_std_ridge, alpha=0.2)
plt.ylabel('CV score +/- std error')
plt.xlabel('alpha')
plt.title('Ridge Regression')
plt.axhline(np.max(scores_ridge), linestyle='--', color='.5')
plt.xlim([alphas[0], alphas[-1]])
Out[24]:
In [59]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
linreg = LinearRegression()
linreg.fit(X_train, y_train)
prediction = linreg.predict(X_test)
print(linreg.score(X_test, y_test))
print(mean_squared_error(prediction, y_test))
In [58]:
print('Intercept: {:0.2f}'.format(linreg.intercept_))
In [57]:
print('Coeff')
pd.DataFrame.from_dict(dict(zip(data.columns, linreg.coef_)), orient='index').T
Out[57]:
In [67]:
import statsmodels.api as sm
model = sm.OLS(y_train, X_train)
results = model.fit()
print(results.summary())
In [62]:
base_line = [np.mean(y)]*len(y_test)
print('R^2 : {}'.format(r2_score(base_line, y_test)))
print('Meansquared error: {} '.format(mean_squared_error(base_line, y_test)))
In [77]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=2),
n_estimators=300, random_state=0)
ada.fit(X_train, y_train)
Out[77]:
In [78]:
ada.score(X_test, y_test)
Out[78]:
In [79]:
print('Feature Importances')
pd.DataFrame.from_dict(dict(zip(data.columns, ada.feature_importances_)), orient='index')
Out[79]:
In [ ]: