Let us build a Regression Model for prediciting the amount to be approved
In [2]:
#Load the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [3]:
#Default Variables
%matplotlib inline
plt.rcParams['figure.figsize'] = (16,9)
plt.rcParams['font.size'] = 18
plt.style.use('fivethirtyeight')
pd.set_option('display.float_format', lambda x: '%.2f' % x)
In [4]:
#Load the dataset
df = pd.read_csv("data/loan_data_clean.csv")
In [5]:
df.head()
Out[5]:
In [8]:
# Select the initial feature set
df_X = df[['age', 'income', 'ownership' , 'years', 'grade']]
In [11]:
# Convert the categorical variables in to numerical values
df_X = pd.get_dummies(df_X)
In [12]:
# Create the feature set X
X = df_X
In [54]:
# Create the target from amount and default
df['amount_non_default'] = df['amount'] * (1- df['default'])
In [55]:
y = df['amount_non_default']
In [125]:
# import the sklearn linear model
from sklearn.linear_model import LinearRegression
In [126]:
# initiate the Linear Regression Model
model_ols = LinearRegression(normalize=True)
In [127]:
# Review the parameters in the Linear Regression
model_ols
Out[127]:
In [128]:
# Review the parameters in the Linear Regression
model_ols.fit(X,y)
Out[128]:
In [129]:
# What are the coeffecients of the model
model_ols.coef_
Out[129]:
In [130]:
# What is the intercept of the model
model_ols.intercept_
Out[130]:
In [131]:
# predict the y
y_pred_ols = model_ols.predict(X)
In [132]:
# import metrics from sklearn
from sklearn import metrics
In [133]:
# Calculate mean squared erro
metrics.mean_squared_error(y_pred_ols, y)
Out[133]:
In [134]:
# What is the score given by the model
model_ols.score(X,y)
Out[134]:
In [135]:
# What is the root mean square error
np.sqrt(metrics.mean_squared_error(y_pred_ols, y))
Out[135]:
In [136]:
# How does rmse compare with standard deviation of the target
df.amount_non_default.std()
Out[136]:
In [137]:
# Get the module for train test split
from sklearn.model_selection import train_test_split
In [138]:
#Split the data in test and training - 20% and 80%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [139]:
#Initiate the model
model_ols_split = LinearRegression()
In [140]:
#Fit the model
model_ols_split.fit(X_train, y_train)
Out[140]:
In [142]:
# Make predictions for test and train
y_pred_split_train = model_ols_split.predict(X_train)
y_pred_split_test = model_ols_split.predict(X_test)
In [145]:
#Find the errors for test and train
error_ols_split_train = metrics.mean_squared_error(y_pred_split_train, y_train)
error_ols_split_test = metrics.mean_squared_error(y_pred_split_test, y_test)
In [148]:
error_ols_split_train, error_ols_split_test
Out[148]:
In [147]:
# Find the generalisation error
generalisation_error = error_ols_split_test - error_ols_split_train
generalisation_error
Out[147]:
In [150]:
# Import Polynominal Features
from sklearn.preprocessing import PolynomialFeatures
In [246]:
# Initiate Polynominal Features for Degree 2
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
In [247]:
# Create Polynominal Features
X_poly = poly.fit_transform(X)
In [248]:
# See the new dataset
X_poly.shape
Out[248]:
In [309]:
#Create split and train
X_poly_train, X_poly_test, y_poly_train, y_poly_test = train_test_split(
X_poly, y, test_size=0.2, random_state=42)
In [324]:
# Initiate the model
model_ols_poly = LinearRegression(normalize=True)
In [325]:
# Fit the model
model_ols_poly.fit(X_poly_train, y_poly_train)
Out[325]:
In [326]:
# Make predictions for test and train
y_pred_poly_train = model_ols_poly.predict(X_poly_train)
y_pred_poly_test = model_ols_poly.predict(X_poly_test)
In [327]:
#Find the errors for test and train
error_ols_poly_train = metrics.mean_squared_error(y_pred_poly_train, y_poly_train)
error_ols_poly_test = metrics.mean_squared_error(y_pred_poly_test, y_poly_test)
In [328]:
error_ols_poly_train, error_ols_poly_test
Out[328]:
In [329]:
# Find the generalisation error
generalisation_poly_error = error_ols_poly_test - error_ols_poly_train
generalisation_poly_error
Out[329]:
For Discussion
In [358]:
# Get ridge regression from linear_models
from sklearn.linear_model import Ridge
In [396]:
# Initiate model
model_ridge = Ridge(alpha = 0.1, normalize=True)
In [397]:
# Fit the model
model_ridge.fit(X_poly_train, y_poly_train)
Out[397]:
In [398]:
# Make predictions for test and train
y_pred_ridge_train = model_ridge.predict(X_poly_train)
y_pred_ridge_test = model_ridge.predict(X_poly_test)
In [399]:
#Find the errors for test and train
error_ridge_train = metrics.mean_squared_error(y_pred_ridge_train, y_poly_train)
error_ridge_test = metrics.mean_squared_error(y_pred_ridge_test, y_poly_test)
In [400]:
error_ridge_train, error_ridge_test
Out[400]:
In [401]:
# Find the generalisation error
generalisation_ridge_error = error_ridge_test - error_ridge_train
generalisation_ridge_error
Out[401]:
Finding alpha using Cross Validation
In [402]:
# Get ridge regression from linear_models
from sklearn.linear_model import RidgeCV
In [403]:
# Initiate model with alphas = 0.1, 0.001, 0.0001
model_ridge_CV = RidgeCV(alphas=[0.1, 0.001, 0.0001], normalize = True)
In [404]:
# Fit the model
model_ridge_CV.fit(X_poly_train, y_poly_train)
Out[404]:
In [405]:
# Find the correct alpha
model_ridge_CV.alpha_
Out[405]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: