Let us build a Regression Model for prediciting the amount to be approved
In [1]:
#Load the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
#Default Variables
%matplotlib inline
plt.rcParams['figure.figsize'] = (16,9)
plt.rcParams['font.size'] = 18
plt.style.use('fivethirtyeight')
pd.set_option('display.float_format', lambda x: '%.2f' % x)
In [3]:
#Load the dataset
df = pd.read_csv("data/loan_data_clean.csv")
In [4]:
df.head()
Out[4]:
In [10]:
# Select the initial feature set
X_raw = df[['age', 'grade', 'years', 'ownership', 'income']]
In [9]:
# Convert the categorical variables in to numerical values
from sklearn.preprocessing import OneHotEncoder
In [11]:
# Create the feature set X
X = pd.get_dummies(X_raw)
In [14]:
# Create the target from amount and default
y = df.amount * (1 - df.default)
In [15]:
y.head()
Out[15]:
In [16]:
# import the sklearn linear model
from sklearn.linear_model import LinearRegression
In [20]:
# initiate the Linear Regression Model
model_ols = LinearRegression(normalize = True)
In [21]:
# Review the parameters in the Linear Regression
model_ols
Out[21]:
In [22]:
model_ols.fit(X,y)
Out[22]:
In [25]:
# What are the coeffecients of the model
model_ols.coef_
Out[25]:
In [26]:
model_ols.intercept_
Out[26]:
In [29]:
# predict the y
y_pred = model_ols.predict(X)
In [30]:
# import metrics from sklearn
np.sum((y_pred - y)**2)/y.shape[0]
Out[30]:
In [31]:
from sklearn.metrics import mean_squared_error
In [32]:
# Calculate mean squared error
mean_squared_error(y_pred,y)
Out[32]:
In [33]:
# Root mean square error
np.sqrt(np.sum((y_pred - y)**2)/y.shape[0])
Out[33]:
In [45]:
plt.hist(y, bins=50, alpha=0.4, label="true")
plt.hist(y_pred, bins=50, alpha=0.4, label="pred")
plt.legend()
plt.show()
In [46]:
plt.scatter(y_pred, y)
plt.show()
In [53]:
# What is the score given by the model
model_ols.score(X, y)
Out[53]:
In [56]:
# What is the root mean square error
np.sqrt(mean_squared_error(y_pred, y))
Out[56]:
In [55]:
# How does rmse compare with standard deviation of the target
y.std()
Out[55]:
In [57]:
# Get the module for train test split
from sklearn.model_selection import train_test_split
In [59]:
train_test_split?
In [61]:
#Split the data in test and training - 20% and 80%
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)
In [63]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape
Out[63]:
In [64]:
#Initiate the model
model_general = LinearRegression(normalize=True)
In [65]:
#Fit the model
model_general.fit(X_train, y_train)
Out[65]:
In [66]:
# Make predictions for test and train
y_pred_general_train = model_general.predict(X_train)
y_pred_general_test = model_general.predict(X_test)
In [68]:
#Find the errors for test and train
mse_general_train = mean_squared_error(y_pred_general_train, y_train)
mse_general_test = mean_squared_error(y_pred_general_test, y_test)
In [70]:
# Find the generalisation error
mse_general_train - mse_general_test, mse_general_train, mse_general_test
Out[70]:
In [71]:
# Import Polynominal Features
from sklearn.preprocessing import PolynomialFeatures
In [78]:
# Initiate Polynominal Features for Degree 2
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
In [79]:
# Create Polynominal Features
X_poly = poly.fit_transform(X)
In [80]:
# See the new dataset
X.shape, X_poly.shape
Out[80]:
In [81]:
#Create split and train
X_train_poly, X_test_poly, y_train_poly, y_test_poly = train_test_split(X_poly,y, test_size = 0.2)
In [82]:
# Initiate the model
model_poly = LinearRegression(normalize=True)
In [85]:
# Fit the model
model_poly.fit(X_train_poly, y_train_poly)
Out[85]:
In [86]:
# Make predictions for test and train
y_pred_poly_train = model_poly.predict(X_train_poly)
y_pred_poly_test = model_poly.predict(X_test_poly)
In [89]:
#Find the errors for test and train
mse_poly_train = mean_squared_error(y_pred_poly_train,y_train_poly )
mse_poly_test = mean_squared_error(y_pred_poly_test,y_test_poly )
In [90]:
# Find the generalisation error
mse_poly_test - mse_poly_train, mse_poly_train, mse_poly_test
Out[90]:
In [91]:
mse_general_train - mse_general_test, mse_general_train, mse_general_test
Out[91]:
For Discussion
In [39]:
# Get ridge regression from linear_models
In [40]:
# Initiate model
In [41]:
# Fit the model
In [42]:
# Make predictions for test and train
In [43]:
#Find the errors for test and train
In [ ]:
In [44]:
# Find the generalisation error
Finding alpha using Cross Validation
In [45]:
# Get ridge regression from linear_models
In [46]:
# Initiate model with alphas = 0.1, 0.001, 0.0001
In [47]:
# Fit the model
In [48]:
# Find the correct alpha
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: