In [12]:
import numpy as np
import pandas as pd
#Packages for checking assumptions
import matplotlib.pyplot as plt
%matplotlib inline
from pandas.plotting import scatter_matrix
import seaborn as sns
plt.rcParams['figure.figsize'] = (12,8)
sns.set()
sns.set(font_scale=1.5)
#Packages for performing LR
from scipy import stats as stats
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
import statsmodels.api as sm
#Set jupyter notebook preferences
# the command below means that the output of multiple commands in a cell will be output at once
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
#the command below tells pandas to display up to 100 columns, this keeps everything visible
pd.set_option('display.max_columns',100)
pd.set_option('expand_frame_repr', True)
In [13]:
path = 'data/'
filename = 'loans.csv'
df = pd.read_csv(path+filename)
In [14]:
df.head()
Out[14]:
In [15]:
df.columns.tolist()
Out[15]:
In [16]:
pd.options.mode.chained_assignment = None # default='warn'
# Define our dependent variable
y_column = 'loan_amount'
y = df[y_column]
# Define our independent variables
x_columns = ['lender_count']
X = df[x_columns]
# Add an intercept term to the independent variables. This is needed to include the constant term from the linear regression eqn.
X['intercept'] = 1
# Split our data into training and test data sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
In [17]:
# using statsmodel implementation
model = sm.OLS(endog=y_train, exog=X_train)
model_fit = model.fit()
print(model_fit.summary())
In [18]:
model_fit.params.intercept
Out[18]:
In [19]:
model_fit.params.lender_count
Out[19]:
In [20]:
y_pred_test = model_fit.predict(X_test)
In [21]:
plt.figure(figsize=(9,7))
plt.scatter(y_pred_test, y_test, alpha=0.5, c='r')
plt.title('predicted vs true for test data')
plt.xlabel('predicted loan amounts')
plt.ylabel('true loan amounts')
axes = plt.gca()
axes.set_xlim([0,11000])
axes.set_ylim([0,10000])
plt.show();