In [37]:
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [38]:
#importing the datset
dataset = pd.read_csv('datasets/50_Startups.csv')
In [39]:
dataset.head()
Out[39]:
In [40]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values
In [41]:
X
Out[41]:
In [42]:
Y
Out[42]:
In [43]:
# But there is categorical variable. i.e. independent varaible State
# We will use one hot encoder
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()
# Here index column 3 has categorical variable
In [44]:
X
Out[44]:
In [45]:
# avoiding the dummy variable trap
X = X[:, 1:]
# It doesn't contain the 1st column which was for california
X
Out[45]:
In [46]:
# Split the dataset into train and test
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
In [47]:
# We dont need feature scaling for multiple linear regression
# The library will takle care of that.
Fitting multiple linear regression to the training set
In [48]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,Y_train)
Out[48]:
In [49]:
# Predicting the test set results
Y_pred = regressor.predict(X_test)
Previously to build the multiple regression model we used all the independent variable. Out of these some independent variable are higly statistically significant and some are not
In [50]:
import statsmodels.formula.api as sm
In [51]:
# The multiple linear regression equation is
# y = b0 + b1.X^1 + b2.x^2 + ... + bn.x^n
# The stats model requires that b0 is actually b0.x^0
# So, X^0 here is column vector of 1's
# we will add this vector of 1's to X
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)
# append methode adds a new row or column
# np.ones adds a column of 1's if axis=1
# We want to keep column of 1's as first column so, np.ones is first argument and then append X to it
# X has 50 rows
# astype[int] required to convert the 1's to int otherwise
# you will get type error
In [52]:
X[1]
Out[52]:
In [55]:
# Backward elimination steps
# Step 1: Select a significance level to stay in the model(e.g. SL = 0.05)
X_opt = X[:, [0,1,2,3,4,5]]
# Step 2: Fit the full model with all possible predictors
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
In [56]:
# Step 3: Consider the predictor with the highest P-value.
# If P>SL, go to STEP4, otherwise go to Finish(model is final)
regressor_OLS.summary()
Out[56]:
In [58]:
# P-value of X2 is highest, so we remove X2
# Step4: Remove the predictor whose p-value is highest and more than SL = 0.05
# here we have to remove index 2 column
# Step 5: Fit the model without this variable
X_opt = X[:, [0,1,3,4,5]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()
Out[58]:
In [59]:
# here X1 has highest p-value. So we will remove column index
# 1
X_opt = X[:, [0,3,4,5]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()
Out[59]:
In [60]:
# here X2 has highest p-value. So we will remove column index
# 4
X_opt = X[:, [0,3,5]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()
Out[60]:
In [61]:
# here X2 has highest p-value. So we will remove column index
# 5
X_opt = X[:, [0,3]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()
Out[61]:
In [62]:
X[1]
Out[62]:
In [63]:
# So column R&D spent has the max impact on profit