Multiple linear regression


In [37]:
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [38]:
#importing the datset
dataset = pd.read_csv('datasets/50_Startups.csv')

In [39]:
dataset.head()


Out[39]:
R&D Spend Administration Marketing Spend State Profit
0 165349.20 136897.80 471784.10 New York 192261.83
1 162597.70 151377.59 443898.53 California 191792.06
2 153441.51 101145.55 407934.54 Florida 191050.39
3 144372.41 118671.85 383199.62 New York 182901.99
4 142107.34 91391.77 366168.42 Florida 166187.94

In [40]:
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values

In [41]:
X


Out[41]:
array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [100671.96, 91790.61, 249744.55, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [91749.16, 114175.79, 294919.57, 'Florida'],
       [86419.7, 153514.11, 0.0, 'New York'],
       [76253.86, 113867.3, 298664.47, 'California'],
       [78389.47, 153773.43, 299737.29, 'New York'],
       [73994.56, 122782.75, 303319.26, 'Florida'],
       [67532.53, 105751.03, 304768.73, 'Florida'],
       [77044.01, 99281.34, 140574.81, 'New York'],
       [64664.71, 139553.16, 137962.62, 'California'],
       [75328.87, 144135.98, 134050.07, 'Florida'],
       [72107.6, 127864.55, 353183.81, 'New York'],
       [66051.52, 182645.56, 118148.2, 'Florida'],
       [65605.48, 153032.06, 107138.38, 'New York'],
       [61994.48, 115641.28, 91131.24, 'Florida'],
       [61136.38, 152701.92, 88218.23, 'New York'],
       [63408.86, 129219.61, 46085.25, 'California'],
       [55493.95, 103057.49, 214634.81, 'Florida'],
       [46426.07, 157693.92, 210797.67, 'California'],
       [46014.02, 85047.44, 205517.64, 'New York'],
       [28663.76, 127056.21, 201126.82, 'Florida'],
       [44069.95, 51283.14, 197029.42, 'California'],
       [20229.59, 65947.93, 185265.1, 'New York'],
       [38558.51, 82982.09, 174999.3, 'California'],
       [28754.33, 118546.05, 172795.67, 'California'],
       [27892.92, 84710.77, 164470.71, 'Florida'],
       [23640.93, 96189.63, 148001.11, 'California'],
       [15505.73, 127382.3, 35534.17, 'New York'],
       [22177.74, 154806.14, 28334.72, 'California'],
       [1000.23, 124153.04, 1903.93, 'New York'],
       [1315.46, 115816.21, 297114.46, 'Florida'],
       [0.0, 135426.92, 0.0, 'California'],
       [542.05, 51743.15, 0.0, 'New York'],
       [0.0, 116983.8, 45173.06, 'California']], dtype=object)

In [42]:
Y


Out[42]:
array([ 192261.83,  191792.06,  191050.39,  182901.99,  166187.94,
        156991.12,  156122.51,  155752.6 ,  152211.77,  149759.96,
        146121.95,  144259.4 ,  141585.52,  134307.35,  132602.65,
        129917.04,  126992.93,  125370.37,  124266.9 ,  122776.86,
        118474.03,  111313.02,  110352.25,  108733.99,  108552.04,
        107404.34,  105733.54,  105008.31,  103282.38,  101004.64,
         99937.59,   97483.56,   97427.84,   96778.92,   96712.8 ,
         96479.51,   90708.19,   89949.14,   81229.06,   81005.76,
         78239.91,   77798.83,   71498.49,   69758.98,   65200.33,
         64926.08,   49490.75,   42559.73,   35673.41,   14681.4 ])

In [43]:
# But there is categorical variable. i.e. independent varaible State 
# We will use one hot encoder
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()
# Here index column 3 has categorical variable

In [44]:
X


Out[44]:
array([[  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.65349200e+05,   1.36897800e+05,   4.71784100e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.62597700e+05,   1.51377590e+05,   4.43898530e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.53441510e+05,   1.01145550e+05,   4.07934540e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.44372410e+05,   1.18671850e+05,   3.83199620e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.42107340e+05,   9.13917700e+04,   3.66168420e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.31876900e+05,   9.98147100e+04,   3.62861360e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.34615460e+05,   1.47198870e+05,   1.27716820e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.30298130e+05,   1.45530060e+05,   3.23876680e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.20542520e+05,   1.48718950e+05,   3.11613290e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.23334880e+05,   1.08679170e+05,   3.04981620e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.01913080e+05,   1.10594110e+05,   2.29160950e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00671960e+05,   9.17906100e+04,   2.49744550e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          9.38637500e+04,   1.27320380e+05,   2.49839440e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          9.19923900e+04,   1.35495070e+05,   2.52664930e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.19943240e+05,   1.56547420e+05,   2.56512920e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.14523610e+05,   1.22616840e+05,   2.61776230e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          7.80131100e+04,   1.21597550e+05,   2.64346060e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          9.46571600e+04,   1.45077580e+05,   2.82574310e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          9.17491600e+04,   1.14175790e+05,   2.94919570e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          8.64197000e+04,   1.53514110e+05,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          7.62538600e+04,   1.13867300e+05,   2.98664470e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          7.83894700e+04,   1.53773430e+05,   2.99737290e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          7.39945600e+04,   1.22782750e+05,   3.03319260e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          6.75325300e+04,   1.05751030e+05,   3.04768730e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          7.70440100e+04,   9.92813400e+04,   1.40574810e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          6.46647100e+04,   1.39553160e+05,   1.37962620e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          7.53288700e+04,   1.44135980e+05,   1.34050070e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          7.21076000e+04,   1.27864550e+05,   3.53183810e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          6.60515200e+04,   1.82645560e+05,   1.18148200e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          6.56054800e+04,   1.53032060e+05,   1.07138380e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          6.19944800e+04,   1.15641280e+05,   9.11312400e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          6.11363800e+04,   1.52701920e+05,   8.82182300e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          6.34088600e+04,   1.29219610e+05,   4.60852500e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.54939500e+04,   1.03057490e+05,   2.14634810e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.64260700e+04,   1.57693920e+05,   2.10797670e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          4.60140200e+04,   8.50474400e+04,   2.05517640e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          2.86637600e+04,   1.27056210e+05,   2.01126820e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40699500e+04,   5.12831400e+04,   1.97029420e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.02295900e+04,   6.59479300e+04,   1.85265100e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.85585100e+04,   8.29820900e+04,   1.74999300e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          2.87543300e+04,   1.18546050e+05,   1.72795670e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          2.78929200e+04,   8.47107700e+04,   1.64470710e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          2.36409300e+04,   9.61896300e+04,   1.48001110e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.55057300e+04,   1.27382300e+05,   3.55341700e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          2.21777400e+04,   1.54806140e+05,   2.83347200e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.00023000e+03,   1.24153040e+05,   1.90393000e+03],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.31546000e+03,   1.15816210e+05,   2.97114460e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.35426920e+05,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          5.42050000e+02,   5.17431500e+04,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.16983800e+05,   4.51730600e+04]])

In [45]:
# avoiding the dummy variable trap
X = X[:, 1:]
# It doesn't contain the 1st column which was for california
X


Out[45]:
array([[  0.00000000e+00,   1.00000000e+00,   1.65349200e+05,
          1.36897800e+05,   4.71784100e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.62597700e+05,
          1.51377590e+05,   4.43898530e+05],
       [  1.00000000e+00,   0.00000000e+00,   1.53441510e+05,
          1.01145550e+05,   4.07934540e+05],
       [  0.00000000e+00,   1.00000000e+00,   1.44372410e+05,
          1.18671850e+05,   3.83199620e+05],
       [  1.00000000e+00,   0.00000000e+00,   1.42107340e+05,
          9.13917700e+04,   3.66168420e+05],
       [  0.00000000e+00,   1.00000000e+00,   1.31876900e+05,
          9.98147100e+04,   3.62861360e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.34615460e+05,
          1.47198870e+05,   1.27716820e+05],
       [  1.00000000e+00,   0.00000000e+00,   1.30298130e+05,
          1.45530060e+05,   3.23876680e+05],
       [  0.00000000e+00,   1.00000000e+00,   1.20542520e+05,
          1.48718950e+05,   3.11613290e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.23334880e+05,
          1.08679170e+05,   3.04981620e+05],
       [  1.00000000e+00,   0.00000000e+00,   1.01913080e+05,
          1.10594110e+05,   2.29160950e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00671960e+05,
          9.17906100e+04,   2.49744550e+05],
       [  1.00000000e+00,   0.00000000e+00,   9.38637500e+04,
          1.27320380e+05,   2.49839440e+05],
       [  0.00000000e+00,   0.00000000e+00,   9.19923900e+04,
          1.35495070e+05,   2.52664930e+05],
       [  1.00000000e+00,   0.00000000e+00,   1.19943240e+05,
          1.56547420e+05,   2.56512920e+05],
       [  0.00000000e+00,   1.00000000e+00,   1.14523610e+05,
          1.22616840e+05,   2.61776230e+05],
       [  0.00000000e+00,   0.00000000e+00,   7.80131100e+04,
          1.21597550e+05,   2.64346060e+05],
       [  0.00000000e+00,   1.00000000e+00,   9.46571600e+04,
          1.45077580e+05,   2.82574310e+05],
       [  1.00000000e+00,   0.00000000e+00,   9.17491600e+04,
          1.14175790e+05,   2.94919570e+05],
       [  0.00000000e+00,   1.00000000e+00,   8.64197000e+04,
          1.53514110e+05,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   7.62538600e+04,
          1.13867300e+05,   2.98664470e+05],
       [  0.00000000e+00,   1.00000000e+00,   7.83894700e+04,
          1.53773430e+05,   2.99737290e+05],
       [  1.00000000e+00,   0.00000000e+00,   7.39945600e+04,
          1.22782750e+05,   3.03319260e+05],
       [  1.00000000e+00,   0.00000000e+00,   6.75325300e+04,
          1.05751030e+05,   3.04768730e+05],
       [  0.00000000e+00,   1.00000000e+00,   7.70440100e+04,
          9.92813400e+04,   1.40574810e+05],
       [  0.00000000e+00,   0.00000000e+00,   6.46647100e+04,
          1.39553160e+05,   1.37962620e+05],
       [  1.00000000e+00,   0.00000000e+00,   7.53288700e+04,
          1.44135980e+05,   1.34050070e+05],
       [  0.00000000e+00,   1.00000000e+00,   7.21076000e+04,
          1.27864550e+05,   3.53183810e+05],
       [  1.00000000e+00,   0.00000000e+00,   6.60515200e+04,
          1.82645560e+05,   1.18148200e+05],
       [  0.00000000e+00,   1.00000000e+00,   6.56054800e+04,
          1.53032060e+05,   1.07138380e+05],
       [  1.00000000e+00,   0.00000000e+00,   6.19944800e+04,
          1.15641280e+05,   9.11312400e+04],
       [  0.00000000e+00,   1.00000000e+00,   6.11363800e+04,
          1.52701920e+05,   8.82182300e+04],
       [  0.00000000e+00,   0.00000000e+00,   6.34088600e+04,
          1.29219610e+05,   4.60852500e+04],
       [  1.00000000e+00,   0.00000000e+00,   5.54939500e+04,
          1.03057490e+05,   2.14634810e+05],
       [  0.00000000e+00,   0.00000000e+00,   4.64260700e+04,
          1.57693920e+05,   2.10797670e+05],
       [  0.00000000e+00,   1.00000000e+00,   4.60140200e+04,
          8.50474400e+04,   2.05517640e+05],
       [  1.00000000e+00,   0.00000000e+00,   2.86637600e+04,
          1.27056210e+05,   2.01126820e+05],
       [  0.00000000e+00,   0.00000000e+00,   4.40699500e+04,
          5.12831400e+04,   1.97029420e+05],
       [  0.00000000e+00,   1.00000000e+00,   2.02295900e+04,
          6.59479300e+04,   1.85265100e+05],
       [  0.00000000e+00,   0.00000000e+00,   3.85585100e+04,
          8.29820900e+04,   1.74999300e+05],
       [  0.00000000e+00,   0.00000000e+00,   2.87543300e+04,
          1.18546050e+05,   1.72795670e+05],
       [  1.00000000e+00,   0.00000000e+00,   2.78929200e+04,
          8.47107700e+04,   1.64470710e+05],
       [  0.00000000e+00,   0.00000000e+00,   2.36409300e+04,
          9.61896300e+04,   1.48001110e+05],
       [  0.00000000e+00,   1.00000000e+00,   1.55057300e+04,
          1.27382300e+05,   3.55341700e+04],
       [  0.00000000e+00,   0.00000000e+00,   2.21777400e+04,
          1.54806140e+05,   2.83347200e+04],
       [  0.00000000e+00,   1.00000000e+00,   1.00023000e+03,
          1.24153040e+05,   1.90393000e+03],
       [  1.00000000e+00,   0.00000000e+00,   1.31546000e+03,
          1.15816210e+05,   2.97114460e+05],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.35426920e+05,   0.00000000e+00],
       [  0.00000000e+00,   1.00000000e+00,   5.42050000e+02,
          5.17431500e+04,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.16983800e+05,   4.51730600e+04]])

In [46]:
# Split the dataset into train and test
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

In [47]:
# We dont need feature scaling for multiple linear regression
# The library will takle care of that.

Fitting multiple linear regression to the training set


In [48]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,Y_train)


Out[48]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [49]:
# Predicting the test set results
Y_pred = regressor.predict(X_test)

Building the optimal model using Backward elimination

Previously to build the multiple regression model we used all the independent variable. Out of these some independent variable are higly statistically significant and some are not


In [50]:
import statsmodels.formula.api as sm

In [51]:
# The multiple linear regression equation is
# y = b0 + b1.X^1 + b2.x^2 + ... + bn.x^n
# The stats model requires that b0 is actually b0.x^0
# So, X^0 here is column vector of 1's
# we will add this vector of 1's to X
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)
# append methode adds a new row or column
# np.ones adds a column of 1's if axis=1
# We want to keep column of 1's as first column so, np.ones is first argument and then append X to it
# X has 50 rows
# astype[int] required to convert the 1's to int otherwise 
# you will get type error

In [52]:
X[1]


Out[52]:
array([  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.62597700e+05,   1.51377590e+05,   4.43898530e+05])

In [55]:
# Backward elimination steps
# Step 1: Select a significance level to stay in the model(e.g. SL = 0.05)
X_opt = X[:, [0,1,2,3,4,5]]

# Step 2: Fit the full model with all possible predictors
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()

In [56]:
# Step 3: Consider the predictor with the highest P-value.
# If P>SL, go to STEP4, otherwise go to Finish(model is final)
regressor_OLS.summary()


Out[56]:
OLS Regression Results
Dep. Variable: y R-squared: 0.951
Model: OLS Adj. R-squared: 0.945
Method: Least Squares F-statistic: 169.9
Date: Sat, 24 Dec 2016 Prob (F-statistic): 1.34e-27
Time: 04:19:23 Log-Likelihood: -525.38
No. Observations: 50 AIC: 1063.
Df Residuals: 44 BIC: 1074.
Df Model: 5
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 5.013e+04 6884.820 7.281 0.000 3.62e+04 6.4e+04
x1 198.7888 3371.007 0.059 0.953 -6595.030 6992.607
x2 -41.8870 3256.039 -0.013 0.990 -6604.003 6520.229
x3 0.8060 0.046 17.369 0.000 0.712 0.900
x4 -0.0270 0.052 -0.517 0.608 -0.132 0.078
x5 0.0270 0.017 1.574 0.123 -0.008 0.062
Omnibus: 14.782 Durbin-Watson: 1.283
Prob(Omnibus): 0.001 Jarque-Bera (JB): 21.266
Skew: -0.948 Prob(JB): 2.41e-05
Kurtosis: 5.572 Cond. No. 1.45e+06

In [58]:
# P-value of X2 is highest, so we remove X2
# Step4: Remove the predictor whose p-value is highest and more than SL = 0.05
# here we have to remove index 2 column
# Step 5: Fit the model without this variable
X_opt = X[:, [0,1,3,4,5]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()


Out[58]:
OLS Regression Results
Dep. Variable: y R-squared: 0.951
Model: OLS Adj. R-squared: 0.946
Method: Least Squares F-statistic: 217.2
Date: Sat, 24 Dec 2016 Prob (F-statistic): 8.49e-29
Time: 04:29:35 Log-Likelihood: -525.38
No. Observations: 50 AIC: 1061.
Df Residuals: 45 BIC: 1070.
Df Model: 4
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 5.011e+04 6647.870 7.537 0.000 3.67e+04 6.35e+04
x1 220.1585 2900.536 0.076 0.940 -5621.821 6062.138
x2 0.8060 0.046 17.606 0.000 0.714 0.898
x3 -0.0270 0.052 -0.523 0.604 -0.131 0.077
x4 0.0270 0.017 1.592 0.118 -0.007 0.061
Omnibus: 14.758 Durbin-Watson: 1.282
Prob(Omnibus): 0.001 Jarque-Bera (JB): 21.172
Skew: -0.948 Prob(JB): 2.53e-05
Kurtosis: 5.563 Cond. No. 1.40e+06

In [59]:
# here X1 has highest p-value. So we will remove column index
# 1
X_opt = X[:, [0,3,4,5]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()


Out[59]:
OLS Regression Results
Dep. Variable: y R-squared: 0.951
Model: OLS Adj. R-squared: 0.948
Method: Least Squares F-statistic: 296.0
Date: Sat, 24 Dec 2016 Prob (F-statistic): 4.53e-30
Time: 04:31:50 Log-Likelihood: -525.39
No. Observations: 50 AIC: 1059.
Df Residuals: 46 BIC: 1066.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 5.012e+04 6572.353 7.626 0.000 3.69e+04 6.34e+04
x1 0.8057 0.045 17.846 0.000 0.715 0.897
x2 -0.0268 0.051 -0.526 0.602 -0.130 0.076
x3 0.0272 0.016 1.655 0.105 -0.006 0.060
Omnibus: 14.838 Durbin-Watson: 1.282
Prob(Omnibus): 0.001 Jarque-Bera (JB): 21.442
Skew: -0.949 Prob(JB): 2.21e-05
Kurtosis: 5.586 Cond. No. 1.40e+06

In [60]:
# here X2 has highest p-value. So we will remove column index
# 4
X_opt = X[:, [0,3,5]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()


Out[60]:
OLS Regression Results
Dep. Variable: y R-squared: 0.950
Model: OLS Adj. R-squared: 0.948
Method: Least Squares F-statistic: 450.8
Date: Sat, 24 Dec 2016 Prob (F-statistic): 2.16e-31
Time: 04:33:22 Log-Likelihood: -525.54
No. Observations: 50 AIC: 1057.
Df Residuals: 47 BIC: 1063.
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 4.698e+04 2689.933 17.464 0.000 4.16e+04 5.24e+04
x1 0.7966 0.041 19.266 0.000 0.713 0.880
x2 0.0299 0.016 1.927 0.060 -0.001 0.061
Omnibus: 14.677 Durbin-Watson: 1.257
Prob(Omnibus): 0.001 Jarque-Bera (JB): 21.161
Skew: -0.939 Prob(JB): 2.54e-05
Kurtosis: 5.575 Cond. No. 5.32e+05

In [61]:
# here X2 has highest p-value. So we will remove column index
# 5
X_opt = X[:, [0,3]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()


Out[61]:
OLS Regression Results
Dep. Variable: y R-squared: 0.947
Model: OLS Adj. R-squared: 0.945
Method: Least Squares F-statistic: 849.8
Date: Sat, 24 Dec 2016 Prob (F-statistic): 3.50e-32
Time: 04:43:23 Log-Likelihood: -527.44
No. Observations: 50 AIC: 1059.
Df Residuals: 48 BIC: 1063.
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [95.0% Conf. Int.]
const 4.903e+04 2537.897 19.320 0.000 4.39e+04 5.41e+04
x1 0.8543 0.029 29.151 0.000 0.795 0.913
Omnibus: 13.727 Durbin-Watson: 1.116
Prob(Omnibus): 0.001 Jarque-Bera (JB): 18.536
Skew: -0.911 Prob(JB): 9.44e-05
Kurtosis: 5.361 Cond. No. 1.65e+05

In [62]:
X[1]


Out[62]:
array([  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.62597700e+05,   1.51377590e+05,   4.43898530e+05])

In [63]:
# So column R&D spent has the max impact on profit