Multiple linear regression



In [37]:

    
# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd



In [38]:

    
#importing the datset
dataset = pd.read_csv('datasets/50_Startups.csv')



In [39]:

    
dataset.head()









    Out[39]:






  
    
      
      R&D Spend
      Administration
      Marketing Spend
      State
      Profit
    
  
  
    
      0
      165349.20
      136897.80
      471784.10
      New York
      192261.83
    
    
      1
      162597.70
      151377.59
      443898.53
      California
      191792.06
    
    
      2
      153441.51
      101145.55
      407934.54
      Florida
      191050.39
    
    
      3
      144372.41
      118671.85
      383199.62
      New York
      182901.99
    
    
      4
      142107.34
      91391.77
      366168.42
      Florida
      166187.94



In [40]:

    
X = dataset.iloc[:, :-1].values
Y = dataset.iloc[:, 4].values



In [41]:

    
X









    Out[41]:





array([[165349.2, 136897.8, 471784.1, 'New York'],
       [162597.7, 151377.59, 443898.53, 'California'],
       [153441.51, 101145.55, 407934.54, 'Florida'],
       [144372.41, 118671.85, 383199.62, 'New York'],
       [142107.34, 91391.77, 366168.42, 'Florida'],
       [131876.9, 99814.71, 362861.36, 'New York'],
       [134615.46, 147198.87, 127716.82, 'California'],
       [130298.13, 145530.06, 323876.68, 'Florida'],
       [120542.52, 148718.95, 311613.29, 'New York'],
       [123334.88, 108679.17, 304981.62, 'California'],
       [101913.08, 110594.11, 229160.95, 'Florida'],
       [100671.96, 91790.61, 249744.55, 'California'],
       [93863.75, 127320.38, 249839.44, 'Florida'],
       [91992.39, 135495.07, 252664.93, 'California'],
       [119943.24, 156547.42, 256512.92, 'Florida'],
       [114523.61, 122616.84, 261776.23, 'New York'],
       [78013.11, 121597.55, 264346.06, 'California'],
       [94657.16, 145077.58, 282574.31, 'New York'],
       [91749.16, 114175.79, 294919.57, 'Florida'],
       [86419.7, 153514.11, 0.0, 'New York'],
       [76253.86, 113867.3, 298664.47, 'California'],
       [78389.47, 153773.43, 299737.29, 'New York'],
       [73994.56, 122782.75, 303319.26, 'Florida'],
       [67532.53, 105751.03, 304768.73, 'Florida'],
       [77044.01, 99281.34, 140574.81, 'New York'],
       [64664.71, 139553.16, 137962.62, 'California'],
       [75328.87, 144135.98, 134050.07, 'Florida'],
       [72107.6, 127864.55, 353183.81, 'New York'],
       [66051.52, 182645.56, 118148.2, 'Florida'],
       [65605.48, 153032.06, 107138.38, 'New York'],
       [61994.48, 115641.28, 91131.24, 'Florida'],
       [61136.38, 152701.92, 88218.23, 'New York'],
       [63408.86, 129219.61, 46085.25, 'California'],
       [55493.95, 103057.49, 214634.81, 'Florida'],
       [46426.07, 157693.92, 210797.67, 'California'],
       [46014.02, 85047.44, 205517.64, 'New York'],
       [28663.76, 127056.21, 201126.82, 'Florida'],
       [44069.95, 51283.14, 197029.42, 'California'],
       [20229.59, 65947.93, 185265.1, 'New York'],
       [38558.51, 82982.09, 174999.3, 'California'],
       [28754.33, 118546.05, 172795.67, 'California'],
       [27892.92, 84710.77, 164470.71, 'Florida'],
       [23640.93, 96189.63, 148001.11, 'California'],
       [15505.73, 127382.3, 35534.17, 'New York'],
       [22177.74, 154806.14, 28334.72, 'California'],
       [1000.23, 124153.04, 1903.93, 'New York'],
       [1315.46, 115816.21, 297114.46, 'Florida'],
       [0.0, 135426.92, 0.0, 'California'],
       [542.05, 51743.15, 0.0, 'New York'],
       [0.0, 116983.8, 45173.06, 'California']], dtype=object)



In [42]:

    
Y









    Out[42]:





array([ 192261.83,  191792.06,  191050.39,  182901.99,  166187.94,
        156991.12,  156122.51,  155752.6 ,  152211.77,  149759.96,
        146121.95,  144259.4 ,  141585.52,  134307.35,  132602.65,
        129917.04,  126992.93,  125370.37,  124266.9 ,  122776.86,
        118474.03,  111313.02,  110352.25,  108733.99,  108552.04,
        107404.34,  105733.54,  105008.31,  103282.38,  101004.64,
         99937.59,   97483.56,   97427.84,   96778.92,   96712.8 ,
         96479.51,   90708.19,   89949.14,   81229.06,   81005.76,
         78239.91,   77798.83,   71498.49,   69758.98,   65200.33,
         64926.08,   49490.75,   42559.73,   35673.41,   14681.4 ])



In [43]:

    
# But there is categorical variable. i.e. independent varaible State 
# We will use one hot encoder
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
labelencoder_X = LabelEncoder()
X[:, 3] = labelencoder_X.fit_transform(X[:, 3])
onehotencoder = OneHotEncoder(categorical_features = [3])
X = onehotencoder.fit_transform(X).toarray()
# Here index column 3 has categorical variable



In [44]:

    
X









    Out[44]:





array([[  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.65349200e+05,   1.36897800e+05,   4.71784100e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.62597700e+05,   1.51377590e+05,   4.43898530e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.53441510e+05,   1.01145550e+05,   4.07934540e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.44372410e+05,   1.18671850e+05,   3.83199620e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.42107340e+05,   9.13917700e+04,   3.66168420e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.31876900e+05,   9.98147100e+04,   3.62861360e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.34615460e+05,   1.47198870e+05,   1.27716820e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.30298130e+05,   1.45530060e+05,   3.23876680e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.20542520e+05,   1.48718950e+05,   3.11613290e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.23334880e+05,   1.08679170e+05,   3.04981620e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.01913080e+05,   1.10594110e+05,   2.29160950e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.00671960e+05,   9.17906100e+04,   2.49744550e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          9.38637500e+04,   1.27320380e+05,   2.49839440e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          9.19923900e+04,   1.35495070e+05,   2.52664930e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.19943240e+05,   1.56547420e+05,   2.56512920e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.14523610e+05,   1.22616840e+05,   2.61776230e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          7.80131100e+04,   1.21597550e+05,   2.64346060e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          9.46571600e+04,   1.45077580e+05,   2.82574310e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          9.17491600e+04,   1.14175790e+05,   2.94919570e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          8.64197000e+04,   1.53514110e+05,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          7.62538600e+04,   1.13867300e+05,   2.98664470e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          7.83894700e+04,   1.53773430e+05,   2.99737290e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          7.39945600e+04,   1.22782750e+05,   3.03319260e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          6.75325300e+04,   1.05751030e+05,   3.04768730e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          7.70440100e+04,   9.92813400e+04,   1.40574810e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          6.46647100e+04,   1.39553160e+05,   1.37962620e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          7.53288700e+04,   1.44135980e+05,   1.34050070e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          7.21076000e+04,   1.27864550e+05,   3.53183810e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          6.60515200e+04,   1.82645560e+05,   1.18148200e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          6.56054800e+04,   1.53032060e+05,   1.07138380e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          6.19944800e+04,   1.15641280e+05,   9.11312400e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          6.11363800e+04,   1.52701920e+05,   8.82182300e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          6.34088600e+04,   1.29219610e+05,   4.60852500e+04],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          5.54939500e+04,   1.03057490e+05,   2.14634810e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.64260700e+04,   1.57693920e+05,   2.10797670e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          4.60140200e+04,   8.50474400e+04,   2.05517640e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          2.86637600e+04,   1.27056210e+05,   2.01126820e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.40699500e+04,   5.12831400e+04,   1.97029420e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          2.02295900e+04,   6.59479300e+04,   1.85265100e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          3.85585100e+04,   8.29820900e+04,   1.74999300e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          2.87543300e+04,   1.18546050e+05,   1.72795670e+05],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          2.78929200e+04,   8.47107700e+04,   1.64470710e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          2.36409300e+04,   9.61896300e+04,   1.48001110e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.55057300e+04,   1.27382300e+05,   3.55341700e+04],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          2.21777400e+04,   1.54806140e+05,   2.83347200e+04],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          1.00023000e+03,   1.24153040e+05,   1.90393000e+03],
       [  0.00000000e+00,   1.00000000e+00,   0.00000000e+00,
          1.31546000e+03,   1.15816210e+05,   2.97114460e+05],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.35426920e+05,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   1.00000000e+00,
          5.42050000e+02,   5.17431500e+04,   0.00000000e+00],
       [  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.16983800e+05,   4.51730600e+04]])



In [45]:

    
# avoiding the dummy variable trap
X = X[:, 1:]
# It doesn't contain the 1st column which was for california
X









    Out[45]:





array([[  0.00000000e+00,   1.00000000e+00,   1.65349200e+05,
          1.36897800e+05,   4.71784100e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.62597700e+05,
          1.51377590e+05,   4.43898530e+05],
       [  1.00000000e+00,   0.00000000e+00,   1.53441510e+05,
          1.01145550e+05,   4.07934540e+05],
       [  0.00000000e+00,   1.00000000e+00,   1.44372410e+05,
          1.18671850e+05,   3.83199620e+05],
       [  1.00000000e+00,   0.00000000e+00,   1.42107340e+05,
          9.13917700e+04,   3.66168420e+05],
       [  0.00000000e+00,   1.00000000e+00,   1.31876900e+05,
          9.98147100e+04,   3.62861360e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.34615460e+05,
          1.47198870e+05,   1.27716820e+05],
       [  1.00000000e+00,   0.00000000e+00,   1.30298130e+05,
          1.45530060e+05,   3.23876680e+05],
       [  0.00000000e+00,   1.00000000e+00,   1.20542520e+05,
          1.48718950e+05,   3.11613290e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.23334880e+05,
          1.08679170e+05,   3.04981620e+05],
       [  1.00000000e+00,   0.00000000e+00,   1.01913080e+05,
          1.10594110e+05,   2.29160950e+05],
       [  0.00000000e+00,   0.00000000e+00,   1.00671960e+05,
          9.17906100e+04,   2.49744550e+05],
       [  1.00000000e+00,   0.00000000e+00,   9.38637500e+04,
          1.27320380e+05,   2.49839440e+05],
       [  0.00000000e+00,   0.00000000e+00,   9.19923900e+04,
          1.35495070e+05,   2.52664930e+05],
       [  1.00000000e+00,   0.00000000e+00,   1.19943240e+05,
          1.56547420e+05,   2.56512920e+05],
       [  0.00000000e+00,   1.00000000e+00,   1.14523610e+05,
          1.22616840e+05,   2.61776230e+05],
       [  0.00000000e+00,   0.00000000e+00,   7.80131100e+04,
          1.21597550e+05,   2.64346060e+05],
       [  0.00000000e+00,   1.00000000e+00,   9.46571600e+04,
          1.45077580e+05,   2.82574310e+05],
       [  1.00000000e+00,   0.00000000e+00,   9.17491600e+04,
          1.14175790e+05,   2.94919570e+05],
       [  0.00000000e+00,   1.00000000e+00,   8.64197000e+04,
          1.53514110e+05,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   7.62538600e+04,
          1.13867300e+05,   2.98664470e+05],
       [  0.00000000e+00,   1.00000000e+00,   7.83894700e+04,
          1.53773430e+05,   2.99737290e+05],
       [  1.00000000e+00,   0.00000000e+00,   7.39945600e+04,
          1.22782750e+05,   3.03319260e+05],
       [  1.00000000e+00,   0.00000000e+00,   6.75325300e+04,
          1.05751030e+05,   3.04768730e+05],
       [  0.00000000e+00,   1.00000000e+00,   7.70440100e+04,
          9.92813400e+04,   1.40574810e+05],
       [  0.00000000e+00,   0.00000000e+00,   6.46647100e+04,
          1.39553160e+05,   1.37962620e+05],
       [  1.00000000e+00,   0.00000000e+00,   7.53288700e+04,
          1.44135980e+05,   1.34050070e+05],
       [  0.00000000e+00,   1.00000000e+00,   7.21076000e+04,
          1.27864550e+05,   3.53183810e+05],
       [  1.00000000e+00,   0.00000000e+00,   6.60515200e+04,
          1.82645560e+05,   1.18148200e+05],
       [  0.00000000e+00,   1.00000000e+00,   6.56054800e+04,
          1.53032060e+05,   1.07138380e+05],
       [  1.00000000e+00,   0.00000000e+00,   6.19944800e+04,
          1.15641280e+05,   9.11312400e+04],
       [  0.00000000e+00,   1.00000000e+00,   6.11363800e+04,
          1.52701920e+05,   8.82182300e+04],
       [  0.00000000e+00,   0.00000000e+00,   6.34088600e+04,
          1.29219610e+05,   4.60852500e+04],
       [  1.00000000e+00,   0.00000000e+00,   5.54939500e+04,
          1.03057490e+05,   2.14634810e+05],
       [  0.00000000e+00,   0.00000000e+00,   4.64260700e+04,
          1.57693920e+05,   2.10797670e+05],
       [  0.00000000e+00,   1.00000000e+00,   4.60140200e+04,
          8.50474400e+04,   2.05517640e+05],
       [  1.00000000e+00,   0.00000000e+00,   2.86637600e+04,
          1.27056210e+05,   2.01126820e+05],
       [  0.00000000e+00,   0.00000000e+00,   4.40699500e+04,
          5.12831400e+04,   1.97029420e+05],
       [  0.00000000e+00,   1.00000000e+00,   2.02295900e+04,
          6.59479300e+04,   1.85265100e+05],
       [  0.00000000e+00,   0.00000000e+00,   3.85585100e+04,
          8.29820900e+04,   1.74999300e+05],
       [  0.00000000e+00,   0.00000000e+00,   2.87543300e+04,
          1.18546050e+05,   1.72795670e+05],
       [  1.00000000e+00,   0.00000000e+00,   2.78929200e+04,
          8.47107700e+04,   1.64470710e+05],
       [  0.00000000e+00,   0.00000000e+00,   2.36409300e+04,
          9.61896300e+04,   1.48001110e+05],
       [  0.00000000e+00,   1.00000000e+00,   1.55057300e+04,
          1.27382300e+05,   3.55341700e+04],
       [  0.00000000e+00,   0.00000000e+00,   2.21777400e+04,
          1.54806140e+05,   2.83347200e+04],
       [  0.00000000e+00,   1.00000000e+00,   1.00023000e+03,
          1.24153040e+05,   1.90393000e+03],
       [  1.00000000e+00,   0.00000000e+00,   1.31546000e+03,
          1.15816210e+05,   2.97114460e+05],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.35426920e+05,   0.00000000e+00],
       [  0.00000000e+00,   1.00000000e+00,   5.42050000e+02,
          5.17431500e+04,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.16983800e+05,   4.51730600e+04]])



In [46]:

    
# Split the dataset into train and test
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)



In [47]:

    
# We dont need feature scaling for multiple linear regression
# The library will takle care of that.

Fitting multiple linear regression to the training set



In [48]:

    
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,Y_train)









    Out[48]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [49]:

    
# Predicting the test set results
Y_pred = regressor.predict(X_test)

Building the optimal model using Backward elimination

Previously to build the multiple regression model we used all the independent variable. Out of these some independent variable are higly statistically significant and some are not



In [50]:

    
import statsmodels.formula.api as sm



In [51]:

    
# The multiple linear regression equation is
# y = b0 + b1.X^1 + b2.x^2 + ... + bn.x^n
# The stats model requires that b0 is actually b0.x^0
# So, X^0 here is column vector of 1's
# we will add this vector of 1's to X
X = np.append(arr = np.ones((50, 1)).astype(int), values = X, axis = 1)
# append methode adds a new row or column
# np.ones adds a column of 1's if axis=1
# We want to keep column of 1's as first column so, np.ones is first argument and then append X to it
# X has 50 rows
# astype[int] required to convert the 1's to int otherwise 
# you will get type error



In [52]:

    
X[1]









    Out[52]:





array([  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.62597700e+05,   1.51377590e+05,   4.43898530e+05])



In [55]:

    
# Backward elimination steps
# Step 1: Select a significance level to stay in the model(e.g. SL = 0.05)
X_opt = X[:, [0,1,2,3,4,5]]

# Step 2: Fit the full model with all possible predictors
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()



In [56]:

    
# Step 3: Consider the predictor with the highest P-value.
# If P>SL, go to STEP4, otherwise go to Finish(model is final)
regressor_OLS.summary()









    Out[56]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.951


  Model:                    OLS          Adj. R-squared:        0.945


  Method:              Least Squares     F-statistic:           169.9


  Date:              Sat, 24 Dec 2016    Prob (F-statistic):  1.34e-27


  Time:                  04:19:23        Log-Likelihood:      -525.38


  No. Observations:           50         AIC:                   1063.


  Df Residuals:               44         BIC:                   1074.


  Df Model:                    5                                     


  Covariance Type:       nonrobust                                   




           coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const   5.013e+04   6884.820      7.281   0.000   3.62e+04   6.4e+04


  x1       198.7888   3371.007      0.059   0.953  -6595.030  6992.607


  x2       -41.8870   3256.039     -0.013   0.990  -6604.003  6520.229


  x3         0.8060      0.046     17.369   0.000      0.712     0.900


  x4        -0.0270      0.052     -0.517   0.608     -0.132     0.078


  x5         0.0270      0.017      1.574   0.123     -0.008     0.062




  Omnibus:        14.782    Durbin-Watson:         1.283


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     21.266


  Skew:           -0.948    Prob(JB):           2.41e-05


  Kurtosis:        5.572    Cond. No.           1.45e+06



In [58]:

    
# P-value of X2 is highest, so we remove X2
# Step4: Remove the predictor whose p-value is highest and more than SL = 0.05
# here we have to remove index 2 column
# Step 5: Fit the model without this variable
X_opt = X[:, [0,1,3,4,5]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()









    Out[58]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.951


  Model:                    OLS          Adj. R-squared:        0.946


  Method:              Least Squares     F-statistic:           217.2


  Date:              Sat, 24 Dec 2016    Prob (F-statistic):  8.49e-29


  Time:                  04:29:35        Log-Likelihood:      -525.38


  No. Observations:           50         AIC:                   1061.


  Df Residuals:               45         BIC:                   1070.


  Df Model:                    4                                     


  Covariance Type:       nonrobust                                   




           coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const   5.011e+04   6647.870      7.537   0.000   3.67e+04  6.35e+04


  x1       220.1585   2900.536      0.076   0.940  -5621.821  6062.138


  x2         0.8060      0.046     17.606   0.000      0.714     0.898


  x3        -0.0270      0.052     -0.523   0.604     -0.131     0.077


  x4         0.0270      0.017      1.592   0.118     -0.007     0.061




  Omnibus:        14.758    Durbin-Watson:         1.282


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     21.172


  Skew:           -0.948    Prob(JB):           2.53e-05


  Kurtosis:        5.563    Cond. No.           1.40e+06



In [59]:

    
# here X1 has highest p-value. So we will remove column index
# 1
X_opt = X[:, [0,3,4,5]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()









    Out[59]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.951


  Model:                    OLS          Adj. R-squared:        0.948


  Method:              Least Squares     F-statistic:           296.0


  Date:              Sat, 24 Dec 2016    Prob (F-statistic):  4.53e-30


  Time:                  04:31:50        Log-Likelihood:      -525.39


  No. Observations:           50         AIC:                   1059.


  Df Residuals:               46         BIC:                   1066.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




           coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const   5.012e+04   6572.353      7.626   0.000   3.69e+04  6.34e+04


  x1         0.8057      0.045     17.846   0.000      0.715     0.897


  x2        -0.0268      0.051     -0.526   0.602     -0.130     0.076


  x3         0.0272      0.016      1.655   0.105     -0.006     0.060




  Omnibus:        14.838    Durbin-Watson:         1.282


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     21.442


  Skew:           -0.949    Prob(JB):           2.21e-05


  Kurtosis:        5.586    Cond. No.           1.40e+06



In [60]:

    
# here X2 has highest p-value. So we will remove column index
# 4
X_opt = X[:, [0,3,5]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()









    Out[60]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.950


  Model:                    OLS          Adj. R-squared:        0.948


  Method:              Least Squares     F-statistic:           450.8


  Date:              Sat, 24 Dec 2016    Prob (F-statistic):  2.16e-31


  Time:                  04:33:22        Log-Likelihood:      -525.54


  No. Observations:           50         AIC:                   1057.


  Df Residuals:               47         BIC:                   1063.


  Df Model:                    2                                     


  Covariance Type:       nonrobust                                   




           coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const   4.698e+04   2689.933     17.464   0.000   4.16e+04  5.24e+04


  x1         0.7966      0.041     19.266   0.000      0.713     0.880


  x2         0.0299      0.016      1.927   0.060     -0.001     0.061




  Omnibus:        14.677    Durbin-Watson:         1.257


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     21.161


  Skew:           -0.939    Prob(JB):           2.54e-05


  Kurtosis:        5.575    Cond. No.           5.32e+05



In [61]:

    
# here X2 has highest p-value. So we will remove column index
# 5
X_opt = X[:, [0,3]]
regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
regressor_OLS.summary()









    Out[61]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.947


  Model:                    OLS          Adj. R-squared:        0.945


  Method:              Least Squares     F-statistic:           849.8


  Date:              Sat, 24 Dec 2016    Prob (F-statistic):  3.50e-32


  Time:                  04:43:23        Log-Likelihood:      -527.44


  No. Observations:           50         AIC:                   1059.


  Df Residuals:               48         BIC:                   1063.


  Df Model:                    1                                     


  Covariance Type:       nonrobust                                   




           coef      std err       t       P>|t|  [95.0% Conf. Int.] 


  const   4.903e+04   2537.897     19.320   0.000   4.39e+04  5.41e+04


  x1         0.8543      0.029     29.151   0.000      0.795     0.913




  Omnibus:        13.727    Durbin-Watson:         1.116


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     18.536


  Skew:           -0.911    Prob(JB):           9.44e-05


  Kurtosis:        5.361    Cond. No.           1.65e+05



In [62]:

    
X[1]









    Out[62]:





array([  1.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         1.62597700e+05,   1.51377590e+05,   4.43898530e+05])



In [63]:

    
# So column R&D spent has the max impact on profit

	R&D Spend	Administration	Marketing Spend	State	Profit
0	165349.20	136897.80	471784.10	New York	192261.83
1	162597.70	151377.59	443898.53	California	191792.06
2	153441.51	101145.55	407934.54	Florida	191050.39
3	144372.41	118671.85	383199.62	New York	182901.99
4	142107.34	91391.77	366168.42	Florida	166187.94

Dep. Variable:	y	R-squared:	0.951
Model:	OLS	Adj. R-squared:	0.945
Method:	Least Squares	F-statistic:	169.9
Date:	Sat, 24 Dec 2016	Prob (F-statistic):	1.34e-27
Time:	04:19:23	Log-Likelihood:	-525.38
No. Observations:	50	AIC:	1063.
Df Residuals:	44	BIC:	1074.
Df Model:	5
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[95.0% Conf. Int.]
const	5.013e+04	6884.820	7.281	0.000	3.62e+04 6.4e+04
x1	198.7888	3371.007	0.059	0.953	-6595.030 6992.607
x2	-41.8870	3256.039	-0.013	0.990	-6604.003 6520.229
x3	0.8060	0.046	17.369	0.000	0.712 0.900
x4	-0.0270	0.052	-0.517	0.608	-0.132 0.078
x5	0.0270	0.017	1.574	0.123	-0.008 0.062

Omnibus:	14.782	Durbin-Watson:	1.283
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.266
Skew:	-0.948	Prob(JB):	2.41e-05
Kurtosis:	5.572	Cond. No.	1.45e+06

Omnibus:	14.758	Durbin-Watson:	1.282
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.172
Skew:	-0.948	Prob(JB):	2.53e-05
Kurtosis:	5.563	Cond. No.	1.40e+06

Omnibus:	14.838	Durbin-Watson:	1.282
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.442
Skew:	-0.949	Prob(JB):	2.21e-05
Kurtosis:	5.586	Cond. No.	1.40e+06

Dep. Variable:	y	R-squared:	0.950
Model:	OLS	Adj. R-squared:	0.948
Method:	Least Squares	F-statistic:	450.8
Date:	Sat, 24 Dec 2016	Prob (F-statistic):	2.16e-31
Time:	04:33:22	Log-Likelihood:	-525.54
No. Observations:	50	AIC:	1057.
Df Residuals:	47	BIC:	1063.
Df Model:	2
Covariance Type:	nonrobust

Omnibus:	14.677	Durbin-Watson:	1.257
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.161
Skew:	-0.939	Prob(JB):	2.54e-05
Kurtosis:	5.575	Cond. No.	5.32e+05

Dep. Variable:	y	R-squared:	0.947
Model:	OLS	Adj. R-squared:	0.945
Method:	Least Squares	F-statistic:	849.8
Date:	Sat, 24 Dec 2016	Prob (F-statistic):	3.50e-32
Time:	04:43:23	Log-Likelihood:	-527.44
No. Observations:	50	AIC:	1059.
Df Residuals:	48	BIC:	1063.
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	13.727	Durbin-Watson:	1.116
Prob(Omnibus):	0.001	Jarque-Bera (JB):	18.536
Skew:	-0.911	Prob(JB):	9.44e-05
Kurtosis:	5.361	Cond. No.	1.65e+05