In [155]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [156]:

    
np.set_printoptions(precision=3, suppress=True)

Importing a dataset



In [157]:

    
dataset = pd.read_csv('50_Startups.csv')



In [158]:

    
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

Encoding categorical data



In [159]:

    
from sklearn.preprocessing import LabelEncoder, OneHotEncoder



In [160]:

    
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])



In [161]:

    
onehotencoder = OneHotEncoder(categorical_features=[3])
X = onehotencoder.fit_transform(X).toarray()

Avoiding the Dummy Variable Trap



In [162]:

    
X = X[:, 1:]

Splitting the dataset into a Training set and a Test set



In [163]:

    
from sklearn.model_selection import train_test_split



In [164]:

    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Fitting Multiple Linear Regression to the Training set



In [165]:

    
from sklearn.linear_model import LinearRegression



In [166]:

    
regressor = LinearRegression()
regressor.fit(X_train, y_train)









    Out[166]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Predicting the Test set results



In [167]:

    
y_pred = regressor.predict(X_test)
pd.DataFrame([y_pred, y_test])









    Out[167]:






  
    
      
      0
      1
      2
      3
      4
      5
      6
      7
      8
      9
    
  
  
    
      0
      103015.201598
      132582.277608
      132447.738452
      71976.098513
      178537.482211
      116161.242302
      67851.692097
      98791.733747
      113969.43533
      167921.065695
    
    
      1
      103282.380000
      144259.400000
      146121.950000
      77798.830000
      191050.390000
      105008.310000
      81229.060000
      97483.560000
      110352.25000
      166187.940000

Building an optimal model using Backward Elimination



In [168]:

    
import statsmodels.formula.api as sm



In [169]:

    
X = np.append(arr=np.ones((50, 1)).astype(int), values=X, axis=1)



In [173]:

    
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()









    Out[173]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.951


  Model:                    OLS          Adj. R-squared:        0.945


  Method:              Least Squares     F-statistic:           169.9


  Date:              Sat, 06 May 2017    Prob (F-statistic):  1.34e-27


  Time:                  04:19:45        Log-Likelihood:      -525.38


  No. Observations:           50         AIC:                   1063.


  Df Residuals:               44         BIC:                   1074.


  Df Model:                    5                                     


  Covariance Type:       nonrobust                                   




           coef      std err       t       P>|t|   [0.025     0.975]  


  const   5.013e+04   6884.820      7.281   0.000   3.62e+04    6.4e+04


  x1       198.7888   3371.007      0.059   0.953  -6595.030   6992.607


  x2       -41.8870   3256.039     -0.013   0.990  -6604.003   6520.229


  x3         0.8060      0.046     17.369   0.000      0.712      0.900


  x4        -0.0270      0.052     -0.517   0.608     -0.132      0.078


  x5         0.0270      0.017      1.574   0.123     -0.008      0.062




  Omnibus:        14.782    Durbin-Watson:         1.283


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     21.266


  Skew:           -0.948    Prob(JB):           2.41e-05


  Kurtosis:        5.572    Cond. No.           1.45e+06



In [174]:

    
X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()









    Out[174]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.951


  Model:                    OLS          Adj. R-squared:        0.946


  Method:              Least Squares     F-statistic:           217.2


  Date:              Sat, 06 May 2017    Prob (F-statistic):  8.49e-29


  Time:                  04:19:55        Log-Likelihood:      -525.38


  No. Observations:           50         AIC:                   1061.


  Df Residuals:               45         BIC:                   1070.


  Df Model:                    4                                     


  Covariance Type:       nonrobust                                   




           coef      std err       t       P>|t|   [0.025     0.975]  


  const   5.011e+04   6647.870      7.537   0.000   3.67e+04   6.35e+04


  x1       220.1585   2900.536      0.076   0.940  -5621.821   6062.138


  x2         0.8060      0.046     17.606   0.000      0.714      0.898


  x3        -0.0270      0.052     -0.523   0.604     -0.131      0.077


  x4         0.0270      0.017      1.592   0.118     -0.007      0.061




  Omnibus:        14.758    Durbin-Watson:         1.282


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     21.172


  Skew:           -0.948    Prob(JB):           2.53e-05


  Kurtosis:        5.563    Cond. No.           1.40e+06



In [177]:

    
X_opt = X[:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()









    Out[177]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.951


  Model:                    OLS          Adj. R-squared:        0.948


  Method:              Least Squares     F-statistic:           296.0


  Date:              Sat, 06 May 2017    Prob (F-statistic):  4.53e-30


  Time:                  04:21:09        Log-Likelihood:      -525.39


  No. Observations:           50         AIC:                   1059.


  Df Residuals:               46         BIC:                   1066.


  Df Model:                    3                                     


  Covariance Type:       nonrobust                                   




           coef      std err       t       P>|t|   [0.025     0.975]  


  const   5.012e+04   6572.353      7.626   0.000   3.69e+04   6.34e+04


  x1         0.8057      0.045     17.846   0.000      0.715      0.897


  x2        -0.0268      0.051     -0.526   0.602     -0.130      0.076


  x3         0.0272      0.016      1.655   0.105     -0.006      0.060




  Omnibus:        14.838    Durbin-Watson:         1.282


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     21.442


  Skew:           -0.949    Prob(JB):           2.21e-05


  Kurtosis:        5.586    Cond. No.           1.40e+06



In [179]:

    
X_opt = X[:, [0, 3, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()









    Out[179]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.950


  Model:                    OLS          Adj. R-squared:        0.948


  Method:              Least Squares     F-statistic:           450.8


  Date:              Sat, 06 May 2017    Prob (F-statistic):  2.16e-31


  Time:                  04:21:47        Log-Likelihood:      -525.54


  No. Observations:           50         AIC:                   1057.


  Df Residuals:               47         BIC:                   1063.


  Df Model:                    2                                     


  Covariance Type:       nonrobust                                   




           coef      std err       t       P>|t|   [0.025     0.975]  


  const   4.698e+04   2689.933     17.464   0.000   4.16e+04   5.24e+04


  x1         0.7966      0.041     19.266   0.000      0.713      0.880


  x2         0.0299      0.016      1.927   0.060     -0.001      0.061




  Omnibus:        14.677    Durbin-Watson:         1.257


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     21.161


  Skew:           -0.939    Prob(JB):           2.54e-05


  Kurtosis:        5.575    Cond. No.           5.32e+05



In [180]:

    
X_opt = X[:, [0, 3]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()









    Out[180]:





OLS Regression Results

  Dep. Variable:             y           R-squared:             0.947


  Model:                    OLS          Adj. R-squared:        0.945


  Method:              Least Squares     F-statistic:           849.8


  Date:              Sat, 06 May 2017    Prob (F-statistic):  3.50e-32


  Time:                  04:21:53        Log-Likelihood:      -527.44


  No. Observations:           50         AIC:                   1059.


  Df Residuals:               48         BIC:                   1063.


  Df Model:                    1                                     


  Covariance Type:       nonrobust                                   




           coef      std err       t       P>|t|   [0.025     0.975]  


  const   4.903e+04   2537.897     19.320   0.000   4.39e+04   5.41e+04


  x1         0.8543      0.029     29.151   0.000      0.795      0.913




  Omnibus:        13.727    Durbin-Watson:         1.116


  Prob(Omnibus):   0.001    Jarque-Bera (JB):     18.536


  Skew:           -0.911    Prob(JB):           9.44e-05


  Kurtosis:        5.361    Cond. No.           1.65e+05

	0	1	2	3	4	5	6	7	8	9
0	103015.201598	132582.277608	132447.738452	71976.098513	178537.482211	116161.242302	67851.692097	98791.733747	113969.43533	167921.065695
1	103282.380000	144259.400000	146121.950000	77798.830000	191050.390000	105008.310000	81229.060000	97483.560000	110352.25000	166187.940000

Dep. Variable:	y	R-squared:	0.951
Model:	OLS	Adj. R-squared:	0.945
Method:	Least Squares	F-statistic:	169.9
Date:	Sat, 06 May 2017	Prob (F-statistic):	1.34e-27
Time:	04:19:45	Log-Likelihood:	-525.38
No. Observations:	50	AIC:	1063.
Df Residuals:	44	BIC:	1074.
Df Model:	5
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	5.013e+04	6884.820	7.281	0.000	3.62e+04	6.4e+04
x1	198.7888	3371.007	0.059	0.953	-6595.030	6992.607
x2	-41.8870	3256.039	-0.013	0.990	-6604.003	6520.229
x3	0.8060	0.046	17.369	0.000	0.712	0.900
x4	-0.0270	0.052	-0.517	0.608	-0.132	0.078
x5	0.0270	0.017	1.574	0.123	-0.008	0.062

Omnibus:	14.782	Durbin-Watson:	1.283
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.266
Skew:	-0.948	Prob(JB):	2.41e-05
Kurtosis:	5.572	Cond. No.	1.45e+06

Omnibus:	14.758	Durbin-Watson:	1.282
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.172
Skew:	-0.948	Prob(JB):	2.53e-05
Kurtosis:	5.563	Cond. No.	1.40e+06

Omnibus:	14.838	Durbin-Watson:	1.282
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.442
Skew:	-0.949	Prob(JB):	2.21e-05
Kurtosis:	5.586	Cond. No.	1.40e+06

Dep. Variable:	y	R-squared:	0.950
Model:	OLS	Adj. R-squared:	0.948
Method:	Least Squares	F-statistic:	450.8
Date:	Sat, 06 May 2017	Prob (F-statistic):	2.16e-31
Time:	04:21:47	Log-Likelihood:	-525.54
No. Observations:	50	AIC:	1057.
Df Residuals:	47	BIC:	1063.
Df Model:	2
Covariance Type:	nonrobust

Omnibus:	14.677	Durbin-Watson:	1.257
Prob(Omnibus):	0.001	Jarque-Bera (JB):	21.161
Skew:	-0.939	Prob(JB):	2.54e-05
Kurtosis:	5.575	Cond. No.	5.32e+05

Dep. Variable:	y	R-squared:	0.947
Model:	OLS	Adj. R-squared:	0.945
Method:	Least Squares	F-statistic:	849.8
Date:	Sat, 06 May 2017	Prob (F-statistic):	3.50e-32
Time:	04:21:53	Log-Likelihood:	-527.44
No. Observations:	50	AIC:	1059.
Df Residuals:	48	BIC:	1063.
Df Model:	1
Covariance Type:	nonrobust

Omnibus:	13.727	Durbin-Watson:	1.116
Prob(Omnibus):	0.001	Jarque-Bera (JB):	18.536
Skew:	-0.911	Prob(JB):	9.44e-05
Kurtosis:	5.361	Cond. No.	1.65e+05