In [155]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [156]:
np.set_printoptions(precision=3, suppress=True)

Importing a dataset


In [157]:
dataset = pd.read_csv('50_Startups.csv')

In [158]:
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

Encoding categorical data


In [159]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [160]:
labelencoder = LabelEncoder()
X[:, 3] = labelencoder.fit_transform(X[:, 3])

In [161]:
onehotencoder = OneHotEncoder(categorical_features=[3])
X = onehotencoder.fit_transform(X).toarray()

Avoiding the Dummy Variable Trap


In [162]:
X = X[:, 1:]

Splitting the dataset into a Training set and a Test set


In [163]:
from sklearn.model_selection import train_test_split

In [164]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

Fitting Multiple Linear Regression to the Training set


In [165]:
from sklearn.linear_model import LinearRegression

In [166]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)


Out[166]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Predicting the Test set results


In [167]:
y_pred = regressor.predict(X_test)
pd.DataFrame([y_pred, y_test])


Out[167]:
0 1 2 3 4 5 6 7 8 9
0 103015.201598 132582.277608 132447.738452 71976.098513 178537.482211 116161.242302 67851.692097 98791.733747 113969.43533 167921.065695
1 103282.380000 144259.400000 146121.950000 77798.830000 191050.390000 105008.310000 81229.060000 97483.560000 110352.25000 166187.940000

Building an optimal model using Backward Elimination


In [168]:
import statsmodels.formula.api as sm

In [169]:
X = np.append(arr=np.ones((50, 1)).astype(int), values=X, axis=1)

In [173]:
X_opt = X[:, [0, 1, 2, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()


Out[173]:
OLS Regression Results
Dep. Variable: y R-squared: 0.951
Model: OLS Adj. R-squared: 0.945
Method: Least Squares F-statistic: 169.9
Date: Sat, 06 May 2017 Prob (F-statistic): 1.34e-27
Time: 04:19:45 Log-Likelihood: -525.38
No. Observations: 50 AIC: 1063.
Df Residuals: 44 BIC: 1074.
Df Model: 5
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 5.013e+04 6884.820 7.281 0.000 3.62e+04 6.4e+04
x1 198.7888 3371.007 0.059 0.953 -6595.030 6992.607
x2 -41.8870 3256.039 -0.013 0.990 -6604.003 6520.229
x3 0.8060 0.046 17.369 0.000 0.712 0.900
x4 -0.0270 0.052 -0.517 0.608 -0.132 0.078
x5 0.0270 0.017 1.574 0.123 -0.008 0.062
Omnibus: 14.782 Durbin-Watson: 1.283
Prob(Omnibus): 0.001 Jarque-Bera (JB): 21.266
Skew: -0.948 Prob(JB): 2.41e-05
Kurtosis: 5.572 Cond. No. 1.45e+06

In [174]:
X_opt = X[:, [0, 1, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()


Out[174]:
OLS Regression Results
Dep. Variable: y R-squared: 0.951
Model: OLS Adj. R-squared: 0.946
Method: Least Squares F-statistic: 217.2
Date: Sat, 06 May 2017 Prob (F-statistic): 8.49e-29
Time: 04:19:55 Log-Likelihood: -525.38
No. Observations: 50 AIC: 1061.
Df Residuals: 45 BIC: 1070.
Df Model: 4
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 5.011e+04 6647.870 7.537 0.000 3.67e+04 6.35e+04
x1 220.1585 2900.536 0.076 0.940 -5621.821 6062.138
x2 0.8060 0.046 17.606 0.000 0.714 0.898
x3 -0.0270 0.052 -0.523 0.604 -0.131 0.077
x4 0.0270 0.017 1.592 0.118 -0.007 0.061
Omnibus: 14.758 Durbin-Watson: 1.282
Prob(Omnibus): 0.001 Jarque-Bera (JB): 21.172
Skew: -0.948 Prob(JB): 2.53e-05
Kurtosis: 5.563 Cond. No. 1.40e+06

In [177]:
X_opt = X[:, [0, 3, 4, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()


Out[177]:
OLS Regression Results
Dep. Variable: y R-squared: 0.951
Model: OLS Adj. R-squared: 0.948
Method: Least Squares F-statistic: 296.0
Date: Sat, 06 May 2017 Prob (F-statistic): 4.53e-30
Time: 04:21:09 Log-Likelihood: -525.39
No. Observations: 50 AIC: 1059.
Df Residuals: 46 BIC: 1066.
Df Model: 3
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 5.012e+04 6572.353 7.626 0.000 3.69e+04 6.34e+04
x1 0.8057 0.045 17.846 0.000 0.715 0.897
x2 -0.0268 0.051 -0.526 0.602 -0.130 0.076
x3 0.0272 0.016 1.655 0.105 -0.006 0.060
Omnibus: 14.838 Durbin-Watson: 1.282
Prob(Omnibus): 0.001 Jarque-Bera (JB): 21.442
Skew: -0.949 Prob(JB): 2.21e-05
Kurtosis: 5.586 Cond. No. 1.40e+06

In [179]:
X_opt = X[:, [0, 3, 5]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()


Out[179]:
OLS Regression Results
Dep. Variable: y R-squared: 0.950
Model: OLS Adj. R-squared: 0.948
Method: Least Squares F-statistic: 450.8
Date: Sat, 06 May 2017 Prob (F-statistic): 2.16e-31
Time: 04:21:47 Log-Likelihood: -525.54
No. Observations: 50 AIC: 1057.
Df Residuals: 47 BIC: 1063.
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 4.698e+04 2689.933 17.464 0.000 4.16e+04 5.24e+04
x1 0.7966 0.041 19.266 0.000 0.713 0.880
x2 0.0299 0.016 1.927 0.060 -0.001 0.061
Omnibus: 14.677 Durbin-Watson: 1.257
Prob(Omnibus): 0.001 Jarque-Bera (JB): 21.161
Skew: -0.939 Prob(JB): 2.54e-05
Kurtosis: 5.575 Cond. No. 5.32e+05

In [180]:
X_opt = X[:, [0, 3]]
regressor_OLS = sm.OLS(endog=y, exog=X_opt).fit()
regressor_OLS.summary()


Out[180]:
OLS Regression Results
Dep. Variable: y R-squared: 0.947
Model: OLS Adj. R-squared: 0.945
Method: Least Squares F-statistic: 849.8
Date: Sat, 06 May 2017 Prob (F-statistic): 3.50e-32
Time: 04:21:53 Log-Likelihood: -527.44
No. Observations: 50 AIC: 1059.
Df Residuals: 48 BIC: 1063.
Df Model: 1
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
const 4.903e+04 2537.897 19.320 0.000 4.39e+04 5.41e+04
x1 0.8543 0.029 29.151 0.000 0.795 0.913
Omnibus: 13.727 Durbin-Watson: 1.116
Prob(Omnibus): 0.001 Jarque-Bera (JB): 18.536
Skew: -0.911 Prob(JB): 9.44e-05
Kurtosis: 5.361 Cond. No. 1.65e+05