In [1]:
import numpy as np
import pandas as pd
data_path = "data/Advertising.csv"
# or load the dataset directly from the link
# # data_link = "http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv"
data = pd.read_csv(data_path, index_col=0)
In [2]:
# display the first 5 rows
data.head()
Out[2]:
In [3]:
import seaborn as sns
# allow plots to appear within the notebook
%matplotlib inline
In [4]:
# visualize the relationship between the features and the response using scatterplots
sns.pairplot(data, x_vars=['TV','Radio','Newspaper'], y_vars='Sales', size=7, aspect=0.7, kind='reg')
Out[4]:
In [5]:
# feature names
feature_names = ['TV', 'Radio', 'Newspaper']
In [6]:
# matrix training instances
X = data[feature_names]
# target output
label = 'Sales'
y = data[label]
In [7]:
print(X.head())
print('-------------------------')
print(y.head())
In [8]:
# How many training examples do we got?
# (row, col) -> (number of training examples, number of variables/features)
X.shape
Out[8]:
In [9]:
# Splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
# As we are spliting data randomly,
# We use random_state=1 for reproducibility of this kernel results on your machine
# otherwise you would get differnt coeficients
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
In [10]:
# default split is 75% for training and 25% for testing
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
In [11]:
# Linear Regression Model
# import model
from sklearn.linear_model import LinearRegression
# import module to calculate model perfomance metrics
from sklearn import metrics
# instantiate
linreg = LinearRegression()
# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)
Out[11]:
In [12]:
# print the intercept and coefficients
print(linreg.intercept_)
print(linreg.coef_)
In [13]:
# pair the feature names with the coefficients
list(zip(feature_names, linreg.coef_))
Out[13]:
In [14]:
# make predictions on the testing set
y_pred = linreg.predict(X_test)
Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors:
$$\sqrt{\frac 1n\sum_{i=1}^n(y_i-\hat{y}_i)^2}$$
In [15]:
# We want to compute the RMSQ using the true testing(y) and our predicted(Y)
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
In [16]:
# create a Python list of feature names
feature_cols = ['TV', 'Radio']
# use the list to select a subset of the original DataFrame
X = data[feature_cols]
# select a Series from the DataFrame
y = data.Sales
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)
# make predictions on the testing set
y_pred = linreg.predict(X_test)
# compute the RMSE of our predictions
print(np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
First we got RMSE of 1.40 but after removing Newspapper we got 1.38, What does it mean?
just in case of doubt or you want to share your answer with me Tweet me on https://twitter.com/fumodavi or email me on coder.davidfumo@gmail.com