In [1]:
# conventional way to import pandas
import pandas as pd
In [2]:
# read CSV file directly from a URL and save the results
data = pd.read_csv('http://www-bcf.usc.edu/~gareth/ISL/Advertising.csv', index_col=0)
# display the first 5 rows
data.head()
Out[2]:
Primary object types:
In [3]:
# display the last 5 rows
data.tail()
Out[3]:
In [4]:
# check the shape of the DataFrame (rows, columns)
data.shape
Out[4]:
What are the features?
In [6]:
# conventional way to import seaborn
import seaborn as sns
# allow plots to appear within the notebook
%matplotlib inlineb
In [7]:
# visualize the relationship between the features and the response using scatterplots
sns.pairplot(data, x_vars=['TV','Radio','Newspaper'], y_vars='Sales', size=7, aspect=0.7, kind='reg')
Out[7]:
In [8]:
# create a Python list of feature names
feature_cols = ['TV', 'Radio', 'Newspaper']
# use the list to select a subset of the original DataFrame
X = data[feature_cols]
# equivalent command to do this in one line
X = data[['TV', 'Radio', 'Newspaper']]
# print the first 5 rows
X.head()
Out[8]:
In [9]:
# check the type and shape of X
print type(X)
print X.shape
In [10]:
# select a Series from the DataFrame
y = data['Sales']
# equivalent command that works if there are no spaces in the column name
y = data.Sales
# print the first 5 values
y.head()
Out[10]:
In [11]:
# check the type and shape of y
print type(y)
print y.shape
In [13]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
In [14]:
# default split is 75% for training and 25% for testing
print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape
In [15]:
# import model
from sklearn.linear_model import LinearRegression
# instantiate
linreg = LinearRegression()
# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)
Out[15]:
In [16]:
# print the intercept and coefficients
print linreg.intercept_
print linreg.coef_
In [17]:
# pair the feature names with the coefficients
zip(feature_cols, linreg.coef_)
Out[17]:
Important notes:
In [18]:
# make predictions on the testing set
y_pred = linreg.predict(X_test)
We need an evaluation metric in order to compare our predictions with the actual values!
In [19]:
# define true and predicted response values
true = [100, 50, 30, 20]
pred = [90, 50, 50, 30]
Mean Absolute Error (MAE) is the mean of the absolute value of the errors: $$\\1/n∑_{i=1}^n|y_i-ŷ_i|$$
In [20]:
# calculate MAE by hand
print (10 + 0 + 20 + 10)/4.
# calculate MAE using scikit-learn
from sklearn import metrics
print metrics.mean_absolute_error(true, pred)
Mean Squared Error (MSE) is the mean of the squared errors: $$\\1/n∑_{i=1}^n(y_i-ŷ_i)^2$$
In [21]:
# calculate MSE by hand
print (10**2 + 0**2 + 20**2 + 10**2)/4.
# calculate MSE using scikit-learn
print metrics.mean_squared_error(true, pred)
Root Mean Squared Error (RMSE) is the square root of the mean of the squared errors: $$\\sqrt{1/n∑_{i=1}^n(y_i-ŷ_i)^2}$$
In [22]:
# calculate RMSE by hand
import numpy as np
print np.sqrt((10**2 + 0**2 + 20**2 + 10**2)/4.)
# calculate RMSE using scikit-learn
print np.sqrt(metrics.mean_squared_error(true, pred))
In [25]:
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))
In [26]:
# create a Python list of feature names
feature_cols = ['TV', 'Radio']
# use the list to select a subset of the original DataFrame
X = data[feature_cols]
# select a Series from the DataFrame
y = data.Sales
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
# fit the model to the training data (learn the coefficients)
linreg.fit(X_train, y_train)
# make predictions on the testing set
y_pred = linreg.predict(X_test)
# compute the RMSE of our predictions
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))
The RMSE decreased when we removed Newspaper from the model. (Error is something we want to minimize, so a lower number for RMSE is better.) Thus, it is unlikely that this feature is useful for predicting Sales, and should be removed from the model.
In [27]:
from IPython.core.display import HTML
def css_styling():
styles = open("styles/custom.css", "r").read()
return HTML(styles)
css_styling()
Out[27]: