Linear Regression


In [2]:
%pylab inline


Populating the interactive namespace from numpy and matplotlib

In [90]:
# Import libraries
from __future__ import absolute_import, division, print_function

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('/Users/omojumiller/mycode/tools')

import numpy as np
import pandas as pd
import scipy.stats as st
from tools import plot_features_by_target



# Graphing Libraries
import matplotlib.pyplot as pyplt
import seaborn as sns
sns.set_style("whitegrid")  

# Configure for presentation
np.set_printoptions(threshold=50, linewidth=50)
import matplotlib as mpl
mpl.rc('font', size=16)

from IPython.display import display

In [91]:
df = pd.read_csv('data/Advertising.csv')
df.shape


Out[91]:
(200, 5)

In [92]:
df.head()


Out[92]:
Unnamed: 0 TV Radio Newspaper Sales
0 1 230.1 37.8 69.2 22.1
1 2 44.5 39.3 45.1 10.4
2 3 17.2 45.9 69.3 9.3
3 4 151.5 41.3 58.5 18.5
4 5 180.8 10.8 58.4 12.9

In [93]:
x_vars = ['TV', 'Radio', 'Newspaper']
y_vars=['Sales']

In [94]:
plot_features_by_target(df, x_vars, y_vars)



In [95]:
g = sns.PairGrid(df.ix[:, [1,2,3,4]], diag_sharey=False)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(pyplt.scatter)
g.map_diag(sns.kdeplot, lw=3);



In [96]:
X = df.ix[:, [1,2,3]]
y = df.ix[:, [4]]

In [97]:
from sklearn.cross_validation import train_test_split
from sklearn import linear_model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)



reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
y_preds = reg.predict(X_test)

In [102]:
print( "slope of regression is", reg.coef_)
print ("intercepts of regression is %.2f" % reg.intercept_)

print ("\n ********stats on dataset********\n")
print ("r-squared score on testing data: ", reg.score(X_test, y_test))
print ("r-squared score on training data: ", reg.score(X_train, y_train))


slope of regression is [[ 0.04359498  0.19927632  0.0146631 ]]
intercepts of regression is 2.58

 ********stats on dataset********

r-squared score on testing data:  0.872100481605
r-squared score on training data:  0.904261364891

In [110]:
import glob
glob.glob('*_[0-9].*')


Out[110]:
[]

In [104]:
ls


data/                                   data_ML_LinearRegression.ipynb
dataSampling_2_ABTesting.ipynb          data_ML_LogisticRegression.ipynb
dataSampling_NormalApproximation.ipynb  images/

In [ ]: