Linear Regression



In [2]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [90]:

    
# Import libraries
from __future__ import absolute_import, division, print_function

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

import sys
sys.path.append('/Users/omojumiller/mycode/tools')

import numpy as np
import pandas as pd
import scipy.stats as st
from tools import plot_features_by_target



# Graphing Libraries
import matplotlib.pyplot as pyplt
import seaborn as sns
sns.set_style("whitegrid")  

# Configure for presentation
np.set_printoptions(threshold=50, linewidth=50)
import matplotlib as mpl
mpl.rc('font', size=16)

from IPython.display import display



In [91]:

    
df = pd.read_csv('data/Advertising.csv')
df.shape









    Out[91]:





(200, 5)



In [92]:

    
df.head()



In [93]:

    
x_vars = ['TV', 'Radio', 'Newspaper']
y_vars=['Sales']



In [94]:

    
plot_features_by_target(df, x_vars, y_vars)



In [95]:

    
g = sns.PairGrid(df.ix[:, [1,2,3,4]], diag_sharey=False)
g.map_lower(sns.kdeplot, cmap="Blues_d")
g.map_upper(pyplt.scatter)
g.map_diag(sns.kdeplot, lw=3);



In [96]:

    
X = df.ix[:, [1,2,3]]
y = df.ix[:, [4]]



In [97]:

    
from sklearn.cross_validation import train_test_split
from sklearn import linear_model

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)



reg = linear_model.LinearRegression()
reg.fit(X_train, y_train)
y_preds = reg.predict(X_test)



In [102]:

    
print( "slope of regression is", reg.coef_)
print ("intercepts of regression is %.2f" % reg.intercept_)

print ("\n ********stats on dataset********\n")
print ("r-squared score on testing data: ", reg.score(X_test, y_test))
print ("r-squared score on training data: ", reg.score(X_train, y_train))









    



slope of regression is [[ 0.04359498  0.19927632  0.0146631 ]]
intercepts of regression is 2.58

 ********stats on dataset********

r-squared score on testing data:  0.872100481605
r-squared score on training data:  0.904261364891



In [110]:

    
import glob
glob.glob('*_[0-9].*')









    Out[110]:





[]



In [104]:

    
ls









    



data/                                   data_ML_LinearRegression.ipynb
dataSampling_2_ABTesting.ipynb          data_ML_LogisticRegression.ipynb
dataSampling_NormalApproximation.ipynb  images/



In [ ]:

	Unnamed: 0	TV	Radio	Newspaper	Sales
0	1	230.1	37.8	69.2	22.1
1	2	44.5	39.3	45.1	10.4
2	3	17.2	45.9	69.3	9.3
3	4	151.5	41.3	58.5	18.5
4	5	180.8	10.8	58.4	12.9