In [1]:
%matplotlib inline

import pandas
import statistics
import numpy as np
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LassoLarsCV
from sklearn import preprocessing

In [2]:
# bug fix for display formats to avoid run time errors
pandas.set_option('display.float_format', lambda x:'%.2f'%x)

#load the data
data = pandas.read_csv('..\separatedData.csv')

# convert to numeric format
data["breastCancer100th"] = pandas.to_numeric(data["breastCancer100th"], errors='coerce')
data["meanSugarPerson"]   = pandas.to_numeric(data["meanSugarPerson"], errors='coerce')
data["meanFoodPerson"]   = pandas.to_numeric(data["meanFoodPerson"], errors='coerce')
data["meanCholesterol"]   = pandas.to_numeric(data["meanCholesterol"], errors='coerce')

# listwise deletion of missing values
sub1 = data[['breastCancer100th', 'meanFoodPerson', 'meanCholesterol', 'meanSugarPerson']].dropna()

#Split into training and testing sets
predvar = sub1[[ 'meanSugarPerson', 'meanFoodPerson', 'meanCholesterol']]
targets = sub1['breastCancer100th']

In [3]:
# standardize predictors to have mean=0 and sd=1
predictors = predvar.copy()
predictors['meanSugarPerson']=preprocessing.scale(predictors['meanSugarPerson'].astype('float64'))
predictors['meanFoodPerson']=preprocessing.scale(predictors['meanFoodPerson'].astype('float64'))
predictors['meanCholesterol']=preprocessing.scale(predictors['meanCholesterol'].astype('float64'))

In [4]:
# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=False).fit(predictors,targets)

# print variable names and regression coefficients
dict(zip(predictors.columns, model.coef_))


Out[4]:
{'meanCholesterol': 14.454522951763355,
 'meanFoodPerson': 5.6180418717114913,
 'meanSugarPerson': 1.1077912357101849}

In [5]:
# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, model.coef_path_.T)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')


Out[5]:
<matplotlib.text.Text at 0xbfdd935208>

In [6]:
# plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
         label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
            label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')


Out[6]:
<matplotlib.text.Text at 0xbfdde19160>

In [7]:
# MSE from training and test data
from sklearn.metrics import mean_squared_error
data_error = mean_squared_error(targets, model.predict(predictors))
print ('MSE')
print(data_error)


MSE
176.079857367

In [8]:
# R-square from training and test data
rsquared=model.score(predictors,targets)
print ('R-square')
print(rsquared)


R-square
0.700067190794

In [ ]: