In [1]:
%matplotlib inline
import pandas
import statistics
import numpy as np
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LassoLarsCV
from sklearn import preprocessing
In [2]:
# bug fix for display formats to avoid run time errors
pandas.set_option('display.float_format', lambda x:'%.2f'%x)
#load the data
data = pandas.read_csv('..\separatedData.csv')
# convert to numeric format
data["breastCancer100th"] = pandas.to_numeric(data["breastCancer100th"], errors='coerce')
data["meanSugarPerson"] = pandas.to_numeric(data["meanSugarPerson"], errors='coerce')
data["meanFoodPerson"] = pandas.to_numeric(data["meanFoodPerson"], errors='coerce')
data["meanCholesterol"] = pandas.to_numeric(data["meanCholesterol"], errors='coerce')
# listwise deletion of missing values
sub1 = data[['breastCancer100th', 'meanFoodPerson', 'meanCholesterol', 'meanSugarPerson']].dropna()
#Split into training and testing sets
predvar = sub1[[ 'meanSugarPerson', 'meanFoodPerson', 'meanCholesterol']]
targets = sub1['breastCancer100th']
In [3]:
# standardize predictors to have mean=0 and sd=1
predictors = predvar.copy()
predictors['meanSugarPerson']=preprocessing.scale(predictors['meanSugarPerson'].astype('float64'))
predictors['meanFoodPerson']=preprocessing.scale(predictors['meanFoodPerson'].astype('float64'))
predictors['meanCholesterol']=preprocessing.scale(predictors['meanCholesterol'].astype('float64'))
In [4]:
# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=False).fit(predictors,targets)
# print variable names and regression coefficients
dict(zip(predictors.columns, model.coef_))
Out[4]:
In [5]:
# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca()
plt.plot(m_log_alphas, model.coef_path_.T)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')
Out[5]:
In [6]:
# plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
Out[6]:
In [7]:
# MSE from training and test data
from sklearn.metrics import mean_squared_error
data_error = mean_squared_error(targets, model.predict(predictors))
print ('MSE')
print(data_error)
In [8]:
# R-square from training and test data
rsquared=model.score(predictors,targets)
print ('R-square')
print(rsquared)
In [ ]: