In [46]:
# Import libraries needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LassoLarsCV
from sklearn import preprocessing
%matplotlib inline
In [21]:
# Make results reproducible
np.random.seed(12345)
In [28]:
#Load the data
df = pd.read_csv('gapminder.csv')
In [59]:
#select the independent variables or explanatory variables
variables = ['incomeperperson', 'alcconsumption', 'co2emissions', 'femaleemployrate',
'internetuserate', 'lifeexpectancy','polityscore','employrate',
'urbanrate','breastcancerper100th']
In [30]:
# convert to numeric format
for variable in variables:
df[variable] = pd.to_numeric(df[variable], errors='coerce')
In [31]:
# listwise deletion of missing values
subset = df[variables].dropna()
In [32]:
# Print the rows and columns of the data frame
print('Size of study data')
print(subset.shape)
print("\n")
In [37]:
# Data Management
# Remove the last variable from the list since the target is derived from it
variables.pop(9)
Out[37]:
In [38]:
# Center and scale data
for variable in variables:
subset[variable]=preprocessing.scale(subset[variable].astype('float64'))
In [39]:
# Identify contries with a high breast cancer case using the MAD (mean absolute deviation) method
subset['absolute_deviations'] = np.absolute(subset['breastcancerper100th'] - np.median(subset['breastcancerper100th']))
MAD = np.mean(subset['absolute_deviations'])
In [40]:
# This function converts the breast cancer per 100th absolute deviations to a high breast cancer flag
def high_breastcancer_flag(absolute_deviations):
threshold = 3
if (absolute_deviations/MAD) > threshold:
return 1
else:
return 0
subset['High BreastCancer'] = subset['absolute_deviations'].apply(high_breastcancer_flag)
In [41]:
predictors = subset[variables]
targets = subset['High BreastCancer']
In [42]:
#Split into training and testing sets
training_data, test_data, training_target, test_target = train_test_split(predictors, targets, test_size=.3)
In [43]:
model=LassoLarsCV(cv=10, precompute=False).fit(training_data, training_target)
In [53]:
# Show the regression coeff of the predictors
dict(zip(predictors.columns,model.coef_))
Out[53]:
Note: In the above output, the variables with coefecient value 0.0 are insignificant in the model and thus have been removed from the final model. So the results show that 9 variables 5 were selected by the final model. Also, as we scaled the data, we can use the size of the regression coeffecients to determine which predictors are the strongest predictors of breastcancerper100th. Here, incomeperperson has the highest regression coeff followed by urbanrate, femaleemployrate, co2emission and polityscore. These five predictor variables are positively associated to breast cancer.
In [49]:
feature_name = list(predictors.columns.values)
feature_coefficient = list(model.coef_)
features = pd.DataFrame({'Variable':feature_name, 'Regression Coefficients':feature_coefficient}).sort_values(by='Regression Coefficients', ascending=False)
print(features.head(len(feature_name)))
In [56]:
m_log_alphas = -np.log10(model.alphas_)
ax = plt.gca() # set the plot axis
plt.plot(m_log_alphas, model.coef_path_.T) # applying the -log10 transformation to the alpha values simply to make the values easier to read.
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV') # A black vertical line will be plotted at the negative log10 transformed alpha value for the selected model
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')
Out[56]:
In [52]:
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure()
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k', label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k', label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
Out[52]:
In [57]:
from sklearn.metrics import mean_squared_error
train_error = mean_squared_error(training_target, model.predict(training_data))
test_error = mean_squared_error(test_target, model.predict(test_data))
print ('training data MSE')
print(train_error)
print ('test data MSE')
print(test_error)
The selected model has predicted better in the test data as compared to the train data. Also the test Mean Square Error (MSE) is close to the training MSE this suggests that prediction accuracy scores is pretty stable across the two datasets.
In [58]:
rsquared_train=model.score(training_data, training_target)
rsquared_test=model.score(test_data, test_target)
print ('training data R-square')
print(rsquared_train)
print ('test data R-square')
print(rsquared_test)
The R-squared values of 0.41 and 0.44 indicate that the selected model explained 41 to 44 percent variance for breast cancer in women for training and test dataset respectively.
In [ ]: