In [55]:
%matplotlib inline
import pandas as pd
import statistics
import numpy as np
import seaborn as sb
import matplotlib.pylab as plt
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.cluster import KMeans
import scipy
import warnings
warnings.filterwarnings('ignore')
In [4]:
# bug fix for display formats to avoid run time errors
pd.set_option('display.float_format', lambda x:'%.2f'%x)
#load the data
data = pd.read_csv('../QOG_dataset.csv')
# convert to numeric format
data["epi_ehair"] = pd.to_numeric(data["epi_ehair"], errors='coerce')
data["epi_ehwater"] = pd.to_numeric(data["epi_ehwater"], errors='coerce')
data["imf_gdpppps"] = pd.to_numeric(data["imf_gdpppps"], errors='coerce')
data["wdi_hepcusd11"] = pd.to_numeric(data["wdi_hepcusd11"], errors='coerce')
data["wdi_smokfem"] = pd.to_numeric(data["wdi_smokfem"], errors='coerce')
data["wdi_smokmal"] = pd.to_numeric(data["wdi_smokmal"], errors='coerce')
data["wdi_tubcdr"] = pd.to_numeric(data["wdi_tubcdr"], errors='coerce')
data["wdi_tubinc"] = pd.to_numeric(data["wdi_tubinc"], errors='coerce')
data["wdi_tubtsr"] = pd.to_numeric(data["wdi_tubtsr"], errors='coerce')
data["wef_bit"] = pd.to_numeric(data["wef_bit"], errors='coerce')
# listwise deletion of missing values
sub1 = data[['epi_ehair', 'epi_ehwater', 'imf_gdpppps', 'wdi_hepcusd11', 'wdi_smokfem',
'wdi_smokmal', 'wdi_tubcdr', 'wdi_tubinc', 'wdi_tubtsr', 'wef_bit']].dropna()
print(len(sub1))
In [5]:
# Univariate histogram
sb.distplot(sub1["epi_ehair"], #kde=False,
kde_kws={"color":"g","lw":3,"label":"KDE Estim","alpha":0.5},
hist_kws={"alpha":0.4,"label":"Freq"})
plt.xlabel('Air Quality.')
plt.ylabel('Countries.')
#plt.title('')
print('Describing the variable:\n')
print(sub1["epi_ehair"].describe())
In [6]:
sb.distplot(sub1["epi_ehwater"], #kde=False,
kde_kws={"color":"g","lw":3,"label":"KDE Estim","alpha":0.5},
hist_kws={"alpha":0.4,"label":"Freq"})
plt.xlabel('Water and Sanitation.')
plt.ylabel('Countries.')
#plt.title('')
print('Describing the variable:\n')
print(sub1["epi_ehwater"].describe())
In [7]:
sb.distplot(sub1["imf_gdpppps"], #kde=False,
kde_kws={"color":"g","lw":3,"label":"KDE Estim","alpha":0.5},
hist_kws={"alpha":0.4,"label":"Freq"})
plt.xlabel('GDP (PPP) (share of world total) (%).')
plt.ylabel('Countries.')
#plt.title('')
print('Describing the variable:\n')
print(sub1["imf_gdpppps"].describe())
In [8]:
sb.distplot(sub1["wdi_hepcusd11"], #kde=False,
kde_kws={"color":"g","lw":3,"label":"KDE Estim","alpha":0.5},
hist_kws={"alpha":0.4,"label":"Freq"})
plt.xlabel('Health expenditure per capita, PPP.')
plt.ylabel('Countries.')
#plt.title('')
print('Describing the variable:\n')
print(sub1["wdi_hepcusd11"].describe())
In [9]:
sb.distplot(sub1["wdi_smokfem"], #kde=False,
kde_kws={"color":"g","lw":3,"label":"KDE Estim","alpha":0.5},
hist_kws={"alpha":0.4,"label":"Freq"})
plt.xlabel('Smoking prevalence, females (% of adults).')
plt.ylabel('Countries.')
#plt.title('')
print('Describing the variable:\n')
print(sub1["wdi_smokfem"].describe())
In [10]:
sb.distplot(sub1["wdi_smokmal"], #kde=False,
kde_kws={"color":"g","lw":3,"label":"KDE Estim","alpha":0.5},
hist_kws={"alpha":0.4,"label":"Freq"})
plt.xlabel('Smoking prevalence, males (% of adults).')
plt.ylabel('Countries.')
#plt.title('')
print('Describing the variable:\n')
print(sub1["wdi_smokmal"].describe())
In [11]:
sb.distplot(sub1["wdi_tubcdr"], #kde=False,
kde_kws={"color":"g","lw":3,"label":"KDE Estim","alpha":0.5},
hist_kws={"alpha":0.4,"label":"Freq"})
plt.xlabel('Tuberculosis case detection rate (%, all forms).')
plt.ylabel('Countries.')
#plt.title('')
print('Describing the variable:\n')
print(sub1["wdi_tubcdr"].describe())
In [12]:
sb.distplot(sub1["wdi_tubinc"], #kde=False,
kde_kws={"color":"g","lw":3,"label":"KDE Estim","alpha":0.5},
hist_kws={"alpha":0.4,"label":"Freq"})
plt.xlabel('Incidence of tuberculosis (per 100,000 people).')
plt.ylabel('Countries.')
#plt.title('')
print('Describing the variable:\n')
print(sub1["wdi_tubinc"].describe())
In [13]:
sb.distplot(sub1["wdi_tubtsr"], #kde=False,
kde_kws={"color":"g","lw":3,"label":"KDE Estim","alpha":0.5},
hist_kws={"alpha":0.4,"label":"Freq"})
plt.xlabel('Tuberculosis treatment success rate (% of new cases).')
plt.ylabel('Countries.')
#plt.title('')
print('Describing the variable:\n')
print(sub1["wdi_tubtsr"].describe())
In [14]:
sb.distplot(sub1["wef_bit"], #kde=False,
kde_kws={"color":"g","lw":3,"label":"KDE Estim","alpha":0.5},
hist_kws={"alpha":0.4,"label":"Freq"})
plt.xlabel('Business impact of tuberculosis.')
plt.ylabel('Countries.')
#plt.title('')
print('Describing the variable:\n')
print(sub1["wef_bit"].describe())
In [15]:
g = sb.pairplot(sub1, kind='reg', diag_kind="kde", diag_kws=dict(shade=True))
In [49]:
def changeLabel(replacements, g, number):
for j in range(number):
xlabel = g.axes[0][j].get_xlabel()
ylabel = g.axes[0][j].get_ylabel()
if xlabel in replacements.keys():
g.axes[0][j].set_xlabel(replacements[xlabel])
if ylabel in replacements.keys():
g.axes[0][j].set_ylabel(replacements[ylabel])
In [53]:
replacements = {'epi_ehair' : 'Air Quality',
'epi_ehwater': 'Water and Sanitation access',
'imf_gdpppps': 'GDP PPP share of world total',
'wdi_hepcusd11': 'Health expenditure per capita',
'wdi_smokfem' : 'Smoking prevalence females',
'wdi_smokmal' : 'Smoking prevalence males',
'wdi_tubcdr' : 'TB case detection rate',
'wdi_tubinc' : 'Incidence of TB',
'wdi_tubtsr' : 'TB treatment success rate',
'wef_bit' : 'Business impact of TB'}
g = sb.pairplot(sub1, kind='reg', diag_kind="kde", diag_kws=dict(shade=True), y_vars=["wdi_tubtsr"],
x_vars=['epi_ehair', 'epi_ehwater', 'imf_gdpppps'], size=3.5)
changeLabel(replacements, g, 3)
g = sb.pairplot(sub1, kind='reg', diag_kind="kde", diag_kws=dict(shade=True), y_vars=["wdi_tubtsr"],
x_vars=['wdi_hepcusd11', 'wdi_smokfem', 'wdi_smokmal'], size=3.5)
changeLabel(replacements, g, 3)
g = sb.pairplot(sub1, kind='reg', diag_kind="kde", diag_kws=dict(shade=True), y_vars=["wdi_tubtsr"],
x_vars=['wdi_tubcdr', 'wdi_tubinc', 'wef_bit'], size=3.5)
changeLabel(replacements, g, 3)
In [58]:
print(scipy.stats.pearsonr(sub1['epi_ehair'], sub1['wdi_tubtsr']))
print(scipy.stats.pearsonr(sub1['epi_ehwater'], sub1['wdi_tubtsr']))
print(scipy.stats.pearsonr(sub1['imf_gdpppps'], sub1['wdi_tubtsr']))
print(scipy.stats.pearsonr(sub1['wdi_hepcusd11'], sub1['wdi_tubtsr']))
print(scipy.stats.pearsonr(sub1['wdi_smokfem'], sub1['wdi_tubtsr']))
print(scipy.stats.pearsonr(sub1['wdi_smokmal'], sub1['wdi_tubtsr']))
print(scipy.stats.pearsonr(sub1['wdi_tubcdr'], sub1['wdi_tubtsr']))
print(scipy.stats.pearsonr(sub1['wdi_tubinc'], sub1['wdi_tubtsr']))
print(scipy.stats.pearsonr(sub1['wef_bit'], sub1['wdi_tubtsr']))
In [115]:
from sklearn.linear_model import LassoLarsCV
# standardize predictors to have mean=0 and sd=1
targets = sub1['wdi_tubtsr']
predictors = sub1[['epi_ehair', 'epi_ehwater', 'imf_gdpppps', 'wdi_hepcusd11', 'wdi_smokfem',
'wdi_smokmal', 'wdi_tubcdr', 'wdi_tubinc', 'wef_bit']].copy()
for i in predictors:
predictors[i] = preprocessing.scale(predictors[i].astype('float64'))
# split data into train and test sets - Train = 70%, Test = 30%
#pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, targets,
# test_size=.3, random_state=123)
# This will not be done because the dataset have low samples
# specify the lasso regression model
model=LassoLarsCV(cv=10, precompute=False, fit_intercept=True).fit(predictors, targets)
print('Intercept: \f', model.intercept_)
# print variable names and regression coefficients
dict(zip(predictors.columns, model.coef_))
Out[115]:
In [118]:
len(model.cv_mse_path_)
Out[118]:
In [132]:
# plot coefficient progression
m_log_alphas = -np.log10(model.alphas_)
#ax = plt.gca()
plt.figure(figsize=(10,6))
plt.plot(m_log_alphas, model.coef_path_.T)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.ylabel('Regression Coefficients')
plt.xlabel('-log(alpha)')
plt.title('Regression Coefficients Progression for Lasso Paths')
Out[132]:
In [133]:
# plot mean square error for each fold
m_log_alphascv = -np.log10(model.cv_alphas_)
plt.figure(figsize=(10,6))
plt.plot(m_log_alphascv, model.cv_mse_path_, ':')
plt.plot(m_log_alphascv, model.cv_mse_path_.mean(axis=-1), 'k',
label='Average across the folds', linewidth=2)
plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
label='alpha CV')
plt.legend()
plt.xlabel('-log(alpha)')
plt.ylabel('Mean squared error')
plt.title('Mean squared error on each fold')
Out[133]:
In [85]:
# MSE from training and test data
from sklearn.metrics import mean_squared_error
analysis_error = mean_squared_error(targets, model.predict(predictors))
print ('Data MSE')
print(analysis_error)
In [83]:
# R-square from training and test data
rsquared=model.score(predictors,targets)
print ('Data R-square')
print(rsquared)
In [140]:
import statsmodels.formula.api as smf
reg1 = smf.ols('wdi_tubtsr ~ epi_ehair + '+
'epi_ehwater + '+
'imf_gdpppps + '+
'wdi_hepcusd11 + '+
'wdi_smokfem + '+
'wdi_smokmal + '+
'wdi_tubcdr + '+
'wdi_tubinc + '+
'wef_bit', data=sub1).fit()
print (reg1.summary())
In [ ]: