In [1]:
%matplotlib inline
import pickle
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt
from patsy import dmatrices
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, classification_report
In [2]:
with open('un_explore.pkl') as picklefile:
undata = pickle.load(picklefile)
In [3]:
for col in sorted(undata.columns):
print undata[col].count(), col
In [4]:
contr_use = ['condom_use_at_last_highrisk_sex_1524_years_old_men_percentage',
'condom_use_at_last_highrisk_sex_1524_years_old_women_percentage',
'condom_use_population_ages_1524_female__of_females_ages_1524',
'condom_use_population_ages_1524_male__of_males_ages_1524',
'condom_use_to_overall_contraceptive_use_among_currently_married_women_1549_years_old_percentage',
'condom_use_with_non_regular_partner__adults1549_female',
'condom_use_with_non_regular_partner__adults1549_male',
'contraceptive_prevalence__of_women_ages_1549']
In [5]:
for col in contr_use:
print undata[col].count(), col
In [6]:
def model_lr_data(dep, ind, data):
formula = dep + '~' + '+'.join(ind)
y, X = dmatrices(formula, data=data, return_type='dataframe' )
print X.shape
print '\n****PREDICTING\n' + dep
model = sm.OLS(y, X)
results = model.fit()
print results.summary()
In [7]:
hiv = 'hiv_incidence_rate_1549_years_old_percentage_midpoint'
pop = 'population_total'
oda = 'net_oda_mill'
gni = 'gni_per_capita_atlas_method_current_us'
prev = 'prevalence_of_hiv_total__of_population_ages_1549'
contr = 'condom_use_at_last_highrisk_sex_1524_years_old_women_percentage'
pvty = 'population_below_national_poverty_line_total_percentage'
abr = 'adolescent_birth_rate_per_1000_women'
In [8]:
features1 = [
'net_oda_mill',
pop,
gni,
prev,
contr,
pvty,
abr,
'year',
'mdgregions',
'islldc',
'isldc2014',
'isdeveloped'
]
In [9]:
model_lr_data(hiv, features1, undata)
The first model has a high R-squared, but is only using 15 observations, since many of the selected features include rows with missing values.
In [10]:
features2 = [
'net_oda_mill',
gni,
prev,
abr,
'year',
'mdgregions',
'isdeveloped'
]
In [11]:
model_lr_data(hiv, features2, undata)
Contraceptive data is too sparse to make accurate predictions. Prevelance of HIV appears significant, GNI appears to be significant, adolescent birth rate is close to significant. ODA is not significant.
In [12]:
ssafrica = undata[undata['mdgregions'] == 'Sub-Saharan Africa']
ssafrica.shape
Out[12]:
In [13]:
features3 = [
'net_oda_mill',
gni,
prev,
'year'
]
In [14]:
model_lr_data(hiv, features3, ssafrica)
ODA received and GNI do not appear to be significant. Preveleance of HIV and year are the most significant.
In [15]:
keys = ['countryname', 'iso3code', 'year', 'mdgregions', 'islldc', 'isldc2014', 'ismdgcountry']
In [16]:
df1 = undata[keys + [hiv]]
df1 = df1.set_index(keys)
In [17]:
df1 = df1.sort()
df2 = df1.pct_change()
df2 = df2.reset_index()
df2.columns = keys + ['hiv_pctchange']
df2.tail()
Out[17]:
In [18]:
undata.shape
Out[18]:
In [19]:
undata2 = pd.merge(undata, df2, how='outer', on=keys)
undata2.shape
Out[19]:
In [20]:
ssafrica2 = undata2[undata2['mdgregions'] == 'Sub-Saharan Africa']
In [21]:
features4 = [
'net_oda_mill',
gni,
prev,
abr,
contr,
'year',
'isdeveloped'
]
In [22]:
model_lr_data('hiv_pctchange', features4, undata2)
R-squared is very low at 0.17, no features appear to be significant, and the model is trained on very few observations.
In [23]:
undata3 = copy.deepcopy(undata2[np.isfinite(undata2['hiv_pctchange'])])
undata3.shape
Out[23]:
Create new variable isdecrease
.
In [24]:
isdecrease = []
for change in undata3['hiv_pctchange']:
if change >= 0:
isdecrease.append(0)
if change < 0:
isdecrease.append(1)
len(isdecrease)
Out[24]:
In [25]:
undata3['isdecrease'] = isdecrease
In [26]:
rs = 6
models = {'logistic': LogisticRegression(),
'naive bayes': GaussianNB(),
'SVM': SVC(random_state=rs, probability=True),
'gradient boosting': GradientBoostingClassifier(n_estimators=250, random_state=rs),
'decision tree': DecisionTreeClassifier(random_state=rs),
'random forest': RandomForestClassifier(n_estimators=250, random_state=rs),
'extra trees': ExtraTreesClassifier(n_estimators=250, random_state=rs)
}
In [27]:
def process_data(dep, ind, data):
formula = dep + '~' + '+'.join(ind)
y, X = dmatrices(formula, data=data, return_type='dataframe')
X = X.iloc[:,1:]
y = y.iloc[:,0]
X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size = 0.25, random_state=rs)
return X_tr, X_ts, y_tr, y_ts
In [28]:
def get_scores(model_dict):
for mname, m in model_dict.iteritems():
print "*** %s" % mname
m.fit(X_tr, y_tr)
preds = m.predict(X_ts)
proba = m.predict_proba(X_ts)
print 'accuracy: %f' % accuracy_score(y_ts, preds)
print 'precision: %f' % precision_score(y_ts, preds)
print 'recall: %f' % recall_score(y_ts, preds)
print 'f1 score: %f' % f1_score(y_ts, preds)
print '\n'
all_preds[mname] = preds
all_proba[mname] = proba
In [29]:
def get_crossval_scores(X, y, model_dict):
print 'CROSS VALIDATION SCORES'
for mname, m in model_dict.iteritems():
print '\n*** %s' % mname
acc = np.mean(cross_val_score(m, X, y, scoring='accuracy'))
pre = np.mean(cross_val_score(m, X, y, scoring='precision'))
rec = np.mean(cross_val_score(m, X, y, scoring='recall'))
f1 = np.mean(cross_val_score(m, X, y, scoring='f1'))
print 'cv score: %f' % np.mean(cross_val_score(m, X, y))
print 'accuracy: %f' % acc
print 'precision: %f' % pre
print 'recall: %f' % rec
print 'f1 score: %f' % f1
In [30]:
ssafrica3 = undata3[undata3['mdgregions'] == 'Sub-Saharan Africa']
In [31]:
features5 = [
'net_oda_mill',
gni,
prev,
'isdeveloped'
]
In [32]:
X_tr, X_ts, y_tr, y_ts = process_data('isdecrease', features5, ssafrica3)
X_tr.shape, X_ts.shape
Out[32]:
In [33]:
all_preds = {}
all_proba = {}
get_scores(models)
In [34]:
get_crossval_scores(X_tr, y_tr, models)
In [35]:
trees = ExtraTreesClassifier(random_state=rs)
trees.fit(X_tr, y_tr)
importances = trees.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(len(features5) - 1):
print("%d. feature: %s (%f)" % (f + 1, features5[indices[f]], importances[indices[f]]))
In [36]:
forest = RandomForestClassifier(random_state=rs)
forest.fit(X_tr, y_tr)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(len(features5) - 1):
print("%d. feature: %s (%f)" % (f + 1, features5[indices[f]], importances[indices[f]]))
In [37]:
gradient = GradientBoostingClassifier(random_state=rs)
gradient.fit(X_tr, y_tr)
importances = gradient.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(len(features5) - 1):
print("%d. feature: %s (%f)" % (f + 1, features5[indices[f]], importances[indices[f]]))
The Extra Trees Classifier, Random Forest, and Grandient Boosting Classifier perform reasonably well. The Gradient Boosting model is the only model with ODA as the most important feature, which warrants further investigation.
In [ ]: