In [1]:
    
%matplotlib inline
import pickle
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt
from patsy import dmatrices
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, classification_report
    
In [2]:
    
with open('un_explore.pkl') as picklefile:
    undata = pickle.load(picklefile)
    
In [3]:
    
for col in sorted(undata.columns):
    print undata[col].count(), col
    
    
In [4]:
    
contr_use = ['condom_use_at_last_highrisk_sex_1524_years_old_men_percentage',
    'condom_use_at_last_highrisk_sex_1524_years_old_women_percentage',
    'condom_use_population_ages_1524_female__of_females_ages_1524',
    'condom_use_population_ages_1524_male__of_males_ages_1524',
    'condom_use_to_overall_contraceptive_use_among_currently_married_women_1549_years_old_percentage',
    'condom_use_with_non_regular_partner__adults1549_female',
    'condom_use_with_non_regular_partner__adults1549_male',
    'contraceptive_prevalence__of_women_ages_1549']
    
In [5]:
    
for col in contr_use:
    print undata[col].count(), col
    
    
In [6]:
    
def model_lr_data(dep, ind, data):
    formula = dep + '~' + '+'.join(ind)
    y, X = dmatrices(formula, data=data, return_type='dataframe' )
    print X.shape
    print '\n****PREDICTING\n' + dep
    model = sm.OLS(y, X)
    results = model.fit()
    print results.summary()
    
In [7]:
    
hiv = 'hiv_incidence_rate_1549_years_old_percentage_midpoint'
pop = 'population_total'
oda = 'net_oda_mill'
gni = 'gni_per_capita_atlas_method_current_us'
prev = 'prevalence_of_hiv_total__of_population_ages_1549'
contr = 'condom_use_at_last_highrisk_sex_1524_years_old_women_percentage'
pvty = 'population_below_national_poverty_line_total_percentage'
abr = 'adolescent_birth_rate_per_1000_women'
    
In [8]:
    
features1 = [
            'net_oda_mill', 
            pop,
            gni,
            prev,
            contr,
            pvty,
            abr,
            'year', 
            'mdgregions',
            'islldc',
            'isldc2014',
            'isdeveloped'
           ]
    
In [9]:
    
model_lr_data(hiv, features1, undata)
    
    
    
The first model has a high R-squared, but is only using 15 observations, since many of the selected features include rows with missing values.
In [10]:
    
features2 = [
            'net_oda_mill', 
            gni,
            prev,
            abr,
            'year', 
            'mdgregions',
            'isdeveloped'
           ]
    
In [11]:
    
model_lr_data(hiv, features2, undata)
    
    
Contraceptive data is too sparse to make accurate predictions. Prevelance of HIV appears significant, GNI appears to be significant, adolescent birth rate is close to significant. ODA is not significant.
In [12]:
    
ssafrica = undata[undata['mdgregions'] == 'Sub-Saharan Africa']
ssafrica.shape
    
    Out[12]:
In [13]:
    
features3 = [
            'net_oda_mill', 
            gni,
            prev,
            'year'
           ]
    
In [14]:
    
model_lr_data(hiv, features3, ssafrica)
    
    
ODA received and GNI do not appear to be significant. Preveleance of HIV and year are the most significant.
In [15]:
    
keys = ['countryname', 'iso3code', 'year', 'mdgregions', 'islldc', 'isldc2014', 'ismdgcountry']
    
In [16]:
    
df1 = undata[keys + [hiv]]
df1 = df1.set_index(keys)
    
In [17]:
    
df1 = df1.sort()
df2 = df1.pct_change()
df2 = df2.reset_index()
df2.columns = keys + ['hiv_pctchange']
df2.tail()
    
    Out[17]:
In [18]:
    
undata.shape
    
    Out[18]:
In [19]:
    
undata2 = pd.merge(undata, df2, how='outer', on=keys)
undata2.shape
    
    Out[19]:
In [20]:
    
ssafrica2 = undata2[undata2['mdgregions'] == 'Sub-Saharan Africa']
    
In [21]:
    
features4 = [
            'net_oda_mill', 
            gni,
            prev,
            abr,
            contr,
            'year', 
            'isdeveloped'
           ]
    
In [22]:
    
model_lr_data('hiv_pctchange', features4, undata2)
    
    
R-squared is very low at 0.17, no features appear to be significant, and the model is trained on very few observations.
In [23]:
    
undata3 = copy.deepcopy(undata2[np.isfinite(undata2['hiv_pctchange'])])
undata3.shape
    
    Out[23]:
Create new variable isdecrease.
In [24]:
    
isdecrease = []
for change in undata3['hiv_pctchange']:
    if change >= 0:
        isdecrease.append(0)
    if change < 0:
        isdecrease.append(1)
        
len(isdecrease)
    
    Out[24]:
In [25]:
    
undata3['isdecrease'] = isdecrease
    
In [26]:
    
rs = 6
models = {'logistic': LogisticRegression(),
          'naive bayes': GaussianNB(),
          'SVM': SVC(random_state=rs, probability=True),
          'gradient boosting': GradientBoostingClassifier(n_estimators=250, random_state=rs),
          'decision tree': DecisionTreeClassifier(random_state=rs),
          'random forest': RandomForestClassifier(n_estimators=250, random_state=rs),
          'extra trees': ExtraTreesClassifier(n_estimators=250, random_state=rs)
         }
    
In [27]:
    
def process_data(dep, ind, data):
    formula = dep + '~' + '+'.join(ind)
    y, X = dmatrices(formula, data=data, return_type='dataframe')
    X = X.iloc[:,1:]
    y = y.iloc[:,0]
    X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size = 0.25, random_state=rs)
    return X_tr, X_ts, y_tr, y_ts
    
In [28]:
    
def get_scores(model_dict):
    for mname, m in model_dict.iteritems():
        print "*** %s" % mname
        m.fit(X_tr, y_tr)
        preds = m.predict(X_ts)
        proba = m.predict_proba(X_ts)
        print 'accuracy: %f' % accuracy_score(y_ts, preds)
        print 'precision: %f' % precision_score(y_ts, preds)
        print 'recall: %f' % recall_score(y_ts, preds)
        print 'f1 score: %f' % f1_score(y_ts, preds)
        print '\n'
        all_preds[mname] = preds
        all_proba[mname] = proba
    
In [29]:
    
def get_crossval_scores(X, y, model_dict):
    print 'CROSS VALIDATION SCORES'
    for mname, m in model_dict.iteritems():
        print '\n*** %s' % mname
        acc = np.mean(cross_val_score(m, X, y, scoring='accuracy'))
        pre = np.mean(cross_val_score(m, X, y, scoring='precision'))
        rec = np.mean(cross_val_score(m, X, y, scoring='recall'))
        f1 = np.mean(cross_val_score(m, X, y, scoring='f1'))
        print 'cv score: %f' % np.mean(cross_val_score(m, X, y))
        print 'accuracy: %f' % acc
        print 'precision: %f' % pre
        print 'recall: %f' % rec
        print 'f1 score: %f' % f1
    
In [30]:
    
ssafrica3 = undata3[undata3['mdgregions'] == 'Sub-Saharan Africa']
    
In [31]:
    
features5 = [
            'net_oda_mill', 
            gni,
            prev,
            'isdeveloped'
           ]
    
In [32]:
    
X_tr, X_ts, y_tr, y_ts = process_data('isdecrease', features5, ssafrica3)
X_tr.shape, X_ts.shape
    
    Out[32]:
In [33]:
    
all_preds = {}
all_proba = {}
get_scores(models)
    
    
In [34]:
    
get_crossval_scores(X_tr, y_tr, models)
    
    
In [35]:
    
trees = ExtraTreesClassifier(random_state=rs)
trees.fit(X_tr, y_tr)
importances = trees.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(len(features5) - 1):
    print("%d. feature: %s (%f)" % (f + 1, features5[indices[f]], importances[indices[f]]))
    
    
In [36]:
    
forest = RandomForestClassifier(random_state=rs)
forest.fit(X_tr, y_tr)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(len(features5) - 1):
    print("%d. feature: %s (%f)" % (f + 1, features5[indices[f]], importances[indices[f]]))
    
    
In [37]:
    
gradient = GradientBoostingClassifier(random_state=rs)
gradient.fit(X_tr, y_tr)
importances = gradient.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")
for f in range(len(features5) - 1):
    print("%d. feature: %s (%f)" % (f + 1, features5[indices[f]], importances[indices[f]]))
    
    
The Extra Trees Classifier, Random Forest, and Grandient Boosting Classifier perform reasonably well. The Gradient Boosting model is the only model with ODA as the most important feature, which warrants further investigation.
In [ ]: