In [1]:

    
%matplotlib inline
import pickle
import pandas as pd
import numpy as np
import copy
import matplotlib.pyplot as plt

from patsy import dmatrices
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.cross_validation import train_test_split, cross_val_score, cross_val_predict

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_curve, auc, classification_report

Open previously saved pickle file



In [2]:

    
with open('un_explore.pkl') as picklefile:
    undata = pickle.load(picklefile)

Print column names and counts for non-NaN values for each column



In [3]:

    
for col in sorted(undata.columns):
    print undata[col].count(), col









    



2814 adolescent_birth_rate_per_1000_women
302 agriculture_support_estimate_for_oecd_countries_as_percentage_of_their_gdp
302 agriculture_support_estimate_for_oecd_countries_million_us
2808 aids_deaths
2808 aids_deaths_lower_bound
2808 aids_deaths_upper_bound
936 aids_orphans_one_or_both_parents
349 antenatal_care_coverage_at_least_four_visits_percentage
645 antenatal_care_coverage_at_least_one_visit_percentage
310 antiretroviral_therapy_coverage_among_people_with_advanced_hiv_infection_percentage
316 antiretroviral_therapy_coverage_among_people_with_advanced_hiv_infection_percentage_lower_bound
322 antiretroviral_therapy_coverage_among_people_with_advanced_hiv_infection_percentage_upper_bound
157 condom_use_at_last_highrisk_sex_1524_years_old_men_percentage
191 condom_use_at_last_highrisk_sex_1524_years_old_women_percentage
144 condom_use_population_ages_1524_female__of_females_ages_1524
122 condom_use_population_ages_1524_male__of_males_ages_1524
682 condom_use_to_overall_contraceptive_use_among_currently_married_women_1549_years_old_percentage
125 condom_use_with_non_regular_partner__adults1549_female
120 condom_use_with_non_regular_partner__adults1549_male
786 contraceptive_prevalence__of_women_ages_1549
5256 countryname
728 current_contraceptive_use_among_married_women_1549_years_old_any_method_percentage
682 current_contraceptive_use_among_married_women_1549_years_old_condom_percentage
712 current_contraceptive_use_among_married_women_1549_years_old_modern_methods_percentage
36 debt_relief_committed_under_hipc_initiative_cumulative_million_us_in_end2009_npv_terms
36 debt_relief_delivered_in_full_under_mdri_initiative_cumulative_million_us_in_end2009_npv_terms
2575 debt_service_as_percentage_of_exports_of_goods_and_services_and_net_income
2856 developed_country_imports_from_developing_countries_admitted_duty_free_percentage
882 developed_country_imports_from_the_ldcs_admitted_duty_free_percentage
4965 gdppc2012
4220 gni_per_capita_atlas_method_current_us
1980 hiv_incidence_rate_1549_years_old_percentage_lower_bound
1978 hiv_incidence_rate_1549_years_old_percentage_midpoint
1980 hiv_incidence_rate_1549_years_old_percentage_upper_bound
57 hiv_prevalence_rate_men_1549_years_old_in_national_based_surveys
57 hiv_prevalence_rate_women_1549_years_old_in_national_based_surveys
5256 isdeveloped
5256 isldc2014
5256 islldc
5256 ismdgcountry
5256 iso3code
5256 mdgregions
159 men_1524_years_old_with_comprehensive_correct_knowledge_of_hivaids_percentage
621 net_oda_as_percentage_of_oecddac_donors_gni
3495 net_oda_mill
623 net_oda_million_us
3219 net_oda_received__of_gni
3492 net_oda_received_per_capita_current_us
610 net_oda_to_ldcs_as_percentage_of_oecddac_donors_gni
616 net_oda_to_ldcs_million_us
3792 net_odaplus_mill
3792 net_official_development_assistance_and_official_aid_received_current_us
3495 net_official_development_assistance_received_current_us
212 oda_provided_to_help_build_trade_capacity_percentage
705 oda_received_in_landlocked_developing_countries_as_percentage_of_their_gni
726 oda_received_in_landlocked_developing_countries_million_us
764 oda_received_in_small_islands_developing_states_as_percentage_of_their_gni
938 oda_received_in_small_islands_developing_states_million_us
464 oda_that_is_untied_million_us
193 oda_that_is_untied_percentage
394 oda_to_basic_social_services_as_percentage_of_sectorallocable_oda
395 oda_to_basic_social_services_million_us
2784 people_living_with_hiv_1549_years_old_percentage
2784 people_living_with_hiv_1549_years_old_percentage_lower_bound
2784 people_living_with_hiv_1549_years_old_percentage_upper_bound
205 percentage_of_hivinfected_pregnant_women_who_received_antiretroviral_drugs_to_reduce_the_risk_for_mothertochild_transmission_lower_bound
71 percentage_of_hivinfected_pregnant_women_who_received_antiretroviral_drugs_to_reduce_the_risk_for_mothertochild_transmission_mid_point
205 percentage_of_hivinfected_pregnant_women_who_received_antiretroviral_drugs_to_reduce_the_risk_for_mothertochild_transmission_upper_bound
978 poorest_quintilequots_share_in_national_income_or_consumption_percentage
4965 population2012
853 population_below_125_ppp_per_day_percentage
424 population_below_national_poverty_line_rural_percentage
620 population_below_national_poverty_line_total_percentage
446 population_below_national_poverty_line_urban_percentage
4893 population_total
853 poverty_gap_ratio_at_125_a_day_ppp_percentage
2784 prevalence_of_hiv_total__of_population_ages_1549
3744 purchasing_power_parities_ppp_conversion_factor_local_currency_unit_to_international_dollar
196 ratio_of_school_attendance_rate_of_orphans_to_school_attendance_rate_of_non_orphans
84 school_attendance_rate_of_children_aged_1014_both_of_whose_parents_are_alive_and_who_live_with_at_least_one_parent
64 school_attendance_rate_of_orphans_aged_1014
325 unmet_need_for_family_planning_limiting_percentage
315 unmet_need_for_family_planning_spacing_percentage
367 unmet_need_for_family_planning_total_percentage
280 women_1524_years_old_with_comprehensive_correct_knowledge_of_hivaids_percentage
5256 year

Print counts specifically for contraceptive use variables



In [4]:

    
contr_use = ['condom_use_at_last_highrisk_sex_1524_years_old_men_percentage',
    'condom_use_at_last_highrisk_sex_1524_years_old_women_percentage',
    'condom_use_population_ages_1524_female__of_females_ages_1524',
    'condom_use_population_ages_1524_male__of_males_ages_1524',
    'condom_use_to_overall_contraceptive_use_among_currently_married_women_1549_years_old_percentage',
    'condom_use_with_non_regular_partner__adults1549_female',
    'condom_use_with_non_regular_partner__adults1549_male',
    'contraceptive_prevalence__of_women_ages_1549']



In [5]:

    
for col in contr_use:
    print undata[col].count(), col









    



157 condom_use_at_last_highrisk_sex_1524_years_old_men_percentage
191 condom_use_at_last_highrisk_sex_1524_years_old_women_percentage
144 condom_use_population_ages_1524_female__of_females_ages_1524
122 condom_use_population_ages_1524_male__of_males_ages_1524
682 condom_use_to_overall_contraceptive_use_among_currently_married_women_1549_years_old_percentage
125 condom_use_with_non_regular_partner__adults1549_female
120 condom_use_with_non_regular_partner__adults1549_male
786 contraceptive_prevalence__of_women_ages_1549

Create funtion for linear regression modeling

The model_lr_data function creates matrices and a linear regression model and outputs a summary of the results. The inputs are:

dep: dependent variable
ind: a list of features / independent variables
data: which dataframe to use



In [6]:

    
def model_lr_data(dep, ind, data):
    formula = dep + '~' + '+'.join(ind)
    y, X = dmatrices(formula, data=data, return_type='dataframe' )
    print X.shape
    print '\n****PREDICTING\n' + dep
    model = sm.OLS(y, X)
    results = model.fit()
    print results.summary()



In [7]:

    
hiv = 'hiv_incidence_rate_1549_years_old_percentage_midpoint'
pop = 'population_total'
oda = 'net_oda_mill'
gni = 'gni_per_capita_atlas_method_current_us'
prev = 'prevalence_of_hiv_total__of_population_ages_1549'
contr = 'condom_use_at_last_highrisk_sex_1524_years_old_women_percentage'
pvty = 'population_below_national_poverty_line_total_percentage'
abr = 'adolescent_birth_rate_per_1000_women'

Linear model 1

Use a number of features which may be important to predict HIV incidence rate worldwide.



In [8]:

    
features1 = [
            'net_oda_mill', 
            pop,
            gni,
            prev,
            contr,
            pvty,
            abr,
            'year', 
            'mdgregions',
            'islldc',
            'isldc2014',
            'isdeveloped'
           ]



In [9]:

    
model_lr_data(hiv, features1, undata)









    



(15, 22)

****PREDICTING
hiv_incidence_rate_1549_years_old_percentage_midpoint
                                              OLS Regression Results                                             
=================================================================================================================
Dep. Variable:     hiv_incidence_rate_1549_years_old_percentage_midpoint   R-squared:                       1.000
Model:                                                               OLS   Adj. R-squared:                  1.000
Method:                                                    Least Squares   F-statistic:                     7189.
Date:                                                   Sun, 20 Sep 2015   Prob (F-statistic):           0.000139
Time:                                                           18:04:55   Log-Likelihood:                 78.601
No. Observations:                                                     15   AIC:                            -131.2
Df Residuals:                                                          2   BIC:                            -122.0
Df Model:                                                             12                                         
Covariance Type:                                               nonrobust                                         
===================================================================================================================================
                                                                      coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------------------------------------------------
Intercept                                                           6.5609      1.513      4.336      0.049         0.051    13.071
mdgregions[T.Caucasus and Central Asia]                             0.0880      0.007     12.524      0.006         0.058     0.118
mdgregions[T.Developed]                                             0.0393      0.004      9.329      0.011         0.021     0.057
mdgregions[T.Eastern Asia]                                      -1.017e-10   2.35e-11     -4.337      0.049     -2.03e-10 -8.06e-13
mdgregions[T.Latin America]                                         0.1031      0.007     15.247      0.004         0.074     0.132
mdgregions[T.Northern Africa]                                   -2.364e-12   5.45e-13     -4.336      0.049     -4.71e-12 -1.83e-14
mdgregions[T.Oceania]                                            2.312e-12   5.33e-13      4.336      0.049      1.79e-14  4.61e-12
mdgregions[T.South-eastern Asia]                                 -5.14e-14   1.18e-14     -4.338      0.049     -1.02e-13 -4.18e-16
mdgregions[T.Southern Asia]                                      1.476e-12    3.4e-13      4.336      0.049      1.15e-14  2.94e-12
mdgregions[T.Sub-Saharan Africa]                                    0.0310      0.003      9.907      0.010         0.018     0.044
mdgregions[T.Western Asia]                                      -2.552e-13   5.88e-14     -4.336      0.049     -5.08e-13 -1.97e-15
net_oda_mill                                                     1.384e-06   4.84e-06      0.286      0.802     -1.95e-05  2.22e-05
population_total                                                -1.367e-09   1.48e-10     -9.222      0.012        -2e-09 -7.29e-10
gni_per_capita_atlas_method_current_us                           2.543e-06    1.5e-06      1.692      0.233     -3.92e-06  9.01e-06
prevalence_of_hiv_total__of_population_ages_1549                    0.1049      0.001    100.297      0.000         0.100     0.109
condom_use_at_last_highrisk_sex_1524_years_old_women_percentage    -0.0002      0.000     -0.651      0.582        -0.001     0.001
population_below_national_poverty_line_total_percentage          4.783e-05      0.000      0.173      0.879        -0.001     0.001
adolescent_birth_rate_per_1000_women                               -0.0003   9.59e-05     -3.025      0.094        -0.001     0.000
year                                                               -0.0033      0.001     -4.335      0.049        -0.007 -2.43e-05
islldc                                                                   0          0        nan        nan             0         0
isldc2014                                                           0.0310      0.003      9.907      0.010         0.018     0.044
isdeveloped                                                         0.0393      0.004      9.329      0.011         0.021     0.057
==============================================================================
Omnibus:                        3.858   Durbin-Watson:                   2.729
Prob(Omnibus):                  0.145   Jarque-Bera (JB):                1.523
Skew:                          -0.200   Prob(JB):                        0.467
Kurtosis:                       4.509   Cond. No.                     2.15e+22
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The input rank is higher than the number of observations.
[3] The smallest eigenvalue is 1.6e-29. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.






    



/usr/local/lib/python2.7/site-packages/scipy/stats/stats.py:1233: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=15
  int(n))

The first model has a high R-squared, but is only using 15 observations, since many of the selected features include rows with missing values.

Linear model 2

Limit the features so that more observations will be included. Predict HIV incidence rate worldwide.



In [10]:

    
features2 = [
            'net_oda_mill', 
            gni,
            prev,
            abr,
            'year', 
            'mdgregions',
            'isdeveloped'
           ]



In [11]:

    
model_lr_data(hiv, features2, undata)









    



(680, 17)

****PREDICTING
hiv_incidence_rate_1549_years_old_percentage_midpoint
                                              OLS Regression Results                                             
=================================================================================================================
Dep. Variable:     hiv_incidence_rate_1549_years_old_percentage_midpoint   R-squared:                       0.817
Model:                                                               OLS   Adj. R-squared:                  0.814
Method:                                                    Least Squares   F-statistic:                     229.1
Date:                                                   Sun, 20 Sep 2015   Prob (F-statistic):          1.45e-235
Time:                                                           18:04:58   Log-Likelihood:                -87.688
No. Observations:                                                    680   AIC:                             203.4
Df Residuals:                                                        666   BIC:                             266.7
Df Model:                                                             13                                         
Covariance Type:                                               nonrobust                                         
====================================================================================================================
                                                       coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------------------------------
Intercept                                           35.5183      3.905      9.095      0.000        27.850    43.187
mdgregions[T.Caucasus and Central Asia]              0.1108      0.065      1.700      0.090        -0.017     0.239
mdgregions[T.Developed]                              0.0814      0.037      2.197      0.028         0.009     0.154
mdgregions[T.Eastern Asia]                        8.473e-11   9.32e-12      9.095      0.000      6.64e-11  1.03e-10
mdgregions[T.Latin America]                          0.0634      0.053      1.195      0.232        -0.041     0.168
mdgregions[T.Northern Africa]                        0.0438      0.070      0.629      0.529        -0.093     0.181
mdgregions[T.Oceania]                                0.1156      0.170      0.679      0.497        -0.219     0.450
mdgregions[T.South-eastern Asia]                     0.0611      0.070      0.878      0.380        -0.076     0.198
mdgregions[T.Southern Asia]                          0.0670      0.068      0.980      0.327        -0.067     0.201
mdgregions[T.Sub-Saharan Africa]                     0.1806      0.060      3.028      0.003         0.063     0.298
mdgregions[T.Western Asia]                       -1.447e-14   1.59e-15     -9.106      0.000     -1.76e-14 -1.13e-14
net_oda_mill                                        1.2e-05   1.54e-05      0.781      0.435     -1.82e-05  4.22e-05
gni_per_capita_atlas_method_current_us            1.354e-05   6.29e-06      2.154      0.032       1.2e-06  2.59e-05
prevalence_of_hiv_total__of_population_ages_1549     0.1241      0.003     44.916      0.000         0.119     0.130
adolescent_birth_rate_per_1000_women                -0.0006      0.000     -1.755      0.080        -0.001  7.69e-05
year                                                -0.0178      0.002     -9.069      0.000        -0.022    -0.014
isdeveloped                                          0.0814      0.037      2.197      0.028         0.009     0.154
==============================================================================
Omnibus:                      652.272   Durbin-Watson:                   1.224
Prob(Omnibus):                  0.000   Jarque-Bera (JB):            62816.026
Skew:                           3.934   Prob(JB):                         0.00
Kurtosis:                      49.423   Cond. No.                     4.60e+20
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 4.33e-32. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

Contraceptive data is too sparse to make accurate predictions. Prevelance of HIV appears significant, GNI appears to be significant, adolescent birth rate is close to significant. ODA is not significant.

Linear model 3

Sub-Saharan Africa is the area of greatest concern. Create a subset of sub-Saharan Africa data for building models. Use limited features to predict HIV incidence rate in sub-Saharan Africa.



In [12]:

    
ssafrica = undata[undata['mdgregions'] == 'Sub-Saharan Africa']

ssafrica.shape









    Out[12]:





(1214, 86)



In [13]:

    
features3 = [
            'net_oda_mill', 
            gni,
            prev,
            'year'
           ]



In [14]:

    
model_lr_data(hiv, features3, ssafrica)









    



(923, 5)

****PREDICTING
hiv_incidence_rate_1549_years_old_percentage_midpoint
                                              OLS Regression Results                                             
=================================================================================================================
Dep. Variable:     hiv_incidence_rate_1549_years_old_percentage_midpoint   R-squared:                       0.760
Model:                                                               OLS   Adj. R-squared:                  0.759
Method:                                                    Least Squares   F-statistic:                     726.7
Date:                                                   Sun, 20 Sep 2015   Prob (F-statistic):          1.19e-282
Time:                                                           18:05:01   Log-Likelihood:                -710.84
No. Observations:                                                    923   AIC:                             1432.
Df Residuals:                                                        918   BIC:                             1456.
Df Model:                                                              4                                         
Covariance Type:                                               nonrobust                                         
====================================================================================================================
                                                       coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------------------------------
Intercept                                           88.1807      5.465     16.136      0.000        77.456    98.906
net_oda_mill                                      1.601e-05   2.41e-05      0.664      0.507     -3.13e-05  6.33e-05
gni_per_capita_atlas_method_current_us            5.053e-06    1.2e-05      0.421      0.674     -1.85e-05  2.86e-05
prevalence_of_hiv_total__of_population_ages_1549     0.1329      0.003     50.504      0.000         0.128     0.138
year                                                -0.0441      0.003    -16.113      0.000        -0.049    -0.039
==============================================================================
Omnibus:                      437.572   Durbin-Watson:                   0.157
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             3454.981
Skew:                           2.002   Prob(JB):                         0.00
Kurtosis:                      11.591   Cond. No.                     7.88e+05
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 7.88e+05. This might indicate that there are
strong multicollinearity or other numerical problems.

ODA received and GNI do not appear to be significant. Preveleance of HIV and year are the most significant.

Create new column - HIV percent change

Create new column to use as dependent variable, hiv_pctchange, which indicates the percent change in the HIV incidence rate from the previous year.



In [15]:

    
keys = ['countryname', 'iso3code', 'year', 'mdgregions', 'islldc', 'isldc2014', 'ismdgcountry']



In [16]:

    
df1 = undata[keys + [hiv]]
df1 = df1.set_index(keys)



In [17]:

    
df1 = df1.sort()
df2 = df1.pct_change()
df2 = df2.reset_index()
df2.columns = keys + ['hiv_pctchange']
df2.tail()









    Out[17]:






  
    
      
      countryname
      iso3code
      year
      mdgregions
      islldc
      isldc2014
      ismdgcountry
      hiv_pctchange
    
  
  
    
      5251
      Zimbabwe
      ZWE
      2009
      Sub-Saharan Africa
      0
      0
      1
      -0.028369
    
    
      5252
      Zimbabwe
      ZWE
      2010
      Sub-Saharan Africa
      0
      0
      1
      -0.051095
    
    
      5253
      Zimbabwe
      ZWE
      2011
      Sub-Saharan Africa
      0
      0
      1
      -0.069231
    
    
      5254
      Zimbabwe
      ZWE
      2012
      Sub-Saharan Africa
      0
      0
      1
      -0.090909
    
    
      5255
      Zimbabwe
      ZWE
      2013
      Sub-Saharan Africa
      0
      0
      1
      -0.118182



In [18]:

    
undata.shape









    Out[18]:





(5256, 86)



In [19]:

    
undata2 = pd.merge(undata, df2, how='outer', on=keys)
undata2.shape









    Out[19]:





(5256, 87)

Linear model 4

Predict on new variable, percent change of HIV incidence rate, in sub-Saharan Africa.



In [20]:

    
ssafrica2 = undata2[undata2['mdgregions'] == 'Sub-Saharan Africa']



In [21]:

    
features4 = [
            'net_oda_mill', 
            gni,
            prev,
            abr,
            contr,
            'year', 
            'isdeveloped'
           ]



In [22]:

    
model_lr_data('hiv_pctchange', features4, undata2)









    



(30, 8)

****PREDICTING
hiv_pctchange
                            OLS Regression Results                            
==============================================================================
Dep. Variable:          hiv_pctchange   R-squared:                       0.169
Model:                            OLS   Adj. R-squared:                 -0.096
Method:                 Least Squares   F-statistic:                    0.6386
Date:                Sun, 20 Sep 2015   Prob (F-statistic):              0.719
Time:                        18:05:12   Log-Likelihood:                -12.131
No. Observations:                  30   AIC:                             40.26
Df Residuals:                      22   BIC:                             51.47
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
===================================================================================================================================
                                                                      coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------------------------------------------------------
Intercept                                                           8.8645     54.162      0.164      0.871      -103.461   121.190
net_oda_mill                                                    -7.332e-05      0.000     -0.474      0.640        -0.000     0.000
gni_per_capita_atlas_method_current_us                          -4.694e-05   3.59e-05     -1.306      0.205        -0.000  2.76e-05
prevalence_of_hiv_total__of_population_ages_1549                   -0.0094      0.013     -0.722      0.478        -0.036     0.018
adolescent_birth_rate_per_1000_women                               -0.0026      0.002     -1.106      0.281        -0.007     0.002
condom_use_at_last_highrisk_sex_1524_years_old_women_percentage     0.0063      0.007      0.905      0.375        -0.008     0.021
year                                                               -0.0044      0.027     -0.161      0.874        -0.060     0.052
isdeveloped                                                        -0.3120      0.274     -1.139      0.267        -0.880     0.256
==============================================================================
Omnibus:                       43.094   Durbin-Watson:                   2.498
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              194.256
Skew:                           2.828   Prob(JB):                     6.58e-43
Kurtosis:                      14.109   Cond. No.                     2.53e+06
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.53e+06. This might indicate that there are
strong multicollinearity or other numerical problems.

R-squared is very low at 0.17, no features appear to be significant, and the model is trained on very few observations.

Classification models

Convert to a classification problem and investigate feature importance.

Is HIV incidence rate decreasing from previous year?



In [23]:

    
undata3 = copy.deepcopy(undata2[np.isfinite(undata2['hiv_pctchange'])])
undata3.shape









    Out[23]:





(1977, 87)

Create new variable isdecrease.



In [24]:

    
isdecrease = []

for change in undata3['hiv_pctchange']:
    if change >= 0:
        isdecrease.append(0)
    if change < 0:
        isdecrease.append(1)
        
len(isdecrease)









    Out[24]:





1977



In [25]:

    
undata3['isdecrease'] = isdecrease

Classification

Create functions to run multiple classification models at once and output metrics to assess accuracy.



In [26]:

    
rs = 6

models = {'logistic': LogisticRegression(),
          'naive bayes': GaussianNB(),
          'SVM': SVC(random_state=rs, probability=True),
          'gradient boosting': GradientBoostingClassifier(n_estimators=250, random_state=rs),
          'decision tree': DecisionTreeClassifier(random_state=rs),
          'random forest': RandomForestClassifier(n_estimators=250, random_state=rs),
          'extra trees': ExtraTreesClassifier(n_estimators=250, random_state=rs)
         }



In [27]:

    
def process_data(dep, ind, data):
    formula = dep + '~' + '+'.join(ind)
    y, X = dmatrices(formula, data=data, return_type='dataframe')
    X = X.iloc[:,1:]
    y = y.iloc[:,0]
    X_tr, X_ts, y_tr, y_ts = train_test_split(X, y, test_size = 0.25, random_state=rs)
    return X_tr, X_ts, y_tr, y_ts



In [28]:

    
def get_scores(model_dict):
    for mname, m in model_dict.iteritems():
        print "*** %s" % mname
        m.fit(X_tr, y_tr)
        preds = m.predict(X_ts)
        proba = m.predict_proba(X_ts)
        print 'accuracy: %f' % accuracy_score(y_ts, preds)
        print 'precision: %f' % precision_score(y_ts, preds)
        print 'recall: %f' % recall_score(y_ts, preds)
        print 'f1 score: %f' % f1_score(y_ts, preds)
        print '\n'
        all_preds[mname] = preds
        all_proba[mname] = proba



In [29]:

    
def get_crossval_scores(X, y, model_dict):
    print 'CROSS VALIDATION SCORES'
    for mname, m in model_dict.iteritems():
        print '\n*** %s' % mname
        acc = np.mean(cross_val_score(m, X, y, scoring='accuracy'))
        pre = np.mean(cross_val_score(m, X, y, scoring='precision'))
        rec = np.mean(cross_val_score(m, X, y, scoring='recall'))
        f1 = np.mean(cross_val_score(m, X, y, scoring='f1'))
        print 'cv score: %f' % np.mean(cross_val_score(m, X, y))
        print 'accuracy: %f' % acc
        print 'precision: %f' % pre
        print 'recall: %f' % rec
        print 'f1 score: %f' % f1



In [30]:

    
ssafrica3 = undata3[undata3['mdgregions'] == 'Sub-Saharan Africa']



In [31]:

    
features5 = [
            'net_oda_mill', 
            gni,
            prev,
            'isdeveloped'
           ]



In [32]:

    
X_tr, X_ts, y_tr, y_ts = process_data('isdecrease', features5, ssafrica3)

X_tr.shape, X_ts.shape









    Out[32]:





((692, 4), (231, 4))



In [33]:

    
all_preds = {}
all_proba = {}

get_scores(models)









    



*** SVM
accuracy: 0.554113
precision: 0.550218
recall: 1.000000
f1 score: 0.709859


*** extra trees
accuracy: 0.740260
precision: 0.750000
recall: 0.785714
f1 score: 0.767442


*** decision tree
accuracy: 0.718615
precision: 0.747967
recall: 0.730159
f1 score: 0.738956


*** naive bayes
accuracy: 0.606061
precision: 0.733333
recall: 0.436508
f1 score: 0.547264


*** gradient boosting
accuracy: 0.675325
precision: 0.691729
recall: 0.730159
f1 score: 0.710425


*** logistic
accuracy: 0.675325
precision: 0.694656
recall: 0.722222
f1 score: 0.708171


*** random forest
accuracy: 0.744589
precision: 0.744526
recall: 0.809524
f1 score: 0.775665



In [34]:

    
get_crossval_scores(X_tr, y_tr, models)









    



CROSS VALIDATION SCORES

*** SVM
cv score: 0.580934
accuracy: 0.580934
precision: 0.581396
recall: 0.995025
f1 score: 0.733940

*** extra trees
cv score: 0.722592
accuracy: 0.722592
precision: 0.747222
recall: 0.788557
f1 score: 0.767094

*** decision tree
cv score: 0.645982
accuracy: 0.645982
precision: 0.691577
recall: 0.703980
f1 score: 0.697372

*** naive bayes
cv score: 0.560769
accuracy: 0.560769
precision: 0.715686
recall: 0.405473
f1 score: 0.517401

*** gradient boosting
cv score: 0.682101
accuracy: 0.682101
precision: 0.712854
recall: 0.758706
f1 score: 0.735011

*** logistic
cv score: 0.682063
accuracy: 0.682063
precision: 0.732912
recall: 0.713930
f1 score: 0.723011

*** random forest
cv score: 0.726915
accuracy: 0.726915
precision: 0.750441
recall: 0.793532
f1 score: 0.771371

Investigate feature importance

Feature importance for best models, Extra Trees, Random Forest, and Gradient Boosting.



In [35]:

    
trees = ExtraTreesClassifier(random_state=rs)

trees.fit(X_tr, y_tr)
importances = trees.feature_importances_

indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(len(features5) - 1):
    print("%d. feature: %s (%f)" % (f + 1, features5[indices[f]], importances[indices[f]]))









    



Feature ranking:
1. feature: prevalence_of_hiv_total__of_population_ages_1549 (0.396860)
2. feature: net_oda_mill (0.306375)
3. feature: gni_per_capita_atlas_method_current_us (0.296765)



In [36]:

    
forest = RandomForestClassifier(random_state=rs)

forest.fit(X_tr, y_tr)
importances = forest.feature_importances_

indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(len(features5) - 1):
    print("%d. feature: %s (%f)" % (f + 1, features5[indices[f]], importances[indices[f]]))









    



Feature ranking:
1. feature: prevalence_of_hiv_total__of_population_ages_1549 (0.399475)
2. feature: net_oda_mill (0.338189)
3. feature: gni_per_capita_atlas_method_current_us (0.262336)



In [37]:

    
gradient = GradientBoostingClassifier(random_state=rs)

gradient.fit(X_tr, y_tr)
importances = gradient.feature_importances_

indices = np.argsort(importances)[::-1]

print("Feature ranking:")

for f in range(len(features5) - 1):
    print("%d. feature: %s (%f)" % (f + 1, features5[indices[f]], importances[indices[f]]))









    



Feature ranking:
1. feature: net_oda_mill (0.424601)
2. feature: prevalence_of_hiv_total__of_population_ages_1549 (0.296408)
3. feature: gni_per_capita_atlas_method_current_us (0.278991)

The Extra Trees Classifier, Random Forest, and Grandient Boosting Classifier perform reasonably well. The Gradient Boosting model is the only model with ODA as the most important feature, which warrants further investigation.



In [ ]:

	countryname	iso3code	year	mdgregions	ismdgcountry	hiv_pctchange
5251	Zimbabwe	ZWE	2009	Sub-Saharan Africa	1	-0.028369
5252	Zimbabwe	ZWE	2010	Sub-Saharan Africa	1	-0.051095
5253	Zimbabwe	ZWE	2011	Sub-Saharan Africa	1	-0.069231
5254	Zimbabwe	ZWE	2012	Sub-Saharan Africa	1	-0.090909
5255	Zimbabwe	ZWE	2013	Sub-Saharan Africa	1	-0.118182