In [1]:

    
from IPython.display import display, display_markdown
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score

Reading data



In [2]:

    
training_data = pd.read_csv('training_set_values.csv', index_col=0)
training_label = pd.read_csv('training_set_labels.csv', index_col=0)

test_data = pd.read_csv('test_set_values.csv', index_col=0)

# Merge test data and training data to apply same data management operations on them
data = training_data.append(test_data).sort_index()
data.head()









    Out[2]:






  
    
      
      amount_tsh
      date_recorded
      funder
      gps_height
      installer
      longitude
      latitude
      wpt_name
      num_private
      basin
      ...
      payment_type
      water_quality
      quality_group
      quantity
      quantity_group
      source
      source_type
      source_class
      waterpoint_type
      waterpoint_type_group
    
    
      id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      0.0
      2012-11-13
      Tasaf
      0
      TASAF
      33.125828
      -5.118154
      Mratibu
      0
      Lake Tanganyika
      ...
      unknown
      milky
      milky
      enough
      enough
      shallow well
      shallow well
      groundwater
      hand pump
      hand pump
    
    
      1
      0.0
      2011-03-05
      Shipo
      1978
      SHIPO
      34.770717
      -9.395642
      none
      0
      Rufiji
      ...
      never pay
      soft
      good
      enough
      enough
      shallow well
      shallow well
      groundwater
      hand pump
      hand pump
    
    
      2
      0.0
      2011-03-27
      Lvia
      0
      LVIA
      36.115056
      -6.279268
      Bombani
      0
      Wami / Ruvu
      ...
      per bucket
      soft
      good
      insufficient
      insufficient
      machine dbh
      borehole
      groundwater
      communal standpipe multiple
      communal standpipe
    
    
      3
      10.0
      2013-06-03
      Germany Republi
      1639
      CES
      37.147432
      -3.187555
      Area 7 Namba 5
      0
      Pangani
      ...
      per bucket
      soft
      good
      enough
      enough
      spring
      spring
      groundwater
      communal standpipe
      communal standpipe
    
    
      4
      0.0
      2011-03-22
      Cmsr
      0
      CMSR
      36.164893
      -6.099289
      Ezeleda
      0
      Wami / Ruvu
      ...
      unknown
      soft
      good
      dry
      dry
      shallow well
      shallow well
      groundwater
      hand pump
      hand pump
    
  

5 rows × 39 columns

Data management



In [3]:

    
# As lots of waterpoints are missing a value for amount_tsh. For that field the missing
# data will be replaced by the mean data to drop less data for the model fit
imp = preprocessing.Imputer(missing_values=0, strategy='mean')
imp.fit(data['amount_tsh'].values.reshape(-1, 1))
data['water_amount'] = imp.transform(data['amount_tsh'].values.reshape(-1, 1)).ravel()

imp = preprocessing.Imputer(missing_values=0, strategy='median')
imp.fit(data['construction_year'].values.reshape(-1, 1))
data['construction_year'] = imp.transform(data['construction_year'].values.reshape(-1, 1)).ravel()

imp = preprocessing.Imputer(missing_values=0, strategy='mean')
imp.fit(data['gps_height'].values.reshape(-1, 1))
data['height'] = imp.transform(data['gps_height'].values.reshape(-1, 1)).ravel()

# Recode missing data as NaN
for field in ('longitude', 'latitude'):
    data[field] = data[field].map(lambda x: x if x else pd.np.nan)

def group_installer(data):
    def gather_installer(x):
        installer_map = {
            'organisation' : ('bank', 'msf', 'wwf', 'unicef', 'unisef', 'oxfam', 'oxfarm', 'club', 'care', 'without', 'faim', 'rain', 'red', 'angels', 'fundat', 'foundation'),
            'church' : ('church', 'churc', 'rcchurch', 'roman', 'missionsry', 'lutheran', 'islamic', 'islam', 'vision'),
            'private' : ('consulting', 'engineer', 'private', 'ltd', 'co.ltd', 'contractor', 'enterp', 'enterpr', 'company', 'contract'),
            'community' : ('village', 'community', 'communit', 'district', 'council', 'commu', 'villigers', 'villagers'),
            'government' : ('government', 'gov', 'govt', 'gover', 'gove', 'governme', 'ministry'),
            'other' : ('0', 'nan', 'known', 'other', 'unknown'), # Group 'unknown' data with 'other' as finally this means the same for interpretation
            'danida' : ('danida', 'danid'),
            'foreign government' : ('netherlands', 'germany', 'european')
        }

        for substr in x.split():
            for subsubstr in substr.split('/'):
                for key in installer_map:
                    if subsubstr in installer_map[key]:
                        return key

        return x
    
    lower_data = data.map(lambda x: str(x).lower())
    tmp_data = lower_data.map(gather_installer)
    top10 = list(tmp_data.value_counts()[:10].index)
    
    return tmp_data.map(lambda x: x if x in top10 else 'other')

data['installer'] = group_installer(data.installer)
data['funder'] = group_installer(data.funder)

clean_data = (data.iloc[training_data.index]
                  .join(training_label['status_group'])
                  .dropna())
# Create two columns one collapsing 'functional' and 'functional needs repair'
# and the other one collapsing 'non functional' and 'functional needs repair'
clean_data['functional'] = clean_data['status_group'].map({'functional' : 1, 
                                                           'functional needs repair' : 1,
                                                           'non functional' : 0})
clean_data['no_repairs'] = clean_data['status_group'].map({'functional' : 1, 
                                                           'functional needs repair' : 0,
                                                           'non functional' : 0})



In [6]:

    
# Extract predictors and convert categorical variables in dichotomic variables
predictors_name = ['water_amount', 'height', 'longitude', 'latitude',
# predictors_name = ['amount_tsh', 'gps_height', 'longitude', 'latitude',
                   'basin', 'region', 'population', 'public_meeting', 'management_group',
                   'permit', 'construction_year', 'extraction_type_class', 'payment_type',
                   'quality_group', 'quantity_group', 'source_type', 'waterpoint_type_group',
                   'installer', 'funder']

categorical_predictors = ('basin', 'region', 'management_group', 'extraction_type_class', 
                          'payment_type', 'quality_group', 'quantity_group', 
                          'source_type', 'waterpoint_type_group', 'installer', 'funder')

process_data = pd.DataFrame()
for name in predictors_name:
    if name in categorical_predictors:
        classes = data[name].unique()
        deployed_categories = preprocessing.label_binarize(data[name], classes=classes)
        # Avoid class name collision
        classe_names = list()
        for c in classes:
            if c in process_data.columns or c == 'other':
                classe_names.append('_'.join((c, name)))
            else:
                classe_names.append(c)
        
        tmp_df = pd.DataFrame(deployed_categories, 
                              columns=classe_names, 
                              index=data.index)
        process_data = process_data.join(tmp_df)
    else:
        process_data[name] = data[name]

predictors_columns = process_data.columns
        
deployed_data = (process_data.iloc[training_data.index]
                             .join(training_label['status_group'])
                             .dropna())
# Create two columns one collapsing 'functional' and 'functional needs repair'
# and the other one collapsing 'non functional' and 'functional needs repair'
deployed_data['functional'] = deployed_data['status_group'].map({'functional' : 1, 
                                                                 'functional needs repair' : 1,
                                                                 'non functional' : 0})
deployed_data['no_repairs'] = deployed_data['status_group'].map({'functional' : 1, 
                                                                 'functional needs repair' : 0,
                                                                 'non functional' : 0})

predictors = deployed_data[predictors_columns]



In [7]:

    
# fit an Extra Trees model to the data and look at the first 15 important fields
model = ExtraTreesClassifier()
model.fit(predictors, deployed_data['status_group'])
# display the relative importance of each attribute
cm = sns.light_palette("yellow", as_cmap=True)
(pd.Series(model.feature_importances_, index=predictors.columns, name='importance')
   .sort_values(ascending=False)
   .to_frame()
   .iloc[:15])









    Out[7]:






  
    
      
      importance
    
  
  
    
      longitude
      0.143399
    
    
      latitude
      0.138313
    
    
      dry
      0.078339
    
    
      gps_height
      0.071213
    
    
      population
      0.047767
    
    
      construction_year
      0.043016
    
    
      other_waterpoint_type_group
      0.034288
    
    
      other_extraction_type_class
      0.030801
    
    
      enough
      0.025929
    
    
      never pay
      0.016663
    
    
      amount_tsh
      0.013986
    
    
      handpump
      0.013617
    
    
      insufficient
      0.013088
    
    
      permit
      0.012641
    
    
      government_funder
      0.010637



In [8]:

    
data.shape









    Out[8]:





(74250, 41)



In [9]:

    
# Let's test the random forest test varying the number of important categories from 10 to 60
importance = (pd.Series(model.feature_importances_, index=predictors.columns, name='importance')
   .sort_values(ascending=False))

pd.np.random.seed(12345)
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors, 
                                                              deployed_data['status_group'], 
                                                              test_size=.4)



In [21]:

    
features=range(10, 61)
accuracy=pd.np.zeros(len(features))

for counter, idx in enumerate(features):
    classifier=RandomForestClassifier(n_estimators=25)
    classifier=classifier.fit(pred_train[importance.index[:idx]],tar_train)
    predictions=classifier.predict(pred_test[importance.index[:idx]])
    accuracy[counter]=accuracy_score(tar_test, predictions)

plt.plot(features, accuracy)
plt.xlabel("Number of features")
plt.ylabel("Accuracy score")
plt.show();



In [10]:

    
# After optimizing for the number of categories (taking 50), let's optimize the number of tree
trees=range(1, 31)
accuracy=pd.np.zeros(len(trees))

for idx in trees:
    classifier=RandomForestClassifier(n_estimators=idx)
    classifier=classifier.fit(pred_train[importance.index[:50]],tar_train)
    predictions=classifier.predict(pred_test[importance.index[:50]])
    accuracy[idx-1]=accuracy_score(tar_test, predictions)

plt.plot(trees, accuracy)
plt.xlabel("Number of trees")
plt.ylabel("Accuracy score")
plt.show();



In [11]:

    
# Let save this optimization approach
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=28)
model = model.fit(predictors[importance.index[:50]], deployed_data['status_group'])



In [12]:

    
clean_test_data = process_data.iloc[test_data.index].dropna()
predictions = model.predict(process_data.iloc[test_data.index][importance.index[:50]].dropna())



In [13]:

    
pred = pd.Series(predictions, index=clean_test_data.index, name='status_group')

missing_index = list()
for i in test_data.index:
    if i not in clean_test_data.index:
        missing_index.append(i)

data_list = list()
pd.np.random.seed(12345)
for rnd in pd.np.random.rand(len(missing_index)):
    if rnd < 0.072677:
        data_list.append('functional needs repair')
    elif rnd < 0.384242 + 0.072677:
        data_list.append('non functional')
    else:
        data_list.append('functional')
fill = pd.Series(data_list, index=missing_index)

pred = pred.append(fill)
to_file = pred[test_data.index]
to_file.to_csv('optimizeRndForest.csv', index_label='id', header=('status_group',))

With the previous model I got a score of 0.7593 on DrivenData. Can do better...

Let's focus on the most important categories



In [26]:

    
# Extract predictors and convert categorical variables in dichotomic variables
# predictors_name = ['water_amount', 'height', 'longitude', 'latitude',
#                    'basin', 'region', 'population', 'public_meeting', 'management_group',
#                    'permit', 'construction_year', 'extraction_type_class', 'payment_type',
#                    'quality_group', 'quantity_group', 'source_type', 'waterpoint_type_group',
#                    'installer', 'funder']

# categorical_predictors = ('basin', 'region', 'management_group', 'extraction_type_class', 
#                           'payment_type', 'quality_group', 'quantity_group', 
#                           'source_type', 'waterpoint_type_group', 'installer', 'funder')

predictors_name = ['height', 'longitude', 'latitude', 'population',
                   'permit', 'construction_year', 'extraction_type_class', 'payment_type',
                   'quantity_group', 'waterpoint_type_group']

categorical_predictors = ('extraction_type_class', 'payment_type', 'quantity_group', 
                          'waterpoint_type_group')

process_data = pd.DataFrame()
for name in predictors_name:
    if name in categorical_predictors:
        classes = data[name].unique()
        deployed_categories = preprocessing.label_binarize(data[name], classes=classes)
        # Avoid class name collision
        classe_names = list()
        for c in classes:
            if c in process_data.columns:
                classe_names.append('_'.join((c, name)))
            else:
                classe_names.append(c)
        
        tmp_df = pd.DataFrame(deployed_categories, 
                              columns=classe_names, 
                              index=data.index)
        process_data = process_data.join(tmp_df)
    else:
        process_data[name] = data[name]

predictors_columns = process_data.columns
        
deployed_data = (process_data.iloc[training_data.index]
                             .join(training_label['status_group'])
                             .dropna())
# Create two columns one collapsing 'functional' and 'functional needs repair'
# and the other one collapsing 'non functional' and 'functional needs repair'
deployed_data['functional'] = deployed_data['status_group'].map({'functional' : 1, 
                                                                 'functional needs repair' : 1,
                                                                 'non functional' : 0})
deployed_data['no_repairs'] = deployed_data['status_group'].map({'functional' : 1, 
                                                                 'functional needs repair' : 0,
                                                                 'non functional' : 0})

predictors = deployed_data[predictors_columns]

Explanatory variables selection

As there are a lot of potential explanatory variable, so let's focus of those having a real influence by using a Random Forest test.



In [24]:

    
# fit an Extra Trees model to the data and look at the first 20 important fields
model = ExtraTreesClassifier()
model.fit(predictors, deployed_data['status_group'])
# display the relative importance of each attribute
cm = sns.light_palette("yellow", as_cmap=True)
(pd.Series(model.feature_importances_, index=predictors.columns, name='importance')
   .sort_values(ascending=False)
   .to_frame()
   .iloc[:20]
   .style.background_gradient(cmap=cm))









    Out[24]:





        

        
        

        
            
            
                
                
                
                importance
                
            
            
        
        
            
            
                
                
                    longitude
                
                
                    0.215398
                
            
            
            
                
                
                    latitude
                
                
                    0.213067
                
            
            
            
                
                
                    height
                
                
                    0.103801
                
            
            
            
                
                
                    construction_year
                
                
                    0.0806856
                
            
            
            
                
                
                    dry
                
                
                    0.0793701
                
            
            
            
                
                
                    population
                
                
                    0.0702775
                
            
            
            
                
                
                    other
                
                
                    0.0392548
                
            
            
            
                
                
                    other_waterpoint_type_group
                
                
                    0.0384225
                
            
            
            
                
                
                    enough
                
                
                    0.030773
                
            
            
            
                
                
                    permit
                
                
                    0.0155521
                
            
            
            
                
                
                    never pay
                
                
                    0.0153311
                
            
            
            
                
                
                    insufficient
                
                
                    0.0136371
                
            
            
            
                
                
                    gravity
                
                
                    0.010479
                
            
            
            
                
                
                    unknown
                
                
                    0.00782398
                
            
            
            
                
                
                    communal standpipe
                
                
                    0.00722429
                
            
            
            
                
                
                    per bucket
                
                
                    0.0071673
                
            
            
            
                
                
                    submersible
                
                
                    0.00711073
                
            
            
            
                
                
                    seasonal
                
                
                    0.00693082
                
            
            
            
                
                
                    motorpump
                
                
                    0.00529115
                
            
            
            
                
                
                    annually
                
                
                    0.00502709

Univariate distribution

In this section the frequencies tables for all categorical variables and the distribution for all quantitative variables will be presented. The missing data will be dropped.

Categorical variables distribution



In [25]:

    
# Visualize status only for the training set
field = 'status_group'
display_markdown("### {} distribution".format(' '.join(field.split('_')).capitalize()), 
                 raw=True)
tmp_df = training_label[field].value_counts(dropna=False)
tmp_df.index.name = tmp_df.name
tmp_df.name = 'unnormalized'
tmp_df2 = training_label[field].value_counts(dropna=False, normalize=True)
tmp_df2.name = 'normalized'
tmp_df = tmp_df.to_frame().join(tmp_df2)
display(tmp_df.T)
fig = plt.figure()
ax = fig.add_subplot(111)
ax = sns.countplot(field, data=training_label, ax=ax)
lbls = ax.get_xticklabels()
if len(lbls) > 7:
    ax.set_xticklabels(lbls, rotation=90)
plt.show()









    




Status group distribution






    






  
    
      status_group
      functional
      non functional
      functional needs repair
    
  
  
    
      unnormalized
      32259.000000
      22824.000000
      4317.000000
    
    
      normalized
      0.543081
      0.384242
      0.072677



In [26]:

    
pd.set_option('display.max_columns', 25)
list_fields = list(categorical_predictors)
list_fields.extend(('public_meeting', 'permit'))
for field in list_fields:
    field_name = ' '.join(field.split('_'))
    display_markdown("### {} distribution".format(field_name.capitalize()), 
                     raw=True)
    tmp_df = data[field].value_counts(dropna=False)
    tmp_df.index.name = tmp_df.name
    tmp_df.name = 'unnormalized'
    tmp_df2 = data[field].value_counts(dropna=False, normalize=True)
    tmp_df2.name = 'normalized'
    tmp_df = tmp_df.to_frame().join(tmp_df2)
#     display(data[field].value_counts(dropna=False).to_frame().T)
    display(tmp_df.T)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax = sns.countplot(field, data=data, ax=ax)
    lbls = ax.get_xticklabels()
    if len(lbls) > 7:
        ax.set_xticklabels(lbls, rotation=90)
    ax.set_xlabel(field_name)
    plt.show()









    




Extraction type class distribution






    






  
    
      extraction_type_class
      gravity
      handpump
      other
      submersible
      motorpump
      rope pump
      wind-powered
    
  
  
    
      unnormalized
      33263.000000
      20612.000000
      8102.000000
      7772.000000
      3777.000000
      572.000000
      152.000000
    
    
      normalized
      0.447987
      0.277603
      0.109118
      0.104673
      0.050869
      0.007704
      0.002047
    
  








    












    




Payment type distribution






    






  
    
      payment_type
      never pay
      per bucket
      monthly
      unknown
      on failure
      annually
      other
    
  
  
    
      unnormalized
      31712.000000
      11266.000000
      10397.000000
      10149.000000
      4842.000000
      4570.000000
      1314.000000
    
    
      normalized
      0.427098
      0.151731
      0.140027
      0.136687
      0.065212
      0.061549
      0.017697
    
  








    












    




Quantity group distribution






    






  
    
      quantity_group
      enough
      insufficient
      dry
      seasonal
      unknown
    
  
  
    
      unnormalized
      41522.000000
      18896.000000
      7782.000000
      5075.00000
      975.000000
    
    
      normalized
      0.559219
      0.254492
      0.104808
      0.06835
      0.013131
    
  








    












    




Waterpoint type group distribution






    






  
    
      waterpoint_type_group
      communal standpipe
      hand pump
      other
      improved spring
      cattle trough
      dam
    
  
  
    
      unnormalized
      43239.000000
      21884.000000
      8010.000000
      959.000000
      150.00000
      8.000000
    
    
      normalized
      0.582343
      0.294734
      0.107879
      0.012916
      0.00202
      0.000108
    
  








    












    




Public meeting distribution






    






  
    
      public_meeting
      True
      False
      nan
    
  
  
    
      unnormalized
      63749.000000
      6346.000000
      4155.00000
    
    
      normalized
      0.858572
      0.085468
      0.05596
    
  








    












    




Permit distribution






    






  
    
      permit
      True
      False
      nan
    
  
  
    
      unnormalized
      48606.000000
      21851.00000
      3793.000000
    
    
      normalized
      0.654626
      0.29429
      0.051084

Quantitative variable distribution



In [27]:

    
# for field in ('amount_tsh', 'gps_height', 'latitude', 'longitude', 'construction_year', 'population'):
for field in ('gps_height', 'latitude', 'longitude', 'construction_year', 'population'):
    field_name = ' '.join(field.split('_'))
    display_markdown("### {} distribution".format(field_name.capitalize()), 
                     raw=True)
    clean_field = training_data[field].map(lambda x: x if abs(x)>1e-8 else pd.np.nan)
    clean_field = clean_field.dropna()
    display(clean_field.describe().to_frame().T)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax = sns.distplot(clean_field)
    ax.set_xlabel(field_name)
    plt.show()









    




Gps height distribution






    






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
  
  
    
      gps_height
      38962.0
      1018.860839
      612.566092
      -90.0
      393.0
      1167.0
      1498.0
      2770.0
    
  








    



C:\Anaconda3\lib\site-packages\statsmodels\nonparametric\kdetools.py:20: VisibleDeprecationWarning: using a non-integer number instead of an integer will result in an error in the future
  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j






    












    




Latitude distribution






    






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
  
  
    
      latitude
      59400.0
      -5.706033
      2.946019
      -11.64944
      -8.540621
      -5.021597
      -3.326156
      -2.000000e-08
    
  








    












    




Longitude distribution






    






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
  
  
    
      longitude
      57588.0
      35.149669
      2.607428
      29.607122
      33.2851
      35.005943
      37.233712
      40.345193
    
  








    












    




Construction year distribution






    






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
  
  
    
      construction_year
      38691.0
      1996.814686
      12.472045
      1960.0
      1987.0
      2000.0
      2008.0
      2013.0
    
  








    












    




Population distribution






    






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
  
  
    
      population
      38019.0
      281.087167
      564.68766
      1.0
      40.0
      150.0
      324.0
      30500.0

Bivariate distribution

If the response variable is categorical with more than two categories, it is adviced to collapse the categories into two categories. So for presenting the bivariate distribution, the functional and functional needs repair categories will be collapsed. Then the distribution for the data falling in that new category will depicted.

The distribution will be presented using bar chart. For quantitative explanatory variables, the data will be collapsed in two categories defined by the median.

Along with the visualization, hypothesis testing will be carried out. As the response variable is categorical, the method will be the chi-square test.



In [43]:

    
list_fields = list(categorical_predictors)
# list_fields.extend(('public_meeting', 'permit'))
list_fields.extend(('permit', ))
for field in list_fields:
    field_name = ' '.join(field.split('_'))
    display_markdown("### {}".format(field_name.capitalize()), 
                     raw=True)
    var_analysis = clean_data[['status_group', 'functional', 'no_repairs', field]]
        
    if len(var_analysis[field].unique()) > 2:
        # Chi-square test
        display_markdown("Contingency table of observed counts", raw=True)
        ct = pd.crosstab(var_analysis['status_group'], var_analysis[field])
        display(ct)
        ct_norm = ct/ct.sum(axis=0)
        display(ct_norm)
        
        
        ct = pd.crosstab(var_analysis['functional'], var_analysis[field])
        chisqr = stats.chi2_contingency(ct)
        
        if chisqr[1] > 0.05:
            display_markdown("There is *no* significant relationship between {} and the functional"
                             " status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
                             raw=True)
        else:
            display_markdown("There is a significant relationship between {} and the functional"
                             " status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
                             raw=True)
            
        ct = pd.crosstab(var_analysis['no_repairs'], var_analysis[field])
        chisqr = stats.chi2_contingency(ct)
        
        if chisqr[1] > 0.05:
            display_markdown("There is *no* significant relationship between {} and the no repairs"
                             " status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
                             raw=True)
        else:
            display_markdown("There is a significant relationship between {} and the no repairs"
                             " status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
                             raw=True)
        
        fig = plt.figure(figsize=[1.5*l for l in plt.rcParams['figure.figsize']])
        ax = fig.add_subplot(121)
        ax = sns.barplot(x=field, y='functional', data=var_analysis, ci=None, ax=ax)
        ax.set_xlabel(field_name)
        lbls = ax.get_xticklabels()
        if len(lbls) > 5:
            ax.set_xticklabels(lbls, rotation=90)
        ax = fig.add_subplot(122)
        ax = sns.barplot(x=field, 
                         y='no_repairs', 
                         data=var_analysis.where(var_analysis['functional'] == 1),
                         ci=None,
                         ax=ax,
                         order=[l.get_text() for l in lbls])
        ax.set_xlabel(field_name)
        lbls = ax.get_xticklabels()
        if len(lbls) > 5:
            ax.set_xticklabels(lbls, rotation=90)
        plt.show()
        
        display_markdown("Test again *functional* status", raw=True)
        categories = var_analysis[field].unique()
        list_field = list(categories)
        p_values = dict()
        for i in range(len(list_field)):
            for j in range(i+1, len(list_field)):
                cat1 = list_field[i]
                cat2 = list_field[j]
                explanatory = var_analysis[field].map(dict(((cat1, cat1),
                                                            (cat2, cat2))))
                comparison = pd.crosstab(var_analysis['functional'], explanatory)
#                 display(Markdown("Crosstable to compare {} and {}".format(cat1, cat2)))
#                 display(comparison)
#                 display(comparison/comparison.sum(axis=0))

                chi_square, p, _, expected_counts = stats.chi2_contingency(comparison)
                p_values[(cat1, cat2)] = p
        df = pd.DataFrame(p_values, index=['p-value', ])

        display(df.stack(level=[0, 1])['p-value']
                  .rename('p-value')
                  .to_frame()
                  .assign(Ha=lambda x: x['p-value'] < 0.05 / len(p_values)))
        
        display_markdown("Test again *no repairs* status", raw=True)
        categories = var_analysis[field].unique()
        list_field = list(categories)
        p_values = dict()
        for i in range(len(list_field)):
            for j in range(i+1, len(list_field)):
                cat1 = list_field[i]
                cat2 = list_field[j]
                explanatory = var_analysis[field].map(dict(((cat1, cat1),
                                                            (cat2, cat2))))
                comparison = pd.crosstab(var_analysis['no_repairs'], explanatory)
#                 display(Markdown("Crosstable to compare {} and {}".format(cat1, cat2)))
#                 display(comparison)
#                 display(comparison/comparison.sum(axis=0))

                chi_square, p, _, expected_counts = stats.chi2_contingency(comparison)
                p_values[(cat1, cat2)] = p
        df = pd.DataFrame(p_values, index=['p-value', ])

        display(df.stack(level=[0, 1])['p-value']
                  .rename('p-value')
                  .to_frame()
                  .assign(Ha=lambda x: x['p-value'] < 0.05 / len(p_values)))
        
    else:
        # Chi-square test
        display_markdown("Contingency table of observed counts", raw=True)
        ct = pd.crosstab(var_analysis['status_group'], var_analysis[field])
        display(ct)
        ct_norm = ct/ct.sum(axis=0)
        display(ct_norm)
        
        chisqr = stats.chi2_contingency(ct)
        
        if chisqr[1] > 0.05:
            display_markdown("There is *no* significant relationship between {} and the functional"
                             " status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
                             raw=True)
        else:
            display_markdown("There is a significant relationship between {} and the functional"
                             " status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
                             raw=True)
        
        fig = plt.figure(figsize=[1.5*l for l in plt.rcParams['figure.figsize']])
        ax = fig.add_subplot(121)
        ax = sns.barplot(x=field, y='functional', data=var_analysis, ci=None, ax=ax,
                         order=[True, False])
        ax.set_xlabel(field_name)
        lbls = ax.get_xticklabels()
        if len(lbls) > 5:
            ax.set_xticklabels(lbls, rotation=90)
        ax = fig.add_subplot(122)
        ax = sns.barplot(x=field, 
                         y='no_repairs', 
                         data=var_analysis.where(var_analysis['functional'] == 1),
                         ci=None,
                         ax=ax,
                         order=[True, False])
        ax.set_xlabel(field_name)
        lbls = ax.get_xticklabels()
        if len(lbls) > 5:
            ax.set_xticklabels(lbls, rotation=90)
        plt.show()









    




Extraction type class






    




Contingency table of observed counts







    






  
    
      extraction_type_class
      gravity
      handpump
      motorpump
      other
      rope pump
      submersible
      wind-powered
    
    
      status_group
      
      
      
      
      
      
      
    
  
  
    
      functional
      12094
      804
      752
      185
      7
      2061
      30
    
    
      functional needs repair
      1709
      21
      85
      12
      0
      174
      5
    
    
      non functional
      5786
      422
      1031
      637
      20
      1651
      35
    
  








    






  
    
      extraction_type_class
      gravity
      handpump
      motorpump
      other
      rope pump
      submersible
      wind-powered
    
    
      status_group
      
      
      
      
      
      
      
    
  
  
    
      functional
      0.617387
      0.644747
      0.402570
      0.221823
      0.259259
      0.530365
      0.428571
    
    
      functional needs repair
      0.087243
      0.016840
      0.045503
      0.014388
      0.000000
      0.044776
      0.071429
    
    
      non functional
      0.295370
      0.338412
      0.551927
      0.763789
      0.740741
      0.424858
      0.500000
    
  








    




There is a significant relationship between extraction type class and the functional status of the waterpoint (p-value = 3.5e-287).







    




There is a significant relationship between extraction type class and the no repairs status of the waterpoint (p-value = 6.8e-185).







    












    




Test again functional status







    






  
    
      
      
      p-value
      Ha
    
  
  
    
      gravity
      handpump
      1.420607e-03
      True
    
    
      motorpump
      2.262516e-114
      True
    
    
      other
      1.346001e-178
      True
    
    
      rope pump
      1.202484e-06
      True
    
    
      submersible
      1.910445e-56
      True
    
    
      wind-powered
      3.034969e-04
      True
    
    
      handpump
      motorpump
      1.873751e-31
      True
    
    
      other
      2.637283e-80
      True
    
    
      rope pump
      3.460225e-05
      True
    
    
      wind-powered
      8.423967e-03
      False
    
    
      motorpump
      other
      1.930282e-25
      True
    
    
      rope pump
      7.757985e-02
      False
    
    
      other
      rope pump
      9.622983e-01
      False
    
    
      submersible
      handpump
      7.446052e-08
      True
    
    
      motorpump
      1.895132e-19
      True
    
    
      other
      2.349516e-70
      True
    
    
      rope pump
      1.860833e-03
      True
    
    
      wind-powered
      2.550959e-01
      False
    
    
      wind-powered
      motorpump
      4.623263e-01
      False
    
    
      other
      2.465892e-06
      True
    
    
      rope pump
      5.535722e-02
      False
    
  








    




Test again no repairs status







    






  
    
      
      
      p-value
      Ha
    
  
  
    
      gravity
      handpump
      5.756493e-02
      False
    
    
      motorpump
      5.142137e-73
      True
    
    
      other
      3.419231e-115
      True
    
    
      rope pump
      2.865684e-04
      True
    
    
      submersible
      4.982951e-24
      True
    
    
      wind-powered
      1.806478e-03
      True
    
    
      handpump
      motorpump
      7.827683e-40
      True
    
    
      other
      1.441784e-79
      True
    
    
      rope pump
      8.932081e-05
      True
    
    
      wind-powered
      4.242535e-04
      True
    
    
      motorpump
      other
      1.135053e-19
      True
    
    
      rope pump
      1.898417e-01
      False
    
    
      other
      rope pump
      8.219331e-01
      False
    
    
      submersible
      handpump
      1.866009e-12
      True
    
    
      motorpump
      1.399579e-19
      True
    
    
      other
      1.123694e-58
      True
    
    
      rope pump
      8.823464e-03
      False
    
    
      wind-powered
      1.163752e-01
      False
    
    
      wind-powered
      motorpump
      7.555927e-01
      False
    
    
      other
      1.724771e-04
      True
    
    
      rope pump
      1.917519e-01
      False
    
  








    




Payment type






    




Contingency table of observed counts







    






  
    
      payment_type
      annually
      monthly
      never pay
      on failure
      other
      per bucket
      unknown
    
    
      status_group
      
      
      
      
      
      
      
    
  
  
    
      functional
      1811
      3871
      4247
      839
      132
      4188
      845
    
    
      functional needs repair
      143
      725
      647
      85
      4
      263
      139
    
    
      non functional
      303
      1276
      4983
      279
      70
      1672
      999
    
  








    






  
    
      payment_type
      annually
      monthly
      never pay
      on failure
      other
      per bucket
      unknown
    
    
      status_group
      
      
      
      
      
      
      
    
  
  
    
      functional
      0.802393
      0.659230
      0.429989
      0.697423
      0.640777
      0.683978
      0.426122
    
    
      functional needs repair
      0.063358
      0.123467
      0.065506
      0.070657
      0.019417
      0.042953
      0.070096
    
    
      non functional
      0.134249
      0.217302
      0.504505
      0.231920
      0.339806
      0.273069
      0.503782
    
  








    




There is a significant relationship between payment type and the functional status of the waterpoint (p-value = 0).







    




There is a significant relationship between payment type and the no repairs status of the waterpoint (p-value = 0).







    












    




Test again functional status







    






  
    
      
      
      p-value
      Ha
    
  
  
    
      annually
      monthly
      3.031849e-17
      True
    
    
      never pay
      1.884291e-224
      True
    
    
      on failure
      3.676184e-13
      True
    
    
      other
      7.435575e-15
      True
    
    
      per bucket
      4.310055e-40
      True
    
    
      unknown
      5.772155e-149
      True
    
    
      monthly
      never pay
      1.485108e-277
      True
    
    
      on failure
      2.813915e-01
      False
    
    
      other
      4.567542e-05
      True
    
    
      unknown
      2.589152e-130
      True
    
    
      never pay
      on failure
      3.172208e-71
      True
    
    
      other
      4.049234e-06
      True
    
    
      unknown
      9.727352e-01
      False
    
    
      on failure
      other
      1.250186e-03
      True
    
    
      unknown
      8.742551e-52
      True
    
    
      per bucket
      monthly
      1.547396e-12
      True
    
    
      never pay
      4.792608e-183
      True
    
    
      on failure
      3.544198e-03
      False
    
    
      other
      4.234480e-02
      False
    
    
      unknown
      3.052523e-80
      True
    
    
      unknown
      other
      1.043067e-05
      True
    
  








    




Test again no repairs status







    






  
    
      
      
      p-value
      Ha
    
  
  
    
      annually
      monthly
      2.898930e-36
      True
    
    
      never pay
      2.592424e-223
      True
    
    
      on failure
      5.118962e-12
      True
    
    
      other
      8.707763e-08
      True
    
    
      per bucket
      2.046482e-26
      True
    
    
      unknown
      1.496984e-140
      True
    
    
      monthly
      never pay
      2.505928e-170
      True
    
    
      on failure
      1.157905e-02
      False
    
    
      other
      6.353067e-01
      False
    
    
      unknown
      8.785655e-75
      True
    
    
      never pay
      on failure
      6.614117e-69
      True
    
    
      other
      2.372608e-09
      True
    
    
      unknown
      7.698215e-01
      False
    
    
      on failure
      other
      1.231580e-01
      False
    
    
      unknown
      8.679468e-50
      True
    
    
      per bucket
      monthly
      4.153706e-03
      False
    
    
      never pay
      1.602440e-214
      True
    
    
      on failure
      3.763071e-01
      False
    
    
      other
      2.171448e-01
      False
    
    
      unknown
      8.859334e-94
      True
    
    
      unknown
      other
      5.707291e-09
      True
    
  








    




Quantity group






    




Contingency table of observed counts







    






  
    
      quantity_group
      dry
      enough
      insufficient
      seasonal
      unknown
    
    
      status_group
      
      
      
      
      
    
  
  
    
      functional
      67
      11722
      3514
      599
      31
    
    
      functional needs repair
      5
      1289
      629
      83
      0
    
    
      non functional
      2701
      3850
      2729
      207
      95
    
  








    






  
    
      quantity_group
      dry
      enough
      insufficient
      seasonal
      unknown
    
    
      status_group
      
      
      
      
      
    
  
  
    
      functional
      0.024162
      0.695214
      0.511350
      0.673791
      0.246032
    
    
      functional needs repair
      0.001803
      0.076449
      0.091531
      0.093363
      0.000000
    
    
      non functional
      0.974035
      0.228338
      0.397119
      0.232846
      0.753968
    
  








    




There is a significant relationship between quantity group and the functional status of the waterpoint (p-value = 0).







    




There is a significant relationship between quantity group and the no repairs status of the waterpoint (p-value = 0).







    












    




Test again functional status







    






  
    
      
      
      p-value
      Ha
    
  
  
    
      dry
      insufficient
      0.000000e+00
      True
    
    
      seasonal
      0.000000e+00
      True
    
    
      unknown
      1.532922e-37
      True
    
    
      enough
      dry
      0.000000e+00
      True
    
    
      insufficient
      8.609560e-153
      True
    
    
      seasonal
      7.863726e-01
      False
    
    
      unknown
      2.066209e-43
      True
    
    
      insufficient
      seasonal
      2.882668e-21
      True
    
    
      unknown
      1.252727e-15
      True
    
    
      seasonal
      unknown
      1.684947e-32
      True
    
  








    




Test again no repairs status







    






  
    
      
      
      p-value
      Ha
    
  
  
    
      dry
      insufficient
      0.000000e+00
      True
    
    
      seasonal
      0.000000e+00
      True
    
    
      unknown
      6.241413e-40
      True
    
    
      enough
      dry
      0.000000e+00
      True
    
    
      insufficient
      5.223492e-158
      True
    
    
      seasonal
      1.889251e-01
      False
    
    
      unknown
      4.148094e-27
      True
    
    
      insufficient
      seasonal
      9.396423e-20
      True
    
    
      unknown
      6.131721e-09
      True
    
    
      seasonal
      unknown
      5.041900e-20
      True
    
  








    




Waterpoint type group






    




Contingency table of observed counts







    






  
    
      waterpoint_type_group
      cattle trough
      communal standpipe
      dam
      hand pump
      improved spring
      other
    
    
      status_group
      
      
      
      
      
      
    
  
  
    
      functional
      54
      14804
      5
      828
      57
      185
    
    
      functional needs repair
      2
      1872
      0
      26
      8
      98
    
    
      non functional
      8
      8241
      0
      425
      12
      896
    
  








    






  
    
      waterpoint_type_group
      cattle trough
      communal standpipe
      dam
      hand pump
      improved spring
      other
    
    
      status_group
      
      
      
      
      
      
    
  
  
    
      functional
      0.84375
      0.594133
      1.0
      0.647381
      0.740260
      0.156913
    
    
      functional needs repair
      0.03125
      0.075129
      0.0
      0.020328
      0.103896
      0.083121
    
    
      non functional
      0.12500
      0.330738
      0.0
      0.332291
      0.155844
      0.759966
    
  








    




There is a significant relationship between waterpoint type group and the functional status of the waterpoint (p-value = 4.8e-202).







    




There is a significant relationship between waterpoint type group and the no repairs status of the waterpoint (p-value = 6.3e-201).







    












    




Test again functional status







    






  
    
      
      
      p-value
      Ha
    
  
  
    
      communal standpipe
      cattle trough
      7.731873e-04
      True
    
    
      dam
      2.728643e-01
      False
    
    
      hand pump
      9.325450e-01
      False
    
    
      improved spring
      1.707226e-03
      True
    
    
      other
      7.982696e-200
      True
    
    
      dam
      cattle trough
      9.079602e-01
      False
    
    
      hand pump
      cattle trough
      8.829528e-04
      True
    
    
      dam
      2.714208e-01
      False
    
    
      improved spring
      1.988047e-03
      True
    
    
      other
      8.597211e-100
      True
    
    
      improved spring
      cattle trough
      7.792963e-01
      False
    
    
      dam
      7.622363e-01
      False
    
    
      other
      7.892635e-30
      True
    
    
      other
      cattle trough
      5.677582e-28
      True
    
    
      dam
      6.033031e-04
      True
    
  








    




Test again no repairs status







    






  
    
      
      
      p-value
      Ha
    
  
  
    
      communal standpipe
      cattle trough
      8.322043e-05
      True
    
    
      dam
      1.637409e-01
      False
    
    
      hand pump
      1.720922e-04
      True
    
    
      improved spring
      1.272357e-02
      False
    
    
      other
      4.682013e-193
      True
    
    
      dam
      cattle trough
      7.669887e-01
      False
    
    
      hand pump
      cattle trough
      1.974827e-03
      True
    
    
      dam
      2.383203e-01
      False
    
    
      improved spring
      1.237518e-01
      False
    
    
      other
      4.761713e-134
      True
    
    
      improved spring
      cattle trough
      1.976478e-01
      False
    
    
      dam
      4.393768e-01
      False
    
    
      other
      1.898473e-35
      True
    
    
      other
      cattle trough
      4.857037e-41
      True
    
    
      dam
      6.337283e-06
      True
    
  








    




Permit






    




Contingency table of observed counts







    






  
    
      permit
      False
      True
    
    
      status_group
      
      
    
  
  
    
      functional
      3114
      12819
    
    
      functional needs repair
      413
      1593
    
    
      non functional
      2426
      7156
    
  








    






  
    
      permit
      False
      True
    
    
      status_group
      
      
    
  
  
    
      functional
      0.523098
      0.594353
    
    
      functional needs repair
      0.069377
      0.073859
    
    
      non functional
      0.407526
      0.331788
    
  








    




There is a significant relationship between permit and the functional status of the waterpoint (p-value = 1.4e-26).



In [45]:

    
# for field in ('amount_tsh', 'gps_height', 'latitude', 'longitude', 'construction_year', 'population'):
for field in ('gps_height', 'latitude', 'longitude', 'construction_year', 'population'):
    field_name = ' '.join(field.split('_'))
    display_markdown("### {}".format(field_name.capitalize()), 
                     raw=True)
    var_analysis = clean_data[['status_group', 'functional', 'no_repairs']]
    clean_field = clean_data[field].map(lambda x: x if abs(x)>1e-8 else pd.np.nan)
    var_analysis = var_analysis.join(clean_field).dropna()
    
    var_analysis[field+'grp4'] = pd.qcut(var_analysis[field], 
                                         2,
                                         labels=["1=50th%tile", 
                                                 "2=100th%tile"])
#                                          4,
#                                          labels=["1=25th%tile", "2=50th%tile", 
#                                                  "3=75th%tile", "4=100th%tile"])
    
    # Chi-square test
    display_markdown("Contingency table of observed counts", raw=True)
    ct = pd.crosstab(var_analysis['status_group'], var_analysis[field+'grp4'])
    display(ct)
    ct_norm = ct/ct.sum(axis=0)
    display(ct_norm)

    ct = pd.crosstab(var_analysis['functional'], var_analysis[field+'grp4'])
    chisqr = stats.chi2_contingency(ct)

    if chisqr[1] > 0.05:
        display_markdown("There is *no* significant relationship between {} and the functional"
                         " status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
                         raw=True)
    else:
        display_markdown("There is a significant relationship between {} and the functional"
                         " status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
                         raw=True)

    ct = pd.crosstab(var_analysis['no_repairs'], var_analysis[field+'grp4'])
    chisqr = stats.chi2_contingency(ct)

    if chisqr[1] > 0.05:
        display_markdown("There is *no* significant relationship between {} and the no repair"
                         " status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
                         raw=True)
    else:
        display_markdown("There is a significant relationship between {} and the no repair"
                         " status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
                         raw=True)
        
    fig = plt.figure(figsize=[1.5*l for l in plt.rcParams['figure.figsize']])
    ax = fig.add_subplot(121)
    ax = sns.barplot(x=field+'grp4', y='functional', data=var_analysis, ci=None, ax=ax)
    ax.set_xlabel(field_name)
    ax = fig.add_subplot(122)
    ax = sns.barplot(x=field+'grp4', 
                     y='no_repairs', 
#                      data=var_analysis.where(var_analysis['functional'] == 1),
                     data=var_analysis,
                     ci=None,
                     ax=ax)
    ax.set_xlabel(field_name)
    plt.show()









    




Gps height






    




Contingency table of observed counts







    






  
    
      gps_heightgrp4
      1=50th%tile
      2=100th%tile
    
    
      status_group
      
      
    
  
  
    
      functional
      5958
      7397
    
    
      functional needs repair
      981
      633
    
    
      non functional
      4013
      2909
    
  








    






  
    
      gps_heightgrp4
      1=50th%tile
      2=100th%tile
    
    
      status_group
      
      
    
  
  
    
      functional
      0.544010
      0.676204
    
    
      functional needs repair
      0.089573
      0.057866
    
    
      non functional
      0.366417
      0.265929
    
  








    




There is a significant relationship between gps height and the functional status of the waterpoint (p-value = 2e-57).







    




There is a significant relationship between gps height and the no repair status of the waterpoint (p-value = 2.6e-89).







    












    




Latitude






    




Contingency table of observed counts







    






  
    
      latitudegrp4
      1=50th%tile
      2=100th%tile
    
    
      status_group
      
      
    
  
  
    
      functional
      8137
      7796
    
    
      functional needs repair
      728
      1278
    
    
      non functional
      4896
      4686
    
  








    






  
    
      latitudegrp4
      1=50th%tile
      2=100th%tile
    
    
      status_group
      
      
    
  
  
    
      functional
      0.591309
      0.566570
    
    
      functional needs repair
      0.052903
      0.092878
    
    
      non functional
      0.355788
      0.340552
    
  








    




There is a significant relationship between latitude and the functional status of the waterpoint (p-value = 0.0083).







    




There is a significant relationship between latitude and the no repair status of the waterpoint (p-value = 3.4e-05).







    












    




Longitude






    




Contingency table of observed counts







    






  
    
      longitudegrp4
      1=50th%tile
      2=100th%tile
    
    
      status_group
      
      
    
  
  
    
      functional
      8007
      7926
    
    
      functional needs repair
      1079
      927
    
    
      non functional
      4675
      4907
    
  








    






  
    
      longitudegrp4
      1=50th%tile
      2=100th%tile
    
    
      status_group
      
      
    
  
  
    
      functional
      0.581862
      0.576017
    
    
      functional needs repair
      0.078410
      0.067369
    
    
      non functional
      0.339728
      0.356613
    
  








    




There is a significant relationship between longitude and the functional status of the waterpoint (p-value = 0.0034).







    




There is no significant relationship between longitude and the no repair status of the waterpoint (p-value = 0.33).







    












    




Construction year






    




Contingency table of observed counts







    






  
    
      construction_yeargrp4
      1=50th%tile
      2=100th%tile
    
    
      status_group
      
      
    
  
  
    
      functional
      8259
      7674
    
    
      functional needs repair
      1326
      680
    
    
      non functional
      7328
      2254
    
  








    






  
    
      construction_yeargrp4
      1=50th%tile
      2=100th%tile
    
    
      status_group
      
      
    
  
  
    
      functional
      0.488323
      0.723416
    
    
      functional needs repair
      0.078401
      0.064103
    
    
      non functional
      0.433276
      0.212481
    
  








    




There is a significant relationship between construction year and the functional status of the waterpoint (p-value = 2.8e-306).







    




There is a significant relationship between construction year and the no repair status of the waterpoint (p-value = 0).







    












    




Population






    




Contingency table of observed counts







    






  
    
      populationgrp4
      1=50th%tile
      2=100th%tile
    
    
      status_group
      
      
    
  
  
    
      functional
      7002
      5700
    
    
      functional needs repair
      584
      1019
    
    
      non functional
      3241
      3563
    
  








    






  
    
      populationgrp4
      1=50th%tile
      2=100th%tile
    
    
      status_group
      
      
    
  
  
    
      functional
      0.646717
      0.554367
    
    
      functional needs repair
      0.053939
      0.099105
    
    
      non functional
      0.299344
      0.346528
    
  








    




There is a significant relationship between population and the functional status of the waterpoint (p-value = 2.5e-13).







    




There is a significant relationship between population and the no repair status of the waterpoint (p-value = 1.2e-42).



In [30]:

    
# fit an Extra Trees model to the data and look at the first 20 important fields
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(predictors, deployed_data['status_group'])
# display the relative importance of each attribute
cm = sns.light_palette("yellow", as_cmap=True)
(pd.Series(model.feature_importances_, index=predictors.columns, name='importance')
   .sort_values(ascending=False)
   .to_frame()
   .iloc[:20]
   .style.background_gradient(cmap=cm))









    Out[30]:





        

        
        

        
            
            
                
                
                
                importance
                
            
            
        
        
            
            
                
                
                    latitude
                
                
                    0.214629
                
            
            
            
                
                
                    longitude
                
                
                    0.213989
                
            
            
            
                
                
                    dry
                
                
                    0.108744
                
            
            
            
                
                
                    height
                
                
                    0.1045
                
            
            
            
                
                
                    construction_year
                
                
                    0.0826137
                
            
            
            
                
                
                    population
                
                
                    0.0705313
                
            
            
            
                
                
                    other_waterpoint_type_group
                
                
                    0.0402349
                
            
            
            
                
                
                    other
                
                
                    0.0239651
                
            
            
            
                
                
                    never pay
                
                
                    0.0212987
                
            
            
            
                
                
                    enough
                
                
                    0.0176261
                
            
            
            
                
                
                    permit
                
                
                    0.014617
                
            
            
            
                
                
                    gravity
                
                
                    0.0128419
                
            
            
            
                
                
                    communal standpipe
                
                
                    0.00780412
                
            
            
            
                
                
                    unknown
                
                
                    0.00755825
                
            
            
            
                
                
                    hand pump
                
                
                    0.00750208
                
            
            
            
                
                
                    insufficient
                
                
                    0.00643511
                
            
            
            
                
                
                    submersible
                
                
                    0.00589803
                
            
            
            
                
                
                    per bucket
                
                
                    0.00529142
                
            
            
            
                
                
                    motorpump
                
                
                    0.00474726
                
            
            
            
                
                
                    handpump
                
                
                    0.00436401



In [33]:

    
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=25)
model = model.fit(predictors, deployed_data['status_group'])

clean_test_data = process_data.iloc[test_data.index].dropna()
predictions = model.predict(process_data.iloc[test_data.index].dropna())

pred = pd.Series(predictions, index=clean_test_data.index, name='status_group')

missing_index = list()
for i in test_data.index:
    if i not in clean_test_data.index:
        missing_index.append(i)

data_list = list()
pd.np.random.seed(12345)
for rnd in pd.np.random.rand(len(missing_index)):
    if rnd < 0.072677:
        data_list.append('functional needs repair')
    elif rnd < 0.384242 + 0.072677:
        data_list.append('non functional')
    else:
        data_list.append('functional')
fill = pd.Series(data_list, index=missing_index)

pred = pred.append(fill)
to_file = pred[test_data.index]
to_file.to_csv('randomForest.csv', index_label='id', header=('status_group',))

len(missing_index)









    Out[33]:





1194



In [65]:

    
# Logistic regression trial
import statsmodels.formula.api as smf

# Rename columns to be pythonic
logit_expl = predictors.copy()
logit_expl.columns = ['_'.join(name.split()) for name in logit_expl.columns]
logit_expl.columns = ['_'.join(name.split('-')) for name in logit_expl.columns]

# Need to center the quantitative variable
for field in ('height', 'latitude', 'longitude', 'construction_year', 'population'):
    logit_expl[field] = logit_expl[field] - logit_expl[field].mean()
logit_expl['permit'] = pd.to_numeric(logit_expl['permit'])
    
formula = 'functional ~ '
formula = formula + ' + '.join(logit_expl.columns)

centered_data = logit_expl.join(deployed_data['functional']).dropna()

model = smf.logit(formula=formula, data=centered_data).fit()
model.summary()









    



Warning: Maximum number of iterations has been exceeded.
         Current function value: 0.481327
         Iterations: 35






    



C:\Anaconda3\lib\site-packages\statsmodels\base\model.py:466: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals
  "Check mle_retvals", ConvergenceWarning)






    Out[65]:





Logit Regression Results

  Dep. Variable:     functional       No. Observations:     54532 


  Model:                Logit         Df Residuals:         54504 


  Method:                MLE          Df Model:                27 


  Date:           Thu, 15 Sep 2016    Pseudo R-squ.:       0.2795 


  Time:               09:40:44        Log-Likelihood:      -26248.


  converged:            False         LL-Null:             -36431.


                                    LLR p-value:          0.000 




                                 coef      std err       z       P>|z|  [95.0% Conf. Int.] 


  Intercept                        0.8783   2.74e+05    3.2e-06   1.000  -5.38e+05  5.38e+05


  permit[T.True]                   0.2299      0.024      9.551   0.000      0.183     0.277


  height                           0.0003   2.74e-05     10.193   0.000      0.000     0.000


  longitude                        0.0056      0.005      1.100   0.271     -0.004     0.016


  latitude                        -0.0196      0.005     -4.145   0.000     -0.029    -0.010


  population                    8.455e-05   2.38e-05      3.550   0.000   3.79e-05     0.000


  construction_year                0.0399      0.001     37.447   0.000      0.038     0.042


  handpump                         0.8367        nan        nan     nan        nan       nan


  rope_pump                        0.8430        nan        nan     nan        nan       nan


  motorpump                       -0.4373        nan        nan     nan        nan       nan


  gravity                          0.7696        nan        nan     nan        nan       nan


  submersible                     -0.2363        nan        nan     nan        nan       nan


  other                           -0.7131        nan        nan     nan        nan       nan


  wind_powered                    -0.1843        nan        nan     nan        nan       nan


  unknown                         -0.3515        nan        nan     nan        nan       nan


  never_pay                       -0.4334        nan        nan     nan        nan       nan


  per_bucket                       0.5641        nan        nan     nan        nan       nan


  on_failure                       0.0896        nan        nan     nan        nan       nan


  monthly                          0.3100        nan        nan     nan        nan       nan


  annually                         0.6159        nan        nan     nan        nan       nan


  other_payment_type               0.0836        nan        nan     nan        nan       nan


  enough                           1.4535   2.67e+05   5.45e-06   1.000  -5.23e+05  5.23e+05


  insufficient                     0.9941   2.67e+05   3.73e-06   1.000  -5.23e+05  5.23e+05


  dry                             -2.9548   2.67e+05  -1.11e-05   1.000  -5.23e+05  5.23e+05


  seasonal                         1.3568   2.67e+05   5.08e-06   1.000  -5.23e+05  5.23e+05


  unknown_quantity_group           0.0287   2.67e+05   1.07e-07   1.000  -5.23e+05  5.23e+05


  hand_pump                       -2.0620        nan        nan     nan        nan       nan


  communal_standpipe              -1.7910        nan        nan     nan        nan       nan


  other_waterpoint_type_group     -2.8429        nan        nan     nan        nan       nan


  improved_spring                 -0.7378        nan        nan     nan        nan       nan


  cattle_trough                   -1.2462        nan        nan     nan        nan       nan


  dam                              9.5583        nan        nan     nan        nan       nan



In [ ]:

	amount_tsh	date_recorded	funder	gps_height	installer	longitude	latitude	wpt_name	num_private	basin	...	payment_type	water_quality	quality_group	quantity	quantity_group	source	source_type	source_class	waterpoint_type	waterpoint_type_group
id
0	0.0	2012-11-13	Tasaf	0	TASAF	33.125828	-5.118154	Mratibu	0	Lake Tanganyika	...	unknown	milky	milky	enough	enough	shallow well	shallow well	groundwater	hand pump	hand pump
1	0.0	2011-03-05	Shipo	1978	SHIPO	34.770717	-9.395642	none	0	Rufiji	...	never pay	soft	good	enough	enough	shallow well	shallow well	groundwater	hand pump	hand pump
2	0.0	2011-03-27	Lvia	0	LVIA	36.115056	-6.279268	Bombani	0	Wami / Ruvu	...	per bucket	soft	good	insufficient	insufficient	machine dbh	borehole	groundwater	communal standpipe multiple	communal standpipe
3	10.0	2013-06-03	Germany Republi	1639	CES	37.147432	-3.187555	Area 7 Namba 5	0	Pangani	...	per bucket	soft	good	enough	enough	spring	spring	groundwater	communal standpipe	communal standpipe
4	0.0	2011-03-22	Cmsr	0	CMSR	36.164893	-6.099289	Ezeleda	0	Wami / Ruvu	...	unknown	soft	good	dry	dry	shallow well	shallow well	groundwater	hand pump	hand pump

	importance
longitude	0.143399
latitude	0.138313
dry	0.078339
gps_height	0.071213
population	0.047767
construction_year	0.043016
other_waterpoint_type_group	0.034288
other_extraction_type_class	0.030801
enough	0.025929
never pay	0.016663
amount_tsh	0.013986
handpump	0.013617
insufficient	0.013088
permit	0.012641
government_funder	0.010637

	importance
longitude	0.215398
latitude	0.213067
height	0.103801
construction_year	0.0806856
dry	0.0793701
population	0.0702775
other	0.0392548
other_waterpoint_type_group	0.0384225
enough	0.030773
permit	0.0155521
never pay	0.0153311
insufficient	0.0136371
gravity	0.010479
unknown	0.00782398
communal standpipe	0.00722429
per bucket	0.0071673
submersible	0.00711073
seasonal	0.00693082
motorpump	0.00529115
annually	0.00502709

status_group	functional	non functional	functional needs repair
unnormalized	32259.000000	22824.000000	4317.000000
normalized	0.543081	0.384242	0.072677

extraction_type_class	gravity	handpump	other	submersible	motorpump	rope pump	wind-powered
unnormalized	33263.000000	20612.000000	8102.000000	7772.000000	3777.000000	572.000000	152.000000
normalized	0.447987	0.277603	0.109118	0.104673	0.050869	0.007704	0.002047

payment_type	never pay	per bucket	monthly	unknown	on failure	annually	other
unnormalized	31712.000000	11266.000000	10397.000000	10149.000000	4842.000000	4570.000000	1314.000000
normalized	0.427098	0.151731	0.140027	0.136687	0.065212	0.061549	0.017697

quantity_group	enough	insufficient	dry	seasonal	unknown
unnormalized	41522.000000	18896.000000	7782.000000	5075.00000	975.000000
normalized	0.559219	0.254492	0.104808	0.06835	0.013131

waterpoint_type_group	communal standpipe	hand pump	other	improved spring	cattle trough	dam
unnormalized	43239.000000	21884.000000	8010.000000	959.000000	150.00000	8.000000
normalized	0.582343	0.294734	0.107879	0.012916	0.00202	0.000108

public_meeting	True	False	nan
unnormalized	63749.000000	6346.000000	4155.00000
normalized	0.858572	0.085468	0.05596

permit	True	False	nan
unnormalized	48606.000000	21851.00000	3793.000000
normalized	0.654626	0.29429	0.051084

extraction_type_class	gravity	handpump	motorpump	other	rope pump	submersible	wind-powered
status_group
functional	12094	804	752	185	7	2061	30
functional needs repair	1709	21	85	12	0	174	5
non functional	5786	422	1031	637	20	1651	35

extraction_type_class	gravity	handpump	motorpump	other	rope pump	submersible	wind-powered
status_group
functional	0.617387	0.644747	0.402570	0.221823	0.259259	0.530365	0.428571
functional needs repair	0.087243	0.016840	0.045503	0.014388	0.000000	0.044776	0.071429
non functional	0.295370	0.338412	0.551927	0.763789	0.740741	0.424858	0.500000

		p-value	Ha
gravity	handpump	1.420607e-03	True
	motorpump	2.262516e-114	True
	other	1.346001e-178	True
	rope pump	1.202484e-06	True
	submersible	1.910445e-56	True
	wind-powered	3.034969e-04	True
handpump	motorpump	1.873751e-31	True
	other	2.637283e-80	True
	rope pump	3.460225e-05	True
	wind-powered	8.423967e-03	False
motorpump	other	1.930282e-25	True
motorpump	rope pump	7.757985e-02	False
other	rope pump	9.622983e-01	False
submersible	handpump	7.446052e-08	True
	motorpump	1.895132e-19	True
	other	2.349516e-70	True
	rope pump	1.860833e-03	True
	wind-powered	2.550959e-01	False
wind-powered	motorpump	4.623263e-01	False
	other	2.465892e-06	True
	rope pump	5.535722e-02	False

		p-value	Ha
gravity	handpump	5.756493e-02	False
	motorpump	5.142137e-73	True
	other	3.419231e-115	True
	rope pump	2.865684e-04	True
	submersible	4.982951e-24	True
	wind-powered	1.806478e-03	True
handpump	motorpump	7.827683e-40	True
	other	1.441784e-79	True
	rope pump	8.932081e-05	True
	wind-powered	4.242535e-04	True
motorpump	other	1.135053e-19	True
motorpump	rope pump	1.898417e-01	False
other	rope pump	8.219331e-01	False
submersible	handpump	1.866009e-12	True
	motorpump	1.399579e-19	True
	other	1.123694e-58	True
	rope pump	8.823464e-03	False
	wind-powered	1.163752e-01	False
wind-powered	motorpump	7.555927e-01	False
	other	1.724771e-04	True
	rope pump	1.917519e-01	False

payment_type	annually	monthly	never pay	on failure	other	per bucket	unknown
status_group
functional	1811	3871	4247	839	132	4188	845
functional needs repair	143	725	647	85	4	263	139
non functional	303	1276	4983	279	70	1672	999

payment_type	annually	monthly	never pay	on failure	other	per bucket	unknown
status_group
functional	0.802393	0.659230	0.429989	0.697423	0.640777	0.683978	0.426122
functional needs repair	0.063358	0.123467	0.065506	0.070657	0.019417	0.042953	0.070096
non functional	0.134249	0.217302	0.504505	0.231920	0.339806	0.273069	0.503782

		p-value	Ha
annually	monthly	3.031849e-17	True
	never pay	1.884291e-224	True
	on failure	3.676184e-13	True
	other	7.435575e-15	True
	per bucket	4.310055e-40	True
	unknown	5.772155e-149	True
monthly	never pay	1.485108e-277	True
	on failure	2.813915e-01	False
	other	4.567542e-05	True
	unknown	2.589152e-130	True
never pay	on failure	3.172208e-71	True
	other	4.049234e-06	True
	unknown	9.727352e-01	False
on failure	other	1.250186e-03	True
on failure	unknown	8.742551e-52	True
per bucket	monthly	1.547396e-12	True
	never pay	4.792608e-183	True
	on failure	3.544198e-03	False
	other	4.234480e-02	False
	unknown	3.052523e-80	True
unknown	other	1.043067e-05	True

		p-value	Ha
annually	monthly	2.898930e-36	True
	never pay	2.592424e-223	True
	on failure	5.118962e-12	True
	other	8.707763e-08	True
	per bucket	2.046482e-26	True
	unknown	1.496984e-140	True
monthly	never pay	2.505928e-170	True
	on failure	1.157905e-02	False
	other	6.353067e-01	False
	unknown	8.785655e-75	True
never pay	on failure	6.614117e-69	True
	other	2.372608e-09	True
	unknown	7.698215e-01	False
on failure	other	1.231580e-01	False
on failure	unknown	8.679468e-50	True
per bucket	monthly	4.153706e-03	False
	never pay	1.602440e-214	True
	on failure	3.763071e-01	False
	other	2.171448e-01	False
	unknown	8.859334e-94	True
unknown	other	5.707291e-09	True

quantity_group	dry	enough	insufficient	seasonal	unknown
status_group
functional	67	11722	3514	599	31
functional needs repair	5	1289	629	83	0
non functional	2701	3850	2729	207	95

quantity_group	dry	enough	insufficient	seasonal	unknown
status_group
functional	0.024162	0.695214	0.511350	0.673791	0.246032
functional needs repair	0.001803	0.076449	0.091531	0.093363	0.000000
non functional	0.974035	0.228338	0.397119	0.232846	0.753968

		p-value	Ha
dry	insufficient	0.000000e+00	True
	seasonal	0.000000e+00	True
	unknown	1.532922e-37	True
enough	dry	0.000000e+00	True
	insufficient	8.609560e-153	True
	seasonal	7.863726e-01	False
	unknown	2.066209e-43	True
insufficient	seasonal	2.882668e-21	True
insufficient	unknown	1.252727e-15	True
seasonal	unknown	1.684947e-32	True

waterpoint_type_group	cattle trough	communal standpipe	dam	hand pump	improved spring	other
status_group
functional	54	14804	5	828	57	185
functional needs repair	2	1872	0	26	8	98
non functional	8	8241	0	425	12	896

waterpoint_type_group	cattle trough	communal standpipe	dam	hand pump	improved spring	other
status_group
functional	0.84375	0.594133	1.0	0.647381	0.740260	0.156913
functional needs repair	0.03125	0.075129	0.0	0.020328	0.103896	0.083121
non functional	0.12500	0.330738	0.0	0.332291	0.155844	0.759966

		p-value	Ha
communal standpipe	cattle trough	7.731873e-04	True
	dam	2.728643e-01	False
	hand pump	9.325450e-01	False
	improved spring	1.707226e-03	True
	other	7.982696e-200	True
dam	cattle trough	9.079602e-01	False
hand pump	cattle trough	8.829528e-04	True
	dam	2.714208e-01	False
	improved spring	1.988047e-03	True
	other	8.597211e-100	True
improved spring	cattle trough	7.792963e-01	False
	dam	7.622363e-01	False
	other	7.892635e-30	True
other	cattle trough	5.677582e-28	True
other	dam	6.033031e-04	True

		p-value	Ha
communal standpipe	cattle trough	8.322043e-05	True
	dam	1.637409e-01	False
	hand pump	1.720922e-04	True
	improved spring	1.272357e-02	False
	other	4.682013e-193	True
dam	cattle trough	7.669887e-01	False
hand pump	cattle trough	1.974827e-03	True
	dam	2.383203e-01	False
	improved spring	1.237518e-01	False
	other	4.761713e-134	True
improved spring	cattle trough	1.976478e-01	False
	dam	4.393768e-01	False
	other	1.898473e-35	True
other	cattle trough	4.857037e-41	True
other	dam	6.337283e-06	True

permit	False	True
status_group
functional	3114	12819
functional needs repair	413	1593
non functional	2426	7156

permit	False	True
status_group
functional	0.523098	0.594353
functional needs repair	0.069377	0.073859
non functional	0.407526	0.331788

gps_heightgrp4	1=50th%tile	2=100th%tile
status_group
functional	5958	7397
functional needs repair	981	633
non functional	4013	2909

gps_heightgrp4	1=50th%tile	2=100th%tile
status_group
functional	0.544010	0.676204
functional needs repair	0.089573	0.057866
non functional	0.366417	0.265929

latitudegrp4	1=50th%tile	2=100th%tile
status_group
functional	8137	7796
functional needs repair	728	1278
non functional	4896	4686