PUMP IT UP

Introduction: Using the data gathered from Taarifa and the Tanzanian Ministry of Water, can we predict which pumps are functional, which need some repairs, and which don't work at all? Predicting one of these three classes based and a smart understanding of which waterpoints will fail, can improve the maintenance operations and ensure that clean, potable water is available to communities across Tanzania.

This is also an intermediate-level competition by DataDriven! All code & support scripts are in Github Repo

Imports



In [63]:

    
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline
# %load_ext writeandexecute

plt.style.use('ggplot')
sns.set(color_codes=True)

# seed
np.random.seed(69572)



In [64]:

    
# import sys
# sys.path = sys.path + ['/Users/sampathkumarm/Desktop/devbox/Sam-DS/Kaggle/datadriven']

import scripts

import imp
imp.reload(scripts)

from scripts.sam_value_counts import sam_dataframe_cols_value_count_analysis, sam_dataframe_markup_value_counts
from scripts.sam_confusion_matrix import sam_plot_confusion_matrix, sam_confusion_maxtrix



In [65]:

    
import sys

from __future__ import absolute_import
from IPython.core.getipython import get_ipython
from IPython.core.magic import (Magics, magics_class,  cell_magic)

try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO

from markdown import markdown
from IPython.core.display import HTML
from IPython.display import display

@magics_class
class MarkdownMagics(Magics):
 
    @cell_magic
    def asmarkdown(self, line, cell):
        buffer = StringIO()
        stdout = sys.stdout
        sys.stdout = buffer
        try:
            exec(cell, locals(), self.shell.user_ns)
        except:
            sys.stdout = stdout
            raise
        sys.stdout = stdout
        return HTML("<p>{}</p>".format(markdown(buffer.getvalue(), extensions=['markdown.extensions.extra'])))
        return buffer.getvalue() + 'test'
    
    def timer_message(self, start_time):
#         print self
        time_diff = (now() - start_time).total_seconds()
        if time_diff < 0.001:
            time_diff = 0
            print('\n<pre>In', time_diff, 'Secs</pre>')
        else:
            print('\n<pre>In', time_diff, 'Secs</pre>')

    @cell_magic
    def timer(self, line, cell):
        import datetime
        now = datetime.datetime.now
        start_time = now()
        buffer = StringIO()
        stdout = sys.stdout
        sys.stdout = buffer
        try:
            exec(cell, locals(), self.shell.user_ns)
            self.timer_message(start_time)
        except:
            sys.stdout = stdout
            raise
        sys.stdout = stdout
        return HTML("<p>{}</p>".format(markdown(buffer.getvalue(), extensions=['markdown.extensions.extra'])))
        return buffer.getvalue() + 'test'
 
get_ipython().register_magics(MarkdownMagics)

Data Analysis



In [116]:

    
RAW_X = pd.read_csv('data/traning_set_values.csv', index_col='id')
RAW_y = pd.read_csv('data/training_set_labels.csv', index_col='id')
RAW_TEST_X = pd.read_csv('data/test_set_values.csv', index_col='id')



In [117]:

    
# proportion of labels  available
RAW_y.status_group.value_counts() / RAW_y.size









    Out[117]:





functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64



In [118]:

    
print('Shape of RAW_X', RAW_X.shape)
print('Shape of RAW_y', RAW_y.shape)
print('Shape of RAW_TEST_X', RAW_TEST_X.shape)

# ('Shape of RAW_X', (59400, 39))
# ('Shape of RAW_y', (59400, 1))
# ('Shape of RAW_TEST_X', (14850, 39))









    



Shape of RAW_X (59400, 39)
Shape of RAW_y (59400, 1)
Shape of RAW_TEST_X (14850, 39)



In [69]:

    
for i, col in enumerate(RAW_X.columns):
    print('|%d|%s|%d|' % (i, col, len(RAW_X[col].value_counts())))









    



|0|amount_tsh|98|
|1|date_recorded|356|
|2|funder|1897|
|3|gps_height|2428|
|4|installer|2145|
|5|longitude|57516|
|6|latitude|57517|
|7|wpt_name|37400|
|8|num_private|65|
|9|basin|9|
|10|subvillage|19287|
|11|region|21|
|12|region_code|27|
|13|district_code|20|
|14|lga|125|
|15|ward|2092|
|16|population|1049|
|17|public_meeting|2|
|18|recorded_by|1|
|19|scheme_management|12|
|20|scheme_name|2696|
|21|permit|2|
|22|construction_year|55|
|23|extraction_type|18|
|24|extraction_type_group|13|
|25|extraction_type_class|7|
|26|management|12|
|27|management_group|5|
|28|payment|7|
|29|payment_type|7|
|30|water_quality|8|
|31|quality_group|6|
|32|quantity|5|
|33|quantity_group|5|
|34|source|10|
|35|source_type|7|
|36|source_class|3|
|37|waterpoint_type|7|
|38|waterpoint_type_group|6|



In [70]:

    
# integer colums
cols_ints = '''amount_tsh
gps_height
longitude
latitude
num_private
region_code
district_code
population
construction_year'''.splitlines()

# bool
cols_bool = 'public_meeting permit'.split()

# date
cols_date = ['date_recorded']

print('INT COlS: ', len(cols_ints))
print('BOOL COLS:', len(cols_bool))
print('Date COLS:', len(cols_date))









    



INT COlS:  9
BOOL COLS: 2
Date COLS: 1



In [71]:

    
len(RAW_X.columns)









    Out[71]:





39



In [119]:

    
def show_object_dtypes(df,others=True):
    dtype = object
    if others:
        return df.dtypes[df.dtypes == dtype]
    else:
        return df.dtypes[df.dtypes != dtype]



In [120]:

    
show_object_dtypes(RAW_TEST_X, True)









    Out[120]:





date_recorded            object
funder                   object
installer                object
wpt_name                 object
basin                    object
subvillage               object
region                   object
lga                      object
ward                     object
public_meeting           object
recorded_by              object
scheme_management        object
scheme_name              object
permit                   object
extraction_type          object
extraction_type_group    object
extraction_type_class    object
management               object
management_group         object
payment                  object
payment_type             object
water_quality            object
quality_group            object
quantity                 object
quantity_group           object
source                   object
source_type              object
source_class             object
waterpoint_type          object
waterpoint_type_group    object
dtype: object



In [121]:

    
show_object_dtypes(RAW_TEST_X, False)









    Out[121]:





amount_tsh           float64
gps_height             int64
longitude            float64
latitude             float64
num_private            int64
region_code            int64
district_code          int64
population             int64
construction_year      int64
dtype: object

cols_values_counts_dataframe

As we can see in above describe output, we seem to have lots of categorical values so let start exploring them a bit.

Lets start taking into believe everything is a Categorical Columns and check their data



In [122]:

    
columns = RAW_X.columns
values_counts_bag = [len(RAW_X[column].value_counts()) for column in columns]



In [123]:

    
_ = sns.distplot(values_counts_bag, hist=True, kde=False,)

Example of how np-log transforms data

>>> np.log([0.001, 0.01, 0.1, 1, 10, 100, 1000])

array([-6.90775528, -4.60517019, -2.30258509,  0.        ,  2.30258509,
        4.60517019,  6.90775528])

As you can see in np-log example, we can learn that when a list of values vary significantly(exponentially) then their logarithms moves linearly. As we(I) feel comfortable in studying linear plot and linear information, we did a np.log to values counts.



In [77]:

    
cols_values_counts_dataframe = pd.DataFrame(np.log(values_counts_bag), index=columns, columns=['Value Counts'])



In [78]:

    
print('Values Counts:', values_counts_bag)

print('\nLog of Values Counts:', cols_values_counts_dataframe.T.values)

_ = sns.distplot(cols_values_counts_dataframe.T.values, hist=True, kde=False,)

plt.title('Historgram of  Object Feature`s (log2 of) Unique Values counts')
plt.xlabel('Features')









    



Values Counts: [98, 356, 1897, 2428, 2145, 57516, 57517, 37400, 65, 9, 19287, 21, 27, 20, 125, 2092, 1049, 2, 1, 12, 2696, 2, 55, 18, 13, 7, 12, 5, 7, 7, 8, 6, 5, 5, 10, 7, 3, 7, 6]

Log of Values Counts: [[  4.58496748   5.87493073   7.54802897   7.79482315   7.67089483
   10.95981845  10.95983584  10.52942598   4.17438727   2.19722458
    9.86718657   3.04452244   3.29583687   2.99573227   4.82831374
    7.64587583   6.95559261   0.69314718   0.           2.48490665
    7.89952447   0.69314718   4.00733319   2.89037176   2.56494936
    1.94591015   2.48490665   1.60943791   1.94591015   1.94591015
    2.07944154   1.79175947   1.60943791   1.60943791   2.30258509
    1.94591015   1.09861229   1.94591015   1.79175947]]






    Out[78]:





<matplotlib.text.Text at 0x1193e1550>



In [79]:

    
cols_values_counts_dataframe.plot(kind='barh', figsize=(12, 12))
_ = plt.plot((2, 2), (0, 38))
_ = plt.plot((4, 4), (0, 38), '-g')
_ = plt.plot((6, 6), (0, 38), '-r')
_ = plt.plot((8, 8), (0, 38), '-y')
print('We seem to have some special categories where value counts are high.')

plt.title('Features Values Counts for comparision')
plt.xlabel ('Log of Unique Values')









    



We seem to have some special categories where value counts are high.






    Out[79]:





<matplotlib.text.Text at 0x119477a20>



In [80]:

    
sam_dataframe_cols_value_count_analysis(RAW_X)









    



(1, 'waterpoint_type_group', 24)
(2, 'basin', 24)
(3, 'region', 24)
(4, 'region_code', 24)
(5, 'district_code', 24)
(6, 'public_meeting', 24)
(7, 'recorded_by', 24)
(8, 'scheme_management', 24)
(9, 'permit', 24)
(10, 'extraction_type', 24)
(11, 'extraction_type_group', 24)
(12, 'extraction_type_class', 24)
(13, 'management', 24)
(14, 'management_group', 24)
(15, 'payment', 24)
(16, 'payment_type', 24)
(17, 'water_quality', 24)
(18, 'quality_group', 24)
(19, 'quantity', 24)
(20, 'quantity_group', 24)
(21, 'source', 24)
(22, 'source_type', 24)
(23, 'source_class', 24)
(24, 'waterpoint_type', 24)
('Showing Plot for Columns:\n', ['basin', 'region', 'region_code', 'district_code', 'public_meeting', 'recorded_by', 'scheme_management', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group'])

Checking rest of the columns



In [81]:

    
cols_value_count_limit_fraction = 0.01
cols_value_count_limit_log_value = np.log(RAW_X.shape[0] * cols_value_count_limit_fraction)


print('Total Number of Records:', RAW_X.shape[0], '- Log val is:', np.log(RAW_X.shape[0]))
print('%s percent of Number of Records:' % (cols_value_count_limit_fraction * 100),\
      RAW_X.shape[0] * cols_value_count_limit_fraction,\
      ' - Log val is:',  cols_value_count_limit_log_value)









    



Total Number of Records: 59400 - Log val is: 10.9920495054
1.0 percent of Number of Records: 594.0  - Log val is: 6.38687931936

cols_categorical_check

Here in this project, cols_categorical_check refers to list of columns for which caution check is considered. Reason for this check is, we would need more data to explain other columns & target cols with respect to it.

Lets consider these columns with more 5% of values as non categorical values and since our problem statement is choosing which category, we will try to minimise the category and see how our performance changes(improves or not)

To begin we will consider that those categories with more than cols_value_count_limit_fraction percentage as the upper limit allowed. Any column with other data will pruged to become some to other information



In [82]:

    
show_object_dtypes(RAW_X, True)









    Out[82]:





date_recorded            object
funder                   object
installer                object
wpt_name                 object
basin                    object
subvillage               object
region                   object
lga                      object
ward                     object
public_meeting           object
recorded_by              object
scheme_management        object
scheme_name              object
permit                   object
extraction_type          object
extraction_type_group    object
extraction_type_class    object
management               object
management_group         object
payment                  object
payment_type             object
water_quality            object
quality_group            object
quantity                 object
quantity_group           object
source                   object
source_type              object
source_class             object
waterpoint_type          object
waterpoint_type_group    object
dtype: object



In [83]:

    
show_object_dtypes(RAW_X, False)









    Out[83]:





amount_tsh           float64
gps_height             int64
longitude            float64
latitude             float64
num_private            int64
region_code            int64
district_code          int64
population             int64
construction_year      int64
dtype: object



In [84]:

    
cols_non_categorical = show_object_dtypes(RAW_X, True).index.tolist()

cols_date_numerics = show_object_dtypes(RAW_X, True).index.tolist()



In [85]:

    
list(cols_date_numerics)









    Out[85]:





['date_recorded',
 'funder',
 'installer',
 'wpt_name',
 'basin',
 'subvillage',
 'region',
 'lga',
 'ward',
 'public_meeting',
 'recorded_by',
 'scheme_management',
 'scheme_name',
 'permit',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group']



In [86]:

    
cols_categorical_check = []

for col, vc in cols_values_counts_dataframe.iterrows():
    if col in cols_non_categorical:
        if float(vc) > cols_value_count_limit_log_value:
            cols_categorical_check.append(col)

print('Columns we need to moderate are:', cols_categorical_check)









    



Columns we need to moderate are: ['funder', 'installer', 'wpt_name', 'subvillage', 'ward', 'scheme_name']

All cols_date_numerics, are date & other numeric data which can be made into buckets or reducing precision. Thus we can bound number of categories in data as the more variety of data we have, we need more information specific to each category which all might end with curse of dimensionality.

During pre-processing states we shall do following TODO

limiting check experiments on our cols_date_numerics & cols_categorical_check to be under cols_value_count_limit_fraction



In [87]:

    
print('Log limit for categories:', cols_value_count_limit_log_value)
print('Actual limit for categories:', cols_value_count_limit_fraction * RAW_X.shape[0])

RAW_X[cols_categorical_check].head()









    



Log limit for categories: 6.38687931936
Actual limit for categories: 594.0






    Out[87]:






  
    
      
      funder
      installer
      wpt_name
      subvillage
      ward
      scheme_name
    
    
      id
      
      
      
      
      
      
    
  
  
    
      69572
      Roman
      Roman
      none
      Mnyusi B
      Mundindi
      Roman
    
    
      8776
      Grumeti
      GRUMETI
      Zahanati
      Nyamara
      Natta
      NaN
    
    
      34310
      Lottery Club
      World vision
      Kwa Mahundi
      Majengo
      Ngorika
      Nyumba ya mungu pipe scheme
    
    
      67743
      Unicef
      UNICEF
      Zahanati Ya Nanyumbu
      Mahakamani
      Nanyumbu
      NaN
    
    
      19728
      Action In A
      Artisan
      Shuleni
      Kyanyamisa
      Nyakasimbi
      NaN



In [88]:

    
RAW_X[cols_categorical_check].head(15)









    Out[88]:






  
    
      
      funder
      installer
      wpt_name
      subvillage
      ward
      scheme_name
    
    
      id
      
      
      
      
      
      
    
  
  
    
      69572
      Roman
      Roman
      none
      Mnyusi B
      Mundindi
      Roman
    
    
      8776
      Grumeti
      GRUMETI
      Zahanati
      Nyamara
      Natta
      NaN
    
    
      34310
      Lottery Club
      World vision
      Kwa Mahundi
      Majengo
      Ngorika
      Nyumba ya mungu pipe scheme
    
    
      67743
      Unicef
      UNICEF
      Zahanati Ya Nanyumbu
      Mahakamani
      Nanyumbu
      NaN
    
    
      19728
      Action In A
      Artisan
      Shuleni
      Kyanyamisa
      Nyakasimbi
      NaN
    
    
      9944
      Mkinga Distric Coun
      DWE
      Tajiri
      Moa/Mwereme
      Moa
      Zingibali
    
    
      19816
      Dwsp
      DWSP
      Kwa Ngomho
      Ishinabulandi
      Samuye
      NaN
    
    
      54551
      Rwssp
      DWE
      Tushirikiane
      Nyawishi Center
      Chambo
      NaN
    
    
      53934
      Wateraid
      Water Aid
      Kwa Ramadhan Musa
      Imalauduki
      Itetemia
      NaN
    
    
      46144
      Isingiro Ho
      Artisan
      Kwapeto
      Mkonomre
      Kaisho
      NaN
    
    
      49056
      Private
      Private
      Mzee Hokororo
      Mizugo
      Tambani
      NaN
    
    
      50409
      Danida
      DANIDA
      Kwa Alid Nchimbi
      Ngondombwito
      Msindo
      NaN
    
    
      36957
      World Vision
      World vision
      Pamba
      Nkilifa
      Busilili
      NaN
    
    
      50495
      Lawatefuka Water Supply
      Lawatefuka water sup
      Kwa John Izack Mmari
      Omarini
      Siha Kaskazini
      BL Bondeni
    
    
      53752
      Biore
      WEDECO
      Mwabasabi
      Mwabasabi
      Nkoma
      None



In [89]:

    
_ = sns.distplot(RAW_X.gps_height, hist=True, kde=False, rug=False)



In [90]:

    
_ = sns.distplot(RAW_X.population, hist=True, kde=False, rug=False)



In [91]:

    
_ = sns.jointplot(x='longitude', y='latitude', data=RAW_X)
plt.xlabel('longitude')
plt.ylabel('latitude')









    Out[91]:





<matplotlib.text.Text at 0x116c289b0>



In [92]:

    
%%asmarkdown

# To generate a Markup Table
tmp = sam_dataframe_markup_value_counts(dataframe=RAW_X, max_print_value_counts=10, show_plots=False, figsize=(9, 2))

for each in tmp:
    print(each)









    Out[92]:







Col ID
Col Name
UniqCount
Col Values
UniqValCount




1
amount_tsh
98




2
date_recorded
356




3
funder
1897




4
gps_height
2428




5
installer
2145




6
longitude
57516




7
latitude
57517




8
wpt_name
37400




9
num_private
65




10
basin
9
Lake Nyasa
5085



-
-
Pangani
8940



-
-
Internal
7785



-
-
Lake Rukwa
2454



-
-
Lake Tanganyika
6432



-
-
Wami / Ruvu
5987



-
-
Ruvuma / Southern Coast
4493



-
-
Lake Victoria
10248



-
-
Rufiji
7976


11
subvillage
19287




12
region
21




13
region_code
27




14
district_code
20




15
lga
125




16
ward
2092




17
population
1049




18
public_meeting
2
False
5055



-
-
True
51011


19
recorded_by
1




20
scheme_management
12




21
scheme_name
2696




22
permit
2
False
17492



-
-
True
38852


23
construction_year
55




24
extraction_type
18




25
extraction_type_group
13




26
extraction_type_class
7
other
6430



-
-
submersible
6179



-
-
handpump
16456



-
-
gravity
26780



-
-
wind-powered
117



-
-
rope pump
451



-
-
motorpump
2987


27
management
12




28
management_group
5
other
943



-
-
unknown
561



-
-
user-group
52490



-
-
parastatal
1768



-
-
commercial
3638


29
payment
7
other
1054



-
-
pay monthly
8300



-
-
pay annually
3642



-
-
pay when scheme fails
3914



-
-
pay per bucket
8985



-
-
unknown
8157



-
-
never pay
25348


30
payment_type
7
other
1054



-
-
monthly
8300



-
-
per bucket
8985



-
-
annually
3642



-
-
unknown
8157



-
-
on failure
3914



-
-
never pay
25348


31
water_quality
8
coloured
490



-
-
fluoride
200



-
-
milky
804



-
-
fluoride abandoned
17



-
-
soft
50818



-
-
salty abandoned
339



-
-
salty
4856



-
-
unknown
1876


32
quality_group
6
good
50818



-
-
fluoride
217



-
-
milky
804



-
-
unknown
1876



-
-
colored
490



-
-
salty
5195


33
quantity
5
seasonal
4050



-
-
unknown
789



-
-
enough
33186



-
-
dry
6246



-
-
insufficient
15129


34
quantity_group
5
seasonal
4050



-
-
unknown
789



-
-
enough
33186



-
-
dry
6246



-
-
insufficient
15129


35
source
10




36
source_type
7
rainwater harvesting
2295



-
-
dam
656



-
-
shallow well
16824



-
-
other
278



-
-
river/lake
10377



-
-
spring
17021



-
-
borehole
11949


37
source_class
3
surface
13328



-
-
unknown
278



-
-
groundwater
45794


38
waterpoint_type
7
other
6380



-
-
communal standpipe
28522



-
-
communal standpipe multiple
6103



-
-
dam
7



-
-
hand pump
17488



-
-
cattle trough
116



-
-
improved spring
784


39
waterpoint_type_group
6
other
6380



-
-
communal standpipe
34625



-
-
hand pump
17488



-
-
dam
7



-
-
cattle trough
116



-
-
improved spring
784

Observations & TODO

Most of the data seems categorical
Need to check cols_date_numerics(TODO1)
- we shall convert date -> day, month, year, weekday, total_no_of_day_from_reference_point. These splits for two reasons.
  - Reason1: It might be possible that in some location all specific set of complaints are registered on a start/mid/at end of the month. It might also be possible that they are registered on every Monday or so.
  - Reason2: Taking as much information as possible.
Need to check cols_categorical_check(TODO2)
- longitutude & latitude seem to hold (0,0) instead of NULL which is acting as outlier for now
Following pairs looks closesly related - cleanup (TODO3)
- quantity & quantity_group
- quality_group & water_quality
- extraction_type, extraction_type_class & extraction_type_group
Other - cleanup (TODO4)
- recorded_by, seems to hold only a single value
- population & amount_tsh, values are for some given as zero

Data Processing

Generic Transformations

Num/Bool Tranformations

date_recorded to Int
public_meeting to Int
permit to Int
longitude to Float(less precision)
latitude to Float(less precision)

Precision Description of Longititude and Latitude is available here at below link



In [124]:

    
# Reloading the data

RAW_X = pd.read_csv('data/traning_set_values.csv', index_col='id')
RAW_y = pd.read_csv('data/training_set_labels.csv', index_col='id')
RAW_TEST_X = pd.read_csv('data/test_set_values.csv', index_col='id')

Int Transformations



In [125]:

    
import datetime

strptime = datetime.datetime.strptime

DATE_FORMAT = "%Y-%m-%d"
REFERENCE_DATE_POINT = strptime('2000-01-01', DATE_FORMAT)

if RAW_X.date_recorded.dtype == 'O':

    # convert it to datetime format
    f = lambda x: strptime(str(x), DATE_FORMAT)
    RAW_X.date_recorded = RAW_X.date_recorded.apply(f)
    RAW_TEST_X.date_recorded = RAW_TEST_X.date_recorded.apply(f)

    # week day
    f = lambda x: x.weekday()
    RAW_X['date_recorded_weekday'] = RAW_X.date_recorded.apply(f)
    RAW_TEST_X['date_recorded_weekday'] = RAW_TEST_X.date_recorded.apply(f)

    # date
    f = lambda x: x.day
    RAW_X['date_recorded_date'] = RAW_X.date_recorded.apply(f)
    RAW_TEST_X['date_recorded_date'] = RAW_TEST_X.date_recorded.apply(f)

    # month
    f = lambda x: x.month
    RAW_X['date_recorded_month'] = RAW_X.date_recorded.apply(f)
    RAW_TEST_X['date_recorded_month'] = RAW_TEST_X.date_recorded.apply(f)

    # year
    f = lambda x: x.year
    RAW_X['date_recorded_year'] = RAW_X.date_recorded.apply(f)
    RAW_TEST_X['date_recorded_year'] = RAW_TEST_X.date_recorded.apply(f)

    # total days
    f = lambda x: (x - REFERENCE_DATE_POINT).days
    RAW_X.date_recorded = RAW_X.date_recorded.apply(f)
    RAW_TEST_X.date_recorded = RAW_TEST_X.date_recorded.apply(f)



In [126]:

    
# Longitude & Latitude -- zero values fix

# Filling Missing/OUTLIAR Values
_ = np.mean(RAW_X[u'latitude'][RAW_X.latitude < -1.0].values)

if not RAW_X.loc[RAW_X.latitude >= -1.0, u'latitude'].empty:
    RAW_X.loc[RAW_X.latitude >= -1.0, u'latitude'] = _
    RAW_TEST_X.loc[RAW_TEST_X.latitude >= -1.0, u'latitude'] = _


# Filling Missing/OUTLIAR Values
_ = np.mean(RAW_X[u'longitude'][RAW_X[u'longitude'] > 1.0].values)

if not RAW_X.loc[RAW_X[u'longitude'] <= 1.0, u'longitude'].empty:
    RAW_X.loc[RAW_X[u'longitude'] <= 1.0, u'longitude'] = _
    RAW_TEST_X.loc[RAW_TEST_X[u'longitude'] <= 1.0, u'longitude'] = _



In [127]:

    
def f(x):
    if x is True:
        return 1
    elif x is False:
        return 2
    else:
        return 3


if (RAW_X.public_meeting.dtype != 'bool') and (RAW_X.permit.dtype != 'bool'):

    # public_meeting
    RAW_X.public_meeting = RAW_X.public_meeting.apply(f)
    RAW_TEST_X.public_meeting = RAW_TEST_X.public_meeting.apply(f)

    # permit
    RAW_X.permit = RAW_X.permit.apply(f)
    RAW_TEST_X.permit = RAW_TEST_X.permit.apply(f)

print('Dtype of public_meetings & permit:',RAW_X.public_meeting.dtype, RAW_X.permit.dtype)
print('')
# checking
if list(RAW_TEST_X.dtypes[RAW_TEST_X.dtypes != RAW_X.dtypes]):
    raise Exception('RAW_X.dtypes and RAW_TEST_X.dtypes are not in Sync')
else:
    print('All in Good Shape')









    



Dtype of public_meetings & permit: int64 int64

All in Good Shape



In [128]:

    
show_object_dtypes(RAW_X, True)









    Out[128]:





funder                   object
installer                object
wpt_name                 object
basin                    object
subvillage               object
region                   object
lga                      object
ward                     object
recorded_by              object
scheme_management        object
scheme_name              object
extraction_type          object
extraction_type_group    object
extraction_type_class    object
management               object
management_group         object
payment                  object
payment_type             object
water_quality            object
quality_group            object
quantity                 object
quantity_group           object
source                   object
source_type              object
source_class             object
waterpoint_type          object
waterpoint_type_group    object
dtype: object



In [129]:

    
show_object_dtypes(RAW_X, False)









    Out[129]:





amount_tsh               float64
date_recorded              int64
gps_height                 int64
longitude                float64
latitude                 float64
num_private                int64
region_code                int64
district_code              int64
population                 int64
public_meeting             int64
permit                     int64
construction_year          int64
date_recorded_weekday      int64
date_recorded_date         int64
date_recorded_month        int64
date_recorded_year         int64
dtype: object



In [130]:

    
# Reducing geo location precision to 11 meters
LONG_LAT_PRECISION = 0.001

# Reducing Precision of Lat.
if RAW_X.longitude.mean() < 50:
    RAW_X.longitude = RAW_X.longitude // LONG_LAT_PRECISION
    RAW_X.latitude = RAW_X.latitude // LONG_LAT_PRECISION
    RAW_TEST_X.longitude = RAW_TEST_X.longitude // LONG_LAT_PRECISION
    RAW_TEST_X.latitude = RAW_TEST_X.latitude // LONG_LAT_PRECISION



In [131]:

    
_ = sns.jointplot(x='longitude', y='latitude', data=RAW_X)

Text Data Tranformations

For cols_categorical_check, we are going to basic clean action like, lower and upper case issue. Clearning of non ascii values.



In [132]:

    
def text_transformation(name):
    """Cleanup basic text issue in name(input).
    
    Removes text capitalisation, case, space and other non text ascii charecters
        except space.
    """
    if name:
        name = name.lower().strip()
        name = ''.join([i if 96 < ord(i) < 128 else ' ' for i in name])
        if 'and' in name:
            name = name.replace('and', ' ')

        # clear double space
        while '  ' in name:
            name = name.replace('  ', ' ')
        return name.strip()
    return ''



In [133]:

    
ord(' ')









    Out[133]:





32



In [134]:

    
%%asmarkdown

print('''
|Column|Prev.|Current|
|------|-----|-------|''')
for col in cols_categorical_check:
    aa = len(RAW_X[col].unique())
    RAW_X[col] = RAW_X[col].fillna('').apply(text_transformation)
    RAW_TEST_X[col] = RAW_TEST_X[col].fillna('').apply(text_transformation)
    bb = len(RAW_X[col].unique())
    if aa != bb:
        print('|%s|%i|%i|' % (col, aa, bb))









    Out[134]:







Column
Prev.
Current




funder
1898
1880


installer
2146
1866


wpt_name
37400
36717


subvillage
19288
19175


scheme_name
2697
2485



In [104]:

    
# saving transformed data
pickle.dump(obj=RAW_X, file=open('tmp\clean_X.pkl', 'wb'))
pickle.dump(RAW_TEST_X, open('tmp\clean_TEST_X.pkl', 'wb'))
# pickle.dump(y, open('tmp\y.pkl', 'wb'))

TEST_X, X = RAW_TEST_X, RAW_X

Custom Labeler

Loading Custom Labeler is for the the purpose of reducing categories varieties by ignoring groups with lower frequencies and covering 80%(default) of the original data.



In [136]:

    
from collections import defaultdict
from __future__ import print_function
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets

from scripts import sam_custom_labeler

CUST_CATEGORY_LABELER = sam_custom_labeler.CUST_CATEGORY_LABELER



In [137]:

    
help(CUST_CATEGORY_LABELER)









    



Help on class CUST_CATEGORY_LABELER in module scripts.sam_custom_labeler:

class CUST_CATEGORY_LABELER(builtins.object)
 |  Custom Mapper Function.
 |  
 |  Based on pd.Series.values_counts, a labler is prepared
 |   to cover one of following details
 |      1. cover top 80% of groups(DEFAULT) (or)
 |      2. top 500 groups
 |  
 |  A special `transform_analysis` function is provided to
 |   understand how value_counts are spread out
 |  
 |  Example:
 |      >>> # Test Data
 |      >>> ss = pd.Series(np.arange(5000) // 5)
 |      >>> ss = ss.map(lambda x: str(x))
 |      >>>
 |      >>> # creating labler
 |      >>> labler = CUST_CATEGORY_LABELER()
 |      >>> labler.fit(funder)
 |      >>>
 |      >>> # testing
 |      >>> _ =  labler.check_group_coverage(90)
 |      90 percentage of GROUPS coverage mean, 1691(in number) groups
 |      >>>
 |      >>> _ =  labler.check_data_coverage(90)
 |      90 percentage of DATA coverage mean, 666 (in number) groups
 |  
 |  Methods defined here:
 |  
 |  __init__(self)
 |      Defaults.
 |  
 |  check_data_coverage(self, data_coverage=None)
 |      Check the data coverage.
 |      
 |      Args:
 |          check_data_coverage(float): Range is (0.0, 100.0)
 |  
 |  check_group_coverage(self, groups_coverage=80)
 |      param: groups_coverage - can be provided as fraction/int.
 |      
 |      To convert fraction into proper count for inter checks.
 |      
 |      Args:
 |          * data_coverage(int): Range between (0 - 100)
 |              percentage(%) of the groups to be covered.
 |  
 |  fit(self, col_data)
 |      Fit the data to class.
 |      
 |      Args:
 |          data(ndarray)
 |  
 |  fit_transform(self, col_data)
 |      Fit data and then transform.
 |  
 |  transform(self, groups_coverage=None)
 |      Default transformation is based on coverage.
 |      
 |      If cumulative sum of groups frequencies then
 |       label is to only cover upto top 80% of groups.
 |  
 |  transform_analysis(self, data_coverage=None, groups_coverage=None)
 |      Post transform data view.
 |      
 |      Args:
 |          * data_coverage(int): Range between (0 - 100)
 |              percentage(%) of the amount data to be covered.
 |      
 |          * groups_coverage(int/float):
 |              Limit the amount groups(variety) coverage. All input can be
 |               provided as fraction or a specific count with in limit.
 |      
 |      Example:
 |          >>> labler = CUST_CATEGORY_LABELER()
 |          >>> labler.fit(RAW_X.funder)
 |          >>>
 |          >>> # to checking report for covering 85.50% data
 |          >>> labler.transform_analysis(data_coverage=85.50)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)



In [138]:

    
labler = CUST_CATEGORY_LABELER()

def select_col(col):
    global labler
    labler = CUST_CATEGORY_LABELER()
    labler.fit(RAW_TEST_X[col])
    print('Selected', col)

ii = interact(select_col, col=['funder', 'installer', 'wpt_name', 'subvillage', 'ward', 'scheme_name'])

# To check data coverage
def f1(data=80):
    labler.check_data_coverage(data_coverage=data)

ii1 = interact(f1, data=(70, 100, .5))

# To check groups coverage
def f2(groups=80):
    labler.check_group_coverage(groups)
    
ii2 = interact(f2, groups=(50, 100., .5))

_ = '''
Please select one of these slider to chose among the
 data coverage or groups coverage
'''









    



80.0 percentage of GROUPS coverage mean, 777.6(in number) groups

funder:
- 100.0 percentage of DATA coverage mean, 1881 (in number) groups
- 97.0 percentage of DATA coverage mean, 592 (in number) groups ##
- 90.5 percentage of DATA coverage mean, 237 (in number) groups
installer:
- 100.0 percentage of DATA coverage mean, 1867 (in number) groups
- 97.0 percentage of DATA coverage mean, 599 (in number) groups ##
wpt_name:
- 80.0 percentage of DATA coverage mean, 24838 (in number) groups ##
subvillage:
- 80.5 percentage of DATA coverage mean, 8715 (in number) groups ##
- 83.0 percentage of DATA coverage mean, 9458 (in number) groups
ward:
- 80.0 percentage of DATA coverage mean, 998 (in number) groups ##
- 91.5 percentage of DATA coverage mean, 1397 (in number) groups
- 100.0 percentage of DATA coverage mean, 2093 (in number) groups
scheme_name:
- 100.0 percentage of DATA coverage mean, 2486 (in number) groups
- 91.5 percentage of DATA coverage mean, 870 (in number) groups
- 80.5 percentage of DATA coverage mean, 363 (in number) groups
- 85.0 percentage of DATA coverage mean, 524 (in number) groups ##
  NOTE : Marked with double hashes are the selected values for coverage



In [139]:

    
##################################
######### TESTING ################
#################################

labler = CUST_CATEGORY_LABELER()
labler.fit(X.installer)

# default data coverage is 80
tmp = labler.transform()

print('data coveraged', labler.DATA_COVERAGE_LIMIT)
print('grous coveraged', len(tmp.value_counts()))

print('---------------------')
labler.DATA_COVERAGE_LIMIT = 90
tmp = labler.transform()

print('data coveraged', labler.DATA_COVERAGE_LIMIT)
print('grous coveraged', len(tmp.value_counts()))









    



80 percentage of DATA coverage mean, 81 (in number) groups
data coveraged 80
grous coveraged 82
---------------------
90 percentage of DATA coverage mean, 203 (in number) groups
data coveraged 90
grous coveraged 204



In [140]:

    
##################################
######### IMPLEMENT ##############
#################################

if 'custom_labler' not in dir():
    custom_labler = defaultdict(CUST_CATEGORY_LABELER)
    tmp = { 'funder': 97,
      'installer': 97,
      'wpt_name': 80,
      'subvillage': 80,
      'ward': 80,
      'scheme_name': 85
      }

    for col, limit  in tmp.items():
        labler = custom_labler[col]
        labler.DATA_COVERAGE_LIMIT = limit
        labler.fit(X[col])
        print('')
        print('-' * 15, col.upper())

    #     custom_labler[col].check_data_coverage(limit)
        RAW_X[col] = labler.transform()
else:
    print('"custom_labler" seems is already defined, please check')
    
print(RAW_X.shape, RAW_TEST_X.shape, all(RAW_X.columns == RAW_TEST_X.columns))









    



"custom_labler" seems is already defined, please check
(59400, 43) (14850, 43) True

drop_cols = ['wpt_name',] RAW_X.drop(drop_cols, axis=1, inplace=True) RAW_TEST_X.drop(drop_cols, axis=1, inplace=True) print('Removed Cols:', drop_cols)

Label Encoder

Label Encoder with DefaultDict for quick data transformation http://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn



In [141]:

    
from collections import defaultdict
from sklearn import preprocessing



In [142]:

    
print(RAW_X.shape, RAW_TEST_X.shape)









    



(59400, 43) (14850, 43)



In [143]:

    
d = defaultdict(preprocessing.LabelEncoder)

RAW_X.scheme_management = RAW_X.scheme_management.fillna('Other')
RAW_TEST_X.scheme_management = RAW_TEST_X.scheme_management.fillna('Other')

# Labels Fit
sam = pd.concat([RAW_X, RAW_TEST_X]).apply(lambda x: d[x.name].fit(x))

# Labels Transform - Training Data
X = RAW_X.apply(lambda x: d[x.name].transform(x))
TEST_X = RAW_TEST_X.apply(lambda x: d[x.name].transform(x))

le = preprocessing.LabelEncoder().fit(RAW_y[u'status_group'])
y = le.transform(RAW_y[u'status_group'])



In [144]:

    
show_object_dtypes(RAW_X, True)









    Out[144]:





funder                   object
installer                object
wpt_name                 object
basin                    object
subvillage               object
region                   object
lga                      object
ward                     object
recorded_by              object
scheme_management        object
scheme_name              object
extraction_type          object
extraction_type_group    object
extraction_type_class    object
management               object
management_group         object
payment                  object
payment_type             object
water_quality            object
quality_group            object
quantity                 object
quantity_group           object
source                   object
source_type              object
source_class             object
waterpoint_type          object
waterpoint_type_group    object
dtype: object



In [145]:

    
show_object_dtypes(X, True)









    Out[145]:





Series([], dtype: object)



In [59]:

    
sam_dataframe_cols_value_count_analysis(X)









    



(1, 'date_recorded_year', 28)
(2, 'basin', 28)
(3, 'region', 28)
(4, 'region_code', 28)
(5, 'district_code', 28)
(6, 'public_meeting', 28)
(7, 'recorded_by', 28)
(8, 'scheme_management', 28)
(9, 'permit', 28)
(10, 'extraction_type', 28)
(11, 'extraction_type_group', 28)
(12, 'extraction_type_class', 28)
(13, 'management', 28)
(14, 'management_group', 28)
(15, 'payment', 28)
(16, 'payment_type', 28)
(17, 'water_quality', 28)
(18, 'quality_group', 28)
(19, 'quantity', 28)
(20, 'quantity_group', 28)
(21, 'source', 28)
(22, 'source_type', 28)
(23, 'source_class', 28)
(24, 'waterpoint_type', 28)
(25, 'waterpoint_type_group', 28)
('Showing Plot for Columns:\n', ['basin', 'region', 'region_code', 'district_code', 'public_meeting', 'recorded_by', 'scheme_management', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group', 'date_recorded_weekday', 'date_recorded_date', 'date_recorded_month', 'date_recorded_year'])

Pickle

Pickle Save



In [62]:

    
# saving transformed data
pickle.dump(X, open('tmp\processed_X.pkl', 'wb'))
pickle.dump(TEST_X, open('tmp\processed_TEST_X.pkl', 'wb'))
pickle.dump(y, open('tmp\processed_y.pkl', 'wb'))

# saving label transformers
pickle.dump(d, open('tmp\d.pkl', 'wb'))
pickle.dump(le, open('tmp\le.pkl', 'wb'))

Feature Selection



In [ ]:

    
X = pickle.load(open('tmp\processed_X.pkl', 'rb'))
TEST_X = pickle.load(open('tmp\processed_TEST_X.pkl', 'rb'))
y = pickle.load(open('tmp\processed_y.pkl', 'rb'))

# # Load this when you are about to do text transformation and submission
# d = pickle.load(open('tmp\d.pkl'))
# le = pickle.load(open('tmp\le.pkl'))

print(X.shape, y.shape, y[:5])

Correlation Threshold

To remove all feature with correlaiton more than 80%



In [182]:

    
if list(X.dtypes[X.dtypes == 'O']):
    print('Please check there are still some OBJECT COLUMNS PRESENT')
else:
    ss = X.corr().fillna(0)

    # postive or negitive - both good
    ss = ss.applymap(lambda x: x if x and x > 0 else -1 * x)
    
    # wish to know only strong corr
    plt.figure(figsize=(15, 15))
    sns.heatmap(ss)



In [189]:

    
# wish to know only strong corr
plt.figure(figsize=(18, 18))
sns.heatmap(ss.applymap(lambda x: x if x > 0.90 else 0))









    Out[189]:





<matplotlib.axes._subplots.AxesSubplot at 0x11b3e9470>



In [248]:

    
len(X[_col_].value_counts()), len(X[_row_].value_counts())









    Out[248]:





5



In [273]:

    
np.set_printoptions(precision=2)

bag = []

for _col_ in ss.index:
    for _row_ in ss.columns:
        if _col_ not in bag:
            if (ss[_col_][_row_] > 0.8 and (ss[_col_][_row_] < 1.0)):
                try:
                    print((_col_, len(X[_col_].value_counts()),
                           _row_, len(X[_row_].value_counts()),
                           ss[_col_][_row_]))
                except KeyError:
                    # few extra cols are added
                    pass
#         bag.append(_row_)
#         bag.append(_col_)

del _col_, _row_, bag









    



('date_recorded', 356, 'date_recorded_year', 5, 0.95920911743658788)
('extraction_type', 18, 'extraction_type_group', 13, 0.94952351098756882)
('extraction_type_group', 13, 'extraction_type', 18, 0.94952351098756882)
('source', 10, 'source_type', 7, 0.94381787586073784)
('source_type', 7, 'source', 10, 0.94381787586073784)
('waterpoint_type', 7, 'waterpoint_type_group', 6, 0.98215380609123037)
('waterpoint_type_group', 6, 'waterpoint_type', 7, 0.98215380609123037)
('date_recorded_year', 5, 'date_recorded', 356, 0.95920911743658788)



In [276]:

    
%%asmarkdown

print ('''
|Column Name|VCount|Column Name|VCount|Corr|
|-----------|------|-----------|------|----|''')

tmp = '''
('date_recorded', 356, 'date_recorded_year', 5, 0.95920911743658788)
('extraction_type', 18, 'extraction_type_group', 13, 0.94952351098756882)
('extraction_type_group', 13, 'extraction_type', 18, 0.94952351098756882)
('source', 10, 'source_type', 7, 0.94381787586073784)
('source_type', 7, 'source', 10, 0.94381787586073784)
('waterpoint_type', 7, 'waterpoint_type_group', 6, 0.98215380609123037)
('waterpoint_type_group', 6, 'waterpoint_type', 7, 0.98215380609123037)
('date_recorded_year', 5, 'date_recorded', 356, 0.95920911743658788)
'''

while ' ' in tmp:
    tmp = tmp.replace(' ', '')

tmp = tmp.strip().replace('\'', '')
print(tmp.replace(",", '|').replace('(', '|').replace(')', '|'))

del tmp









    Out[276]:







Column Name
VCount
Column Name
VCount
Corr




date_recorded
356
date_recorded_year
5
0.95920911743658788


extraction_type
18
extraction_type_group
13
0.94952351098756882


extraction_type_group
13
extraction_type
18
0.94952351098756882


source
10
source_type
7
0.94381787586073784


source_type
7
source
10
0.94381787586073784


waterpoint_type
7
waterpoint_type_group
6
0.98215380609123037


waterpoint_type_group
6
waterpoint_type
7
0.98215380609123037


date_recorded_year
5
date_recorded
356
0.95920911743658788



In [285]:

    
from sklearn.feature_selection import chi2

X['date_recorded'].shape









    Out[285]:





(59400,)

Variance Threshold

To remove all features that are either one or zero (on or off) in more than 80% of the samples.

http://scikit-learn.org/stable/modules/feature_selection.html#removing-features-with-low-variance

http://stackoverflow.com/questions/29298973/removing-features-with-low-variance-scikit-learn/34850639#34850639



In [288]:

    
X.dtypes









    Out[288]:





amount_tsh               int64
date_recorded            int64
funder                   int64
gps_height               int64
installer                int64
longitude                int64
latitude                 int64
wpt_name                 int64
num_private              int64
basin                    int64
subvillage               int64
region                   int64
region_code              int64
district_code            int64
lga                      int64
ward                     int64
population               int64
public_meeting           int64
recorded_by              int64
scheme_management        int64
scheme_name              int64
permit                   int64
construction_year        int64
extraction_type          int64
extraction_type_group    int64
extraction_type_class    int64
management               int64
management_group         int64
payment                  int64
payment_type             int64
water_quality            int64
quality_group            int64
quantity                 int64
quantity_group           int64
source                   int64
source_type              int64
source_class             int64
waterpoint_type          int64
waterpoint_type_group    int64
date_recorded_weekday    int64
date_recorded_date       int64
date_recorded_month      int64
date_recorded_year       int64
dtype: object



In [289]:

    
from scripts.sam_variance_check import get_low_variance_columns



In [290]:

    
X, removed_features, ranking_variance_thresholds = get_low_variance_columns(dframe=X,
                                                                            threshold=(0.85 * (1 - 0.85)),
                                                                            autoremove=True)

print('\nLow Variance Columns', removed_features)
print('Shape of X is', X.shape)









    



Finding low-variance features.
Found 1 low-variance columns.
                
Removing low-variance features.
Reassembling the dataframe (with low-variance features removed).
Succesfully removed low-variance columns.

Low Variance Columns ['recorded_by']
Shape of X is (59400, 42)



In [291]:

    
if removed_features:
    TEST_X.drop(removed_features, axis=1, inplace=True)
    print('cleanup completed!')









    



cleanup completed!



In [292]:

    
print('Shape of X is', X.shape)
print('Shape of TEST_X is', TEST_X.shape)









    



Shape of X is (59400, 42)
Shape of TEST_X is (14850, 42)

Select K Best

For regression: f_regression, mutual_info_regression
For classification: chi2, f_classif, mutual_info_classif

Random Forest Classifier score: RandomForestClassifier(n_estimators=150, criterion='entropy', class_weight="balanced_subsample", n_jobs=-1)

chi2 0.81225589225589223
f_classic 0.81138047138047142
mutual_info_classif 0.81037037037037041



In [293]:

    
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif



In [294]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def game(X, y):
#     print(X.shape, y.shape[0])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    
    clf_rf = RandomForestClassifier(n_jobs=-1, random_state=192)
    clf_rf = clf_rf.fit(X_train, y_train)
    
    train_score = clf_rf.score(X_train, y_train)
    test_score = clf_rf.score(X_test, y_test)
#     print('Train Score', train_score)
#     print('Test  Score', test_score)
    return train_score, test_score



In [295]:

    
X.shape, y.shape









    Out[295]:





((59400, 42), (59400,))

ranking_selectkbest = dict(zip(cols_names, fit.scores_)) kbest_selected_cols = [_ for _ in cols_names[:kbest_cols]] % pprint ranking_selectkbest print('Removed Columns:\n\t', ','.join([ _ for _ in X.columns if _ not in kbest_selected_cols ])) print('\nSelected Columns:\n\t', ','.join(kbest_selected_cols))



In [296]:

    
kbest_cols = 26

for fns in [chi2, f_classif, mutual_info_classif]:
    print((fns,game(SelectKBest(score_func=fns, k=kbest_cols).fit(X, y).transform(X), y)))









    



(<function chi2 at 0x11d9afea0>, (0.98410774410774415, 0.79912457912457913))
(<function f_classif at 0x11d9af598>, (0.97263748597081934, 0.79050505050505049))
(<function mutual_info_classif at 0x119229ae8>, (0.98356902356902354, 0.79569023569023567))



In [297]:

    
print('''
(chi2, 0.98428731762065091, 0.79966329966329963)
(f_classif, 0.97432098765432096, 0.79286195286195282)
(mutual_info_classif, 0.98410774410774415, 0.79447811447811445)
'''.replace('(', '|').replace(')', '|').replace(', ', '|'))









    



|chi2|0.98428731762065091|0.79966329966329963|
|f_classif|0.97432098765432096|0.79286195286195282|
|mutual_info_classif|0.98410774410774415|0.79447811447811445|

bag = [] kbest_cols = 40 # for k in range(1, 40, 4): # for k in range(23, 33, 2): for k in range(26, 29): kbest_cols = k fit = SelectKBest(score_func=chi2, k=kbest_cols).fit(X, y) cols_names = X.columns kbest_selected_cols = [_ for _ in cols_names[:kbest_cols]] kbest_X = pd.DataFrame(fit.transform(X)) kbest_TEST_X = pd.DataFrame(fit.transform(TEST_X)) # kbest_X.columns = kbest_selected_cols # kbest_TEST_X.columns = kbest_selected_cols # print('Before KBest', X.shape, TEST_X.shape, len(y)) # print('After KBest', kbest_X.shape, kbest_TEST_X.shape, len(y)) train_score, test_score = game(kbest_X, y) bag.append({'cols': kbest_cols, 'train': train_score, 'test': test_score}) print(', '.join(kbest_selected_cols).upper()) bag

kbest conclusion :

Best selected columns

AMOUNT_TSH, DATE_RECORDED, FUNDER, GPS_HEIGHT, INSTALLER, LONGITUDE, LATITUDE, NUM_PRIVATE, BASIN, SUBVILLAGE, REGION, REGION_CODE, DISTRICT_CODE, LGA, WARD, POPULATION, PUBLIC_MEETING, SCHEME_MANAGEMENT, SCHEME_NAME, PERMIT, CONSTRUCTION_YEAR, EXTRACTION_TYPE, EXTRACTION_TYPE_GROUP, EXTRACTION_TYPE_CLASS, MANAGEMENT, MANAGEMENT_GROUP, PAYMENT, PAYMENT_TYPE

# results of previous runs
[{'cols': 1, 'test': 0.52659932659932662, 'train': 0.57483726150392822},
 {'cols': 5, 'test': 0.68962962962962959, 'train': 0.94240179573512906},
 {'cols': 9, 'test': 0.7211447811447812, 'train': 0.97638608305274976},
 {'cols': 13, 'test': 0.75380471380471381, 'train': 0.97955106621773291},
 {'cols': 17, 'test': 0.76134680134680133, 'train': 0.98071829405162736},
 {'cols': 21, 'test': 0.76511784511784509, 'train': 0.98076318742985413},
 {'cols': 25, 'test': 0.80033670033670035, 'train': 0.98316498316498313},
 {'cols': 29, 'test': 0.80053872053872055, 'train': 0.98379349046015707},
 {'cols': 33, 'test': 0.80040404040404045, 'train': 0.98390572390572395},
 {'cols': 37, 'test': 0.79993265993265994, 'train': 0.98341189674523011}]

[{'cols': 23, 'test': 0.7976430976430976, 'train': 0.9836812570145903},
 {'cols': 25, 'test': 0.80033670033670035, 'train': 0.98316498316498313},
 {'cols': 27, 'test': 0.80101010101010106, 'train': 0.9829405162738496},
 {'cols': 29, 'test': 0.80053872053872055, 'train': 0.98379349046015707},
 {'cols': 31, 'test': 0.80000000000000004, 'train': 0.98381593714927051}]

[{'cols': 26, 'test': 0.80309764309764309, 'train': 0.98359147025813698},
 {'cols': 27, 'test': 0.80101010101010106, 'train': 0.9829405162738496},
 {'cols': 28, 'test': 0.80222222222222217, 'train': 0.98334455667789}]

As per Okham Razor's rules, we are going to select the simplest and well performing. Luckily, we have got kbest_selected_cols at 26 which is comparitively top performer among other K-selections and also lower than actualy number of columns



In [298]:

    
kbest_cols = 26

fit = SelectKBest(score_func=chi2, k=kbest_cols).fit(X, y)
cols_names = X.columns
kbest_selected_cols =  [_ for _ in cols_names[:kbest_cols]]

kbest_X = pd.DataFrame(fit.transform(X))
kbest_TEST_X = pd.DataFrame(fit.transform(TEST_X))



In [299]:

    
kbest_X.shape, kbest_TEST_X.shape, y.shape









    Out[299]:





((59400, 26), (14850, 26), (59400,))



In [300]:

    
pickle.dump(kbest_X, open('tmp\kbest_X.pkl', 'wb'))
pickle.dump(kbest_TEST_X, open('tmp\kbest_TEST_X.pkl', 'wb'))
pickle.dump(y, open('tmp\kbest_y.pkl', 'wb'))

PCA



In [301]:

    
load = 2

if load ==1:
    # this will load kbest
    print('Loading KBest Processed Data')
    X = pickle.load(open('tmp\kbest_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\kbest_TEST_X.pkl', 'rb'))
    y = pickle.load(open('tmp\kbest_y.pkl', 'rb'))
elif load ==2:
    # this will load processed data
    print('Loading normal Processed Data')
    X = pickle.load(open('tmp\processed_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\processed_TEST_X.pkl', 'rb'))

# # y = pickle.load(open('tmp\processed_y.pkl'))









    



Loading normal Processed Data

PCA



In [302]:

    
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis



In [303]:

    
X.shape









    Out[303]:





(59400, 43)



In [304]:

    
# feature extraction
pca = PCA(n_components=30)
fit = pca.fit(X)

plt.figure(figsize=(12, 3))

_ = plt.scatter (range(len(fit.explained_variance_ratio_)), fit.explained_variance_ratio_.cumsum())

_ = plt.xlabel('cumilative sum of explained variance')
_ = plt.ylabel('score')


print(fit.explained_variance_ratio_.cumsum())
print()
print(('Score', game(pca.transform(X), y)))


# (0.97580246913580249, 0.60511784511784517) # KBest dataset
# (0.97564534231200895, 0.60552188552188557) # Normal Dataset









    



[ 0.8   0.9   0.96  0.99  0.99  1.    1.    1.    1.    1.    1.    1.    1.
  1.    1.    1.    1.    1.    1.    1.    1.    1.    1.    1.    1.    1.
  1.    1.    1.    1.  ]

('Score', (0.98552188552188558, 0.7781818181818182))



In [305]:

    
ss = pd.DataFrame(fit.components_)
ss = ss.applymap(lambda x: x if x > 0 else -1 * x)
display(ss.describe().T)

ss.plot(kind='bar', figsize=(125, 10))









    






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
  
  
    
      0
      30.0
      5.264466e-02
      1.776436e-01
      3.066936e-05
      0.002936
      5.765839e-03
      1.470519e-02
      8.611451e-01
    
    
      1
      30.0
      4.268263e-02
      1.805436e-01
      6.822979e-05
      0.002029
      3.609913e-03
      1.266789e-02
      9.957240e-01
    
    
      2
      30.0
      5.619575e-02
      1.766802e-01
      2.795439e-05
      0.000310
      8.516267e-04
      4.805287e-03
      7.605836e-01
    
    
      3
      30.0
      6.128836e-02
      1.749199e-01
      1.156348e-05
      0.000272
      1.745133e-03
      1.154194e-02
      8.095676e-01
    
    
      4
      30.0
      5.542420e-02
      1.769322e-01
      2.312065e-05
      0.000199
      5.290925e-04
      8.663409e-03
      7.468425e-01
    
    
      5
      30.0
      5.051806e-02
      1.784452e-01
      1.680544e-06
      0.000161
      3.803534e-04
      3.975645e-03
      7.725976e-01
    
    
      6
      30.0
      5.121032e-02
      1.782409e-01
      1.508064e-05
      0.000078
      3.240449e-04
      1.022508e-02
      7.723962e-01
    
    
      7
      30.0
      3.472833e-02
      1.823050e-01
      3.419095e-07
      0.000004
      9.423929e-06
      5.997757e-04
      9.997252e-01
    
    
      8
      30.0
      6.470633e-02
      1.736035e-01
      6.221466e-07
      0.000163
      2.913288e-03
      3.655878e-02
      8.314784e-01
    
    
      9
      30.0
      7.921026e-02
      1.663410e-01
      1.727048e-06
      0.000400
      9.006896e-03
      4.520668e-02
      6.911450e-01
    
    
      10
      30.0
      3.740845e-02
      1.817556e-01
      6.543990e-07
      0.000010
      2.491947e-05
      1.717698e-03
      9.982215e-01
    
    
      11
      30.0
      7.840541e-02
      1.676441e-01
      1.100873e-05
      0.001062
      9.910642e-03
      6.561904e-02
      6.748640e-01
    
    
      12
      30.0
      6.984921e-02
      1.715041e-01
      1.098525e-05
      0.001527
      1.052479e-02
      4.993550e-02
      8.420199e-01
    
    
      13
      30.0
      8.629159e-02
      1.634623e-01
      8.256655e-07
      0.000558
      1.480295e-02
      1.052436e-01
      7.931873e-01
    
    
      14
      30.0
      4.144004e-02
      1.808467e-01
      1.565410e-05
      0.001588
      3.719466e-03
      1.827063e-02
      9.975896e-01
    
    
      15
      30.0
      5.158451e-02
      1.781293e-01
      1.741198e-05
      0.000082
      2.249986e-04
      8.506731e-03
      8.282855e-01
    
    
      16
      30.0
      5.239932e-02
      1.778831e-01
      1.409580e-04
      0.000339
      1.777523e-03
      1.663163e-02
      9.591436e-01
    
    
      17
      30.0
      7.186709e-03
      1.165815e-02
      1.827096e-07
      0.000047
      2.581867e-03
      7.245337e-03
      5.017586e-02
    
    
      18
      30.0
      3.336819e-17
      4.963148e-17
      -0.000000e+00
      0.000000
      5.505714e-20
      5.551115e-17
      1.804112e-16
    
    
      19
      30.0
      9.037559e-02
      1.603736e-01
      1.010811e-05
      0.000330
      9.664426e-03
      9.996771e-02
      6.265809e-01
    
    
      20
      30.0
      6.211051e-02
      1.746196e-01
      3.191952e-05
      0.000256
      7.286919e-04
      8.350287e-03
      6.770724e-01
    
    
      21
      30.0
      1.119623e-02
      1.680378e-02
      5.902775e-08
      0.000084
      2.381170e-03
      1.431135e-02
      5.904086e-02
    
    
      22
      30.0
      5.453542e-02
      1.772159e-01
      2.157749e-05
      0.002852
      5.227284e-03
      1.833292e-02
      8.602716e-01
    
    
      23
      30.0
      8.423729e-02
      1.432741e-01
      1.544605e-06
      0.000559
      1.370984e-02
      7.302268e-02
      5.287920e-01
    
    
      24
      30.0
      6.512927e-02
      1.143630e-01
      2.154448e-06
      0.000751
      1.080049e-02
      7.076019e-02
      4.278898e-01
    
    
      25
      30.0
      5.975461e-02
      1.349504e-01
      2.600068e-06
      0.000360
      6.078542e-03
      2.426885e-02
      5.540015e-01
    
    
      26
      30.0
      8.589454e-02
      1.474672e-01
      6.368114e-06
      0.000185
      6.729705e-03
      1.241829e-01
      6.021727e-01
    
    
      27
      30.0
      3.566714e-02
      6.451389e-02
      1.418986e-07
      0.000082
      4.677488e-03
      5.579973e-02
      2.994893e-01
    
    
      28
      30.0
      7.898384e-02
      1.321893e-01
      1.552095e-07
      0.000261
      1.441977e-02
      9.583476e-02
      5.094964e-01
    
    
      29
      30.0
      6.045190e-02
      9.003434e-02
      7.602828e-06
      0.000296
      1.820820e-02
      8.656660e-02
      3.262628e-01
    
    
      30
      30.0
      1.105546e-02
      2.038700e-02
      5.019569e-07
      0.000140
      1.066348e-03
      7.806508e-03
      6.510965e-02
    
    
      31
      30.0
      1.126054e-02
      1.717433e-02
      1.447720e-06
      0.000109
      2.192364e-03
      1.602182e-02
      7.455432e-02
    
    
      32
      30.0
      4.383445e-02
      1.229576e-01
      1.493985e-06
      0.000068
      3.036652e-03
      1.744089e-02
      4.999902e-01
    
    
      33
      30.0
      4.383445e-02
      1.229576e-01
      1.493985e-06
      0.000068
      3.036652e-03
      1.744089e-02
      4.999902e-01
    
    
      34
      30.0
      6.732498e-02
      1.050369e-01
      9.335828e-07
      0.000380
      2.000375e-02
      8.911797e-02
      4.649121e-01
    
    
      35
      30.0
      7.507680e-02
      1.130867e-01
      1.813865e-07
      0.000428
      1.733277e-02
      1.185510e-01
      4.958592e-01
    
    
      36
      30.0
      9.107952e-03
      1.541552e-02
      7.065533e-07
      0.000058
      1.870569e-03
      1.079122e-02
      6.098258e-02
    
    
      37
      30.0
      6.488749e-02
      1.287179e-01
      3.217442e-06
      0.000593
      1.069135e-02
      6.687406e-02
      5.972969e-01
    
    
      38
      30.0
      5.189630e-02
      1.009729e-01
      2.145346e-06
      0.000354
      1.177189e-02
      5.937794e-02
      4.666592e-01
    
    
      39
      30.0
      5.473117e-02
      1.771367e-01
      9.233950e-08
      0.000071
      1.147458e-03
      1.327330e-02
      9.078710e-01
    
    
      40
      30.0
      4.618602e-02
      1.796512e-01
      4.938538e-06
      0.000577
      3.669242e-03
      1.390761e-02
      9.883904e-01
    
    
      41
      30.0
      7.356021e-02
      1.691821e-01
      3.337361e-06
      0.000808
      1.592026e-02
      6.103160e-02
      8.853109e-01
    
    
      42
      30.0
      5.776790e-03
      1.280072e-02
      4.976312e-06
      0.000332
      1.674403e-03
      5.824429e-03
      6.893185e-02
    
  








    Out[305]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a807550>



In [306]:

    
# feature extraction
lda = LinearDiscriminantAnalysis(n_components=16)
fit = lda.fit(X, y)

plt.figure(figsize=(12, 3))

_ = plt.scatter (range(len(fit.explained_variance_ratio_)), fit.explained_variance_ratio_.cumsum())

_ = plt.xlabel('cumilative sum of explained variance')
_ = plt.ylabel('score')


print(fit.explained_variance_ratio_.cumsum())

print(('\nScore', game(lda.transform(X), y)))


# (0.97580246913580249, 0.60511784511784517) # KBest dataset
# (0.97564534231200895, 0.60552188552188557) # Normal Dataset









    



/Users/sampathm/miniconda3/lib/python3.5/site-packages/sklearn/discriminant_analysis.py:387: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")






    



[ 0.84  1.  ]
('\nScore', (0.97340067340067338, 0.64296296296296296))



In [307]:

    
ss = pd.DataFrame(fit.coef_)
ss = ss.applymap(lambda x: x if x > 0 else -1 * x)
display(ss.describe().T)

ss.plot(kind='bar', figsize=(125, 10))









    






  
    
      
      count
      mean
      std
      min
      25%
      50%
      75%
      max
    
  
  
    
      0
      3.0
      8.337468e-03
      6.804167e-03
      8.299133e-04
      5.457564e-03
      1.008521e-02
      1.209124e-02
      1.409727e-02
    
    
      1
      3.0
      1.495772e-03
      1.480684e-03
      3.053713e-04
      6.667497e-04
      1.028128e-03
      2.090972e-03
      3.153815e-03
    
    
      2
      3.0
      1.396777e-04
      7.433377e-05
      6.421568e-05
      1.031019e-04
      1.419882e-04
      1.774088e-04
      2.128293e-04
    
    
      3
      3.0
      1.505486e-04
      1.182363e-04
      4.900360e-05
      8.564571e-05
      1.222878e-04
      2.013211e-04
      2.803545e-04
    
    
      4
      3.0
      7.289322e-05
      2.376144e-05
      4.844370e-05
      6.138937e-05
      7.433504e-05
      8.511798e-05
      9.590092e-05
    
    
      5
      3.0
      4.974313e-05
      5.133707e-05
      1.519399e-05
      2.024762e-05
      2.530125e-05
      6.701770e-05
      1.087341e-04
    
    
      6
      3.0
      2.195375e-05
      1.729867e-05
      3.061823e-06
      1.442160e-05
      2.578138e-05
      3.139971e-05
      3.701803e-05
    
    
      7
      3.0
      5.517552e-07
      7.558838e-07
      9.259986e-08
      1.155466e-07
      1.384934e-07
      7.813328e-07
      1.424172e-06
    
    
      8
      3.0
      1.860713e-03
      2.580830e-03
      3.218733e-04
      3.709395e-04
      4.200058e-04
      2.630133e-03
      4.840260e-03
    
    
      9
      3.0
      4.772289e-02
      1.273324e-02
      3.841346e-02
      4.046765e-02
      4.252183e-02
      5.237760e-02
      6.223338e-02
    
    
      10
      3.0
      2.555641e-06
      7.267233e-07
      2.112506e-06
      2.136292e-06
      2.160077e-06
      2.777208e-06
      3.394339e-06
    
    
      11
      3.0
      6.599947e-03
      2.308804e-03
      5.093635e-03
      5.270892e-03
      5.448149e-03
      7.353103e-03
      9.258056e-03
    
    
      12
      3.0
      1.067965e-02
      6.871940e-03
      4.397211e-03
      7.010128e-03
      9.623045e-03
      1.382087e-02
      1.801870e-02
    
    
      13
      3.0
      2.067925e-02
      2.309495e-02
      2.684797e-03
      7.658224e-03
      1.263165e-02
      2.967648e-02
      4.672132e-02
    
    
      14
      3.0
      2.783953e-03
      2.387193e-03
      7.862720e-04
      1.462090e-03
      2.137909e-03
      3.782793e-03
      5.427677e-03
    
    
      15
      3.0
      1.669431e-05
      2.360449e-05
      1.203457e-06
      3.110820e-06
      5.018182e-06
      2.443974e-05
      4.386130e-05
    
    
      16
      3.0
      9.747306e-05
      8.501960e-05
      3.647203e-06
      6.150795e-05
      1.193687e-04
      1.443860e-04
      1.694033e-04
    
    
      17
      3.0
      5.389127e-02
      3.736855e-02
      1.376729e-02
      3.698717e-02
      6.020705e-02
      7.395326e-02
      8.769947e-02
    
    
      18
      3.0
      1.338180e-16
      1.872716e-16
      1.657216e-17
      2.582890e-17
      3.508564e-17
      1.924410e-16
      3.497963e-16
    
    
      19
      3.0
      1.782513e-02
      2.404402e-02
      1.237305e-03
      4.037789e-03
      6.838272e-03
      2.611904e-02
      4.539980e-02
    
    
      20
      3.0
      1.197332e-04
      7.240755e-05
      4.387354e-05
      8.554620e-05
      1.272189e-04
      1.576630e-04
      1.881071e-04
    
    
      21
      3.0
      1.474554e-01
      1.454648e-01
      3.039995e-02
      6.602955e-02
      1.016591e-01
      2.059832e-01
      3.103072e-01
    
    
      22
      3.0
      7.850430e-03
      2.252974e-03
      5.581121e-03
      6.732293e-03
      7.883464e-03
      8.985085e-03
      1.008671e-02
    
    
      23
      3.0
      2.558823e-02
      2.935264e-02
      5.589832e-03
      8.739308e-03
      1.188878e-02
      3.558742e-02
      5.928606e-02
    
    
      24
      3.0
      5.506476e-02
      1.756721e-02
      4.316921e-02
      4.497612e-02
      4.678303e-02
      6.101254e-02
      7.524204e-02
    
    
      25
      3.0
      1.406357e-01
      5.590006e-02
      9.075016e-02
      1.104275e-01
      1.301047e-01
      1.655785e-01
      2.010523e-01
    
    
      26
      3.0
      5.262338e-02
      4.049202e-02
      8.525442e-03
      3.486961e-02
      6.121379e-02
      7.467235e-02
      8.813091e-02
    
    
      27
      3.0
      9.821851e-02
      4.667700e-02
      4.817481e-02
      7.704065e-02
      1.059065e-01
      1.232404e-01
      1.405742e-01
    
    
      28
      3.0
      2.519284e-02
      9.449109e-03
      1.711129e-02
      1.999825e-02
      2.288522e-02
      2.923361e-02
      3.558200e-02
    
    
      29
      3.0
      5.616476e-02
      7.611849e-02
      5.880103e-03
      1.237816e-02
      1.887621e-02
      8.130709e-02
      1.437380e-01
    
    
      30
      3.0
      5.800423e-02
      1.505242e-02
      4.579136e-02
      4.959588e-02
      5.340041e-02
      6.411067e-02
      7.482093e-02
    
    
      31
      3.0
      1.422329e-01
      4.819415e-02
      1.049519e-01
      1.150222e-01
      1.250925e-01
      1.608734e-01
      1.966543e-01
    
    
      32
      3.0
      1.212447e-01
      4.119230e-02
      7.493459e-02
      1.049678e-01
      1.350010e-01
      1.443997e-01
      1.537985e-01
    
    
      33
      3.0
      1.212447e-01
      4.119230e-02
      7.493459e-02
      1.049678e-01
      1.350010e-01
      1.443997e-01
      1.537985e-01
    
    
      34
      3.0
      1.492370e-01
      9.395345e-02
      6.272971e-02
      9.926144e-02
      1.357932e-01
      1.924907e-01
      2.491883e-01
    
    
      35
      3.0
      1.084671e-01
      5.423996e-02
      5.451978e-02
      8.120318e-02
      1.078866e-01
      1.354408e-01
      1.629950e-01
    
    
      36
      3.0
      3.432188e-01
      3.503985e-01
      1.077237e-01
      1.418796e-01
      1.760354e-01
      4.609664e-01
      7.458973e-01
    
    
      37
      3.0
      1.406672e-01
      1.539974e-01
      3.629541e-02
      5.223427e-02
      6.817313e-02
      1.928531e-01
      3.175332e-01
    
    
      38
      3.0
      2.638073e-01
      2.567213e-01
      5.646120e-02
      1.202355e-01
      1.840097e-01
      3.674804e-01
      5.509511e-01
    
    
      39
      3.0
      3.952694e-03
      1.596501e-03
      2.966568e-03
      3.031722e-03
      3.096875e-03
      4.445757e-03
      5.794639e-03
    
    
      40
      3.0
      3.721769e-03
      5.211595e-03
      1.013048e-04
      7.351925e-04
      1.369080e-03
      5.532001e-03
      9.694921e-03
    
    
      41
      3.0
      3.023130e-02
      4.177840e-02
      5.882133e-03
      6.110885e-03
      6.339636e-03
      4.240588e-02
      7.847212e-02
    
    
      42
      3.0
      1.502867e-01
      8.717312e-02
      5.979505e-02
      1.085746e-01
      1.573541e-01
      1.955326e-01
      2.337110e-01
    
  








    Out[307]:





<matplotlib.axes._subplots.AxesSubplot at 0x11f54c780>



In [308]:

    
X = pca.transform(X)
TEST_X = pca.transform(TEST_X)



In [309]:

    
X.shape, TEST_X.shape









    Out[309]:





((59400, 30), (14850, 30))

Saving Processed Data



In [310]:

    
pickle.dump(X, open('tmp\pca_X.pkl', 'wb'))
pickle.dump(TEST_X, open('tmp\pca_TEST_X.pkl', 'wb'))
# pickle.dump(y, open('tmp\pca_y.pkl', 'wb'))

Unsupervised Learning

Unsupervised Learning Exploration(Gaussian Process, Neural Nets)

Loading Pre-Processed Data



In [311]:

    
load = 2

if load == 1:
    print('Loading PCA Processed Data')
    X = pickle.load(open('tmp\pca_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\pca_TEST_X.pkl', 'rb'))

elif load == 2:
    # this will load kbest
    print('Loading KBest Processed Data')
    X = pickle.load(open('tmp\kbest_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\kbest_TEST_X.pkl', 'rb'))
elif load == 3:
    # this will load processed data
    print('Loading normal Processed Data')
    X = pickle.load(open('tmp\processed_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\processed_TEST_X.pkl', 'rb'))

# # y = pickle.load(open('tmp\processed_y.pkl'))









    



Loading KBest Processed Data



In [312]:

    
print(X.shape, y.shape, TEST_X.shape)









    



(59400, 26) (59400,) (14850, 26)

Gaussian



In [313]:

    
from sklearn.mixture import GaussianMixture as GMM
from sklearn.metrics import silhouette_score



In [314]:

    
# For future analysis
GMM_Centers = []

__check_for  = 1000

print ('clusters | score for top 1000')

for i in range(2, 7):
    # TODO: Apply your clustering algorithm of choice to the reduced data 
    clusterer = GMM(n_components=i, random_state=42)
    clusterer.fit(X)

    # TODO: Predict the cluster for each data point
    preds = clusterer.predict(X)

    # TODO: Find the cluster centers
    GMM_Centers.append(clusterer.means_)

    # score = silhouette_score(X, preds)
    score = silhouette_score(X[:__check_for], preds[:__check_for])

    print(i, score)
    
# clusters | score for top 1000
# 2 0.484879234998
# 3 0.377180934294
# 4 0.334333476259
# 5 0.29213724894
# 6 0.27643712696









    



clusters | score for top 1000
2 0.0136799380517
3 0.00581313038817
4 -0.0236377736025
5 -0.0671052621016
6 -0.0265095892177

KMeans



In [315]:

    
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score



In [316]:

    
# For future analysis
KMM_Centers = []

# Testing each category
for i in range(2, 7):

    clusterer = KMeans(init='k-means++', n_clusters=i, n_init=10)
    clusterer.fit(X)

    preds = clusterer.predict(X)

    centers = clusterer.cluster_centers_
    
    KMM_Centers.append(centers)

#     score = silhouette_score(X, preds)
    score = silhouette_score(X[:__check_for], preds[:__check_for])
    print(i, score)
    
# clusters | score for top 1000
# 2 0.502005229628
# 3 0.377168744959
# 4 0.325091546516
# 5 0.303811069492
# 6 0.304265445159









    



2 0.54644092119
3 0.398792236781
4 0.301272045129
5 0.273347089426
6 0.259803278604



In [317]:

    
i = 2

clusterer = KMeans(init='k-means++', n_clusters=i, n_init=10)
clusterer.fit(X)
preds = clusterer.predict(X)



In [318]:

    
score = silhouette_score(X[:__check_for], preds[:__check_for])
print(i, score)









    



2 0.54644092119



In [319]:

    
print(X.shape, TEST_X.shape)









    



(59400, 26) (14850, 26)



In [320]:

    
X = pd.DataFrame(X)
X['new'] = clusterer.predict(X)



In [321]:

    
TEST_X = pd.DataFrame(TEST_X)
TEST_X['new'] = clusterer.predict(TEST_X)



In [322]:

    
print(X.shape, TEST_X.shape)









    



(59400, 27) (14850, 27)

Supervised Learning

Supervised Learning(GBT Trees, Nearest Neighbours, RF, One-vs-One)

Test-Train Split



In [326]:

    
from sklearn.model_selection import train_test_split

load = 3

if load == 1:
    print('Loading PCA Processed Data')
    X = pickle.load(open('tmp\pca_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\pca_TEST_X.pkl', 'rb'))

elif load == 2:
    # this will load kbest
    print('Loading KBest Processed Data')
    X = pickle.load(open('tmp\kbest_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\kbest_TEST_X.pkl', 'rb'))
elif load == 3:
    # this will load processed data
    print('Loading normal Processed Data')
    X = pickle.load(open('tmp\processed_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\processed_TEST_X.pkl', 'rb'))

y = pickle.load(open('tmp\processed_y.pkl', 'rb'))


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X.shape, y.shape)









    



Loading normal Processed Data
(59400, 43) (59400,)

GBT Trees



In [327]:

    
from sklearn.ensemble import GradientBoostingClassifier



In [328]:

    
clf_gbt = GradientBoostingClassifier(random_state=192)

clf_gbt = clf_gbt.fit(X_train, y_train)

print('score:', clf_gbt.score(X_test, y_test))

# ('score:', 0.75252525252525249) k_best score

# ('score:', 0.75400673400673401) preprocessed









    



score: 0.754478114478

Nearest Neighbours



In [329]:

    
from sklearn.neighbors import KNeighborsClassifier



In [330]:

    
# modelling
clf_knn = KNeighborsClassifier()
clf_knn.fit(X_test, y_test)

# score
clf_knn.score(X_train, y_train)

# 0.55842873176206509 k_best
# 0.55840628507295176 preprocessed









    Out[330]:





0.55809203142536479

Random Forest



In [331]:

    
from sklearn.ensemble import RandomForestClassifier



In [332]:

    
clf_rf = RandomForestClassifier(random_state=192)
clf_rf = clf_rf.fit(X_train, y_train)

print('Score:' + str(clf_rf.score(X_test, y_test)))

# 0.79542087542087547 # (n_jobs=-1, random_state=192)
# 0.800942760943 k_best
# 0.8









    



Score:0.800336700337



In [333]:

    
print(list(zip(X.columns, clf_rf.feature_importances_)))









    



[('amount_tsh', 0.02146536138185149), ('date_recorded', 0.037974199115599641), ('funder', 0.030325938983991507), ('gps_height', 0.043389708344340985), ('installer', 0.022601464611514462), ('longitude', 0.074757246698181165), ('latitude', 0.069779960754673825), ('wpt_name', 0.052531093768187799), ('num_private', 0.00085531362401935834), ('basin', 0.0097794385940230006), ('subvillage', 0.045864844570713298), ('region', 0.010343928629942076), ('region_code', 0.013028343029034679), ('district_code', 0.014224576969325118), ('lga', 0.018912299830737403), ('ward', 0.024664127564419373), ('population', 0.031073011950909953), ('public_meeting', 0.0058449880064946561), ('recorded_by', 0.0), ('scheme_management', 0.01129753512379114), ('scheme_name', 0.017967903794259128), ('permit', 0.0060762408972778606), ('construction_year', 0.0307014820983606), ('extraction_type', 0.012930964287103133), ('extraction_type_group', 0.015486706002440337), ('extraction_type_class', 0.020069837797164772), ('management', 0.010962881901000164), ('management_group', 0.0061010181344350791), ('payment', 0.018674287804992432), ('payment_type', 0.0099973532478958398), ('water_quality', 0.0090763819563593369), ('quality_group', 0.0060378716297362755), ('quantity', 0.065465797479158608), ('quantity_group', 0.077355131193541871), ('source', 0.013859347353133804), ('source_type', 0.0074974134844467853), ('source_class', 0.0037229724940747353), ('waterpoint_type', 0.041010612379949227), ('waterpoint_type_group', 0.021262735451926647), ('date_recorded_weekday', 0.022163861609408846), ('date_recorded_date', 0.02861911016064498), ('date_recorded_month', 0.012313403345631139), ('date_recorded_year', 0.0039333039453074382)]



In [334]:

    
plt.title('Random Forest - Features Importance - Histogram')
plt.ylabel('No.of Features')
plt.xlabel('Feature Importance')

_ = sns.distplot(clf_rf.feature_importances_ * 100, bins=20, hist=True, kde=False)



In [335]:

    
plt.title('Random Forest - Features (relative*) Importance - Histogram')
plt.ylabel('No.of Features')
plt.xlabel('Feature Importance - Bin size is 5')

tmp = 100 * (clf_rf.feature_importances_  - min(clf_rf.feature_importances_)) / max(clf_rf.feature_importances_)

_ = sns.distplot(tmp, bins=20, hist=True, kde=False)



In [336]:

    
bag = []
kbest_selected_cols = []
for col, score in zip(X.columns, tmp):
    if score < 5:
        bag.append(col)
    else:
        kbest_selected_cols.append(col)

print('Removed Cols:', bag)
print('Rest of Cols', kbest_selected_cols)









    



Removed Cols: ['num_private', 'recorded_by', 'source_class']
Rest of Cols ['amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer', 'longitude', 'latitude', 'wpt_name', 'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward', 'population', 'public_meeting', 'scheme_management', 'scheme_name', 'permit', 'construction_year', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'waterpoint_type', 'waterpoint_type_group', 'date_recorded_weekday', 'date_recorded_date', 'date_recorded_month', 'date_recorded_year']



In [337]:

    
X[kbest_selected_cols].size / 40., X[kbest_selected_cols].shape









    Out[337]:





(59400.0, (59400, 40))



In [338]:

    
# n_estimators=150, criterion='entropy', class_weight="balanced_subsample", 

clf_rf = RandomForestClassifier(random_state=192, n_jobs=-1)
# class_weight="balanced_subsample"/"balanced"
# criterion="gini"/"entropy"

clf_rf = clf_rf.fit(X_train[kbest_selected_cols], y_train)
# pred = clf_rf.predict_proba(X_test)
clf_rf.score(X_test[kbest_selected_cols], y_test)









    Out[338]:





0.79979797979797984

### SVM

X.shape

from sklearn import svm

clf_svm = svm.SVC(kernel='linear')

clf_svm?

limit = 500 clf_svm = clf_svm.fit(X_train[:limit], y_train[:limit]) print('') print('Score:' + str(clf_svm.score(X_test, y_test))) print('')

Multi Class



In [339]:

    
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

One Vs One



In [342]:

    
clf_multiclass_rf = OneVsOneClassifier(RandomForestClassifier(
    n_estimators=200,criterion='entropy', class_weight="balanced_subsample",
    random_state=192, n_jobs=-1
))

clf_multiclass_rf = clf_multiclass_rf.fit(X_train, y_train)

print('Classifier:', clf_multiclass_rf)

print('Score:', clf_multiclass_rf.score(X_train, y_train))
print('Score:', clf_multiclass_rf.score(X_test, y_test))

# Score: 0.999775533109
# Score: 0.813602693603









    



Classifier: OneVsOneClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='entropy', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=192, verbose=0, warm_start=False),
          n_jobs=1)
Score: 0.99975308642
Score: 0.81595959596

One vs Rest



In [344]:

    
clf_multiclass_rf = OneVsRestClassifier(RandomForestClassifier(
    n_estimators=200,criterion='entropy', class_weight="balanced_subsample",
    random_state=192, n_jobs=-1
))

clf_multiclass_rf = clf_multiclass_rf.fit(X_train, y_train)

print('Classifier:', clf_multiclass_rf)
print('Train Score: ', clf_multiclass_rf.score(X_train, y_train))
print('Test Score:', clf_multiclass_rf.score(X_test, y_test))









    



Classifier: OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='entropy', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=192, verbose=0, warm_start=False),
          n_jobs=1)
Train Score:  0.999775533109
Test Score: 0.81468013468

Parameter tuning

From above analysis we can see that Random Forest CLF performed better than most other and so here we shall optimise it.



In [345]:

    
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV



In [346]:

    
# max_features
np.sqrt(len(X_train.columns)), np.log(len(X_train.columns))









    Out[346]:





(6.5574385243020004, 3.7612001156935624)



In [347]:

    
np.log2(len(X_train.columns)), np.sqrt (len(X_train.columns)), len(X_train.columns)









    Out[347]:





(5.4262647547020979, 6.5574385243020004, 43)



In [348]:

    
'balanced_subsample balanced'.split(), 'gini entropy'.split()









    Out[348]:





(['balanced_subsample', 'balanced'], ['gini', 'entropy'])



In [349]:

    
parameters = {
    'n_estimators': [10, 50, 100, 150, 200],
    'class_weight': ['balanced_subsample', 'balanced'],
    'criterion': ['gini', 'entropy'],
    'max_features': ['log2', 'auto', 25],
    'random_state': [192]
}

# clf_rf = RandomForestClassifier(n_estimators=150, criterion='entropy', class_weight="balanced_subsample", n_jobs=-1, random_state=192)
# 0.81346801346801345

GS_CV = RandomizedSearchCV(RandomForestClassifier(), parameters)

GS_CV.fit(X, y)









    Out[349]:





RandomizedSearchCV(cv=None, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_estimators': [10, 50, 100, 150, 200], 'class_weight': ['balanced_subsample', 'balanced'], 'max_features': ['log2', 'auto', 25], 'random_state': [192], 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)



In [350]:

    
print(GS_CV.best_params_, GS_CV.best_score_)
# {'n_estimators': 200, 'max_features': 'log2', 'random_state': 192, 'criterion': 'entropy',
#  'class_weight': 'balanced_subsample'} 0.806717171717









    



{'n_estimators': 150, 'random_state': 192, 'max_features': 'log2', 'class_weight': 'balanced_subsample', 'criterion': 'gini'} 0.808114478114



In [351]:

    
cv_results = pd.DataFrame(GS_CV.cv_results_, columns=[u'mean_fit_time', u'mean_score_time', u'mean_test_score',
       u'mean_train_score', u'param_class_weight', u'param_criterion',
       u'param_max_features', u'param_n_estimators', u'params'])



In [352]:

    
cv_results.head(2)









    Out[352]:






  
    
      
      mean_fit_time
      mean_score_time
      mean_test_score
      mean_train_score
      param_class_weight
      param_criterion
      param_max_features
      param_n_estimators
      params
    
  
  
    
      0
      9.190857
      0.691325
      0.807593
      0.999747
      balanced_subsample
      gini
      log2
      100
      {'n_estimators': 100, 'random_state': 192, 'ma...
    
    
      1
      8.334739
      0.690683
      0.806347
      0.999747
      balanced
      gini
      log2
      100
      {'n_estimators': 100, 'random_state': 192, 'ma...



In [353]:

    
import seaborn as sns
sns.set(color_codes=True)

np.random.seed(sum(map(ord, "regression")))
tips = sns.load_dataset("tips")



In [354]:

    
ax=plt.figure(figsize=(8,8))
_ = sns.lmplot(x="mean_test_score", y="mean_train_score", hue="param_max_features", data=cv_results)









    





<matplotlib.figure.Figure at 0x125605908>

Checking "clf_rf" RF performance

sam_confusion_maxtrix(y_test, clf_rf.predict(X_test), ['func', 'non f', 'repair'])

XGBOOST

import xgboost as xgb

gbm = xgb.XGBClassifier(max_depth=3, n_estimators=100, learning_rate=0.05).fit(X_train, y_train) gbm_predictions = gbm.predict(X_test)

print(sum(gbm_predictions == y_test)/ (1.0 * len(y_test)) # 0.7279461279461279)

sam_confusion_maxtrix(y_test, predictions)

Submission

Model Selection

Check for which model is performing best and using it.
Check to apply the one-vs-many//one-vs-one wrapper.
Check for 'test_train_split' for which X,y to be used for training



In [378]:

    
GS_CV.best_params_









    Out[378]:





{'class_weight': 'balanced_subsample',
 'criterion': 'gini',
 'max_features': 'log2',
 'n_estimators': 150,
 'random_state': 192}



In [377]:

    
clf_rf = OneVsOneClassifier(RandomForestClassifier(n_estimators=150,
random_state=192,
max_features='log2',
class_weight='balanced_subsample',
criterion='gini'))

print (clf_rf)

clf_rf = clf_rf.fit(X, y)









    



OneVsOneClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features='log2',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=192, verbose=0, warm_start=False),
          n_jobs=1)



In [380]:

    
# saving the index
test_ids = RAW_TEST_X.index

# predicint the values
predictions = clf_rf.predict(TEST_X)
print(predictions.shape)

# Converting int to its respective Labels
predictions_labels = le.inverse_transform(predictions)

# setting up column name & save file
sub = pd.DataFrame(predictions_labels, columns=['status_group'])
sub.head()
sub.insert(loc=0, column='id', value=test_ids)
sub.reset_index()
sub.to_csv('submit.csv', index=False)
sub.head()









    



(14850,)






    Out[380]:






  
    
      
      id
      status_group
    
  
  
    
      0
      50785
      non functional
    
    
      1
      51630
      functional
    
    
      2
      17168
      functional
    
    
      3
      45559
      non functional
    
    
      4
      49871
      functional



In [ ]:



In [ ]:



In [ ]:

	funder	installer	wpt_name	subvillage	ward	scheme_name
id
69572	Roman	Roman	none	Mnyusi B	Mundindi	Roman
8776	Grumeti	GRUMETI	Zahanati	Nyamara	Natta	NaN
34310	Lottery Club	World vision	Kwa Mahundi	Majengo	Ngorika	Nyumba ya mungu pipe scheme
67743	Unicef	UNICEF	Zahanati Ya Nanyumbu	Mahakamani	Nanyumbu	NaN
19728	Action In A	Artisan	Shuleni	Kyanyamisa	Nyakasimbi	NaN

Col ID	Col Name	UniqCount	Col Values	UniqValCount
1	amount_tsh	98
2	date_recorded	356
3	funder	1897
4	gps_height	2428
5	installer	2145
6	longitude	57516
7	latitude	57517
8	wpt_name	37400
9	num_private	65
10	basin	9	Lake Nyasa	5085
	-	-	Pangani	8940
	-	-	Internal	7785
	-	-	Lake Rukwa	2454
	-	-	Lake Tanganyika	6432
	-	-	Wami / Ruvu	5987
	-	-	Ruvuma / Southern Coast	4493
	-	-	Lake Victoria	10248
	-	-	Rufiji	7976
11	subvillage	19287
12	region	21
13	region_code	27
14	district_code	20
15	lga	125
16	ward	2092
17	population	1049
18	public_meeting	2	False	5055
	-	-	True	51011
19	recorded_by	1
20	scheme_management	12
21	scheme_name	2696
22	permit	2	False	17492
	-	-	True	38852
23	construction_year	55
24	extraction_type	18
25	extraction_type_group	13
26	extraction_type_class	7	other	6430
	-	-	submersible	6179
	-	-	handpump	16456
	-	-	gravity	26780
	-	-	wind-powered	117
	-	-	rope pump	451
	-	-	motorpump	2987
27	management	12
28	management_group	5	other	943
	-	-	unknown	561
	-	-	user-group	52490
	-	-	parastatal	1768
	-	-	commercial	3638
29	payment	7	other	1054
	-	-	pay monthly	8300
	-	-	pay annually	3642
	-	-	pay when scheme fails	3914
	-	-	pay per bucket	8985
	-	-	unknown	8157
	-	-	never pay	25348
30	payment_type	7	other	1054
	-	-	monthly	8300
	-	-	per bucket	8985
	-	-	annually	3642
	-	-	unknown	8157
	-	-	on failure	3914
	-	-	never pay	25348
31	water_quality	8	coloured	490
	-	-	fluoride	200
	-	-	milky	804
	-	-	fluoride abandoned	17
	-	-	soft	50818
	-	-	salty abandoned	339
	-	-	salty	4856
	-	-	unknown	1876
32	quality_group	6	good	50818
	-	-	fluoride	217
	-	-	milky	804
	-	-	unknown	1876
	-	-	colored	490
	-	-	salty	5195
33	quantity	5	seasonal	4050
	-	-	unknown	789
	-	-	enough	33186
	-	-	dry	6246
	-	-	insufficient	15129
34	quantity_group	5	seasonal	4050
	-	-	unknown	789
	-	-	enough	33186
	-	-	dry	6246
	-	-	insufficient	15129
35	source	10
36	source_type	7	rainwater harvesting	2295
	-	-	dam	656
	-	-	shallow well	16824
	-	-	other	278
	-	-	river/lake	10377
	-	-	spring	17021
	-	-	borehole	11949
37	source_class	3	surface	13328
	-	-	unknown	278
	-	-	groundwater	45794
38	waterpoint_type	7	other	6380
	-	-	communal standpipe	28522
	-	-	communal standpipe multiple	6103
	-	-	dam	7
	-	-	hand pump	17488
	-	-	cattle trough	116
	-	-	improved spring	784
39	waterpoint_type_group	6	other	6380
	-	-	communal standpipe	34625
	-	-	hand pump	17488
	-	-	dam	7
	-	-	cattle trough	116
	-	-	improved spring	784

Column	Prev.	Current
funder	1898	1880
installer	2146	1866
wpt_name	37400	36717
subvillage	19288	19175
scheme_name	2697	2485

Column Name	VCount	Column Name	VCount	Corr
date_recorded	356	date_recorded_year	5	0.95920911743658788
extraction_type	18	extraction_type_group	13	0.94952351098756882
extraction_type_group	13	extraction_type	18	0.94952351098756882
source	10	source_type	7	0.94381787586073784
source_type	7	source	10	0.94381787586073784
waterpoint_type	7	waterpoint_type_group	6	0.98215380609123037
waterpoint_type_group	6	waterpoint_type	7	0.98215380609123037
date_recorded_year	5	date_recorded	356	0.95920911743658788

	count	mean	std	min	25%	50%	75%	max
0	30.0	5.264466e-02	1.776436e-01	3.066936e-05	0.002936	5.765839e-03	1.470519e-02	8.611451e-01
1	30.0	4.268263e-02	1.805436e-01	6.822979e-05	0.002029	3.609913e-03	1.266789e-02	9.957240e-01
2	30.0	5.619575e-02	1.766802e-01	2.795439e-05	0.000310	8.516267e-04	4.805287e-03	7.605836e-01
3	30.0	6.128836e-02	1.749199e-01	1.156348e-05	0.000272	1.745133e-03	1.154194e-02	8.095676e-01
4	30.0	5.542420e-02	1.769322e-01	2.312065e-05	0.000199	5.290925e-04	8.663409e-03	7.468425e-01
5	30.0	5.051806e-02	1.784452e-01	1.680544e-06	0.000161	3.803534e-04	3.975645e-03	7.725976e-01
6	30.0	5.121032e-02	1.782409e-01	1.508064e-05	0.000078	3.240449e-04	1.022508e-02	7.723962e-01
7	30.0	3.472833e-02	1.823050e-01	3.419095e-07	0.000004	9.423929e-06	5.997757e-04	9.997252e-01
8	30.0	6.470633e-02	1.736035e-01	6.221466e-07	0.000163	2.913288e-03	3.655878e-02	8.314784e-01
9	30.0	7.921026e-02	1.663410e-01	1.727048e-06	0.000400	9.006896e-03	4.520668e-02	6.911450e-01
10	30.0	3.740845e-02	1.817556e-01	6.543990e-07	0.000010	2.491947e-05	1.717698e-03	9.982215e-01
11	30.0	7.840541e-02	1.676441e-01	1.100873e-05	0.001062	9.910642e-03	6.561904e-02	6.748640e-01
12	30.0	6.984921e-02	1.715041e-01	1.098525e-05	0.001527	1.052479e-02	4.993550e-02	8.420199e-01
13	30.0	8.629159e-02	1.634623e-01	8.256655e-07	0.000558	1.480295e-02	1.052436e-01	7.931873e-01
14	30.0	4.144004e-02	1.808467e-01	1.565410e-05	0.001588	3.719466e-03	1.827063e-02	9.975896e-01
15	30.0	5.158451e-02	1.781293e-01	1.741198e-05	0.000082	2.249986e-04	8.506731e-03	8.282855e-01
16	30.0	5.239932e-02	1.778831e-01	1.409580e-04	0.000339	1.777523e-03	1.663163e-02	9.591436e-01
17	30.0	7.186709e-03	1.165815e-02	1.827096e-07	0.000047	2.581867e-03	7.245337e-03	5.017586e-02
18	30.0	3.336819e-17	4.963148e-17	-0.000000e+00	0.000000	5.505714e-20	5.551115e-17	1.804112e-16
19	30.0	9.037559e-02	1.603736e-01	1.010811e-05	0.000330	9.664426e-03	9.996771e-02	6.265809e-01
20	30.0	6.211051e-02	1.746196e-01	3.191952e-05	0.000256	7.286919e-04	8.350287e-03	6.770724e-01
21	30.0	1.119623e-02	1.680378e-02	5.902775e-08	0.000084	2.381170e-03	1.431135e-02	5.904086e-02
22	30.0	5.453542e-02	1.772159e-01	2.157749e-05	0.002852	5.227284e-03	1.833292e-02	8.602716e-01
23	30.0	8.423729e-02	1.432741e-01	1.544605e-06	0.000559	1.370984e-02	7.302268e-02	5.287920e-01
24	30.0	6.512927e-02	1.143630e-01	2.154448e-06	0.000751	1.080049e-02	7.076019e-02	4.278898e-01
25	30.0	5.975461e-02	1.349504e-01	2.600068e-06	0.000360	6.078542e-03	2.426885e-02	5.540015e-01
26	30.0	8.589454e-02	1.474672e-01	6.368114e-06	0.000185	6.729705e-03	1.241829e-01	6.021727e-01
27	30.0	3.566714e-02	6.451389e-02	1.418986e-07	0.000082	4.677488e-03	5.579973e-02	2.994893e-01
28	30.0	7.898384e-02	1.321893e-01	1.552095e-07	0.000261	1.441977e-02	9.583476e-02	5.094964e-01
29	30.0	6.045190e-02	9.003434e-02	7.602828e-06	0.000296	1.820820e-02	8.656660e-02	3.262628e-01
30	30.0	1.105546e-02	2.038700e-02	5.019569e-07	0.000140	1.066348e-03	7.806508e-03	6.510965e-02
31	30.0	1.126054e-02	1.717433e-02	1.447720e-06	0.000109	2.192364e-03	1.602182e-02	7.455432e-02
32	30.0	4.383445e-02	1.229576e-01	1.493985e-06	0.000068	3.036652e-03	1.744089e-02	4.999902e-01
33	30.0	4.383445e-02	1.229576e-01	1.493985e-06	0.000068	3.036652e-03	1.744089e-02	4.999902e-01
34	30.0	6.732498e-02	1.050369e-01	9.335828e-07	0.000380	2.000375e-02	8.911797e-02	4.649121e-01
35	30.0	7.507680e-02	1.130867e-01	1.813865e-07	0.000428	1.733277e-02	1.185510e-01	4.958592e-01
36	30.0	9.107952e-03	1.541552e-02	7.065533e-07	0.000058	1.870569e-03	1.079122e-02	6.098258e-02
37	30.0	6.488749e-02	1.287179e-01	3.217442e-06	0.000593	1.069135e-02	6.687406e-02	5.972969e-01
38	30.0	5.189630e-02	1.009729e-01	2.145346e-06	0.000354	1.177189e-02	5.937794e-02	4.666592e-01
39	30.0	5.473117e-02	1.771367e-01	9.233950e-08	0.000071	1.147458e-03	1.327330e-02	9.078710e-01
40	30.0	4.618602e-02	1.796512e-01	4.938538e-06	0.000577	3.669242e-03	1.390761e-02	9.883904e-01
41	30.0	7.356021e-02	1.691821e-01	3.337361e-06	0.000808	1.592026e-02	6.103160e-02	8.853109e-01
42	30.0	5.776790e-03	1.280072e-02	4.976312e-06	0.000332	1.674403e-03	5.824429e-03	6.893185e-02

	count	mean	std	min	25%	50%	75%	max
0	3.0	8.337468e-03	6.804167e-03	8.299133e-04	5.457564e-03	1.008521e-02	1.209124e-02	1.409727e-02
1	3.0	1.495772e-03	1.480684e-03	3.053713e-04	6.667497e-04	1.028128e-03	2.090972e-03	3.153815e-03
2	3.0	1.396777e-04	7.433377e-05	6.421568e-05	1.031019e-04	1.419882e-04	1.774088e-04	2.128293e-04
3	3.0	1.505486e-04	1.182363e-04	4.900360e-05	8.564571e-05	1.222878e-04	2.013211e-04	2.803545e-04
4	3.0	7.289322e-05	2.376144e-05	4.844370e-05	6.138937e-05	7.433504e-05	8.511798e-05	9.590092e-05
5	3.0	4.974313e-05	5.133707e-05	1.519399e-05	2.024762e-05	2.530125e-05	6.701770e-05	1.087341e-04
6	3.0	2.195375e-05	1.729867e-05	3.061823e-06	1.442160e-05	2.578138e-05	3.139971e-05	3.701803e-05
7	3.0	5.517552e-07	7.558838e-07	9.259986e-08	1.155466e-07	1.384934e-07	7.813328e-07	1.424172e-06
8	3.0	1.860713e-03	2.580830e-03	3.218733e-04	3.709395e-04	4.200058e-04	2.630133e-03	4.840260e-03
9	3.0	4.772289e-02	1.273324e-02	3.841346e-02	4.046765e-02	4.252183e-02	5.237760e-02	6.223338e-02
10	3.0	2.555641e-06	7.267233e-07	2.112506e-06	2.136292e-06	2.160077e-06	2.777208e-06	3.394339e-06
11	3.0	6.599947e-03	2.308804e-03	5.093635e-03	5.270892e-03	5.448149e-03	7.353103e-03	9.258056e-03
12	3.0	1.067965e-02	6.871940e-03	4.397211e-03	7.010128e-03	9.623045e-03	1.382087e-02	1.801870e-02
13	3.0	2.067925e-02	2.309495e-02	2.684797e-03	7.658224e-03	1.263165e-02	2.967648e-02	4.672132e-02
14	3.0	2.783953e-03	2.387193e-03	7.862720e-04	1.462090e-03	2.137909e-03	3.782793e-03	5.427677e-03
15	3.0	1.669431e-05	2.360449e-05	1.203457e-06	3.110820e-06	5.018182e-06	2.443974e-05	4.386130e-05
16	3.0	9.747306e-05	8.501960e-05	3.647203e-06	6.150795e-05	1.193687e-04	1.443860e-04	1.694033e-04
17	3.0	5.389127e-02	3.736855e-02	1.376729e-02	3.698717e-02	6.020705e-02	7.395326e-02	8.769947e-02
18	3.0	1.338180e-16	1.872716e-16	1.657216e-17	2.582890e-17	3.508564e-17	1.924410e-16	3.497963e-16
19	3.0	1.782513e-02	2.404402e-02	1.237305e-03	4.037789e-03	6.838272e-03	2.611904e-02	4.539980e-02
20	3.0	1.197332e-04	7.240755e-05	4.387354e-05	8.554620e-05	1.272189e-04	1.576630e-04	1.881071e-04
21	3.0	1.474554e-01	1.454648e-01	3.039995e-02	6.602955e-02	1.016591e-01	2.059832e-01	3.103072e-01
22	3.0	7.850430e-03	2.252974e-03	5.581121e-03	6.732293e-03	7.883464e-03	8.985085e-03	1.008671e-02
23	3.0	2.558823e-02	2.935264e-02	5.589832e-03	8.739308e-03	1.188878e-02	3.558742e-02	5.928606e-02
24	3.0	5.506476e-02	1.756721e-02	4.316921e-02	4.497612e-02	4.678303e-02	6.101254e-02	7.524204e-02
25	3.0	1.406357e-01	5.590006e-02	9.075016e-02	1.104275e-01	1.301047e-01	1.655785e-01	2.010523e-01
26	3.0	5.262338e-02	4.049202e-02	8.525442e-03	3.486961e-02	6.121379e-02	7.467235e-02	8.813091e-02
27	3.0	9.821851e-02	4.667700e-02	4.817481e-02	7.704065e-02	1.059065e-01	1.232404e-01	1.405742e-01
28	3.0	2.519284e-02	9.449109e-03	1.711129e-02	1.999825e-02	2.288522e-02	2.923361e-02	3.558200e-02
29	3.0	5.616476e-02	7.611849e-02	5.880103e-03	1.237816e-02	1.887621e-02	8.130709e-02	1.437380e-01
30	3.0	5.800423e-02	1.505242e-02	4.579136e-02	4.959588e-02	5.340041e-02	6.411067e-02	7.482093e-02
31	3.0	1.422329e-01	4.819415e-02	1.049519e-01	1.150222e-01	1.250925e-01	1.608734e-01	1.966543e-01
32	3.0	1.212447e-01	4.119230e-02	7.493459e-02	1.049678e-01	1.350010e-01	1.443997e-01	1.537985e-01
33	3.0	1.212447e-01	4.119230e-02	7.493459e-02	1.049678e-01	1.350010e-01	1.443997e-01	1.537985e-01
34	3.0	1.492370e-01	9.395345e-02	6.272971e-02	9.926144e-02	1.357932e-01	1.924907e-01	2.491883e-01
35	3.0	1.084671e-01	5.423996e-02	5.451978e-02	8.120318e-02	1.078866e-01	1.354408e-01	1.629950e-01
36	3.0	3.432188e-01	3.503985e-01	1.077237e-01	1.418796e-01	1.760354e-01	4.609664e-01	7.458973e-01
37	3.0	1.406672e-01	1.539974e-01	3.629541e-02	5.223427e-02	6.817313e-02	1.928531e-01	3.175332e-01
38	3.0	2.638073e-01	2.567213e-01	5.646120e-02	1.202355e-01	1.840097e-01	3.674804e-01	5.509511e-01
39	3.0	3.952694e-03	1.596501e-03	2.966568e-03	3.031722e-03	3.096875e-03	4.445757e-03	5.794639e-03
40	3.0	3.721769e-03	5.211595e-03	1.013048e-04	7.351925e-04	1.369080e-03	5.532001e-03	9.694921e-03
41	3.0	3.023130e-02	4.177840e-02	5.882133e-03	6.110885e-03	6.339636e-03	4.240588e-02	7.847212e-02
42	3.0	1.502867e-01	8.717312e-02	5.979505e-02	1.085746e-01	1.573541e-01	1.955326e-01	2.337110e-01

	mean_fit_time	mean_score_time	mean_test_score	mean_train_score	param_class_weight	param_criterion	param_max_features	param_n_estimators	params
0	9.190857	0.691325	0.807593	0.999747	balanced_subsample	gini	log2	100	{'n_estimators': 100, 'random_state': 192, 'ma...
1	8.334739	0.690683	0.806347	0.999747	balanced	gini	log2	100	{'n_estimators': 100, 'random_state': 192, 'ma...

	id	status_group
0	50785	non functional
1	51630	functional
2	17168	functional
3	45559	non functional
4	49871	functional