PUMP IT UP

Introduction: Using the data gathered from Taarifa and the Tanzanian Ministry of Water, can we predict which pumps are functional, which need some repairs, and which don't work at all? Predicting one of these three classes based and a smart understanding of which waterpoints will fail, can improve the maintenance operations and ensure that clean, potable water is available to communities across Tanzania.

This is also an intermediate-level competition by DataDriven! All code & support scripts are in Github Repo

Imports


In [63]:
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


%matplotlib inline
# %load_ext writeandexecute

plt.style.use('ggplot')
sns.set(color_codes=True)

# seed
np.random.seed(69572)

In [64]:
# import sys
# sys.path = sys.path + ['/Users/sampathkumarm/Desktop/devbox/Sam-DS/Kaggle/datadriven']

import scripts

import imp
imp.reload(scripts)

from scripts.sam_value_counts import sam_dataframe_cols_value_count_analysis, sam_dataframe_markup_value_counts
from scripts.sam_confusion_matrix import sam_plot_confusion_matrix, sam_confusion_maxtrix

In [65]:
import sys

from __future__ import absolute_import
from IPython.core.getipython import get_ipython
from IPython.core.magic import (Magics, magics_class,  cell_magic)

try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO

from markdown import markdown
from IPython.core.display import HTML
from IPython.display import display

@magics_class
class MarkdownMagics(Magics):
 
    @cell_magic
    def asmarkdown(self, line, cell):
        buffer = StringIO()
        stdout = sys.stdout
        sys.stdout = buffer
        try:
            exec(cell, locals(), self.shell.user_ns)
        except:
            sys.stdout = stdout
            raise
        sys.stdout = stdout
        return HTML("<p>{}</p>".format(markdown(buffer.getvalue(), extensions=['markdown.extensions.extra'])))
        return buffer.getvalue() + 'test'
    
    def timer_message(self, start_time):
#         print self
        time_diff = (now() - start_time).total_seconds()
        if time_diff < 0.001:
            time_diff = 0
            print('\n<pre>In', time_diff, 'Secs</pre>')
        else:
            print('\n<pre>In', time_diff, 'Secs</pre>')

    @cell_magic
    def timer(self, line, cell):
        import datetime
        now = datetime.datetime.now
        start_time = now()
        buffer = StringIO()
        stdout = sys.stdout
        sys.stdout = buffer
        try:
            exec(cell, locals(), self.shell.user_ns)
            self.timer_message(start_time)
        except:
            sys.stdout = stdout
            raise
        sys.stdout = stdout
        return HTML("<p>{}</p>".format(markdown(buffer.getvalue(), extensions=['markdown.extensions.extra'])))
        return buffer.getvalue() + 'test'
 
get_ipython().register_magics(MarkdownMagics)

Data Analysis


In [116]:
RAW_X = pd.read_csv('data/traning_set_values.csv', index_col='id')
RAW_y = pd.read_csv('data/training_set_labels.csv', index_col='id')
RAW_TEST_X = pd.read_csv('data/test_set_values.csv', index_col='id')

In [117]:
# proportion of labels  available
RAW_y.status_group.value_counts() / RAW_y.size


Out[117]:
functional                 0.543081
non functional             0.384242
functional needs repair    0.072677
Name: status_group, dtype: float64

In [118]:
print('Shape of RAW_X', RAW_X.shape)
print('Shape of RAW_y', RAW_y.shape)
print('Shape of RAW_TEST_X', RAW_TEST_X.shape)

# ('Shape of RAW_X', (59400, 39))
# ('Shape of RAW_y', (59400, 1))
# ('Shape of RAW_TEST_X', (14850, 39))


Shape of RAW_X (59400, 39)
Shape of RAW_y (59400, 1)
Shape of RAW_TEST_X (14850, 39)

In [69]:
for i, col in enumerate(RAW_X.columns):
    print('|%d|%s|%d|' % (i, col, len(RAW_X[col].value_counts())))


|0|amount_tsh|98|
|1|date_recorded|356|
|2|funder|1897|
|3|gps_height|2428|
|4|installer|2145|
|5|longitude|57516|
|6|latitude|57517|
|7|wpt_name|37400|
|8|num_private|65|
|9|basin|9|
|10|subvillage|19287|
|11|region|21|
|12|region_code|27|
|13|district_code|20|
|14|lga|125|
|15|ward|2092|
|16|population|1049|
|17|public_meeting|2|
|18|recorded_by|1|
|19|scheme_management|12|
|20|scheme_name|2696|
|21|permit|2|
|22|construction_year|55|
|23|extraction_type|18|
|24|extraction_type_group|13|
|25|extraction_type_class|7|
|26|management|12|
|27|management_group|5|
|28|payment|7|
|29|payment_type|7|
|30|water_quality|8|
|31|quality_group|6|
|32|quantity|5|
|33|quantity_group|5|
|34|source|10|
|35|source_type|7|
|36|source_class|3|
|37|waterpoint_type|7|
|38|waterpoint_type_group|6|

In [70]:
# integer colums
cols_ints = '''amount_tsh
gps_height
longitude
latitude
num_private
region_code
district_code
population
construction_year'''.splitlines()

# bool
cols_bool = 'public_meeting permit'.split()

# date
cols_date = ['date_recorded']

print('INT COlS: ', len(cols_ints))
print('BOOL COLS:', len(cols_bool))
print('Date COLS:', len(cols_date))


INT COlS:  9
BOOL COLS: 2
Date COLS: 1

In [71]:
len(RAW_X.columns)


Out[71]:
39

In [119]:
def show_object_dtypes(df,others=True):
    dtype = object
    if others:
        return df.dtypes[df.dtypes == dtype]
    else:
        return df.dtypes[df.dtypes != dtype]

In [120]:
show_object_dtypes(RAW_TEST_X, True)


Out[120]:
date_recorded            object
funder                   object
installer                object
wpt_name                 object
basin                    object
subvillage               object
region                   object
lga                      object
ward                     object
public_meeting           object
recorded_by              object
scheme_management        object
scheme_name              object
permit                   object
extraction_type          object
extraction_type_group    object
extraction_type_class    object
management               object
management_group         object
payment                  object
payment_type             object
water_quality            object
quality_group            object
quantity                 object
quantity_group           object
source                   object
source_type              object
source_class             object
waterpoint_type          object
waterpoint_type_group    object
dtype: object

In [121]:
show_object_dtypes(RAW_TEST_X, False)


Out[121]:
amount_tsh           float64
gps_height             int64
longitude            float64
latitude             float64
num_private            int64
region_code            int64
district_code          int64
population             int64
construction_year      int64
dtype: object

cols_values_counts_dataframe

As we can see in above describe output, we seem to have lots of categorical values so let start exploring them a bit.

Lets start taking into believe everything is a Categorical Columns and check their data


In [122]:
columns = RAW_X.columns
values_counts_bag = [len(RAW_X[column].value_counts()) for column in columns]

In [123]:
_ = sns.distplot(values_counts_bag, hist=True, kde=False,)


Example of how np-log transforms data

>>> np.log([0.001, 0.01, 0.1, 1, 10, 100, 1000])

array([-6.90775528, -4.60517019, -2.30258509,  0.        ,  2.30258509,
        4.60517019,  6.90775528])

As you can see in np-log example, we can learn that when a list of values vary significantly(exponentially) then their logarithms moves linearly. As we(I) feel comfortable in studying linear plot and linear information, we did a np.log to values counts.


In [77]:
cols_values_counts_dataframe = pd.DataFrame(np.log(values_counts_bag), index=columns, columns=['Value Counts'])

In [78]:
print('Values Counts:', values_counts_bag)

print('\nLog of Values Counts:', cols_values_counts_dataframe.T.values)

_ = sns.distplot(cols_values_counts_dataframe.T.values, hist=True, kde=False,)

plt.title('Historgram of  Object Feature`s (log2 of) Unique Values counts')
plt.xlabel('Features')


Values Counts: [98, 356, 1897, 2428, 2145, 57516, 57517, 37400, 65, 9, 19287, 21, 27, 20, 125, 2092, 1049, 2, 1, 12, 2696, 2, 55, 18, 13, 7, 12, 5, 7, 7, 8, 6, 5, 5, 10, 7, 3, 7, 6]

Log of Values Counts: [[  4.58496748   5.87493073   7.54802897   7.79482315   7.67089483
   10.95981845  10.95983584  10.52942598   4.17438727   2.19722458
    9.86718657   3.04452244   3.29583687   2.99573227   4.82831374
    7.64587583   6.95559261   0.69314718   0.           2.48490665
    7.89952447   0.69314718   4.00733319   2.89037176   2.56494936
    1.94591015   2.48490665   1.60943791   1.94591015   1.94591015
    2.07944154   1.79175947   1.60943791   1.60943791   2.30258509
    1.94591015   1.09861229   1.94591015   1.79175947]]
Out[78]:
<matplotlib.text.Text at 0x1193e1550>

In [79]:
cols_values_counts_dataframe.plot(kind='barh', figsize=(12, 12))
_ = plt.plot((2, 2), (0, 38))
_ = plt.plot((4, 4), (0, 38), '-g')
_ = plt.plot((6, 6), (0, 38), '-r')
_ = plt.plot((8, 8), (0, 38), '-y')
print('We seem to have some special categories where value counts are high.')

plt.title('Features Values Counts for comparision')
plt.xlabel ('Log of Unique Values')


We seem to have some special categories where value counts are high.
Out[79]:
<matplotlib.text.Text at 0x119477a20>

In [80]:
sam_dataframe_cols_value_count_analysis(RAW_X)


(1, 'waterpoint_type_group', 24)
(2, 'basin', 24)
(3, 'region', 24)
(4, 'region_code', 24)
(5, 'district_code', 24)
(6, 'public_meeting', 24)
(7, 'recorded_by', 24)
(8, 'scheme_management', 24)
(9, 'permit', 24)
(10, 'extraction_type', 24)
(11, 'extraction_type_group', 24)
(12, 'extraction_type_class', 24)
(13, 'management', 24)
(14, 'management_group', 24)
(15, 'payment', 24)
(16, 'payment_type', 24)
(17, 'water_quality', 24)
(18, 'quality_group', 24)
(19, 'quantity', 24)
(20, 'quantity_group', 24)
(21, 'source', 24)
(22, 'source_type', 24)
(23, 'source_class', 24)
(24, 'waterpoint_type', 24)
('Showing Plot for Columns:\n', ['basin', 'region', 'region_code', 'district_code', 'public_meeting', 'recorded_by', 'scheme_management', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group'])

Checking rest of the columns


In [81]:
cols_value_count_limit_fraction = 0.01
cols_value_count_limit_log_value = np.log(RAW_X.shape[0] * cols_value_count_limit_fraction)


print('Total Number of Records:', RAW_X.shape[0], '- Log val is:', np.log(RAW_X.shape[0]))
print('%s percent of Number of Records:' % (cols_value_count_limit_fraction * 100),\
      RAW_X.shape[0] * cols_value_count_limit_fraction,\
      ' - Log val is:',  cols_value_count_limit_log_value)


Total Number of Records: 59400 - Log val is: 10.9920495054
1.0 percent of Number of Records: 594.0  - Log val is: 6.38687931936

cols_categorical_check

Here in this project, cols_categorical_check refers to list of columns for which caution check is considered. Reason for this check is, we would need more data to explain other columns & target cols with respect to it.

Lets consider these columns with more 5% of values as non categorical values and since our problem statement is choosing which category, we will try to minimise the category and see how our performance changes(improves or not)

To begin we will consider that those categories with more than cols_value_count_limit_fraction percentage as the upper limit allowed. Any column with other data will pruged to become some to other information


In [82]:
show_object_dtypes(RAW_X, True)


Out[82]:
date_recorded            object
funder                   object
installer                object
wpt_name                 object
basin                    object
subvillage               object
region                   object
lga                      object
ward                     object
public_meeting           object
recorded_by              object
scheme_management        object
scheme_name              object
permit                   object
extraction_type          object
extraction_type_group    object
extraction_type_class    object
management               object
management_group         object
payment                  object
payment_type             object
water_quality            object
quality_group            object
quantity                 object
quantity_group           object
source                   object
source_type              object
source_class             object
waterpoint_type          object
waterpoint_type_group    object
dtype: object

In [83]:
show_object_dtypes(RAW_X, False)


Out[83]:
amount_tsh           float64
gps_height             int64
longitude            float64
latitude             float64
num_private            int64
region_code            int64
district_code          int64
population             int64
construction_year      int64
dtype: object

In [84]:
cols_non_categorical = show_object_dtypes(RAW_X, True).index.tolist()

cols_date_numerics = show_object_dtypes(RAW_X, True).index.tolist()

In [85]:
list(cols_date_numerics)


Out[85]:
['date_recorded',
 'funder',
 'installer',
 'wpt_name',
 'basin',
 'subvillage',
 'region',
 'lga',
 'ward',
 'public_meeting',
 'recorded_by',
 'scheme_management',
 'scheme_name',
 'permit',
 'extraction_type',
 'extraction_type_group',
 'extraction_type_class',
 'management',
 'management_group',
 'payment',
 'payment_type',
 'water_quality',
 'quality_group',
 'quantity',
 'quantity_group',
 'source',
 'source_type',
 'source_class',
 'waterpoint_type',
 'waterpoint_type_group']

In [86]:
cols_categorical_check = []

for col, vc in cols_values_counts_dataframe.iterrows():
    if col in cols_non_categorical:
        if float(vc) > cols_value_count_limit_log_value:
            cols_categorical_check.append(col)

print('Columns we need to moderate are:', cols_categorical_check)


Columns we need to moderate are: ['funder', 'installer', 'wpt_name', 'subvillage', 'ward', 'scheme_name']

All cols_date_numerics, are date & other numeric data which can be made into buckets or reducing precision. Thus we can bound number of categories in data as the more variety of data we have, we need more information specific to each category which all might end with curse of dimensionality.

During pre-processing states we shall do following TODO

  • limiting check experiments on our cols_date_numerics & cols_categorical_check to be under cols_value_count_limit_fraction

In [87]:
print('Log limit for categories:', cols_value_count_limit_log_value)
print('Actual limit for categories:', cols_value_count_limit_fraction * RAW_X.shape[0])

RAW_X[cols_categorical_check].head()


Log limit for categories: 6.38687931936
Actual limit for categories: 594.0
Out[87]:
funder installer wpt_name subvillage ward scheme_name
id
69572 Roman Roman none Mnyusi B Mundindi Roman
8776 Grumeti GRUMETI Zahanati Nyamara Natta NaN
34310 Lottery Club World vision Kwa Mahundi Majengo Ngorika Nyumba ya mungu pipe scheme
67743 Unicef UNICEF Zahanati Ya Nanyumbu Mahakamani Nanyumbu NaN
19728 Action In A Artisan Shuleni Kyanyamisa Nyakasimbi NaN

In [88]:
RAW_X[cols_categorical_check].head(15)


Out[88]:
funder installer wpt_name subvillage ward scheme_name
id
69572 Roman Roman none Mnyusi B Mundindi Roman
8776 Grumeti GRUMETI Zahanati Nyamara Natta NaN
34310 Lottery Club World vision Kwa Mahundi Majengo Ngorika Nyumba ya mungu pipe scheme
67743 Unicef UNICEF Zahanati Ya Nanyumbu Mahakamani Nanyumbu NaN
19728 Action In A Artisan Shuleni Kyanyamisa Nyakasimbi NaN
9944 Mkinga Distric Coun DWE Tajiri Moa/Mwereme Moa Zingibali
19816 Dwsp DWSP Kwa Ngomho Ishinabulandi Samuye NaN
54551 Rwssp DWE Tushirikiane Nyawishi Center Chambo NaN
53934 Wateraid Water Aid Kwa Ramadhan Musa Imalauduki Itetemia NaN
46144 Isingiro Ho Artisan Kwapeto Mkonomre Kaisho NaN
49056 Private Private Mzee Hokororo Mizugo Tambani NaN
50409 Danida DANIDA Kwa Alid Nchimbi Ngondombwito Msindo NaN
36957 World Vision World vision Pamba Nkilifa Busilili NaN
50495 Lawatefuka Water Supply Lawatefuka water sup Kwa John Izack Mmari Omarini Siha Kaskazini BL Bondeni
53752 Biore WEDECO Mwabasabi Mwabasabi Nkoma None

In [89]:
_ = sns.distplot(RAW_X.gps_height, hist=True, kde=False, rug=False)



In [90]:
_ = sns.distplot(RAW_X.population, hist=True, kde=False, rug=False)



In [91]:
_ = sns.jointplot(x='longitude', y='latitude', data=RAW_X)
plt.xlabel('longitude')
plt.ylabel('latitude')


Out[91]:
<matplotlib.text.Text at 0x116c289b0>

In [92]:
%%asmarkdown

# To generate a Markup Table
tmp = sam_dataframe_markup_value_counts(dataframe=RAW_X, max_print_value_counts=10, show_plots=False, figsize=(9, 2))

for each in tmp:
    print(each)


Out[92]:

Col ID Col Name UniqCount Col Values UniqValCount
1 amount_tsh 98
2 date_recorded 356
3 funder 1897
4 gps_height 2428
5 installer 2145
6 longitude 57516
7 latitude 57517
8 wpt_name 37400
9 num_private 65
10 basin 9 Lake Nyasa 5085
- - Pangani 8940
- - Internal 7785
- - Lake Rukwa 2454
- - Lake Tanganyika 6432
- - Wami / Ruvu 5987
- - Ruvuma / Southern Coast 4493
- - Lake Victoria 10248
- - Rufiji 7976
11 subvillage 19287
12 region 21
13 region_code 27
14 district_code 20
15 lga 125
16 ward 2092
17 population 1049
18 public_meeting 2 False 5055
- - True 51011
19 recorded_by 1
20 scheme_management 12
21 scheme_name 2696
22 permit 2 False 17492
- - True 38852
23 construction_year 55
24 extraction_type 18
25 extraction_type_group 13
26 extraction_type_class 7 other 6430
- - submersible 6179
- - handpump 16456
- - gravity 26780
- - wind-powered 117
- - rope pump 451
- - motorpump 2987
27 management 12
28 management_group 5 other 943
- - unknown 561
- - user-group 52490
- - parastatal 1768
- - commercial 3638
29 payment 7 other 1054
- - pay monthly 8300
- - pay annually 3642
- - pay when scheme fails 3914
- - pay per bucket 8985
- - unknown 8157
- - never pay 25348
30 payment_type 7 other 1054
- - monthly 8300
- - per bucket 8985
- - annually 3642
- - unknown 8157
- - on failure 3914
- - never pay 25348
31 water_quality 8 coloured 490
- - fluoride 200
- - milky 804
- - fluoride abandoned 17
- - soft 50818
- - salty abandoned 339
- - salty 4856
- - unknown 1876
32 quality_group 6 good 50818
- - fluoride 217
- - milky 804
- - unknown 1876
- - colored 490
- - salty 5195
33 quantity 5 seasonal 4050
- - unknown 789
- - enough 33186
- - dry 6246
- - insufficient 15129
34 quantity_group 5 seasonal 4050
- - unknown 789
- - enough 33186
- - dry 6246
- - insufficient 15129
35 source 10
36 source_type 7 rainwater harvesting 2295
- - dam 656
- - shallow well 16824
- - other 278
- - river/lake 10377
- - spring 17021
- - borehole 11949
37 source_class 3 surface 13328
- - unknown 278
- - groundwater 45794
38 waterpoint_type 7 other 6380
- - communal standpipe 28522
- - communal standpipe multiple 6103
- - dam 7
- - hand pump 17488
- - cattle trough 116
- - improved spring 784
39 waterpoint_type_group 6 other 6380
- - communal standpipe 34625
- - hand pump 17488
- - dam 7
- - cattle trough 116
- - improved spring 784

Observations & TODO

  • Most of the data seems categorical

  • Need to check cols_date_numerics(TODO1)

    • we shall convert date -> day, month, year, weekday, total_no_of_day_from_reference_point. These splits for two reasons.
      • Reason1: It might be possible that in some location all specific set of complaints are registered on a start/mid/at end of the month. It might also be possible that they are registered on every Monday or so.
      • Reason2: Taking as much information as possible.
  • Need to check cols_categorical_check(TODO2)

    • longitutude & latitude seem to hold (0,0) instead of NULL which is acting as outlier for now
  • Following pairs looks closesly related - cleanup (TODO3)

    • quantity & quantity_group
    • quality_group & water_quality
    • extraction_type, extraction_type_class & extraction_type_group
  • Other - cleanup (TODO4)

    • recorded_by, seems to hold only a single value
    • population & amount_tsh, values are for some given as zero

Data Processing

Generic Transformations

Num/Bool Tranformations

  • date_recorded to Int
  • public_meeting to Int
  • permit to Int
  • longitude to Float(less precision)
  • latitude to Float(less precision)

Precision Description of Longititude and Latitude is available here at below link


In [124]:
# Reloading the data

RAW_X = pd.read_csv('data/traning_set_values.csv', index_col='id')
RAW_y = pd.read_csv('data/training_set_labels.csv', index_col='id')
RAW_TEST_X = pd.read_csv('data/test_set_values.csv', index_col='id')

Int Transformations


In [125]:
import datetime

strptime = datetime.datetime.strptime

DATE_FORMAT = "%Y-%m-%d"
REFERENCE_DATE_POINT = strptime('2000-01-01', DATE_FORMAT)

if RAW_X.date_recorded.dtype == 'O':

    # convert it to datetime format
    f = lambda x: strptime(str(x), DATE_FORMAT)
    RAW_X.date_recorded = RAW_X.date_recorded.apply(f)
    RAW_TEST_X.date_recorded = RAW_TEST_X.date_recorded.apply(f)

    # week day
    f = lambda x: x.weekday()
    RAW_X['date_recorded_weekday'] = RAW_X.date_recorded.apply(f)
    RAW_TEST_X['date_recorded_weekday'] = RAW_TEST_X.date_recorded.apply(f)

    # date
    f = lambda x: x.day
    RAW_X['date_recorded_date'] = RAW_X.date_recorded.apply(f)
    RAW_TEST_X['date_recorded_date'] = RAW_TEST_X.date_recorded.apply(f)

    # month
    f = lambda x: x.month
    RAW_X['date_recorded_month'] = RAW_X.date_recorded.apply(f)
    RAW_TEST_X['date_recorded_month'] = RAW_TEST_X.date_recorded.apply(f)

    # year
    f = lambda x: x.year
    RAW_X['date_recorded_year'] = RAW_X.date_recorded.apply(f)
    RAW_TEST_X['date_recorded_year'] = RAW_TEST_X.date_recorded.apply(f)

    # total days
    f = lambda x: (x - REFERENCE_DATE_POINT).days
    RAW_X.date_recorded = RAW_X.date_recorded.apply(f)
    RAW_TEST_X.date_recorded = RAW_TEST_X.date_recorded.apply(f)

In [126]:
# Longitude & Latitude -- zero values fix

# Filling Missing/OUTLIAR Values
_ = np.mean(RAW_X[u'latitude'][RAW_X.latitude < -1.0].values)

if not RAW_X.loc[RAW_X.latitude >= -1.0, u'latitude'].empty:
    RAW_X.loc[RAW_X.latitude >= -1.0, u'latitude'] = _
    RAW_TEST_X.loc[RAW_TEST_X.latitude >= -1.0, u'latitude'] = _


# Filling Missing/OUTLIAR Values
_ = np.mean(RAW_X[u'longitude'][RAW_X[u'longitude'] > 1.0].values)

if not RAW_X.loc[RAW_X[u'longitude'] <= 1.0, u'longitude'].empty:
    RAW_X.loc[RAW_X[u'longitude'] <= 1.0, u'longitude'] = _
    RAW_TEST_X.loc[RAW_TEST_X[u'longitude'] <= 1.0, u'longitude'] = _

In [127]:
def f(x):
    if x is True:
        return 1
    elif x is False:
        return 2
    else:
        return 3


if (RAW_X.public_meeting.dtype != 'bool') and (RAW_X.permit.dtype != 'bool'):

    # public_meeting
    RAW_X.public_meeting = RAW_X.public_meeting.apply(f)
    RAW_TEST_X.public_meeting = RAW_TEST_X.public_meeting.apply(f)

    # permit
    RAW_X.permit = RAW_X.permit.apply(f)
    RAW_TEST_X.permit = RAW_TEST_X.permit.apply(f)

print('Dtype of public_meetings & permit:',RAW_X.public_meeting.dtype, RAW_X.permit.dtype)
print('')
# checking
if list(RAW_TEST_X.dtypes[RAW_TEST_X.dtypes != RAW_X.dtypes]):
    raise Exception('RAW_X.dtypes and RAW_TEST_X.dtypes are not in Sync')
else:
    print('All in Good Shape')


Dtype of public_meetings & permit: int64 int64

All in Good Shape

In [128]:
show_object_dtypes(RAW_X, True)


Out[128]:
funder                   object
installer                object
wpt_name                 object
basin                    object
subvillage               object
region                   object
lga                      object
ward                     object
recorded_by              object
scheme_management        object
scheme_name              object
extraction_type          object
extraction_type_group    object
extraction_type_class    object
management               object
management_group         object
payment                  object
payment_type             object
water_quality            object
quality_group            object
quantity                 object
quantity_group           object
source                   object
source_type              object
source_class             object
waterpoint_type          object
waterpoint_type_group    object
dtype: object

In [129]:
show_object_dtypes(RAW_X, False)


Out[129]:
amount_tsh               float64
date_recorded              int64
gps_height                 int64
longitude                float64
latitude                 float64
num_private                int64
region_code                int64
district_code              int64
population                 int64
public_meeting             int64
permit                     int64
construction_year          int64
date_recorded_weekday      int64
date_recorded_date         int64
date_recorded_month        int64
date_recorded_year         int64
dtype: object

In [130]:
# Reducing geo location precision to 11 meters
LONG_LAT_PRECISION = 0.001

# Reducing Precision of Lat.
if RAW_X.longitude.mean() < 50:
    RAW_X.longitude = RAW_X.longitude // LONG_LAT_PRECISION
    RAW_X.latitude = RAW_X.latitude // LONG_LAT_PRECISION
    RAW_TEST_X.longitude = RAW_TEST_X.longitude // LONG_LAT_PRECISION
    RAW_TEST_X.latitude = RAW_TEST_X.latitude // LONG_LAT_PRECISION

In [131]:
_ = sns.jointplot(x='longitude', y='latitude', data=RAW_X)


Text Data Tranformations

For cols_categorical_check, we are going to basic clean action like, lower and upper case issue. Clearning of non ascii values.


In [132]:
def text_transformation(name):
    """Cleanup basic text issue in name(input).
    
    Removes text capitalisation, case, space and other non text ascii charecters
        except space.
    """
    if name:
        name = name.lower().strip()
        name = ''.join([i if 96 < ord(i) < 128 else ' ' for i in name])
        if 'and' in name:
            name = name.replace('and', ' ')

        # clear double space
        while '  ' in name:
            name = name.replace('  ', ' ')
        return name.strip()
    return ''

In [133]:
ord(' ')


Out[133]:
32

In [134]:
%%asmarkdown

print('''
|Column|Prev.|Current|
|------|-----|-------|''')
for col in cols_categorical_check:
    aa = len(RAW_X[col].unique())
    RAW_X[col] = RAW_X[col].fillna('').apply(text_transformation)
    RAW_TEST_X[col] = RAW_TEST_X[col].fillna('').apply(text_transformation)
    bb = len(RAW_X[col].unique())
    if aa != bb:
        print('|%s|%i|%i|' % (col, aa, bb))


Out[134]:

Column Prev. Current
funder 1898 1880
installer 2146 1866
wpt_name 37400 36717
subvillage 19288 19175
scheme_name 2697 2485


In [104]:
# saving transformed data
pickle.dump(obj=RAW_X, file=open('tmp\clean_X.pkl', 'wb'))
pickle.dump(RAW_TEST_X, open('tmp\clean_TEST_X.pkl', 'wb'))
# pickle.dump(y, open('tmp\y.pkl', 'wb'))

TEST_X, X = RAW_TEST_X, RAW_X

Custom Labeler

Loading Custom Labeler is for the the purpose of reducing categories varieties by ignoring groups with lower frequencies and covering 80%(default) of the original data.


In [136]:
from collections import defaultdict
from __future__ import print_function
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets

from scripts import sam_custom_labeler

CUST_CATEGORY_LABELER = sam_custom_labeler.CUST_CATEGORY_LABELER

In [137]:
help(CUST_CATEGORY_LABELER)


Help on class CUST_CATEGORY_LABELER in module scripts.sam_custom_labeler:

class CUST_CATEGORY_LABELER(builtins.object)
 |  Custom Mapper Function.
 |  
 |  Based on pd.Series.values_counts, a labler is prepared
 |   to cover one of following details
 |      1. cover top 80% of groups(DEFAULT) (or)
 |      2. top 500 groups
 |  
 |  A special `transform_analysis` function is provided to
 |   understand how value_counts are spread out
 |  
 |  Example:
 |      >>> # Test Data
 |      >>> ss = pd.Series(np.arange(5000) // 5)
 |      >>> ss = ss.map(lambda x: str(x))
 |      >>>
 |      >>> # creating labler
 |      >>> labler = CUST_CATEGORY_LABELER()
 |      >>> labler.fit(funder)
 |      >>>
 |      >>> # testing
 |      >>> _ =  labler.check_group_coverage(90)
 |      90 percentage of GROUPS coverage mean, 1691(in number) groups
 |      >>>
 |      >>> _ =  labler.check_data_coverage(90)
 |      90 percentage of DATA coverage mean, 666 (in number) groups
 |  
 |  Methods defined here:
 |  
 |  __init__(self)
 |      Defaults.
 |  
 |  check_data_coverage(self, data_coverage=None)
 |      Check the data coverage.
 |      
 |      Args:
 |          check_data_coverage(float): Range is (0.0, 100.0)
 |  
 |  check_group_coverage(self, groups_coverage=80)
 |      param: groups_coverage - can be provided as fraction/int.
 |      
 |      To convert fraction into proper count for inter checks.
 |      
 |      Args:
 |          * data_coverage(int): Range between (0 - 100)
 |              percentage(%) of the groups to be covered.
 |  
 |  fit(self, col_data)
 |      Fit the data to class.
 |      
 |      Args:
 |          data(ndarray)
 |  
 |  fit_transform(self, col_data)
 |      Fit data and then transform.
 |  
 |  transform(self, groups_coverage=None)
 |      Default transformation is based on coverage.
 |      
 |      If cumulative sum of groups frequencies then
 |       label is to only cover upto top 80% of groups.
 |  
 |  transform_analysis(self, data_coverage=None, groups_coverage=None)
 |      Post transform data view.
 |      
 |      Args:
 |          * data_coverage(int): Range between (0 - 100)
 |              percentage(%) of the amount data to be covered.
 |      
 |          * groups_coverage(int/float):
 |              Limit the amount groups(variety) coverage. All input can be
 |               provided as fraction or a specific count with in limit.
 |      
 |      Example:
 |          >>> labler = CUST_CATEGORY_LABELER()
 |          >>> labler.fit(RAW_X.funder)
 |          >>>
 |          >>> # to checking report for covering 85.50% data
 |          >>> labler.transform_analysis(data_coverage=85.50)
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors defined here:
 |  
 |  __dict__
 |      dictionary for instance variables (if defined)
 |  
 |  __weakref__
 |      list of weak references to the object (if defined)


In [138]:
labler = CUST_CATEGORY_LABELER()

def select_col(col):
    global labler
    labler = CUST_CATEGORY_LABELER()
    labler.fit(RAW_TEST_X[col])
    print('Selected', col)

ii = interact(select_col, col=['funder', 'installer', 'wpt_name', 'subvillage', 'ward', 'scheme_name'])

# To check data coverage
def f1(data=80):
    labler.check_data_coverage(data_coverage=data)

ii1 = interact(f1, data=(70, 100, .5))

# To check groups coverage
def f2(groups=80):
    labler.check_group_coverage(groups)
    
ii2 = interact(f2, groups=(50, 100., .5))

_ = '''
Please select one of these slider to chose among the
 data coverage or groups coverage
'''


80.0 percentage of GROUPS coverage mean, 777.6(in number) groups
  • funder:
    • 100.0 percentage of DATA coverage mean, 1881 (in number) groups
    • 97.0 percentage of DATA coverage mean, 592 (in number) groups ##
    • 90.5 percentage of DATA coverage mean, 237 (in number) groups
  • installer:

    • 100.0 percentage of DATA coverage mean, 1867 (in number) groups
    • 97.0 percentage of DATA coverage mean, 599 (in number) groups ##
  • wpt_name:

    • 80.0 percentage of DATA coverage mean, 24838 (in number) groups ##
  • subvillage:

    • 80.5 percentage of DATA coverage mean, 8715 (in number) groups ##
    • 83.0 percentage of DATA coverage mean, 9458 (in number) groups
  • ward:
    • 80.0 percentage of DATA coverage mean, 998 (in number) groups ##
    • 91.5 percentage of DATA coverage mean, 1397 (in number) groups
    • 100.0 percentage of DATA coverage mean, 2093 (in number) groups
  • scheme_name:
    • 100.0 percentage of DATA coverage mean, 2486 (in number) groups
    • 91.5 percentage of DATA coverage mean, 870 (in number) groups
    • 80.5 percentage of DATA coverage mean, 363 (in number) groups
    • 85.0 percentage of DATA coverage mean, 524 (in number) groups ##
      NOTE : Marked with double hashes are the selected values for coverage

In [139]:
##################################
######### TESTING ################
#################################

labler = CUST_CATEGORY_LABELER()
labler.fit(X.installer)

# default data coverage is 80
tmp = labler.transform()

print('data coveraged', labler.DATA_COVERAGE_LIMIT)
print('grous coveraged', len(tmp.value_counts()))

print('---------------------')
labler.DATA_COVERAGE_LIMIT = 90
tmp = labler.transform()

print('data coveraged', labler.DATA_COVERAGE_LIMIT)
print('grous coveraged', len(tmp.value_counts()))


80 percentage of DATA coverage mean, 81 (in number) groups
data coveraged 80
grous coveraged 82
---------------------
90 percentage of DATA coverage mean, 203 (in number) groups
data coveraged 90
grous coveraged 204

In [140]:
##################################
######### IMPLEMENT ##############
#################################

if 'custom_labler' not in dir():
    custom_labler = defaultdict(CUST_CATEGORY_LABELER)
    tmp = { 'funder': 97,
      'installer': 97,
      'wpt_name': 80,
      'subvillage': 80,
      'ward': 80,
      'scheme_name': 85
      }

    for col, limit  in tmp.items():
        labler = custom_labler[col]
        labler.DATA_COVERAGE_LIMIT = limit
        labler.fit(X[col])
        print('')
        print('-' * 15, col.upper())

    #     custom_labler[col].check_data_coverage(limit)
        RAW_X[col] = labler.transform()
else:
    print('"custom_labler" seems is already defined, please check')
    
print(RAW_X.shape, RAW_TEST_X.shape, all(RAW_X.columns == RAW_TEST_X.columns))


"custom_labler" seems is already defined, please check
(59400, 43) (14850, 43) True
drop_cols = ['wpt_name',] RAW_X.drop(drop_cols, axis=1, inplace=True) RAW_TEST_X.drop(drop_cols, axis=1, inplace=True) print('Removed Cols:', drop_cols)

Label Encoder

Label Encoder with DefaultDict for quick data transformation http://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn


In [141]:
from collections import defaultdict
from sklearn import preprocessing

In [142]:
print(RAW_X.shape, RAW_TEST_X.shape)


(59400, 43) (14850, 43)

In [143]:
d = defaultdict(preprocessing.LabelEncoder)

RAW_X.scheme_management = RAW_X.scheme_management.fillna('Other')
RAW_TEST_X.scheme_management = RAW_TEST_X.scheme_management.fillna('Other')

# Labels Fit
sam = pd.concat([RAW_X, RAW_TEST_X]).apply(lambda x: d[x.name].fit(x))

# Labels Transform - Training Data
X = RAW_X.apply(lambda x: d[x.name].transform(x))
TEST_X = RAW_TEST_X.apply(lambda x: d[x.name].transform(x))

le = preprocessing.LabelEncoder().fit(RAW_y[u'status_group'])
y = le.transform(RAW_y[u'status_group'])

In [144]:
show_object_dtypes(RAW_X, True)


Out[144]:
funder                   object
installer                object
wpt_name                 object
basin                    object
subvillage               object
region                   object
lga                      object
ward                     object
recorded_by              object
scheme_management        object
scheme_name              object
extraction_type          object
extraction_type_group    object
extraction_type_class    object
management               object
management_group         object
payment                  object
payment_type             object
water_quality            object
quality_group            object
quantity                 object
quantity_group           object
source                   object
source_type              object
source_class             object
waterpoint_type          object
waterpoint_type_group    object
dtype: object

In [145]:
show_object_dtypes(X, True)


Out[145]:
Series([], dtype: object)

In [59]:
sam_dataframe_cols_value_count_analysis(X)


(1, 'date_recorded_year', 28)
(2, 'basin', 28)
(3, 'region', 28)
(4, 'region_code', 28)
(5, 'district_code', 28)
(6, 'public_meeting', 28)
(7, 'recorded_by', 28)
(8, 'scheme_management', 28)
(9, 'permit', 28)
(10, 'extraction_type', 28)
(11, 'extraction_type_group', 28)
(12, 'extraction_type_class', 28)
(13, 'management', 28)
(14, 'management_group', 28)
(15, 'payment', 28)
(16, 'payment_type', 28)
(17, 'water_quality', 28)
(18, 'quality_group', 28)
(19, 'quantity', 28)
(20, 'quantity_group', 28)
(21, 'source', 28)
(22, 'source_type', 28)
(23, 'source_class', 28)
(24, 'waterpoint_type', 28)
(25, 'waterpoint_type_group', 28)
('Showing Plot for Columns:\n', ['basin', 'region', 'region_code', 'district_code', 'public_meeting', 'recorded_by', 'scheme_management', 'permit', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group', 'date_recorded_weekday', 'date_recorded_date', 'date_recorded_month', 'date_recorded_year'])

Pickle

Pickle Save


In [62]:
# saving transformed data
pickle.dump(X, open('tmp\processed_X.pkl', 'wb'))
pickle.dump(TEST_X, open('tmp\processed_TEST_X.pkl', 'wb'))
pickle.dump(y, open('tmp\processed_y.pkl', 'wb'))

# saving label transformers
pickle.dump(d, open('tmp\d.pkl', 'wb'))
pickle.dump(le, open('tmp\le.pkl', 'wb'))

Feature Selection


In [ ]:
X = pickle.load(open('tmp\processed_X.pkl', 'rb'))
TEST_X = pickle.load(open('tmp\processed_TEST_X.pkl', 'rb'))
y = pickle.load(open('tmp\processed_y.pkl', 'rb'))

# # Load this when you are about to do text transformation and submission
# d = pickle.load(open('tmp\d.pkl'))
# le = pickle.load(open('tmp\le.pkl'))

print(X.shape, y.shape, y[:5])

Correlation Threshold

To remove all feature with correlaiton more than 80%


In [182]:
if list(X.dtypes[X.dtypes == 'O']):
    print('Please check there are still some OBJECT COLUMNS PRESENT')
else:
    ss = X.corr().fillna(0)

    # postive or negitive - both good
    ss = ss.applymap(lambda x: x if x and x > 0 else -1 * x)
    
    # wish to know only strong corr
    plt.figure(figsize=(15, 15))
    sns.heatmap(ss)



In [189]:
# wish to know only strong corr
plt.figure(figsize=(18, 18))
sns.heatmap(ss.applymap(lambda x: x if x > 0.90 else 0))


Out[189]:
<matplotlib.axes._subplots.AxesSubplot at 0x11b3e9470>

In [248]:
len(X[_col_].value_counts()), len(X[_row_].value_counts())


Out[248]:
5

In [273]:
np.set_printoptions(precision=2)

bag = []

for _col_ in ss.index:
    for _row_ in ss.columns:
        if _col_ not in bag:
            if (ss[_col_][_row_] > 0.8 and (ss[_col_][_row_] < 1.0)):
                try:
                    print((_col_, len(X[_col_].value_counts()),
                           _row_, len(X[_row_].value_counts()),
                           ss[_col_][_row_]))
                except KeyError:
                    # few extra cols are added
                    pass
#         bag.append(_row_)
#         bag.append(_col_)

del _col_, _row_, bag


('date_recorded', 356, 'date_recorded_year', 5, 0.95920911743658788)
('extraction_type', 18, 'extraction_type_group', 13, 0.94952351098756882)
('extraction_type_group', 13, 'extraction_type', 18, 0.94952351098756882)
('source', 10, 'source_type', 7, 0.94381787586073784)
('source_type', 7, 'source', 10, 0.94381787586073784)
('waterpoint_type', 7, 'waterpoint_type_group', 6, 0.98215380609123037)
('waterpoint_type_group', 6, 'waterpoint_type', 7, 0.98215380609123037)
('date_recorded_year', 5, 'date_recorded', 356, 0.95920911743658788)

In [276]:
%%asmarkdown

print ('''
|Column Name|VCount|Column Name|VCount|Corr|
|-----------|------|-----------|------|----|''')

tmp = '''
('date_recorded', 356, 'date_recorded_year', 5, 0.95920911743658788)
('extraction_type', 18, 'extraction_type_group', 13, 0.94952351098756882)
('extraction_type_group', 13, 'extraction_type', 18, 0.94952351098756882)
('source', 10, 'source_type', 7, 0.94381787586073784)
('source_type', 7, 'source', 10, 0.94381787586073784)
('waterpoint_type', 7, 'waterpoint_type_group', 6, 0.98215380609123037)
('waterpoint_type_group', 6, 'waterpoint_type', 7, 0.98215380609123037)
('date_recorded_year', 5, 'date_recorded', 356, 0.95920911743658788)
'''

while ' ' in tmp:
    tmp = tmp.replace(' ', '')

tmp = tmp.strip().replace('\'', '')
print(tmp.replace(",", '|').replace('(', '|').replace(')', '|'))

del tmp


Out[276]:

Column Name VCount Column Name VCount Corr
date_recorded 356 date_recorded_year 5 0.95920911743658788
extraction_type 18 extraction_type_group 13 0.94952351098756882
extraction_type_group 13 extraction_type 18 0.94952351098756882
source 10 source_type 7 0.94381787586073784
source_type 7 source 10 0.94381787586073784
waterpoint_type 7 waterpoint_type_group 6 0.98215380609123037
waterpoint_type_group 6 waterpoint_type 7 0.98215380609123037
date_recorded_year 5 date_recorded 356 0.95920911743658788


In [285]:
from sklearn.feature_selection import chi2

X['date_recorded'].shape


Out[285]:
(59400,)

In [288]:
X.dtypes


Out[288]:
amount_tsh               int64
date_recorded            int64
funder                   int64
gps_height               int64
installer                int64
longitude                int64
latitude                 int64
wpt_name                 int64
num_private              int64
basin                    int64
subvillage               int64
region                   int64
region_code              int64
district_code            int64
lga                      int64
ward                     int64
population               int64
public_meeting           int64
recorded_by              int64
scheme_management        int64
scheme_name              int64
permit                   int64
construction_year        int64
extraction_type          int64
extraction_type_group    int64
extraction_type_class    int64
management               int64
management_group         int64
payment                  int64
payment_type             int64
water_quality            int64
quality_group            int64
quantity                 int64
quantity_group           int64
source                   int64
source_type              int64
source_class             int64
waterpoint_type          int64
waterpoint_type_group    int64
date_recorded_weekday    int64
date_recorded_date       int64
date_recorded_month      int64
date_recorded_year       int64
dtype: object

In [289]:
from scripts.sam_variance_check import get_low_variance_columns

In [290]:
X, removed_features, ranking_variance_thresholds = get_low_variance_columns(dframe=X,
                                                                            threshold=(0.85 * (1 - 0.85)),
                                                                            autoremove=True)

print('\nLow Variance Columns', removed_features)
print('Shape of X is', X.shape)


Finding low-variance features.
Found 1 low-variance columns.
                
Removing low-variance features.
Reassembling the dataframe (with low-variance features removed).
Succesfully removed low-variance columns.

Low Variance Columns ['recorded_by']
Shape of X is (59400, 42)

In [291]:
if removed_features:
    TEST_X.drop(removed_features, axis=1, inplace=True)
    print('cleanup completed!')


cleanup completed!

In [292]:
print('Shape of X is', X.shape)
print('Shape of TEST_X is', TEST_X.shape)


Shape of X is (59400, 42)
Shape of TEST_X is (14850, 42)

Select K Best

  • For regression: f_regression, mutual_info_regression
  • For classification: chi2, f_classif, mutual_info_classif

Random Forest Classifier score: RandomForestClassifier(n_estimators=150, criterion='entropy', class_weight="balanced_subsample", n_jobs=-1)

  • chi2 0.81225589225589223
  • f_classic 0.81138047138047142
  • mutual_info_classif 0.81037037037037041

In [293]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif

In [294]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def game(X, y):
#     print(X.shape, y.shape[0])
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
    
    clf_rf = RandomForestClassifier(n_jobs=-1, random_state=192)
    clf_rf = clf_rf.fit(X_train, y_train)
    
    train_score = clf_rf.score(X_train, y_train)
    test_score = clf_rf.score(X_test, y_test)
#     print('Train Score', train_score)
#     print('Test  Score', test_score)
    return train_score, test_score

In [295]:
X.shape, y.shape


Out[295]:
((59400, 42), (59400,))
ranking_selectkbest = dict(zip(cols_names, fit.scores_)) kbest_selected_cols = [_ for _ in cols_names[:kbest_cols]] % pprint ranking_selectkbest print('Removed Columns:\n\t', ','.join([ _ for _ in X.columns if _ not in kbest_selected_cols ])) print('\nSelected Columns:\n\t', ','.join(kbest_selected_cols))

In [296]:
kbest_cols = 26

for fns in [chi2, f_classif, mutual_info_classif]:
    print((fns,game(SelectKBest(score_func=fns, k=kbest_cols).fit(X, y).transform(X), y)))


(<function chi2 at 0x11d9afea0>, (0.98410774410774415, 0.79912457912457913))
(<function f_classif at 0x11d9af598>, (0.97263748597081934, 0.79050505050505049))
(<function mutual_info_classif at 0x119229ae8>, (0.98356902356902354, 0.79569023569023567))

In [297]:
print('''
(chi2, 0.98428731762065091, 0.79966329966329963)
(f_classif, 0.97432098765432096, 0.79286195286195282)
(mutual_info_classif, 0.98410774410774415, 0.79447811447811445)
'''.replace('(', '|').replace(')', '|').replace(', ', '|'))


|chi2|0.98428731762065091|0.79966329966329963|
|f_classif|0.97432098765432096|0.79286195286195282|
|mutual_info_classif|0.98410774410774415|0.79447811447811445|

bag = [] kbest_cols = 40 # for k in range(1, 40, 4): # for k in range(23, 33, 2): for k in range(26, 29): kbest_cols = k fit = SelectKBest(score_func=chi2, k=kbest_cols).fit(X, y) cols_names = X.columns kbest_selected_cols = [_ for _ in cols_names[:kbest_cols]] kbest_X = pd.DataFrame(fit.transform(X)) kbest_TEST_X = pd.DataFrame(fit.transform(TEST_X)) # kbest_X.columns = kbest_selected_cols # kbest_TEST_X.columns = kbest_selected_cols # print('Before KBest', X.shape, TEST_X.shape, len(y)) # print('After KBest', kbest_X.shape, kbest_TEST_X.shape, len(y)) train_score, test_score = game(kbest_X, y) bag.append({'cols': kbest_cols, 'train': train_score, 'test': test_score}) print(', '.join(kbest_selected_cols).upper()) bag

kbest conclusion :

Best selected columns

AMOUNT_TSH, DATE_RECORDED, FUNDER, GPS_HEIGHT, INSTALLER, LONGITUDE, LATITUDE, NUM_PRIVATE, BASIN, SUBVILLAGE, REGION, REGION_CODE, DISTRICT_CODE, LGA, WARD, POPULATION, PUBLIC_MEETING, SCHEME_MANAGEMENT, SCHEME_NAME, PERMIT, CONSTRUCTION_YEAR, EXTRACTION_TYPE, EXTRACTION_TYPE_GROUP, EXTRACTION_TYPE_CLASS, MANAGEMENT, MANAGEMENT_GROUP, PAYMENT, PAYMENT_TYPE
# results of previous runs
[{'cols': 1, 'test': 0.52659932659932662, 'train': 0.57483726150392822},
 {'cols': 5, 'test': 0.68962962962962959, 'train': 0.94240179573512906},
 {'cols': 9, 'test': 0.7211447811447812, 'train': 0.97638608305274976},
 {'cols': 13, 'test': 0.75380471380471381, 'train': 0.97955106621773291},
 {'cols': 17, 'test': 0.76134680134680133, 'train': 0.98071829405162736},
 {'cols': 21, 'test': 0.76511784511784509, 'train': 0.98076318742985413},
 {'cols': 25, 'test': 0.80033670033670035, 'train': 0.98316498316498313},
 {'cols': 29, 'test': 0.80053872053872055, 'train': 0.98379349046015707},
 {'cols': 33, 'test': 0.80040404040404045, 'train': 0.98390572390572395},
 {'cols': 37, 'test': 0.79993265993265994, 'train': 0.98341189674523011}]

[{'cols': 23, 'test': 0.7976430976430976, 'train': 0.9836812570145903},
 {'cols': 25, 'test': 0.80033670033670035, 'train': 0.98316498316498313},
 {'cols': 27, 'test': 0.80101010101010106, 'train': 0.9829405162738496},
 {'cols': 29, 'test': 0.80053872053872055, 'train': 0.98379349046015707},
 {'cols': 31, 'test': 0.80000000000000004, 'train': 0.98381593714927051}]

[{'cols': 26, 'test': 0.80309764309764309, 'train': 0.98359147025813698},
 {'cols': 27, 'test': 0.80101010101010106, 'train': 0.9829405162738496},
 {'cols': 28, 'test': 0.80222222222222217, 'train': 0.98334455667789}]

As per Okham Razor's rules, we are going to select the simplest and well performing. Luckily, we have got kbest_selected_cols at 26 which is comparitively top performer among other K-selections and also lower than actualy number of columns


In [298]:
kbest_cols = 26

fit = SelectKBest(score_func=chi2, k=kbest_cols).fit(X, y)
cols_names = X.columns
kbest_selected_cols =  [_ for _ in cols_names[:kbest_cols]]

kbest_X = pd.DataFrame(fit.transform(X))
kbest_TEST_X = pd.DataFrame(fit.transform(TEST_X))

In [299]:
kbest_X.shape, kbest_TEST_X.shape, y.shape


Out[299]:
((59400, 26), (14850, 26), (59400,))

In [300]:
pickle.dump(kbest_X, open('tmp\kbest_X.pkl', 'wb'))
pickle.dump(kbest_TEST_X, open('tmp\kbest_TEST_X.pkl', 'wb'))
pickle.dump(y, open('tmp\kbest_y.pkl', 'wb'))

PCA


In [301]:
load = 2

if load ==1:
    # this will load kbest
    print('Loading KBest Processed Data')
    X = pickle.load(open('tmp\kbest_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\kbest_TEST_X.pkl', 'rb'))
    y = pickle.load(open('tmp\kbest_y.pkl', 'rb'))
elif load ==2:
    # this will load processed data
    print('Loading normal Processed Data')
    X = pickle.load(open('tmp\processed_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\processed_TEST_X.pkl', 'rb'))

# # y = pickle.load(open('tmp\processed_y.pkl'))


Loading normal Processed Data

PCA


In [302]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [303]:
X.shape


Out[303]:
(59400, 43)

In [304]:
# feature extraction
pca = PCA(n_components=30)
fit = pca.fit(X)

plt.figure(figsize=(12, 3))

_ = plt.scatter (range(len(fit.explained_variance_ratio_)), fit.explained_variance_ratio_.cumsum())

_ = plt.xlabel('cumilative sum of explained variance')
_ = plt.ylabel('score')


print(fit.explained_variance_ratio_.cumsum())
print()
print(('Score', game(pca.transform(X), y)))


# (0.97580246913580249, 0.60511784511784517) # KBest dataset
# (0.97564534231200895, 0.60552188552188557) # Normal Dataset


[ 0.8   0.9   0.96  0.99  0.99  1.    1.    1.    1.    1.    1.    1.    1.
  1.    1.    1.    1.    1.    1.    1.    1.    1.    1.    1.    1.    1.
  1.    1.    1.    1.  ]

('Score', (0.98552188552188558, 0.7781818181818182))

In [305]:
ss = pd.DataFrame(fit.components_)
ss = ss.applymap(lambda x: x if x > 0 else -1 * x)
display(ss.describe().T)

ss.plot(kind='bar', figsize=(125, 10))


count mean std min 25% 50% 75% max
0 30.0 5.264466e-02 1.776436e-01 3.066936e-05 0.002936 5.765839e-03 1.470519e-02 8.611451e-01
1 30.0 4.268263e-02 1.805436e-01 6.822979e-05 0.002029 3.609913e-03 1.266789e-02 9.957240e-01
2 30.0 5.619575e-02 1.766802e-01 2.795439e-05 0.000310 8.516267e-04 4.805287e-03 7.605836e-01
3 30.0 6.128836e-02 1.749199e-01 1.156348e-05 0.000272 1.745133e-03 1.154194e-02 8.095676e-01
4 30.0 5.542420e-02 1.769322e-01 2.312065e-05 0.000199 5.290925e-04 8.663409e-03 7.468425e-01
5 30.0 5.051806e-02 1.784452e-01 1.680544e-06 0.000161 3.803534e-04 3.975645e-03 7.725976e-01
6 30.0 5.121032e-02 1.782409e-01 1.508064e-05 0.000078 3.240449e-04 1.022508e-02 7.723962e-01
7 30.0 3.472833e-02 1.823050e-01 3.419095e-07 0.000004 9.423929e-06 5.997757e-04 9.997252e-01
8 30.0 6.470633e-02 1.736035e-01 6.221466e-07 0.000163 2.913288e-03 3.655878e-02 8.314784e-01
9 30.0 7.921026e-02 1.663410e-01 1.727048e-06 0.000400 9.006896e-03 4.520668e-02 6.911450e-01
10 30.0 3.740845e-02 1.817556e-01 6.543990e-07 0.000010 2.491947e-05 1.717698e-03 9.982215e-01
11 30.0 7.840541e-02 1.676441e-01 1.100873e-05 0.001062 9.910642e-03 6.561904e-02 6.748640e-01
12 30.0 6.984921e-02 1.715041e-01 1.098525e-05 0.001527 1.052479e-02 4.993550e-02 8.420199e-01
13 30.0 8.629159e-02 1.634623e-01 8.256655e-07 0.000558 1.480295e-02 1.052436e-01 7.931873e-01
14 30.0 4.144004e-02 1.808467e-01 1.565410e-05 0.001588 3.719466e-03 1.827063e-02 9.975896e-01
15 30.0 5.158451e-02 1.781293e-01 1.741198e-05 0.000082 2.249986e-04 8.506731e-03 8.282855e-01
16 30.0 5.239932e-02 1.778831e-01 1.409580e-04 0.000339 1.777523e-03 1.663163e-02 9.591436e-01
17 30.0 7.186709e-03 1.165815e-02 1.827096e-07 0.000047 2.581867e-03 7.245337e-03 5.017586e-02
18 30.0 3.336819e-17 4.963148e-17 -0.000000e+00 0.000000 5.505714e-20 5.551115e-17 1.804112e-16
19 30.0 9.037559e-02 1.603736e-01 1.010811e-05 0.000330 9.664426e-03 9.996771e-02 6.265809e-01
20 30.0 6.211051e-02 1.746196e-01 3.191952e-05 0.000256 7.286919e-04 8.350287e-03 6.770724e-01
21 30.0 1.119623e-02 1.680378e-02 5.902775e-08 0.000084 2.381170e-03 1.431135e-02 5.904086e-02
22 30.0 5.453542e-02 1.772159e-01 2.157749e-05 0.002852 5.227284e-03 1.833292e-02 8.602716e-01
23 30.0 8.423729e-02 1.432741e-01 1.544605e-06 0.000559 1.370984e-02 7.302268e-02 5.287920e-01
24 30.0 6.512927e-02 1.143630e-01 2.154448e-06 0.000751 1.080049e-02 7.076019e-02 4.278898e-01
25 30.0 5.975461e-02 1.349504e-01 2.600068e-06 0.000360 6.078542e-03 2.426885e-02 5.540015e-01
26 30.0 8.589454e-02 1.474672e-01 6.368114e-06 0.000185 6.729705e-03 1.241829e-01 6.021727e-01
27 30.0 3.566714e-02 6.451389e-02 1.418986e-07 0.000082 4.677488e-03 5.579973e-02 2.994893e-01
28 30.0 7.898384e-02 1.321893e-01 1.552095e-07 0.000261 1.441977e-02 9.583476e-02 5.094964e-01
29 30.0 6.045190e-02 9.003434e-02 7.602828e-06 0.000296 1.820820e-02 8.656660e-02 3.262628e-01
30 30.0 1.105546e-02 2.038700e-02 5.019569e-07 0.000140 1.066348e-03 7.806508e-03 6.510965e-02
31 30.0 1.126054e-02 1.717433e-02 1.447720e-06 0.000109 2.192364e-03 1.602182e-02 7.455432e-02
32 30.0 4.383445e-02 1.229576e-01 1.493985e-06 0.000068 3.036652e-03 1.744089e-02 4.999902e-01
33 30.0 4.383445e-02 1.229576e-01 1.493985e-06 0.000068 3.036652e-03 1.744089e-02 4.999902e-01
34 30.0 6.732498e-02 1.050369e-01 9.335828e-07 0.000380 2.000375e-02 8.911797e-02 4.649121e-01
35 30.0 7.507680e-02 1.130867e-01 1.813865e-07 0.000428 1.733277e-02 1.185510e-01 4.958592e-01
36 30.0 9.107952e-03 1.541552e-02 7.065533e-07 0.000058 1.870569e-03 1.079122e-02 6.098258e-02
37 30.0 6.488749e-02 1.287179e-01 3.217442e-06 0.000593 1.069135e-02 6.687406e-02 5.972969e-01
38 30.0 5.189630e-02 1.009729e-01 2.145346e-06 0.000354 1.177189e-02 5.937794e-02 4.666592e-01
39 30.0 5.473117e-02 1.771367e-01 9.233950e-08 0.000071 1.147458e-03 1.327330e-02 9.078710e-01
40 30.0 4.618602e-02 1.796512e-01 4.938538e-06 0.000577 3.669242e-03 1.390761e-02 9.883904e-01
41 30.0 7.356021e-02 1.691821e-01 3.337361e-06 0.000808 1.592026e-02 6.103160e-02 8.853109e-01
42 30.0 5.776790e-03 1.280072e-02 4.976312e-06 0.000332 1.674403e-03 5.824429e-03 6.893185e-02
Out[305]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a807550>

In [306]:
# feature extraction
lda = LinearDiscriminantAnalysis(n_components=16)
fit = lda.fit(X, y)

plt.figure(figsize=(12, 3))

_ = plt.scatter (range(len(fit.explained_variance_ratio_)), fit.explained_variance_ratio_.cumsum())

_ = plt.xlabel('cumilative sum of explained variance')
_ = plt.ylabel('score')


print(fit.explained_variance_ratio_.cumsum())

print(('\nScore', game(lda.transform(X), y)))


# (0.97580246913580249, 0.60511784511784517) # KBest dataset
# (0.97564534231200895, 0.60552188552188557) # Normal Dataset


/Users/sampathm/miniconda3/lib/python3.5/site-packages/sklearn/discriminant_analysis.py:387: UserWarning: Variables are collinear.
  warnings.warn("Variables are collinear.")
[ 0.84  1.  ]
('\nScore', (0.97340067340067338, 0.64296296296296296))

In [307]:
ss = pd.DataFrame(fit.coef_)
ss = ss.applymap(lambda x: x if x > 0 else -1 * x)
display(ss.describe().T)

ss.plot(kind='bar', figsize=(125, 10))


count mean std min 25% 50% 75% max
0 3.0 8.337468e-03 6.804167e-03 8.299133e-04 5.457564e-03 1.008521e-02 1.209124e-02 1.409727e-02
1 3.0 1.495772e-03 1.480684e-03 3.053713e-04 6.667497e-04 1.028128e-03 2.090972e-03 3.153815e-03
2 3.0 1.396777e-04 7.433377e-05 6.421568e-05 1.031019e-04 1.419882e-04 1.774088e-04 2.128293e-04
3 3.0 1.505486e-04 1.182363e-04 4.900360e-05 8.564571e-05 1.222878e-04 2.013211e-04 2.803545e-04
4 3.0 7.289322e-05 2.376144e-05 4.844370e-05 6.138937e-05 7.433504e-05 8.511798e-05 9.590092e-05
5 3.0 4.974313e-05 5.133707e-05 1.519399e-05 2.024762e-05 2.530125e-05 6.701770e-05 1.087341e-04
6 3.0 2.195375e-05 1.729867e-05 3.061823e-06 1.442160e-05 2.578138e-05 3.139971e-05 3.701803e-05
7 3.0 5.517552e-07 7.558838e-07 9.259986e-08 1.155466e-07 1.384934e-07 7.813328e-07 1.424172e-06
8 3.0 1.860713e-03 2.580830e-03 3.218733e-04 3.709395e-04 4.200058e-04 2.630133e-03 4.840260e-03
9 3.0 4.772289e-02 1.273324e-02 3.841346e-02 4.046765e-02 4.252183e-02 5.237760e-02 6.223338e-02
10 3.0 2.555641e-06 7.267233e-07 2.112506e-06 2.136292e-06 2.160077e-06 2.777208e-06 3.394339e-06
11 3.0 6.599947e-03 2.308804e-03 5.093635e-03 5.270892e-03 5.448149e-03 7.353103e-03 9.258056e-03
12 3.0 1.067965e-02 6.871940e-03 4.397211e-03 7.010128e-03 9.623045e-03 1.382087e-02 1.801870e-02
13 3.0 2.067925e-02 2.309495e-02 2.684797e-03 7.658224e-03 1.263165e-02 2.967648e-02 4.672132e-02
14 3.0 2.783953e-03 2.387193e-03 7.862720e-04 1.462090e-03 2.137909e-03 3.782793e-03 5.427677e-03
15 3.0 1.669431e-05 2.360449e-05 1.203457e-06 3.110820e-06 5.018182e-06 2.443974e-05 4.386130e-05
16 3.0 9.747306e-05 8.501960e-05 3.647203e-06 6.150795e-05 1.193687e-04 1.443860e-04 1.694033e-04
17 3.0 5.389127e-02 3.736855e-02 1.376729e-02 3.698717e-02 6.020705e-02 7.395326e-02 8.769947e-02
18 3.0 1.338180e-16 1.872716e-16 1.657216e-17 2.582890e-17 3.508564e-17 1.924410e-16 3.497963e-16
19 3.0 1.782513e-02 2.404402e-02 1.237305e-03 4.037789e-03 6.838272e-03 2.611904e-02 4.539980e-02
20 3.0 1.197332e-04 7.240755e-05 4.387354e-05 8.554620e-05 1.272189e-04 1.576630e-04 1.881071e-04
21 3.0 1.474554e-01 1.454648e-01 3.039995e-02 6.602955e-02 1.016591e-01 2.059832e-01 3.103072e-01
22 3.0 7.850430e-03 2.252974e-03 5.581121e-03 6.732293e-03 7.883464e-03 8.985085e-03 1.008671e-02
23 3.0 2.558823e-02 2.935264e-02 5.589832e-03 8.739308e-03 1.188878e-02 3.558742e-02 5.928606e-02
24 3.0 5.506476e-02 1.756721e-02 4.316921e-02 4.497612e-02 4.678303e-02 6.101254e-02 7.524204e-02
25 3.0 1.406357e-01 5.590006e-02 9.075016e-02 1.104275e-01 1.301047e-01 1.655785e-01 2.010523e-01
26 3.0 5.262338e-02 4.049202e-02 8.525442e-03 3.486961e-02 6.121379e-02 7.467235e-02 8.813091e-02
27 3.0 9.821851e-02 4.667700e-02 4.817481e-02 7.704065e-02 1.059065e-01 1.232404e-01 1.405742e-01
28 3.0 2.519284e-02 9.449109e-03 1.711129e-02 1.999825e-02 2.288522e-02 2.923361e-02 3.558200e-02
29 3.0 5.616476e-02 7.611849e-02 5.880103e-03 1.237816e-02 1.887621e-02 8.130709e-02 1.437380e-01
30 3.0 5.800423e-02 1.505242e-02 4.579136e-02 4.959588e-02 5.340041e-02 6.411067e-02 7.482093e-02
31 3.0 1.422329e-01 4.819415e-02 1.049519e-01 1.150222e-01 1.250925e-01 1.608734e-01 1.966543e-01
32 3.0 1.212447e-01 4.119230e-02 7.493459e-02 1.049678e-01 1.350010e-01 1.443997e-01 1.537985e-01
33 3.0 1.212447e-01 4.119230e-02 7.493459e-02 1.049678e-01 1.350010e-01 1.443997e-01 1.537985e-01
34 3.0 1.492370e-01 9.395345e-02 6.272971e-02 9.926144e-02 1.357932e-01 1.924907e-01 2.491883e-01
35 3.0 1.084671e-01 5.423996e-02 5.451978e-02 8.120318e-02 1.078866e-01 1.354408e-01 1.629950e-01
36 3.0 3.432188e-01 3.503985e-01 1.077237e-01 1.418796e-01 1.760354e-01 4.609664e-01 7.458973e-01
37 3.0 1.406672e-01 1.539974e-01 3.629541e-02 5.223427e-02 6.817313e-02 1.928531e-01 3.175332e-01
38 3.0 2.638073e-01 2.567213e-01 5.646120e-02 1.202355e-01 1.840097e-01 3.674804e-01 5.509511e-01
39 3.0 3.952694e-03 1.596501e-03 2.966568e-03 3.031722e-03 3.096875e-03 4.445757e-03 5.794639e-03
40 3.0 3.721769e-03 5.211595e-03 1.013048e-04 7.351925e-04 1.369080e-03 5.532001e-03 9.694921e-03
41 3.0 3.023130e-02 4.177840e-02 5.882133e-03 6.110885e-03 6.339636e-03 4.240588e-02 7.847212e-02
42 3.0 1.502867e-01 8.717312e-02 5.979505e-02 1.085746e-01 1.573541e-01 1.955326e-01 2.337110e-01
Out[307]:
<matplotlib.axes._subplots.AxesSubplot at 0x11f54c780>

In [308]:
X = pca.transform(X)
TEST_X = pca.transform(TEST_X)

In [309]:
X.shape, TEST_X.shape


Out[309]:
((59400, 30), (14850, 30))

Saving Processed Data


In [310]:
pickle.dump(X, open('tmp\pca_X.pkl', 'wb'))
pickle.dump(TEST_X, open('tmp\pca_TEST_X.pkl', 'wb'))
# pickle.dump(y, open('tmp\pca_y.pkl', 'wb'))

Unsupervised Learning

  • Unsupervised Learning Exploration(Gaussian Process, Neural Nets)

Loading Pre-Processed Data


In [311]:
load = 2

if load == 1:
    print('Loading PCA Processed Data')
    X = pickle.load(open('tmp\pca_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\pca_TEST_X.pkl', 'rb'))

elif load == 2:
    # this will load kbest
    print('Loading KBest Processed Data')
    X = pickle.load(open('tmp\kbest_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\kbest_TEST_X.pkl', 'rb'))
elif load == 3:
    # this will load processed data
    print('Loading normal Processed Data')
    X = pickle.load(open('tmp\processed_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\processed_TEST_X.pkl', 'rb'))

# # y = pickle.load(open('tmp\processed_y.pkl'))


Loading KBest Processed Data

In [312]:
print(X.shape, y.shape, TEST_X.shape)


(59400, 26) (59400,) (14850, 26)

Gaussian


In [313]:
from sklearn.mixture import GaussianMixture as GMM
from sklearn.metrics import silhouette_score

In [314]:
# For future analysis
GMM_Centers = []

__check_for  = 1000

print ('clusters | score for top 1000')

for i in range(2, 7):
    # TODO: Apply your clustering algorithm of choice to the reduced data 
    clusterer = GMM(n_components=i, random_state=42)
    clusterer.fit(X)

    # TODO: Predict the cluster for each data point
    preds = clusterer.predict(X)

    # TODO: Find the cluster centers
    GMM_Centers.append(clusterer.means_)

    # score = silhouette_score(X, preds)
    score = silhouette_score(X[:__check_for], preds[:__check_for])

    print(i, score)
    
# clusters | score for top 1000
# 2 0.484879234998
# 3 0.377180934294
# 4 0.334333476259
# 5 0.29213724894
# 6 0.27643712696


clusters | score for top 1000
2 0.0136799380517
3 0.00581313038817
4 -0.0236377736025
5 -0.0671052621016
6 -0.0265095892177

KMeans


In [315]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [316]:
# For future analysis
KMM_Centers = []

# Testing each category
for i in range(2, 7):

    clusterer = KMeans(init='k-means++', n_clusters=i, n_init=10)
    clusterer.fit(X)

    preds = clusterer.predict(X)

    centers = clusterer.cluster_centers_
    
    KMM_Centers.append(centers)

#     score = silhouette_score(X, preds)
    score = silhouette_score(X[:__check_for], preds[:__check_for])
    print(i, score)
    
# clusters | score for top 1000
# 2 0.502005229628
# 3 0.377168744959
# 4 0.325091546516
# 5 0.303811069492
# 6 0.304265445159


2 0.54644092119
3 0.398792236781
4 0.301272045129
5 0.273347089426
6 0.259803278604

In [317]:
i = 2

clusterer = KMeans(init='k-means++', n_clusters=i, n_init=10)
clusterer.fit(X)
preds = clusterer.predict(X)

In [318]:
score = silhouette_score(X[:__check_for], preds[:__check_for])
print(i, score)


2 0.54644092119

In [319]:
print(X.shape, TEST_X.shape)


(59400, 26) (14850, 26)

In [320]:
X = pd.DataFrame(X)
X['new'] = clusterer.predict(X)

In [321]:
TEST_X = pd.DataFrame(TEST_X)
TEST_X['new'] = clusterer.predict(TEST_X)

In [322]:
print(X.shape, TEST_X.shape)


(59400, 27) (14850, 27)

Supervised Learning

  • Supervised Learning(GBT Trees, Nearest Neighbours, RF, One-vs-One)

Test-Train Split


In [326]:
from sklearn.model_selection import train_test_split

load = 3

if load == 1:
    print('Loading PCA Processed Data')
    X = pickle.load(open('tmp\pca_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\pca_TEST_X.pkl', 'rb'))

elif load == 2:
    # this will load kbest
    print('Loading KBest Processed Data')
    X = pickle.load(open('tmp\kbest_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\kbest_TEST_X.pkl', 'rb'))
elif load == 3:
    # this will load processed data
    print('Loading normal Processed Data')
    X = pickle.load(open('tmp\processed_X.pkl', 'rb'))
    TEST_X = pickle.load(open('tmp\processed_TEST_X.pkl', 'rb'))

y = pickle.load(open('tmp\processed_y.pkl', 'rb'))


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

print(X.shape, y.shape)


Loading normal Processed Data
(59400, 43) (59400,)

GBT Trees


In [327]:
from sklearn.ensemble import GradientBoostingClassifier

In [328]:
clf_gbt = GradientBoostingClassifier(random_state=192)

clf_gbt = clf_gbt.fit(X_train, y_train)

print('score:', clf_gbt.score(X_test, y_test))

# ('score:', 0.75252525252525249) k_best score

# ('score:', 0.75400673400673401) preprocessed


score: 0.754478114478

Nearest Neighbours


In [329]:
from sklearn.neighbors import KNeighborsClassifier

In [330]:
# modelling
clf_knn = KNeighborsClassifier()
clf_knn.fit(X_test, y_test)

# score
clf_knn.score(X_train, y_train)

# 0.55842873176206509 k_best
# 0.55840628507295176 preprocessed


Out[330]:
0.55809203142536479

Random Forest


In [331]:
from sklearn.ensemble import RandomForestClassifier

In [332]:
clf_rf = RandomForestClassifier(random_state=192)
clf_rf = clf_rf.fit(X_train, y_train)

print('Score:' + str(clf_rf.score(X_test, y_test)))

# 0.79542087542087547 # (n_jobs=-1, random_state=192)
# 0.800942760943 k_best
# 0.8


Score:0.800336700337

In [333]:
print(list(zip(X.columns, clf_rf.feature_importances_)))


[('amount_tsh', 0.02146536138185149), ('date_recorded', 0.037974199115599641), ('funder', 0.030325938983991507), ('gps_height', 0.043389708344340985), ('installer', 0.022601464611514462), ('longitude', 0.074757246698181165), ('latitude', 0.069779960754673825), ('wpt_name', 0.052531093768187799), ('num_private', 0.00085531362401935834), ('basin', 0.0097794385940230006), ('subvillage', 0.045864844570713298), ('region', 0.010343928629942076), ('region_code', 0.013028343029034679), ('district_code', 0.014224576969325118), ('lga', 0.018912299830737403), ('ward', 0.024664127564419373), ('population', 0.031073011950909953), ('public_meeting', 0.0058449880064946561), ('recorded_by', 0.0), ('scheme_management', 0.01129753512379114), ('scheme_name', 0.017967903794259128), ('permit', 0.0060762408972778606), ('construction_year', 0.0307014820983606), ('extraction_type', 0.012930964287103133), ('extraction_type_group', 0.015486706002440337), ('extraction_type_class', 0.020069837797164772), ('management', 0.010962881901000164), ('management_group', 0.0061010181344350791), ('payment', 0.018674287804992432), ('payment_type', 0.0099973532478958398), ('water_quality', 0.0090763819563593369), ('quality_group', 0.0060378716297362755), ('quantity', 0.065465797479158608), ('quantity_group', 0.077355131193541871), ('source', 0.013859347353133804), ('source_type', 0.0074974134844467853), ('source_class', 0.0037229724940747353), ('waterpoint_type', 0.041010612379949227), ('waterpoint_type_group', 0.021262735451926647), ('date_recorded_weekday', 0.022163861609408846), ('date_recorded_date', 0.02861911016064498), ('date_recorded_month', 0.012313403345631139), ('date_recorded_year', 0.0039333039453074382)]

In [334]:
plt.title('Random Forest - Features Importance - Histogram')
plt.ylabel('No.of Features')
plt.xlabel('Feature Importance')

_ = sns.distplot(clf_rf.feature_importances_ * 100, bins=20, hist=True, kde=False)



In [335]:
plt.title('Random Forest - Features (relative*) Importance - Histogram')
plt.ylabel('No.of Features')
plt.xlabel('Feature Importance - Bin size is 5')

tmp = 100 * (clf_rf.feature_importances_  - min(clf_rf.feature_importances_)) / max(clf_rf.feature_importances_)

_ = sns.distplot(tmp, bins=20, hist=True, kde=False)



In [336]:
bag = []
kbest_selected_cols = []
for col, score in zip(X.columns, tmp):
    if score < 5:
        bag.append(col)
    else:
        kbest_selected_cols.append(col)

print('Removed Cols:', bag)
print('Rest of Cols', kbest_selected_cols)


Removed Cols: ['num_private', 'recorded_by', 'source_class']
Rest of Cols ['amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer', 'longitude', 'latitude', 'wpt_name', 'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward', 'population', 'public_meeting', 'scheme_management', 'scheme_name', 'permit', 'construction_year', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'waterpoint_type', 'waterpoint_type_group', 'date_recorded_weekday', 'date_recorded_date', 'date_recorded_month', 'date_recorded_year']

In [337]:
X[kbest_selected_cols].size / 40., X[kbest_selected_cols].shape


Out[337]:
(59400.0, (59400, 40))

In [338]:
# n_estimators=150, criterion='entropy', class_weight="balanced_subsample", 

clf_rf = RandomForestClassifier(random_state=192, n_jobs=-1)
# class_weight="balanced_subsample"/"balanced"
# criterion="gini"/"entropy"

clf_rf = clf_rf.fit(X_train[kbest_selected_cols], y_train)
# pred = clf_rf.predict_proba(X_test)
clf_rf.score(X_test[kbest_selected_cols], y_test)


Out[338]:
0.79979797979797984
### SVM
X.shape
from sklearn import svm
clf_svm = svm.SVC(kernel='linear')
clf_svm?
limit = 500 clf_svm = clf_svm.fit(X_train[:limit], y_train[:limit]) print('') print('Score:' + str(clf_svm.score(X_test, y_test))) print('')

Multi Class


In [339]:
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier

One Vs One


In [342]:
clf_multiclass_rf = OneVsOneClassifier(RandomForestClassifier(
    n_estimators=200,criterion='entropy', class_weight="balanced_subsample",
    random_state=192, n_jobs=-1
))

clf_multiclass_rf = clf_multiclass_rf.fit(X_train, y_train)

print('Classifier:', clf_multiclass_rf)

print('Score:', clf_multiclass_rf.score(X_train, y_train))
print('Score:', clf_multiclass_rf.score(X_test, y_test))

# Score: 0.999775533109
# Score: 0.813602693603


Classifier: OneVsOneClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='entropy', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=192, verbose=0, warm_start=False),
          n_jobs=1)
Score: 0.99975308642
Score: 0.81595959596

One vs Rest


In [344]:
clf_multiclass_rf = OneVsRestClassifier(RandomForestClassifier(
    n_estimators=200,criterion='entropy', class_weight="balanced_subsample",
    random_state=192, n_jobs=-1
))

clf_multiclass_rf = clf_multiclass_rf.fit(X_train, y_train)

print('Classifier:', clf_multiclass_rf)
print('Train Score: ', clf_multiclass_rf.score(X_train, y_train))
print('Test Score:', clf_multiclass_rf.score(X_test, y_test))


Classifier: OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='entropy', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=-1,
            oob_score=False, random_state=192, verbose=0, warm_start=False),
          n_jobs=1)
Train Score:  0.999775533109
Test Score: 0.81468013468

Parameter tuning

From above analysis we can see that Random Forest CLF performed better than most other and so here we shall optimise it.


In [345]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [346]:
# max_features
np.sqrt(len(X_train.columns)), np.log(len(X_train.columns))


Out[346]:
(6.5574385243020004, 3.7612001156935624)

In [347]:
np.log2(len(X_train.columns)), np.sqrt (len(X_train.columns)), len(X_train.columns)


Out[347]:
(5.4262647547020979, 6.5574385243020004, 43)

In [348]:
'balanced_subsample balanced'.split(), 'gini entropy'.split()


Out[348]:
(['balanced_subsample', 'balanced'], ['gini', 'entropy'])

In [349]:
parameters = {
    'n_estimators': [10, 50, 100, 150, 200],
    'class_weight': ['balanced_subsample', 'balanced'],
    'criterion': ['gini', 'entropy'],
    'max_features': ['log2', 'auto', 25],
    'random_state': [192]
}

# clf_rf = RandomForestClassifier(n_estimators=150, criterion='entropy', class_weight="balanced_subsample", n_jobs=-1, random_state=192)
# 0.81346801346801345

GS_CV = RandomizedSearchCV(RandomForestClassifier(), parameters)

GS_CV.fit(X, y)


Out[349]:
RandomizedSearchCV(cv=None, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_estimators': [10, 50, 100, 150, 200], 'class_weight': ['balanced_subsample', 'balanced'], 'max_features': ['log2', 'auto', 25], 'random_state': [192], 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=0)

In [350]:
print(GS_CV.best_params_, GS_CV.best_score_)
# {'n_estimators': 200, 'max_features': 'log2', 'random_state': 192, 'criterion': 'entropy',
#  'class_weight': 'balanced_subsample'} 0.806717171717


{'n_estimators': 150, 'random_state': 192, 'max_features': 'log2', 'class_weight': 'balanced_subsample', 'criterion': 'gini'} 0.808114478114

In [351]:
cv_results = pd.DataFrame(GS_CV.cv_results_, columns=[u'mean_fit_time', u'mean_score_time', u'mean_test_score',
       u'mean_train_score', u'param_class_weight', u'param_criterion',
       u'param_max_features', u'param_n_estimators', u'params'])

In [352]:
cv_results.head(2)


Out[352]:
mean_fit_time mean_score_time mean_test_score mean_train_score param_class_weight param_criterion param_max_features param_n_estimators params
0 9.190857 0.691325 0.807593 0.999747 balanced_subsample gini log2 100 {'n_estimators': 100, 'random_state': 192, 'ma...
1 8.334739 0.690683 0.806347 0.999747 balanced gini log2 100 {'n_estimators': 100, 'random_state': 192, 'ma...

In [353]:
import seaborn as sns
sns.set(color_codes=True)

np.random.seed(sum(map(ord, "regression")))
tips = sns.load_dataset("tips")

In [354]:
ax=plt.figure(figsize=(8,8))
_ = sns.lmplot(x="mean_test_score", y="mean_train_score", hue="param_max_features", data=cv_results)


<matplotlib.figure.Figure at 0x125605908>
  • Checking "clf_rf" RF performance
sam_confusion_maxtrix(y_test, clf_rf.predict(X_test), ['func', 'non f', 'repair'])

XGBOOST

import xgboost as xgb
gbm = xgb.XGBClassifier(max_depth=3, n_estimators=100, learning_rate=0.05).fit(X_train, y_train) gbm_predictions = gbm.predict(X_test)
print(sum(gbm_predictions == y_test)/ (1.0 * len(y_test)) # 0.7279461279461279)
sam_confusion_maxtrix(y_test, predictions)

Submission

Model Selection

  • Check for which model is performing best and using it.
  • Check to apply the one-vs-many//one-vs-one wrapper.
  • Check for 'test_train_split' for which X,y to be used for training

In [378]:
GS_CV.best_params_


Out[378]:
{'class_weight': 'balanced_subsample',
 'criterion': 'gini',
 'max_features': 'log2',
 'n_estimators': 150,
 'random_state': 192}

In [377]:
clf_rf = OneVsOneClassifier(RandomForestClassifier(n_estimators=150,
random_state=192,
max_features='log2',
class_weight='balanced_subsample',
criterion='gini'))

print (clf_rf)

clf_rf = clf_rf.fit(X, y)


OneVsOneClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features='log2',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=1,
            oob_score=False, random_state=192, verbose=0, warm_start=False),
          n_jobs=1)

In [380]:
# saving the index
test_ids = RAW_TEST_X.index

# predicint the values
predictions = clf_rf.predict(TEST_X)
print(predictions.shape)

# Converting int to its respective Labels
predictions_labels = le.inverse_transform(predictions)

# setting up column name & save file
sub = pd.DataFrame(predictions_labels, columns=['status_group'])
sub.head()
sub.insert(loc=0, column='id', value=test_ids)
sub.reset_index()
sub.to_csv('submit.csv', index=False)
sub.head()


(14850,)
Out[380]:
id status_group
0 50785 non functional
1 51630 functional
2 17168 functional
3 45559 non functional
4 49871 functional

In [ ]:


In [ ]:


In [ ]: