Introduction: Using the data gathered from Taarifa and the Tanzanian Ministry of Water, can we predict which pumps are functional, which need some repairs, and which don't work at all? Predicting one of these three classes based and a smart understanding of which waterpoints will fail, can improve the maintenance operations and ensure that clean, potable water is available to communities across Tanzania.
This is also an intermediate-level competition by DataDriven! All code & support scripts are in Github Repo
In [63]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# %load_ext writeandexecute
plt.style.use('ggplot')
sns.set(color_codes=True)
# seed
np.random.seed(69572)
In [64]:
# import sys
# sys.path = sys.path + ['/Users/sampathkumarm/Desktop/devbox/Sam-DS/Kaggle/datadriven']
import scripts
import imp
imp.reload(scripts)
from scripts.sam_value_counts import sam_dataframe_cols_value_count_analysis, sam_dataframe_markup_value_counts
from scripts.sam_confusion_matrix import sam_plot_confusion_matrix, sam_confusion_maxtrix
In [65]:
import sys
from __future__ import absolute_import
from IPython.core.getipython import get_ipython
from IPython.core.magic import (Magics, magics_class, cell_magic)
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
from markdown import markdown
from IPython.core.display import HTML
from IPython.display import display
@magics_class
class MarkdownMagics(Magics):
@cell_magic
def asmarkdown(self, line, cell):
buffer = StringIO()
stdout = sys.stdout
sys.stdout = buffer
try:
exec(cell, locals(), self.shell.user_ns)
except:
sys.stdout = stdout
raise
sys.stdout = stdout
return HTML("<p>{}</p>".format(markdown(buffer.getvalue(), extensions=['markdown.extensions.extra'])))
return buffer.getvalue() + 'test'
def timer_message(self, start_time):
# print self
time_diff = (now() - start_time).total_seconds()
if time_diff < 0.001:
time_diff = 0
print('\n<pre>In', time_diff, 'Secs</pre>')
else:
print('\n<pre>In', time_diff, 'Secs</pre>')
@cell_magic
def timer(self, line, cell):
import datetime
now = datetime.datetime.now
start_time = now()
buffer = StringIO()
stdout = sys.stdout
sys.stdout = buffer
try:
exec(cell, locals(), self.shell.user_ns)
self.timer_message(start_time)
except:
sys.stdout = stdout
raise
sys.stdout = stdout
return HTML("<p>{}</p>".format(markdown(buffer.getvalue(), extensions=['markdown.extensions.extra'])))
return buffer.getvalue() + 'test'
get_ipython().register_magics(MarkdownMagics)
In [116]:
RAW_X = pd.read_csv('data/traning_set_values.csv', index_col='id')
RAW_y = pd.read_csv('data/training_set_labels.csv', index_col='id')
RAW_TEST_X = pd.read_csv('data/test_set_values.csv', index_col='id')
In [117]:
# proportion of labels available
RAW_y.status_group.value_counts() / RAW_y.size
Out[117]:
In [118]:
print('Shape of RAW_X', RAW_X.shape)
print('Shape of RAW_y', RAW_y.shape)
print('Shape of RAW_TEST_X', RAW_TEST_X.shape)
# ('Shape of RAW_X', (59400, 39))
# ('Shape of RAW_y', (59400, 1))
# ('Shape of RAW_TEST_X', (14850, 39))
In [69]:
for i, col in enumerate(RAW_X.columns):
print('|%d|%s|%d|' % (i, col, len(RAW_X[col].value_counts())))
In [70]:
# integer colums
cols_ints = '''amount_tsh
gps_height
longitude
latitude
num_private
region_code
district_code
population
construction_year'''.splitlines()
# bool
cols_bool = 'public_meeting permit'.split()
# date
cols_date = ['date_recorded']
print('INT COlS: ', len(cols_ints))
print('BOOL COLS:', len(cols_bool))
print('Date COLS:', len(cols_date))
In [71]:
len(RAW_X.columns)
Out[71]:
In [119]:
def show_object_dtypes(df,others=True):
dtype = object
if others:
return df.dtypes[df.dtypes == dtype]
else:
return df.dtypes[df.dtypes != dtype]
In [120]:
show_object_dtypes(RAW_TEST_X, True)
Out[120]:
In [121]:
show_object_dtypes(RAW_TEST_X, False)
Out[121]:
In [122]:
columns = RAW_X.columns
values_counts_bag = [len(RAW_X[column].value_counts()) for column in columns]
In [123]:
_ = sns.distplot(values_counts_bag, hist=True, kde=False,)
Example of how np-log transforms data
>>> np.log([0.001, 0.01, 0.1, 1, 10, 100, 1000])
array([-6.90775528, -4.60517019, -2.30258509, 0. , 2.30258509,
4.60517019, 6.90775528])
As you can see in np-log example, we can learn that when a list of values vary significantly(exponentially) then their logarithms moves linearly. As we(I) feel comfortable in studying linear plot and linear information, we did a np.log to values counts.
In [77]:
cols_values_counts_dataframe = pd.DataFrame(np.log(values_counts_bag), index=columns, columns=['Value Counts'])
In [78]:
print('Values Counts:', values_counts_bag)
print('\nLog of Values Counts:', cols_values_counts_dataframe.T.values)
_ = sns.distplot(cols_values_counts_dataframe.T.values, hist=True, kde=False,)
plt.title('Historgram of Object Feature`s (log2 of) Unique Values counts')
plt.xlabel('Features')
Out[78]:
In [79]:
cols_values_counts_dataframe.plot(kind='barh', figsize=(12, 12))
_ = plt.plot((2, 2), (0, 38))
_ = plt.plot((4, 4), (0, 38), '-g')
_ = plt.plot((6, 6), (0, 38), '-r')
_ = plt.plot((8, 8), (0, 38), '-y')
print('We seem to have some special categories where value counts are high.')
plt.title('Features Values Counts for comparision')
plt.xlabel ('Log of Unique Values')
Out[79]:
In [80]:
sam_dataframe_cols_value_count_analysis(RAW_X)
Checking rest of the columns
In [81]:
cols_value_count_limit_fraction = 0.01
cols_value_count_limit_log_value = np.log(RAW_X.shape[0] * cols_value_count_limit_fraction)
print('Total Number of Records:', RAW_X.shape[0], '- Log val is:', np.log(RAW_X.shape[0]))
print('%s percent of Number of Records:' % (cols_value_count_limit_fraction * 100),\
RAW_X.shape[0] * cols_value_count_limit_fraction,\
' - Log val is:', cols_value_count_limit_log_value)
Here in this project, cols_categorical_check
refers to list of columns for which caution check is considered. Reason for this check is, we would need more data to explain other columns & target cols with respect to it.
Lets consider these columns with more 5% of values as non categorical values and since our problem statement is choosing which category, we will try to minimise the category and see how our performance changes(improves or not)
To begin we will consider that those categories with more than cols_value_count_limit_fraction
percentage as the upper limit allowed. Any column with other data will pruged to become some to other information
In [82]:
show_object_dtypes(RAW_X, True)
Out[82]:
In [83]:
show_object_dtypes(RAW_X, False)
Out[83]:
In [84]:
cols_non_categorical = show_object_dtypes(RAW_X, True).index.tolist()
cols_date_numerics = show_object_dtypes(RAW_X, True).index.tolist()
In [85]:
list(cols_date_numerics)
Out[85]:
In [86]:
cols_categorical_check = []
for col, vc in cols_values_counts_dataframe.iterrows():
if col in cols_non_categorical:
if float(vc) > cols_value_count_limit_log_value:
cols_categorical_check.append(col)
print('Columns we need to moderate are:', cols_categorical_check)
All cols_date_numerics
, are date & other numeric data which can be made into buckets or reducing precision. Thus we can bound number of categories in data as the more variety of data we have, we need more information specific to each category which all might end with curse of dimensionality.
During pre-processing states we shall do following TODO
cols_date_numerics
& cols_categorical_check
to be under cols_value_count_limit_fraction
In [87]:
print('Log limit for categories:', cols_value_count_limit_log_value)
print('Actual limit for categories:', cols_value_count_limit_fraction * RAW_X.shape[0])
RAW_X[cols_categorical_check].head()
Out[87]:
In [88]:
RAW_X[cols_categorical_check].head(15)
Out[88]:
In [89]:
_ = sns.distplot(RAW_X.gps_height, hist=True, kde=False, rug=False)
In [90]:
_ = sns.distplot(RAW_X.population, hist=True, kde=False, rug=False)
In [91]:
_ = sns.jointplot(x='longitude', y='latitude', data=RAW_X)
plt.xlabel('longitude')
plt.ylabel('latitude')
Out[91]:
In [92]:
%%asmarkdown
# To generate a Markup Table
tmp = sam_dataframe_markup_value_counts(dataframe=RAW_X, max_print_value_counts=10, show_plots=False, figsize=(9, 2))
for each in tmp:
print(each)
Out[92]:
Most of the data seems categorical
Need to check cols_date_numerics(TODO1)
Need to check cols_categorical_check(TODO2)
Following pairs looks closesly related - cleanup (TODO3)
Other - cleanup (TODO4)
Num/Bool Tranformations
Precision Description of Longititude and Latitude is available here at below link
In [124]:
# Reloading the data
RAW_X = pd.read_csv('data/traning_set_values.csv', index_col='id')
RAW_y = pd.read_csv('data/training_set_labels.csv', index_col='id')
RAW_TEST_X = pd.read_csv('data/test_set_values.csv', index_col='id')
In [125]:
import datetime
strptime = datetime.datetime.strptime
DATE_FORMAT = "%Y-%m-%d"
REFERENCE_DATE_POINT = strptime('2000-01-01', DATE_FORMAT)
if RAW_X.date_recorded.dtype == 'O':
# convert it to datetime format
f = lambda x: strptime(str(x), DATE_FORMAT)
RAW_X.date_recorded = RAW_X.date_recorded.apply(f)
RAW_TEST_X.date_recorded = RAW_TEST_X.date_recorded.apply(f)
# week day
f = lambda x: x.weekday()
RAW_X['date_recorded_weekday'] = RAW_X.date_recorded.apply(f)
RAW_TEST_X['date_recorded_weekday'] = RAW_TEST_X.date_recorded.apply(f)
# date
f = lambda x: x.day
RAW_X['date_recorded_date'] = RAW_X.date_recorded.apply(f)
RAW_TEST_X['date_recorded_date'] = RAW_TEST_X.date_recorded.apply(f)
# month
f = lambda x: x.month
RAW_X['date_recorded_month'] = RAW_X.date_recorded.apply(f)
RAW_TEST_X['date_recorded_month'] = RAW_TEST_X.date_recorded.apply(f)
# year
f = lambda x: x.year
RAW_X['date_recorded_year'] = RAW_X.date_recorded.apply(f)
RAW_TEST_X['date_recorded_year'] = RAW_TEST_X.date_recorded.apply(f)
# total days
f = lambda x: (x - REFERENCE_DATE_POINT).days
RAW_X.date_recorded = RAW_X.date_recorded.apply(f)
RAW_TEST_X.date_recorded = RAW_TEST_X.date_recorded.apply(f)
In [126]:
# Longitude & Latitude -- zero values fix
# Filling Missing/OUTLIAR Values
_ = np.mean(RAW_X[u'latitude'][RAW_X.latitude < -1.0].values)
if not RAW_X.loc[RAW_X.latitude >= -1.0, u'latitude'].empty:
RAW_X.loc[RAW_X.latitude >= -1.0, u'latitude'] = _
RAW_TEST_X.loc[RAW_TEST_X.latitude >= -1.0, u'latitude'] = _
# Filling Missing/OUTLIAR Values
_ = np.mean(RAW_X[u'longitude'][RAW_X[u'longitude'] > 1.0].values)
if not RAW_X.loc[RAW_X[u'longitude'] <= 1.0, u'longitude'].empty:
RAW_X.loc[RAW_X[u'longitude'] <= 1.0, u'longitude'] = _
RAW_TEST_X.loc[RAW_TEST_X[u'longitude'] <= 1.0, u'longitude'] = _
In [127]:
def f(x):
if x is True:
return 1
elif x is False:
return 2
else:
return 3
if (RAW_X.public_meeting.dtype != 'bool') and (RAW_X.permit.dtype != 'bool'):
# public_meeting
RAW_X.public_meeting = RAW_X.public_meeting.apply(f)
RAW_TEST_X.public_meeting = RAW_TEST_X.public_meeting.apply(f)
# permit
RAW_X.permit = RAW_X.permit.apply(f)
RAW_TEST_X.permit = RAW_TEST_X.permit.apply(f)
print('Dtype of public_meetings & permit:',RAW_X.public_meeting.dtype, RAW_X.permit.dtype)
print('')
# checking
if list(RAW_TEST_X.dtypes[RAW_TEST_X.dtypes != RAW_X.dtypes]):
raise Exception('RAW_X.dtypes and RAW_TEST_X.dtypes are not in Sync')
else:
print('All in Good Shape')
In [128]:
show_object_dtypes(RAW_X, True)
Out[128]:
In [129]:
show_object_dtypes(RAW_X, False)
Out[129]:
In [130]:
# Reducing geo location precision to 11 meters
LONG_LAT_PRECISION = 0.001
# Reducing Precision of Lat.
if RAW_X.longitude.mean() < 50:
RAW_X.longitude = RAW_X.longitude // LONG_LAT_PRECISION
RAW_X.latitude = RAW_X.latitude // LONG_LAT_PRECISION
RAW_TEST_X.longitude = RAW_TEST_X.longitude // LONG_LAT_PRECISION
RAW_TEST_X.latitude = RAW_TEST_X.latitude // LONG_LAT_PRECISION
In [131]:
_ = sns.jointplot(x='longitude', y='latitude', data=RAW_X)
In [132]:
def text_transformation(name):
"""Cleanup basic text issue in name(input).
Removes text capitalisation, case, space and other non text ascii charecters
except space.
"""
if name:
name = name.lower().strip()
name = ''.join([i if 96 < ord(i) < 128 else ' ' for i in name])
if 'and' in name:
name = name.replace('and', ' ')
# clear double space
while ' ' in name:
name = name.replace(' ', ' ')
return name.strip()
return ''
In [133]:
ord(' ')
Out[133]:
In [134]:
%%asmarkdown
print('''
|Column|Prev.|Current|
|------|-----|-------|''')
for col in cols_categorical_check:
aa = len(RAW_X[col].unique())
RAW_X[col] = RAW_X[col].fillna('').apply(text_transformation)
RAW_TEST_X[col] = RAW_TEST_X[col].fillna('').apply(text_transformation)
bb = len(RAW_X[col].unique())
if aa != bb:
print('|%s|%i|%i|' % (col, aa, bb))
Out[134]:
In [104]:
# saving transformed data
pickle.dump(obj=RAW_X, file=open('tmp\clean_X.pkl', 'wb'))
pickle.dump(RAW_TEST_X, open('tmp\clean_TEST_X.pkl', 'wb'))
# pickle.dump(y, open('tmp\y.pkl', 'wb'))
TEST_X, X = RAW_TEST_X, RAW_X
In [136]:
from collections import defaultdict
from __future__ import print_function
from ipywidgets import interact, interactive, fixed
import ipywidgets as widgets
from scripts import sam_custom_labeler
CUST_CATEGORY_LABELER = sam_custom_labeler.CUST_CATEGORY_LABELER
In [137]:
help(CUST_CATEGORY_LABELER)
In [138]:
labler = CUST_CATEGORY_LABELER()
def select_col(col):
global labler
labler = CUST_CATEGORY_LABELER()
labler.fit(RAW_TEST_X[col])
print('Selected', col)
ii = interact(select_col, col=['funder', 'installer', 'wpt_name', 'subvillage', 'ward', 'scheme_name'])
# To check data coverage
def f1(data=80):
labler.check_data_coverage(data_coverage=data)
ii1 = interact(f1, data=(70, 100, .5))
# To check groups coverage
def f2(groups=80):
labler.check_group_coverage(groups)
ii2 = interact(f2, groups=(50, 100., .5))
_ = '''
Please select one of these slider to chose among the
data coverage or groups coverage
'''
installer:
wpt_name:
subvillage:
In [139]:
##################################
######### TESTING ################
#################################
labler = CUST_CATEGORY_LABELER()
labler.fit(X.installer)
# default data coverage is 80
tmp = labler.transform()
print('data coveraged', labler.DATA_COVERAGE_LIMIT)
print('grous coveraged', len(tmp.value_counts()))
print('---------------------')
labler.DATA_COVERAGE_LIMIT = 90
tmp = labler.transform()
print('data coveraged', labler.DATA_COVERAGE_LIMIT)
print('grous coveraged', len(tmp.value_counts()))
In [140]:
##################################
######### IMPLEMENT ##############
#################################
if 'custom_labler' not in dir():
custom_labler = defaultdict(CUST_CATEGORY_LABELER)
tmp = { 'funder': 97,
'installer': 97,
'wpt_name': 80,
'subvillage': 80,
'ward': 80,
'scheme_name': 85
}
for col, limit in tmp.items():
labler = custom_labler[col]
labler.DATA_COVERAGE_LIMIT = limit
labler.fit(X[col])
print('')
print('-' * 15, col.upper())
# custom_labler[col].check_data_coverage(limit)
RAW_X[col] = labler.transform()
else:
print('"custom_labler" seems is already defined, please check')
print(RAW_X.shape, RAW_TEST_X.shape, all(RAW_X.columns == RAW_TEST_X.columns))
Label Encoder with DefaultDict for quick data transformation http://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
In [141]:
from collections import defaultdict
from sklearn import preprocessing
In [142]:
print(RAW_X.shape, RAW_TEST_X.shape)
In [143]:
d = defaultdict(preprocessing.LabelEncoder)
RAW_X.scheme_management = RAW_X.scheme_management.fillna('Other')
RAW_TEST_X.scheme_management = RAW_TEST_X.scheme_management.fillna('Other')
# Labels Fit
sam = pd.concat([RAW_X, RAW_TEST_X]).apply(lambda x: d[x.name].fit(x))
# Labels Transform - Training Data
X = RAW_X.apply(lambda x: d[x.name].transform(x))
TEST_X = RAW_TEST_X.apply(lambda x: d[x.name].transform(x))
le = preprocessing.LabelEncoder().fit(RAW_y[u'status_group'])
y = le.transform(RAW_y[u'status_group'])
In [144]:
show_object_dtypes(RAW_X, True)
Out[144]:
In [145]:
show_object_dtypes(X, True)
Out[145]:
In [59]:
sam_dataframe_cols_value_count_analysis(X)
In [62]:
# saving transformed data
pickle.dump(X, open('tmp\processed_X.pkl', 'wb'))
pickle.dump(TEST_X, open('tmp\processed_TEST_X.pkl', 'wb'))
pickle.dump(y, open('tmp\processed_y.pkl', 'wb'))
# saving label transformers
pickle.dump(d, open('tmp\d.pkl', 'wb'))
pickle.dump(le, open('tmp\le.pkl', 'wb'))
In [ ]:
X = pickle.load(open('tmp\processed_X.pkl', 'rb'))
TEST_X = pickle.load(open('tmp\processed_TEST_X.pkl', 'rb'))
y = pickle.load(open('tmp\processed_y.pkl', 'rb'))
# # Load this when you are about to do text transformation and submission
# d = pickle.load(open('tmp\d.pkl'))
# le = pickle.load(open('tmp\le.pkl'))
print(X.shape, y.shape, y[:5])
In [182]:
if list(X.dtypes[X.dtypes == 'O']):
print('Please check there are still some OBJECT COLUMNS PRESENT')
else:
ss = X.corr().fillna(0)
# postive or negitive - both good
ss = ss.applymap(lambda x: x if x and x > 0 else -1 * x)
# wish to know only strong corr
plt.figure(figsize=(15, 15))
sns.heatmap(ss)
In [189]:
# wish to know only strong corr
plt.figure(figsize=(18, 18))
sns.heatmap(ss.applymap(lambda x: x if x > 0.90 else 0))
Out[189]:
In [248]:
len(X[_col_].value_counts()), len(X[_row_].value_counts())
Out[248]:
In [273]:
np.set_printoptions(precision=2)
bag = []
for _col_ in ss.index:
for _row_ in ss.columns:
if _col_ not in bag:
if (ss[_col_][_row_] > 0.8 and (ss[_col_][_row_] < 1.0)):
try:
print((_col_, len(X[_col_].value_counts()),
_row_, len(X[_row_].value_counts()),
ss[_col_][_row_]))
except KeyError:
# few extra cols are added
pass
# bag.append(_row_)
# bag.append(_col_)
del _col_, _row_, bag
In [276]:
%%asmarkdown
print ('''
|Column Name|VCount|Column Name|VCount|Corr|
|-----------|------|-----------|------|----|''')
tmp = '''
('date_recorded', 356, 'date_recorded_year', 5, 0.95920911743658788)
('extraction_type', 18, 'extraction_type_group', 13, 0.94952351098756882)
('extraction_type_group', 13, 'extraction_type', 18, 0.94952351098756882)
('source', 10, 'source_type', 7, 0.94381787586073784)
('source_type', 7, 'source', 10, 0.94381787586073784)
('waterpoint_type', 7, 'waterpoint_type_group', 6, 0.98215380609123037)
('waterpoint_type_group', 6, 'waterpoint_type', 7, 0.98215380609123037)
('date_recorded_year', 5, 'date_recorded', 356, 0.95920911743658788)
'''
while ' ' in tmp:
tmp = tmp.replace(' ', '')
tmp = tmp.strip().replace('\'', '')
print(tmp.replace(",", '|').replace('(', '|').replace(')', '|'))
del tmp
Out[276]:
In [285]:
from sklearn.feature_selection import chi2
X['date_recorded'].shape
Out[285]:
To remove all features that are either one or zero (on or off) in more than 80% of the samples.
http://scikit-learn.org/stable/modules/feature_selection.html#removing-features-with-low-variance
In [288]:
X.dtypes
Out[288]:
In [289]:
from scripts.sam_variance_check import get_low_variance_columns
In [290]:
X, removed_features, ranking_variance_thresholds = get_low_variance_columns(dframe=X,
threshold=(0.85 * (1 - 0.85)),
autoremove=True)
print('\nLow Variance Columns', removed_features)
print('Shape of X is', X.shape)
In [291]:
if removed_features:
TEST_X.drop(removed_features, axis=1, inplace=True)
print('cleanup completed!')
In [292]:
print('Shape of X is', X.shape)
print('Shape of TEST_X is', TEST_X.shape)
Random Forest Classifier score: RandomForestClassifier(n_estimators=150, criterion='entropy', class_weight="balanced_subsample", n_jobs=-1)
In [293]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
In [294]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
def game(X, y):
# print(X.shape, y.shape[0])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
clf_rf = RandomForestClassifier(n_jobs=-1, random_state=192)
clf_rf = clf_rf.fit(X_train, y_train)
train_score = clf_rf.score(X_train, y_train)
test_score = clf_rf.score(X_test, y_test)
# print('Train Score', train_score)
# print('Test Score', test_score)
return train_score, test_score
In [295]:
X.shape, y.shape
Out[295]:
In [296]:
kbest_cols = 26
for fns in [chi2, f_classif, mutual_info_classif]:
print((fns,game(SelectKBest(score_func=fns, k=kbest_cols).fit(X, y).transform(X), y)))
In [297]:
print('''
(chi2, 0.98428731762065091, 0.79966329966329963)
(f_classif, 0.97432098765432096, 0.79286195286195282)
(mutual_info_classif, 0.98410774410774415, 0.79447811447811445)
'''.replace('(', '|').replace(')', '|').replace(', ', '|'))
kbest conclusion :
Best selected columns
AMOUNT_TSH, DATE_RECORDED, FUNDER, GPS_HEIGHT, INSTALLER, LONGITUDE, LATITUDE, NUM_PRIVATE, BASIN, SUBVILLAGE, REGION, REGION_CODE, DISTRICT_CODE, LGA, WARD, POPULATION, PUBLIC_MEETING, SCHEME_MANAGEMENT, SCHEME_NAME, PERMIT, CONSTRUCTION_YEAR, EXTRACTION_TYPE, EXTRACTION_TYPE_GROUP, EXTRACTION_TYPE_CLASS, MANAGEMENT, MANAGEMENT_GROUP, PAYMENT, PAYMENT_TYPE
# results of previous runs
[{'cols': 1, 'test': 0.52659932659932662, 'train': 0.57483726150392822},
{'cols': 5, 'test': 0.68962962962962959, 'train': 0.94240179573512906},
{'cols': 9, 'test': 0.7211447811447812, 'train': 0.97638608305274976},
{'cols': 13, 'test': 0.75380471380471381, 'train': 0.97955106621773291},
{'cols': 17, 'test': 0.76134680134680133, 'train': 0.98071829405162736},
{'cols': 21, 'test': 0.76511784511784509, 'train': 0.98076318742985413},
{'cols': 25, 'test': 0.80033670033670035, 'train': 0.98316498316498313},
{'cols': 29, 'test': 0.80053872053872055, 'train': 0.98379349046015707},
{'cols': 33, 'test': 0.80040404040404045, 'train': 0.98390572390572395},
{'cols': 37, 'test': 0.79993265993265994, 'train': 0.98341189674523011}]
[{'cols': 23, 'test': 0.7976430976430976, 'train': 0.9836812570145903},
{'cols': 25, 'test': 0.80033670033670035, 'train': 0.98316498316498313},
{'cols': 27, 'test': 0.80101010101010106, 'train': 0.9829405162738496},
{'cols': 29, 'test': 0.80053872053872055, 'train': 0.98379349046015707},
{'cols': 31, 'test': 0.80000000000000004, 'train': 0.98381593714927051}]
[{'cols': 26, 'test': 0.80309764309764309, 'train': 0.98359147025813698},
{'cols': 27, 'test': 0.80101010101010106, 'train': 0.9829405162738496},
{'cols': 28, 'test': 0.80222222222222217, 'train': 0.98334455667789}]
As per Okham Razor's rules, we are going to select the simplest and well performing. Luckily, we have got kbest_selected_cols at 26 which is comparitively top performer among other K-selections and also lower than actualy number of columns
In [298]:
kbest_cols = 26
fit = SelectKBest(score_func=chi2, k=kbest_cols).fit(X, y)
cols_names = X.columns
kbest_selected_cols = [_ for _ in cols_names[:kbest_cols]]
kbest_X = pd.DataFrame(fit.transform(X))
kbest_TEST_X = pd.DataFrame(fit.transform(TEST_X))
In [299]:
kbest_X.shape, kbest_TEST_X.shape, y.shape
Out[299]:
In [300]:
pickle.dump(kbest_X, open('tmp\kbest_X.pkl', 'wb'))
pickle.dump(kbest_TEST_X, open('tmp\kbest_TEST_X.pkl', 'wb'))
pickle.dump(y, open('tmp\kbest_y.pkl', 'wb'))
In [301]:
load = 2
if load ==1:
# this will load kbest
print('Loading KBest Processed Data')
X = pickle.load(open('tmp\kbest_X.pkl', 'rb'))
TEST_X = pickle.load(open('tmp\kbest_TEST_X.pkl', 'rb'))
y = pickle.load(open('tmp\kbest_y.pkl', 'rb'))
elif load ==2:
# this will load processed data
print('Loading normal Processed Data')
X = pickle.load(open('tmp\processed_X.pkl', 'rb'))
TEST_X = pickle.load(open('tmp\processed_TEST_X.pkl', 'rb'))
# # y = pickle.load(open('tmp\processed_y.pkl'))
PCA
In [302]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
In [303]:
X.shape
Out[303]:
In [304]:
# feature extraction
pca = PCA(n_components=30)
fit = pca.fit(X)
plt.figure(figsize=(12, 3))
_ = plt.scatter (range(len(fit.explained_variance_ratio_)), fit.explained_variance_ratio_.cumsum())
_ = plt.xlabel('cumilative sum of explained variance')
_ = plt.ylabel('score')
print(fit.explained_variance_ratio_.cumsum())
print()
print(('Score', game(pca.transform(X), y)))
# (0.97580246913580249, 0.60511784511784517) # KBest dataset
# (0.97564534231200895, 0.60552188552188557) # Normal Dataset
In [305]:
ss = pd.DataFrame(fit.components_)
ss = ss.applymap(lambda x: x if x > 0 else -1 * x)
display(ss.describe().T)
ss.plot(kind='bar', figsize=(125, 10))
Out[305]:
In [306]:
# feature extraction
lda = LinearDiscriminantAnalysis(n_components=16)
fit = lda.fit(X, y)
plt.figure(figsize=(12, 3))
_ = plt.scatter (range(len(fit.explained_variance_ratio_)), fit.explained_variance_ratio_.cumsum())
_ = plt.xlabel('cumilative sum of explained variance')
_ = plt.ylabel('score')
print(fit.explained_variance_ratio_.cumsum())
print(('\nScore', game(lda.transform(X), y)))
# (0.97580246913580249, 0.60511784511784517) # KBest dataset
# (0.97564534231200895, 0.60552188552188557) # Normal Dataset
In [307]:
ss = pd.DataFrame(fit.coef_)
ss = ss.applymap(lambda x: x if x > 0 else -1 * x)
display(ss.describe().T)
ss.plot(kind='bar', figsize=(125, 10))
Out[307]:
In [308]:
X = pca.transform(X)
TEST_X = pca.transform(TEST_X)
In [309]:
X.shape, TEST_X.shape
Out[309]:
Saving Processed Data
In [310]:
pickle.dump(X, open('tmp\pca_X.pkl', 'wb'))
pickle.dump(TEST_X, open('tmp\pca_TEST_X.pkl', 'wb'))
# pickle.dump(y, open('tmp\pca_y.pkl', 'wb'))
Loading Pre-Processed Data
In [311]:
load = 2
if load == 1:
print('Loading PCA Processed Data')
X = pickle.load(open('tmp\pca_X.pkl', 'rb'))
TEST_X = pickle.load(open('tmp\pca_TEST_X.pkl', 'rb'))
elif load == 2:
# this will load kbest
print('Loading KBest Processed Data')
X = pickle.load(open('tmp\kbest_X.pkl', 'rb'))
TEST_X = pickle.load(open('tmp\kbest_TEST_X.pkl', 'rb'))
elif load == 3:
# this will load processed data
print('Loading normal Processed Data')
X = pickle.load(open('tmp\processed_X.pkl', 'rb'))
TEST_X = pickle.load(open('tmp\processed_TEST_X.pkl', 'rb'))
# # y = pickle.load(open('tmp\processed_y.pkl'))
In [312]:
print(X.shape, y.shape, TEST_X.shape)
In [313]:
from sklearn.mixture import GaussianMixture as GMM
from sklearn.metrics import silhouette_score
In [314]:
# For future analysis
GMM_Centers = []
__check_for = 1000
print ('clusters | score for top 1000')
for i in range(2, 7):
# TODO: Apply your clustering algorithm of choice to the reduced data
clusterer = GMM(n_components=i, random_state=42)
clusterer.fit(X)
# TODO: Predict the cluster for each data point
preds = clusterer.predict(X)
# TODO: Find the cluster centers
GMM_Centers.append(clusterer.means_)
# score = silhouette_score(X, preds)
score = silhouette_score(X[:__check_for], preds[:__check_for])
print(i, score)
# clusters | score for top 1000
# 2 0.484879234998
# 3 0.377180934294
# 4 0.334333476259
# 5 0.29213724894
# 6 0.27643712696
In [315]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
In [316]:
# For future analysis
KMM_Centers = []
# Testing each category
for i in range(2, 7):
clusterer = KMeans(init='k-means++', n_clusters=i, n_init=10)
clusterer.fit(X)
preds = clusterer.predict(X)
centers = clusterer.cluster_centers_
KMM_Centers.append(centers)
# score = silhouette_score(X, preds)
score = silhouette_score(X[:__check_for], preds[:__check_for])
print(i, score)
# clusters | score for top 1000
# 2 0.502005229628
# 3 0.377168744959
# 4 0.325091546516
# 5 0.303811069492
# 6 0.304265445159
In [317]:
i = 2
clusterer = KMeans(init='k-means++', n_clusters=i, n_init=10)
clusterer.fit(X)
preds = clusterer.predict(X)
In [318]:
score = silhouette_score(X[:__check_for], preds[:__check_for])
print(i, score)
In [319]:
print(X.shape, TEST_X.shape)
In [320]:
X = pd.DataFrame(X)
X['new'] = clusterer.predict(X)
In [321]:
TEST_X = pd.DataFrame(TEST_X)
TEST_X['new'] = clusterer.predict(TEST_X)
In [322]:
print(X.shape, TEST_X.shape)
In [326]:
from sklearn.model_selection import train_test_split
load = 3
if load == 1:
print('Loading PCA Processed Data')
X = pickle.load(open('tmp\pca_X.pkl', 'rb'))
TEST_X = pickle.load(open('tmp\pca_TEST_X.pkl', 'rb'))
elif load == 2:
# this will load kbest
print('Loading KBest Processed Data')
X = pickle.load(open('tmp\kbest_X.pkl', 'rb'))
TEST_X = pickle.load(open('tmp\kbest_TEST_X.pkl', 'rb'))
elif load == 3:
# this will load processed data
print('Loading normal Processed Data')
X = pickle.load(open('tmp\processed_X.pkl', 'rb'))
TEST_X = pickle.load(open('tmp\processed_TEST_X.pkl', 'rb'))
y = pickle.load(open('tmp\processed_y.pkl', 'rb'))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)
print(X.shape, y.shape)
In [327]:
from sklearn.ensemble import GradientBoostingClassifier
In [328]:
clf_gbt = GradientBoostingClassifier(random_state=192)
clf_gbt = clf_gbt.fit(X_train, y_train)
print('score:', clf_gbt.score(X_test, y_test))
# ('score:', 0.75252525252525249) k_best score
# ('score:', 0.75400673400673401) preprocessed
In [329]:
from sklearn.neighbors import KNeighborsClassifier
In [330]:
# modelling
clf_knn = KNeighborsClassifier()
clf_knn.fit(X_test, y_test)
# score
clf_knn.score(X_train, y_train)
# 0.55842873176206509 k_best
# 0.55840628507295176 preprocessed
Out[330]:
In [331]:
from sklearn.ensemble import RandomForestClassifier
In [332]:
clf_rf = RandomForestClassifier(random_state=192)
clf_rf = clf_rf.fit(X_train, y_train)
print('Score:' + str(clf_rf.score(X_test, y_test)))
# 0.79542087542087547 # (n_jobs=-1, random_state=192)
# 0.800942760943 k_best
# 0.8
In [333]:
print(list(zip(X.columns, clf_rf.feature_importances_)))
In [334]:
plt.title('Random Forest - Features Importance - Histogram')
plt.ylabel('No.of Features')
plt.xlabel('Feature Importance')
_ = sns.distplot(clf_rf.feature_importances_ * 100, bins=20, hist=True, kde=False)
In [335]:
plt.title('Random Forest - Features (relative*) Importance - Histogram')
plt.ylabel('No.of Features')
plt.xlabel('Feature Importance - Bin size is 5')
tmp = 100 * (clf_rf.feature_importances_ - min(clf_rf.feature_importances_)) / max(clf_rf.feature_importances_)
_ = sns.distplot(tmp, bins=20, hist=True, kde=False)
In [336]:
bag = []
kbest_selected_cols = []
for col, score in zip(X.columns, tmp):
if score < 5:
bag.append(col)
else:
kbest_selected_cols.append(col)
print('Removed Cols:', bag)
print('Rest of Cols', kbest_selected_cols)
In [337]:
X[kbest_selected_cols].size / 40., X[kbest_selected_cols].shape
Out[337]:
In [338]:
# n_estimators=150, criterion='entropy', class_weight="balanced_subsample",
clf_rf = RandomForestClassifier(random_state=192, n_jobs=-1)
# class_weight="balanced_subsample"/"balanced"
# criterion="gini"/"entropy"
clf_rf = clf_rf.fit(X_train[kbest_selected_cols], y_train)
# pred = clf_rf.predict_proba(X_test)
clf_rf.score(X_test[kbest_selected_cols], y_test)
Out[338]:
In [339]:
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier
One Vs One
In [342]:
clf_multiclass_rf = OneVsOneClassifier(RandomForestClassifier(
n_estimators=200,criterion='entropy', class_weight="balanced_subsample",
random_state=192, n_jobs=-1
))
clf_multiclass_rf = clf_multiclass_rf.fit(X_train, y_train)
print('Classifier:', clf_multiclass_rf)
print('Score:', clf_multiclass_rf.score(X_train, y_train))
print('Score:', clf_multiclass_rf.score(X_test, y_test))
# Score: 0.999775533109
# Score: 0.813602693603
One vs Rest
In [344]:
clf_multiclass_rf = OneVsRestClassifier(RandomForestClassifier(
n_estimators=200,criterion='entropy', class_weight="balanced_subsample",
random_state=192, n_jobs=-1
))
clf_multiclass_rf = clf_multiclass_rf.fit(X_train, y_train)
print('Classifier:', clf_multiclass_rf)
print('Train Score: ', clf_multiclass_rf.score(X_train, y_train))
print('Test Score:', clf_multiclass_rf.score(X_test, y_test))
In [345]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
In [346]:
# max_features
np.sqrt(len(X_train.columns)), np.log(len(X_train.columns))
Out[346]:
In [347]:
np.log2(len(X_train.columns)), np.sqrt (len(X_train.columns)), len(X_train.columns)
Out[347]:
In [348]:
'balanced_subsample balanced'.split(), 'gini entropy'.split()
Out[348]:
In [349]:
parameters = {
'n_estimators': [10, 50, 100, 150, 200],
'class_weight': ['balanced_subsample', 'balanced'],
'criterion': ['gini', 'entropy'],
'max_features': ['log2', 'auto', 25],
'random_state': [192]
}
# clf_rf = RandomForestClassifier(n_estimators=150, criterion='entropy', class_weight="balanced_subsample", n_jobs=-1, random_state=192)
# 0.81346801346801345
GS_CV = RandomizedSearchCV(RandomForestClassifier(), parameters)
GS_CV.fit(X, y)
Out[349]:
In [350]:
print(GS_CV.best_params_, GS_CV.best_score_)
# {'n_estimators': 200, 'max_features': 'log2', 'random_state': 192, 'criterion': 'entropy',
# 'class_weight': 'balanced_subsample'} 0.806717171717
In [351]:
cv_results = pd.DataFrame(GS_CV.cv_results_, columns=[u'mean_fit_time', u'mean_score_time', u'mean_test_score',
u'mean_train_score', u'param_class_weight', u'param_criterion',
u'param_max_features', u'param_n_estimators', u'params'])
In [352]:
cv_results.head(2)
Out[352]:
In [353]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "regression")))
tips = sns.load_dataset("tips")
In [354]:
ax=plt.figure(figsize=(8,8))
_ = sns.lmplot(x="mean_test_score", y="mean_train_score", hue="param_max_features", data=cv_results)
Model Selection
In [378]:
GS_CV.best_params_
Out[378]:
In [377]:
clf_rf = OneVsOneClassifier(RandomForestClassifier(n_estimators=150,
random_state=192,
max_features='log2',
class_weight='balanced_subsample',
criterion='gini'))
print (clf_rf)
clf_rf = clf_rf.fit(X, y)
In [380]:
# saving the index
test_ids = RAW_TEST_X.index
# predicint the values
predictions = clf_rf.predict(TEST_X)
print(predictions.shape)
# Converting int to its respective Labels
predictions_labels = le.inverse_transform(predictions)
# setting up column name & save file
sub = pd.DataFrame(predictions_labels, columns=['status_group'])
sub.head()
sub.insert(loc=0, column='id', value=test_ids)
sub.reset_index()
sub.to_csv('submit.csv', index=False)
sub.head()
Out[380]:
In [ ]:
In [ ]:
In [ ]: