In [1]:
from IPython.display import display, display_markdown
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy.stats as stats
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score
In [2]:
training_data = pd.read_csv('training_set_values.csv', index_col=0)
training_label = pd.read_csv('training_set_labels.csv', index_col=0)
test_data = pd.read_csv('test_set_values.csv', index_col=0)
# Merge test data and training data to apply same data management operations on them
data = training_data.append(test_data).sort_index()
data.head()
Out[2]:
In [3]:
# As lots of waterpoints are missing a value for amount_tsh. For that field the missing
# data will be replaced by the mean data to drop less data for the model fit
imp = preprocessing.Imputer(missing_values=0, strategy='mean')
imp.fit(data['amount_tsh'].values.reshape(-1, 1))
data['water_amount'] = imp.transform(data['amount_tsh'].values.reshape(-1, 1)).ravel()
imp = preprocessing.Imputer(missing_values=0, strategy='median')
imp.fit(data['construction_year'].values.reshape(-1, 1))
data['construction_year'] = imp.transform(data['construction_year'].values.reshape(-1, 1)).ravel()
imp = preprocessing.Imputer(missing_values=0, strategy='mean')
imp.fit(data['gps_height'].values.reshape(-1, 1))
data['height'] = imp.transform(data['gps_height'].values.reshape(-1, 1)).ravel()
# Recode missing data as NaN
for field in ('longitude', 'latitude'):
data[field] = data[field].map(lambda x: x if x else pd.np.nan)
def group_installer(data):
def gather_installer(x):
installer_map = {
'organisation' : ('bank', 'msf', 'wwf', 'unicef', 'unisef', 'oxfam', 'oxfarm', 'club', 'care', 'without', 'faim', 'rain', 'red', 'angels', 'fundat', 'foundation'),
'church' : ('church', 'churc', 'rcchurch', 'roman', 'missionsry', 'lutheran', 'islamic', 'islam', 'vision'),
'private' : ('consulting', 'engineer', 'private', 'ltd', 'co.ltd', 'contractor', 'enterp', 'enterpr', 'company', 'contract'),
'community' : ('village', 'community', 'communit', 'district', 'council', 'commu', 'villigers', 'villagers'),
'government' : ('government', 'gov', 'govt', 'gover', 'gove', 'governme', 'ministry'),
'other' : ('0', 'nan', 'known', 'other', 'unknown'), # Group 'unknown' data with 'other' as finally this means the same for interpretation
'danida' : ('danida', 'danid'),
'foreign government' : ('netherlands', 'germany', 'european')
}
for substr in x.split():
for subsubstr in substr.split('/'):
for key in installer_map:
if subsubstr in installer_map[key]:
return key
return x
lower_data = data.map(lambda x: str(x).lower())
tmp_data = lower_data.map(gather_installer)
top10 = list(tmp_data.value_counts()[:10].index)
return tmp_data.map(lambda x: x if x in top10 else 'other')
data['installer'] = group_installer(data.installer)
data['funder'] = group_installer(data.funder)
clean_data = (data.iloc[training_data.index]
.join(training_label['status_group'])
.dropna())
# Create two columns one collapsing 'functional' and 'functional needs repair'
# and the other one collapsing 'non functional' and 'functional needs repair'
clean_data['functional'] = clean_data['status_group'].map({'functional' : 1,
'functional needs repair' : 1,
'non functional' : 0})
clean_data['no_repairs'] = clean_data['status_group'].map({'functional' : 1,
'functional needs repair' : 0,
'non functional' : 0})
In [6]:
# Extract predictors and convert categorical variables in dichotomic variables
predictors_name = ['water_amount', 'height', 'longitude', 'latitude',
# predictors_name = ['amount_tsh', 'gps_height', 'longitude', 'latitude',
'basin', 'region', 'population', 'public_meeting', 'management_group',
'permit', 'construction_year', 'extraction_type_class', 'payment_type',
'quality_group', 'quantity_group', 'source_type', 'waterpoint_type_group',
'installer', 'funder']
categorical_predictors = ('basin', 'region', 'management_group', 'extraction_type_class',
'payment_type', 'quality_group', 'quantity_group',
'source_type', 'waterpoint_type_group', 'installer', 'funder')
process_data = pd.DataFrame()
for name in predictors_name:
if name in categorical_predictors:
classes = data[name].unique()
deployed_categories = preprocessing.label_binarize(data[name], classes=classes)
# Avoid class name collision
classe_names = list()
for c in classes:
if c in process_data.columns or c == 'other':
classe_names.append('_'.join((c, name)))
else:
classe_names.append(c)
tmp_df = pd.DataFrame(deployed_categories,
columns=classe_names,
index=data.index)
process_data = process_data.join(tmp_df)
else:
process_data[name] = data[name]
predictors_columns = process_data.columns
deployed_data = (process_data.iloc[training_data.index]
.join(training_label['status_group'])
.dropna())
# Create two columns one collapsing 'functional' and 'functional needs repair'
# and the other one collapsing 'non functional' and 'functional needs repair'
deployed_data['functional'] = deployed_data['status_group'].map({'functional' : 1,
'functional needs repair' : 1,
'non functional' : 0})
deployed_data['no_repairs'] = deployed_data['status_group'].map({'functional' : 1,
'functional needs repair' : 0,
'non functional' : 0})
predictors = deployed_data[predictors_columns]
In [7]:
# fit an Extra Trees model to the data and look at the first 15 important fields
model = ExtraTreesClassifier()
model.fit(predictors, deployed_data['status_group'])
# display the relative importance of each attribute
cm = sns.light_palette("yellow", as_cmap=True)
(pd.Series(model.feature_importances_, index=predictors.columns, name='importance')
.sort_values(ascending=False)
.to_frame()
.iloc[:15])
Out[7]:
In [8]:
data.shape
Out[8]:
In [9]:
# Let's test the random forest test varying the number of important categories from 10 to 60
importance = (pd.Series(model.feature_importances_, index=predictors.columns, name='importance')
.sort_values(ascending=False))
pd.np.random.seed(12345)
pred_train, pred_test, tar_train, tar_test = train_test_split(predictors,
deployed_data['status_group'],
test_size=.4)
In [21]:
features=range(10, 61)
accuracy=pd.np.zeros(len(features))
for counter, idx in enumerate(features):
classifier=RandomForestClassifier(n_estimators=25)
classifier=classifier.fit(pred_train[importance.index[:idx]],tar_train)
predictions=classifier.predict(pred_test[importance.index[:idx]])
accuracy[counter]=accuracy_score(tar_test, predictions)
plt.plot(features, accuracy)
plt.xlabel("Number of features")
plt.ylabel("Accuracy score")
plt.show();
In [10]:
# After optimizing for the number of categories (taking 50), let's optimize the number of tree
trees=range(1, 31)
accuracy=pd.np.zeros(len(trees))
for idx in trees:
classifier=RandomForestClassifier(n_estimators=idx)
classifier=classifier.fit(pred_train[importance.index[:50]],tar_train)
predictions=classifier.predict(pred_test[importance.index[:50]])
accuracy[idx-1]=accuracy_score(tar_test, predictions)
plt.plot(trees, accuracy)
plt.xlabel("Number of trees")
plt.ylabel("Accuracy score")
plt.show();
In [11]:
# Let save this optimization approach
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=28)
model = model.fit(predictors[importance.index[:50]], deployed_data['status_group'])
In [12]:
clean_test_data = process_data.iloc[test_data.index].dropna()
predictions = model.predict(process_data.iloc[test_data.index][importance.index[:50]].dropna())
In [13]:
pred = pd.Series(predictions, index=clean_test_data.index, name='status_group')
missing_index = list()
for i in test_data.index:
if i not in clean_test_data.index:
missing_index.append(i)
data_list = list()
pd.np.random.seed(12345)
for rnd in pd.np.random.rand(len(missing_index)):
if rnd < 0.072677:
data_list.append('functional needs repair')
elif rnd < 0.384242 + 0.072677:
data_list.append('non functional')
else:
data_list.append('functional')
fill = pd.Series(data_list, index=missing_index)
pred = pred.append(fill)
to_file = pred[test_data.index]
to_file.to_csv('optimizeRndForest.csv', index_label='id', header=('status_group',))
With the previous model I got a score of 0.7593 on DrivenData. Can do better...
Let's focus on the most important categories
In [26]:
# Extract predictors and convert categorical variables in dichotomic variables
# predictors_name = ['water_amount', 'height', 'longitude', 'latitude',
# 'basin', 'region', 'population', 'public_meeting', 'management_group',
# 'permit', 'construction_year', 'extraction_type_class', 'payment_type',
# 'quality_group', 'quantity_group', 'source_type', 'waterpoint_type_group',
# 'installer', 'funder']
# categorical_predictors = ('basin', 'region', 'management_group', 'extraction_type_class',
# 'payment_type', 'quality_group', 'quantity_group',
# 'source_type', 'waterpoint_type_group', 'installer', 'funder')
predictors_name = ['height', 'longitude', 'latitude', 'population',
'permit', 'construction_year', 'extraction_type_class', 'payment_type',
'quantity_group', 'waterpoint_type_group']
categorical_predictors = ('extraction_type_class', 'payment_type', 'quantity_group',
'waterpoint_type_group')
process_data = pd.DataFrame()
for name in predictors_name:
if name in categorical_predictors:
classes = data[name].unique()
deployed_categories = preprocessing.label_binarize(data[name], classes=classes)
# Avoid class name collision
classe_names = list()
for c in classes:
if c in process_data.columns:
classe_names.append('_'.join((c, name)))
else:
classe_names.append(c)
tmp_df = pd.DataFrame(deployed_categories,
columns=classe_names,
index=data.index)
process_data = process_data.join(tmp_df)
else:
process_data[name] = data[name]
predictors_columns = process_data.columns
deployed_data = (process_data.iloc[training_data.index]
.join(training_label['status_group'])
.dropna())
# Create two columns one collapsing 'functional' and 'functional needs repair'
# and the other one collapsing 'non functional' and 'functional needs repair'
deployed_data['functional'] = deployed_data['status_group'].map({'functional' : 1,
'functional needs repair' : 1,
'non functional' : 0})
deployed_data['no_repairs'] = deployed_data['status_group'].map({'functional' : 1,
'functional needs repair' : 0,
'non functional' : 0})
predictors = deployed_data[predictors_columns]
In [24]:
# fit an Extra Trees model to the data and look at the first 20 important fields
model = ExtraTreesClassifier()
model.fit(predictors, deployed_data['status_group'])
# display the relative importance of each attribute
cm = sns.light_palette("yellow", as_cmap=True)
(pd.Series(model.feature_importances_, index=predictors.columns, name='importance')
.sort_values(ascending=False)
.to_frame()
.iloc[:20]
.style.background_gradient(cmap=cm))
Out[24]:
In [25]:
# Visualize status only for the training set
field = 'status_group'
display_markdown("### {} distribution".format(' '.join(field.split('_')).capitalize()),
raw=True)
tmp_df = training_label[field].value_counts(dropna=False)
tmp_df.index.name = tmp_df.name
tmp_df.name = 'unnormalized'
tmp_df2 = training_label[field].value_counts(dropna=False, normalize=True)
tmp_df2.name = 'normalized'
tmp_df = tmp_df.to_frame().join(tmp_df2)
display(tmp_df.T)
fig = plt.figure()
ax = fig.add_subplot(111)
ax = sns.countplot(field, data=training_label, ax=ax)
lbls = ax.get_xticklabels()
if len(lbls) > 7:
ax.set_xticklabels(lbls, rotation=90)
plt.show()
In [26]:
pd.set_option('display.max_columns', 25)
list_fields = list(categorical_predictors)
list_fields.extend(('public_meeting', 'permit'))
for field in list_fields:
field_name = ' '.join(field.split('_'))
display_markdown("### {} distribution".format(field_name.capitalize()),
raw=True)
tmp_df = data[field].value_counts(dropna=False)
tmp_df.index.name = tmp_df.name
tmp_df.name = 'unnormalized'
tmp_df2 = data[field].value_counts(dropna=False, normalize=True)
tmp_df2.name = 'normalized'
tmp_df = tmp_df.to_frame().join(tmp_df2)
# display(data[field].value_counts(dropna=False).to_frame().T)
display(tmp_df.T)
fig = plt.figure()
ax = fig.add_subplot(111)
ax = sns.countplot(field, data=data, ax=ax)
lbls = ax.get_xticklabels()
if len(lbls) > 7:
ax.set_xticklabels(lbls, rotation=90)
ax.set_xlabel(field_name)
plt.show()
In [27]:
# for field in ('amount_tsh', 'gps_height', 'latitude', 'longitude', 'construction_year', 'population'):
for field in ('gps_height', 'latitude', 'longitude', 'construction_year', 'population'):
field_name = ' '.join(field.split('_'))
display_markdown("### {} distribution".format(field_name.capitalize()),
raw=True)
clean_field = training_data[field].map(lambda x: x if abs(x)>1e-8 else pd.np.nan)
clean_field = clean_field.dropna()
display(clean_field.describe().to_frame().T)
fig = plt.figure()
ax = fig.add_subplot(111)
ax = sns.distplot(clean_field)
ax.set_xlabel(field_name)
plt.show()
If the response variable is categorical with more than two categories, it is adviced to collapse the categories into two categories. So for presenting the bivariate distribution, the functional and functional needs repair categories will be collapsed. Then the distribution for the data falling in that new category will depicted.
The distribution will be presented using bar chart. For quantitative explanatory variables, the data will be collapsed in two categories defined by the median.
Along with the visualization, hypothesis testing will be carried out. As the response variable is categorical, the method will be the chi-square test.
In [43]:
list_fields = list(categorical_predictors)
# list_fields.extend(('public_meeting', 'permit'))
list_fields.extend(('permit', ))
for field in list_fields:
field_name = ' '.join(field.split('_'))
display_markdown("### {}".format(field_name.capitalize()),
raw=True)
var_analysis = clean_data[['status_group', 'functional', 'no_repairs', field]]
if len(var_analysis[field].unique()) > 2:
# Chi-square test
display_markdown("Contingency table of observed counts", raw=True)
ct = pd.crosstab(var_analysis['status_group'], var_analysis[field])
display(ct)
ct_norm = ct/ct.sum(axis=0)
display(ct_norm)
ct = pd.crosstab(var_analysis['functional'], var_analysis[field])
chisqr = stats.chi2_contingency(ct)
if chisqr[1] > 0.05:
display_markdown("There is *no* significant relationship between {} and the functional"
" status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
raw=True)
else:
display_markdown("There is a significant relationship between {} and the functional"
" status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
raw=True)
ct = pd.crosstab(var_analysis['no_repairs'], var_analysis[field])
chisqr = stats.chi2_contingency(ct)
if chisqr[1] > 0.05:
display_markdown("There is *no* significant relationship between {} and the no repairs"
" status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
raw=True)
else:
display_markdown("There is a significant relationship between {} and the no repairs"
" status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
raw=True)
fig = plt.figure(figsize=[1.5*l for l in plt.rcParams['figure.figsize']])
ax = fig.add_subplot(121)
ax = sns.barplot(x=field, y='functional', data=var_analysis, ci=None, ax=ax)
ax.set_xlabel(field_name)
lbls = ax.get_xticklabels()
if len(lbls) > 5:
ax.set_xticklabels(lbls, rotation=90)
ax = fig.add_subplot(122)
ax = sns.barplot(x=field,
y='no_repairs',
data=var_analysis.where(var_analysis['functional'] == 1),
ci=None,
ax=ax,
order=[l.get_text() for l in lbls])
ax.set_xlabel(field_name)
lbls = ax.get_xticklabels()
if len(lbls) > 5:
ax.set_xticklabels(lbls, rotation=90)
plt.show()
display_markdown("Test again *functional* status", raw=True)
categories = var_analysis[field].unique()
list_field = list(categories)
p_values = dict()
for i in range(len(list_field)):
for j in range(i+1, len(list_field)):
cat1 = list_field[i]
cat2 = list_field[j]
explanatory = var_analysis[field].map(dict(((cat1, cat1),
(cat2, cat2))))
comparison = pd.crosstab(var_analysis['functional'], explanatory)
# display(Markdown("Crosstable to compare {} and {}".format(cat1, cat2)))
# display(comparison)
# display(comparison/comparison.sum(axis=0))
chi_square, p, _, expected_counts = stats.chi2_contingency(comparison)
p_values[(cat1, cat2)] = p
df = pd.DataFrame(p_values, index=['p-value', ])
display(df.stack(level=[0, 1])['p-value']
.rename('p-value')
.to_frame()
.assign(Ha=lambda x: x['p-value'] < 0.05 / len(p_values)))
display_markdown("Test again *no repairs* status", raw=True)
categories = var_analysis[field].unique()
list_field = list(categories)
p_values = dict()
for i in range(len(list_field)):
for j in range(i+1, len(list_field)):
cat1 = list_field[i]
cat2 = list_field[j]
explanatory = var_analysis[field].map(dict(((cat1, cat1),
(cat2, cat2))))
comparison = pd.crosstab(var_analysis['no_repairs'], explanatory)
# display(Markdown("Crosstable to compare {} and {}".format(cat1, cat2)))
# display(comparison)
# display(comparison/comparison.sum(axis=0))
chi_square, p, _, expected_counts = stats.chi2_contingency(comparison)
p_values[(cat1, cat2)] = p
df = pd.DataFrame(p_values, index=['p-value', ])
display(df.stack(level=[0, 1])['p-value']
.rename('p-value')
.to_frame()
.assign(Ha=lambda x: x['p-value'] < 0.05 / len(p_values)))
else:
# Chi-square test
display_markdown("Contingency table of observed counts", raw=True)
ct = pd.crosstab(var_analysis['status_group'], var_analysis[field])
display(ct)
ct_norm = ct/ct.sum(axis=0)
display(ct_norm)
chisqr = stats.chi2_contingency(ct)
if chisqr[1] > 0.05:
display_markdown("There is *no* significant relationship between {} and the functional"
" status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
raw=True)
else:
display_markdown("There is a significant relationship between {} and the functional"
" status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
raw=True)
fig = plt.figure(figsize=[1.5*l for l in plt.rcParams['figure.figsize']])
ax = fig.add_subplot(121)
ax = sns.barplot(x=field, y='functional', data=var_analysis, ci=None, ax=ax,
order=[True, False])
ax.set_xlabel(field_name)
lbls = ax.get_xticklabels()
if len(lbls) > 5:
ax.set_xticklabels(lbls, rotation=90)
ax = fig.add_subplot(122)
ax = sns.barplot(x=field,
y='no_repairs',
data=var_analysis.where(var_analysis['functional'] == 1),
ci=None,
ax=ax,
order=[True, False])
ax.set_xlabel(field_name)
lbls = ax.get_xticklabels()
if len(lbls) > 5:
ax.set_xticklabels(lbls, rotation=90)
plt.show()
In [45]:
# for field in ('amount_tsh', 'gps_height', 'latitude', 'longitude', 'construction_year', 'population'):
for field in ('gps_height', 'latitude', 'longitude', 'construction_year', 'population'):
field_name = ' '.join(field.split('_'))
display_markdown("### {}".format(field_name.capitalize()),
raw=True)
var_analysis = clean_data[['status_group', 'functional', 'no_repairs']]
clean_field = clean_data[field].map(lambda x: x if abs(x)>1e-8 else pd.np.nan)
var_analysis = var_analysis.join(clean_field).dropna()
var_analysis[field+'grp4'] = pd.qcut(var_analysis[field],
2,
labels=["1=50th%tile",
"2=100th%tile"])
# 4,
# labels=["1=25th%tile", "2=50th%tile",
# "3=75th%tile", "4=100th%tile"])
# Chi-square test
display_markdown("Contingency table of observed counts", raw=True)
ct = pd.crosstab(var_analysis['status_group'], var_analysis[field+'grp4'])
display(ct)
ct_norm = ct/ct.sum(axis=0)
display(ct_norm)
ct = pd.crosstab(var_analysis['functional'], var_analysis[field+'grp4'])
chisqr = stats.chi2_contingency(ct)
if chisqr[1] > 0.05:
display_markdown("There is *no* significant relationship between {} and the functional"
" status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
raw=True)
else:
display_markdown("There is a significant relationship between {} and the functional"
" status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
raw=True)
ct = pd.crosstab(var_analysis['no_repairs'], var_analysis[field+'grp4'])
chisqr = stats.chi2_contingency(ct)
if chisqr[1] > 0.05:
display_markdown("There is *no* significant relationship between {} and the no repair"
" status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
raw=True)
else:
display_markdown("There is a significant relationship between {} and the no repair"
" status of the waterpoint (p-value = {:.2g}).".format(field_name, chisqr[1]),
raw=True)
fig = plt.figure(figsize=[1.5*l for l in plt.rcParams['figure.figsize']])
ax = fig.add_subplot(121)
ax = sns.barplot(x=field+'grp4', y='functional', data=var_analysis, ci=None, ax=ax)
ax.set_xlabel(field_name)
ax = fig.add_subplot(122)
ax = sns.barplot(x=field+'grp4',
y='no_repairs',
# data=var_analysis.where(var_analysis['functional'] == 1),
data=var_analysis,
ci=None,
ax=ax)
ax.set_xlabel(field_name)
plt.show()
In [30]:
# fit an Extra Trees model to the data and look at the first 20 important fields
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(predictors, deployed_data['status_group'])
# display the relative importance of each attribute
cm = sns.light_palette("yellow", as_cmap=True)
(pd.Series(model.feature_importances_, index=predictors.columns, name='importance')
.sort_values(ascending=False)
.to_frame()
.iloc[:20]
.style.background_gradient(cmap=cm))
Out[30]:
In [33]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=25)
model = model.fit(predictors, deployed_data['status_group'])
clean_test_data = process_data.iloc[test_data.index].dropna()
predictions = model.predict(process_data.iloc[test_data.index].dropna())
pred = pd.Series(predictions, index=clean_test_data.index, name='status_group')
missing_index = list()
for i in test_data.index:
if i not in clean_test_data.index:
missing_index.append(i)
data_list = list()
pd.np.random.seed(12345)
for rnd in pd.np.random.rand(len(missing_index)):
if rnd < 0.072677:
data_list.append('functional needs repair')
elif rnd < 0.384242 + 0.072677:
data_list.append('non functional')
else:
data_list.append('functional')
fill = pd.Series(data_list, index=missing_index)
pred = pred.append(fill)
to_file = pred[test_data.index]
to_file.to_csv('randomForest.csv', index_label='id', header=('status_group',))
len(missing_index)
Out[33]:
In [65]:
# Logistic regression trial
import statsmodels.formula.api as smf
# Rename columns to be pythonic
logit_expl = predictors.copy()
logit_expl.columns = ['_'.join(name.split()) for name in logit_expl.columns]
logit_expl.columns = ['_'.join(name.split('-')) for name in logit_expl.columns]
# Need to center the quantitative variable
for field in ('height', 'latitude', 'longitude', 'construction_year', 'population'):
logit_expl[field] = logit_expl[field] - logit_expl[field].mean()
logit_expl['permit'] = pd.to_numeric(logit_expl['permit'])
formula = 'functional ~ '
formula = formula + ' + '.join(logit_expl.columns)
centered_data = logit_expl.join(deployed_data['functional']).dropna()
model = smf.logit(formula=formula, data=centered_data).fit()
model.summary()
Out[65]:
In [ ]: