PUMP IT

Using data from Taarifa and the Tanzanian Ministry of Water, can you predict which pumps are functional, which need some repairs, and which don't work at all? This is an intermediate-level practice competition. Predict one of these three classes based on a number of variables about what kind of pump is operating, when it was installed, and how it is managed. A smart understanding of which waterpoints will fail can improve maintenance operations and ensure that clean, potable water is available to communities across Tanzania.

An interactive course exploring this dataset is currently offered by DataCamp.com!

Competition End Date: Jan. 28, 2017, 11:59 p.m.

This competition is for learning and exploring, so the deadline may be extended in the future.

Import Libraries



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

np.random.seed(69572)

%matplotlib inline

%load_ext writeandexecute

# plt.figure(figsize=(120,10))

small = (4,3)
mid = (10, 8)
large = (12, 8)









    



'writeandexecute' magic loaded.

Custom Functions

MarkUP Fns



In [2]:

    
from __future__ import absolute_import
from IPython.core.getipython import get_ipython
from IPython.core.magic import (Magics, magics_class,  cell_magic)
import sys
from StringIO import StringIO
from markdown import markdown
from IPython.core.display import HTML
 
@magics_class
class MarkdownMagics(Magics):
 
    @cell_magic
    def asmarkdown(self, line, cell):
        buffer = StringIO()
        stdout = sys.stdout
        sys.stdout = buffer
        try:
            exec(cell, locals(), self.shell.user_ns)
        except:
            sys.stdout = stdout
            raise
        sys.stdout = stdout
        return HTML("<p>{}</p>".format(markdown(buffer.getvalue(), extensions=['markdown.extensions.extra'])))
        return buffer.getvalue() + 'test'
 
get_ipython().register_magics(MarkdownMagics)

DataFrame Value Counts



In [3]:

    
def raw_markup_value_counts(dataframe, max_print_value_counts=30, show_plots=False):
    '''
    prints value counts of each feature in data frame
    '''
    mydf = pd.DataFrame.copy(dataframe)
    i = 0
    raw_markup_data = []
    pp = raw_markup_data.append
    pp('''|Col ID|Col Name|UniqCount|Col Values|UniqValCount|''')
    pp('''|------|--------|---------|----------|------------|''')
    for col in mydf.dtypes.index:
        i += 1
        sam = mydf[col]
        tmp = len(sam.value_counts())
        if tmp < max_print_value_counts:
            flag = True
            for key, val in dict(sam.value_counts()).iteritems():
                if flag:
                    pp('|%i|%s|%i|%s|%s|' % (
                            i, col, len(sam.value_counts()), key, val))
                    flag = False
                else:
                    pp('||-|-|%s|%s|' % (key, val))
            if show_plots:
                plt.figure(i)
                ax = sam.value_counts().plot(kind='barh', figsize=(12, 5))
                _ = plt.title(col.upper())
                _ = plt.xlabel('counts')
        else:
            pp('|%i|%s|%i|||' % (i, col, len(sam.value_counts())))
    return raw_markup_data

Confusion Matrix



In [4]:

    
from __future__ import division
import itertools
from sklearn.metrics import confusion_matrix


def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')    

    
def confusion_maxtrix_stuff(y_test, y_pred, class_names):
    '''
    Example
    >>> confusion_maxtrix_stuff(y_test,
                                y_pred,
                                class_names=RAW_y.status_group.value_counts().keys()
                                ):
    '''
    # Compute confusion matrix
    cnf_matrix = confusion_matrix(y_test, y_pred)
    np.set_printoptions(precision=2)
    # Plot non-normalized confusion matrix
    plt.figure(figsize=(8,8))
    plot_confusion_matrix(cnf_matrix, classes=class_names,
                          title='Confusion matrix, without normalization')
    # Plot normalized confusion matrix
    plt.figure(figsize=(8,8))
    plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                          title='Normalized confusion matrix')
    plt.show()

Import & Explore Data



In [5]:

    
RAW_X = pd.read_csv('traning_set_values.csv', index_col='id')
RAW_y = pd.read_csv('training_set_labels.csv', index_col='id')
RAW_TEST_X = pd.read_csv('test_set_values.csv', index_col='id')

%%asmarkdown # show me the graphs tmp = raw_markup_value_counts(RAW_X, 10, True)

Pre Processing

Log_Lat_Help: Link

Num Data Tranformation

date_recorded --> Int
longitude --> Float(less precision)
latitude --> Float(less precision)
public_meeting --> Bool
permit --> Bool



In [6]:

    
from datetime import datetime
strptime = datetime.strptime

DATE_FORMAT = "%Y-%m-%d"
REFERENCE_DATE_POINT = strptime('2000-01-01', DATE_FORMAT)

# Reducing geo location precision to 11 meters
LONG_LAT_PRECISION = 0.001

def sam_datetime_to_number(x):
    return (strptime(str(x), DATE_FORMAT) - REFERENCE_DATE_POINT).days


# Transforming Date to Int.
if RAW_X.date_recorded.dtype == 'O':
    RAW_X.date_recorded = RAW_X.date_recorded.map(sam_datetime_to_number)
    RAW_TEST_X.date_recorded = RAW_TEST_X.date_recorded.map(sam_datetime_to_number)


# Filling Missing/OUTLIAR Values
_ = np.mean(RAW_X[u'latitude'][RAW_X.latitude < -1.0].values)
if not RAW_X.loc[RAW_X.latitude >= -1.0, u'latitude'].empty:
    RAW_X.loc[RAW_X.latitude >= -1.0, u'latitude'] = _
    RAW_TEST_X.loc[RAW_TEST_X.latitude >= -1.0, u'latitude'] = _


# Filling Missing/OUTLIAR Values
_ = np.mean(RAW_X[u'longitude'][RAW_X[u'longitude'] > 1.0].values)
if not RAW_X.loc[RAW_X[u'longitude'] <= 1.0, u'longitude'].empty:
    RAW_X.loc[RAW_X[u'longitude'] <= 1.0, u'longitude'] = _
    RAW_TEST_X.loc[RAW_TEST_X[u'longitude'] <= 1.0, u'longitude'] = _


# Reducing Precision of Lat.
if RAW_X.longitude.mean() < 50:
    RAW_X.longitude = RAW_X.longitude // LONG_LAT_PRECISION
    RAW_X.latitude = RAW_X.latitude // LONG_LAT_PRECISION
    RAW_TEST_X.longitude = RAW_TEST_X.longitude // LONG_LAT_PRECISION
    RAW_TEST_X.latitude = RAW_TEST_X.latitude // LONG_LAT_PRECISION

    
# Filling Missing/OUTLIAR Values
if RAW_X.public_meeting.dtype != 'bool':
    RAW_X.public_meeting = RAW_X.public_meeting == True
    RAW_TEST_X.public_meeting = RAW_TEST_X.public_meeting == True


if RAW_X.permit.dtype != 'bool':
    RAW_X.permit = RAW_X.permit == True
    RAW_TEST_X.permit = RAW_TEST_X.permit == True



In [7]:

    
if list(RAW_TEST_X.dtypes[RAW_TEST_X.dtypes != RAW_X.dtypes]):
    raise Exception('RAW_X.dtypes and RAW_TEST_X.dtypes are not in Sync')

Text Data Tranformations



In [8]:

    
def text_transformation(name):
    if name:
        name = name.lower().strip()
        name = ''.join([i if 96 < ord(i) < 128 else ' ' for i in name])
        if 'and' in name:
            name = name.replace('and', ' ')
        if '/' in name:
            name = name.replace('/', ' ')
        while '  ' in name:
            name = name.replace('  ', ' ')
        return name.strip()
    return

for col in RAW_X.dtypes[RAW_X.dtypes == object].index:
    aa = len(RAW_X[col].unique())
    RAW_X[col] = RAW_X[col].fillna('').apply(text_transformation)
    RAW_TEST_X[col] = RAW_TEST_X[col].fillna('').apply(text_transformation)
    bb = len(RAW_X[col].unique())
    if aa != bb:
        print col, aa, bb









    



funder 1898 1881
installer 2146 1867
wpt_name 37400 36717
subvillage 19288 19176
scheme_name 2697 2485

RAW_X.funder.unique()

"setting a limit factor of 100 categorical values are allowed" MAX_CATEGORICAL_GROUPS_LIMIT = 150 icols = {} for col in RAW_X.dtypes[RAW_X.dtypes == object].index: tmp = len(RAW_X[col].unique()) if tmp > MAX_CATEGORICAL_GROUPS_LIMIT: icols[col] = tmp icols

from collections import Counter import pickle

funder_dict = Counter(RAW_X.funder) tmp = funder_dict.keys() tmp.sort() pickle.dump(funder_dict, open('funder.pkl', 'w'))

%pprint on

tmp2 = funder_dict.keys() tmp2 = Counter(tmp).values() tmp2.sort() tmp2 = np.array(tmp2) _ = plt.plot(tmp2)

Cols vs Uniq distribution

x = [] y = [] i = 0 for col in RAW_X.dtypes[RAW_X.dtypes == object].index: # print col, len(RAW_X[col].unique()) i += 1 x.append(i) y.append(len(RAW_X[col].unique())) y.sort() plt.figure(figsize=(12, 2)) plt.title('Cols vs Uniq categorical lables') plt.xlabel('Columns ID\'s') plt.ylabel('Uniq values') _ = plt.semilogy(x, np.array(y))

_ = plt.semilogy(x, np.array(y))

y = np.array(sorted(y)) plt.hist(np.log(y))

_ = plt.scatter(np.arange(len(y)), y)

Data Distribution

y = np.log(dict(RAW_X.installer.value_counts()).values()) print 'Total no.of unique values', len(y) print 'Max', y.max() print 'Min', y.min() print 'Values more than 1', len(y[y > 0]) #_ = plt.plot(np.arange(len(y)), y)

# discard all minor values y1 = y[y > 0] # find the max limt for accepting print y1.mean(), y1.std() # _ = plt.hist(y1) print 'with restpect to mean, 2 standard deviation', y1.mean() + (2 * y1.std())

y = np.log(dict(RAW_X.installer.value_counts()).values()) # Discard all values appeared only once. y = y[y > 0] y = y[y <= 5.13106006715] print 'with restpect to mean, 2 standard deviation', y1.mean() + (2 * y1.std()) plt.hist(y)

m = y1.mean() s = y1.std() for i in range(3): print m - (s * i), m + (s * i)

x = np.array([1, 2, 3, 4, 5]) y = np.power(x, 2) # Effectively y = x**2 e = np.array([1.5, 2.6, 3.7, 4.6, 5.5]) plt.errorbar(x, y, e, linestyle='None', marker='^') plt.show()

Finding closely matching words

import difflib tmp2 = difflib.get_close_matches('govern', tmp1, n=100, cutoff=0.66) for each in tmp2: print '%s : %s' % (each, tmp[each])

Vector Transformation

Feature Selection: http://machinelearningmastery.com/feature-selection-machine-learning-python/



In [9]:

    
from collections import defaultdict
from sklearn import preprocessing

# http://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
d = defaultdict(preprocessing.LabelEncoder)

# Labels Fit
sam = pd.concat([RAW_X, RAW_TEST_X]).apply(lambda x: d[x.name].fit(x))

# Labels Transform - Training Data
X = RAW_X.apply(lambda x: d[x.name].transform(x))
TEST_X = RAW_TEST_X.apply(lambda x: d[x.name].transform(x))

le = preprocessing.LabelEncoder().fit(RAW_y[u'status_group'])
y = le.transform(RAW_y[u'status_group'])



In [ ]:

    
# g = sns.PairGrid(X[:1000])
# g.map(plt.scatter);

UniVariate Analysis



In [10]:

    
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2



In [11]:

    
test = SelectKBest(score_func=chi2, k=30)
fit = test.fit(X, y)
cols_names = RAW_X.columns

np.set_printoptions(precision=2)
print(fit.scores_), len(fit.scores_)

col_importances = list(zip(fit.scores_, cols_names))
col_importances.sort(reverse=True)
selected_cols =  [_[-1] for _ in col_importances[:30] ]









    



[  7.49e+04   9.58e+03   7.19e+04   5.13e+05   2.42e+04   4.86e+05
   5.24e+04   1.18e+05   2.11e+02   2.66e+02   1.36e+04   1.81e+03
   1.70e+03   6.45e+02   9.15e+03   3.11e+03   5.16e+03   4.42e+01
        nan   4.39e+02   7.78e+04   2.15e+01   2.43e+04   2.64e+03
   3.43e+03   4.96e+03   1.77e+02   3.02e+01   8.66e+02   4.63e+02
   9.91e-01   4.23e+02   6.73e+02   6.73e+02   4.59e+02   5.68e+02
   3.95e+02   3.35e+03   2.54e+03] 39



In [12]:

    
features = pd.DataFrame(fit.transform(X))
features.columns = selected_cols



In [13]:

    
print len(X.columns), features.shape, len(y)









    



39 (59400, 30) 59400



In [16]:

    
X = pd.DataFrame(fit.transform(X))
TEST_X = pd.DataFrame(fit.transform(TEST_X))

X.columns = selected_cols
TEST_X.columns = selected_cols

PCA



In [ ]:

    
from sklearn.decomposition import PCA



In [ ]:

    
# feature extraction
pca = PCA(n_components=18)
fit = pca.fit(X)



In [ ]:

    
plt.scatter (range(len(fit.explained_variance_ratio_)), fit.explained_variance_ratio_.cumsum())



In [ ]:

    
X = pca.transform(X)
TEST_X = pca.transform(TEST_X)

Test-Train Split



In [17]:

    
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

# X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.25, random_state=42, stratify=y)

# X_train.boxplot() plt.figure(figsize=(12,10)) _ = sns.boxplot(X_train)

Model Training

Random Forest



In [85]:

    
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, class_weight="balanced_subsample", n_jobs=-1)
# class_weight="balanced_subsample"/"balanced"
# criterion="gini"/"entropy"



In [86]:

    
clf









    Out[86]:





RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)



In [87]:

    
clf = clf.fit(X_train, y_train)
pred = clf.predict_proba(X_test)
clf.score(X_test, y_test) # 0.79303132333435367 # 0.80252525252525253 # 0.80303030303030298 # 0.80345117845117842

# 0.79814814814814816
# (n_estimators=100, class_weight="balanced_subsample", n_jobs=-1) 0.80782828282828278
# (n_estimators=100, class_weight="balanced_subsample", n_jobs=-1) 0.81186868686868685









    Out[87]:





0.81186868686868685



In [81]:

    
clf?



In [55]:

    
plt.figure(figsize=(12, 3))

# making importance relative
a, b = min(clf.feature_importances_), max(clf.feature_importances_)
cols_imp = (clf.feature_importances_ - a) /b
_ = plt.scatter(range(30), cols_imp)
_ = plt.plot((0, 29), (0.05,0.05), '-r')
_ = plt.xlabel('Columns')
_ = plt.ylabel('Relative Col Importance')

Scoring

Random Forest Score



In [ ]:

    
from sklearn import metrics



In [59]:

    
print map(lambda x: len(x), [X_test, y_test])

clf.score(X_test, y_test) # 0.79303132333435367 # 0.80252525252525253 # 0.80303030303030298 # 0.80345117845117842









    



[11880, 11880]






    Out[59]:





0.80025252525252522



In [60]:

    
print .79303132333435367 - 0.80345117845117842

print .8285 - 0.80345117845117842, .8285 - .79303132333435367









    



-0.0104198551168
0.0250488215488 0.0354686766656

confusion_maxtrix_stuff(y_test, clf.predict(X_test))

XGBOOST

import xgboost as xgb

# gbm = xgb.XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.05).fit(X_train, y_train) gbm = xgb.XGBClassifier(max_depth=3, n_estimators=100, learning_rate=0.05).fit(X_train, y_train) gbm_predictions = gbm.predict(X_test) print sum(gbm_predictions == y_test)/ len(y_pred) # 0.7279461279461279 confusion_maxtrix_stuff(y_test, predictions)

Submission



In [88]:

    
test_ids = RAW_TEST_X.index

predictions = clf.predict(TEST_X)
print (predictions.shape)
predictions_labels = le.inverse_transform(predictions)
# sub = pd.DataFrame(predictions, columns=list(le.classes_))
sub = pd.DataFrame(predictions_labels, columns=['status_group'])
sub.head()


sub.insert(0, 'id', test_ids)
sub.reset_index()
sub.to_csv('submit.csv', index = False)
sub.head()









    



(14850,)






    Out[88]:






  
    
      
      id
      status_group
    
  
  
    
      0
      50785
      non functional
    
    
      1
      51630
      functional
    
    
      2
      17168
      functional
    
    
      3
      45559
      non functional
    
    
      4
      49871
      functional



In [89]:

    
X.shape









    Out[89]:





(59400, 30)



In [ ]:



In [ ]:

	id	status_group
0	50785	non functional
1	51630	functional
2	17168	functional
3	45559	non functional
4	49871	functional