PUMP IT



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb

from collections import defaultdict
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

np.random.seed(69572)
# plt.figure(figsize=(120,10))



In [2]:

    
RAW_X = pd.read_csv('data/traning_set_values.csv', index_col='id')
y = pd.read_csv('data/training_set_labels.csv', index_col='id')
RAW_TEST_X = pd.read_csv('data/test_set_values.csv', index_col='id')



In [3]:

    
# Below transformation are required for label ecoding

# bool columns
tmp = ['public_meeting', 'permit']
RAW_X[tmp] = RAW_X[tmp].fillna(False)
RAW_TEST_X[tmp] = RAW_TEST_X[tmp].fillna(False)



In [4]:

    
na_cols = 'funder installer subvillage scheme_management scheme_name'.split()
RAW_X[na_cols] = RAW_X[na_cols].fillna('Other')
RAW_TEST_X[na_cols] = RAW_TEST_X[na_cols].fillna('Other')

COMPLETE_RAW_DATA = pd.concat([RAW_X.copy(), RAW_TEST_X.copy()])



In [5]:

    
print(RAW_X.shape)
print(RAW_TEST_X.shape)
print(COMPLETE_RAW_DATA.shape)









    



(59400, 39)
(14850, 39)
(74250, 39)



In [6]:

    
# http://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn
d = defaultdict(preprocessing.LabelEncoder)

COMPLETE_RAW_DATA.apply(lambda x: d[x.name].fit(x))









    Out[6]:





amount_tsh               LabelEncoder()
date_recorded            LabelEncoder()
funder                   LabelEncoder()
gps_height               LabelEncoder()
installer                LabelEncoder()
longitude                LabelEncoder()
latitude                 LabelEncoder()
wpt_name                 LabelEncoder()
num_private              LabelEncoder()
basin                    LabelEncoder()
subvillage               LabelEncoder()
region                   LabelEncoder()
region_code              LabelEncoder()
district_code            LabelEncoder()
lga                      LabelEncoder()
ward                     LabelEncoder()
population               LabelEncoder()
public_meeting           LabelEncoder()
recorded_by              LabelEncoder()
scheme_management        LabelEncoder()
scheme_name              LabelEncoder()
permit                   LabelEncoder()
construction_year        LabelEncoder()
extraction_type          LabelEncoder()
extraction_type_group    LabelEncoder()
extraction_type_class    LabelEncoder()
management               LabelEncoder()
management_group         LabelEncoder()
payment                  LabelEncoder()
payment_type             LabelEncoder()
water_quality            LabelEncoder()
quality_group            LabelEncoder()
quantity                 LabelEncoder()
quantity_group           LabelEncoder()
source                   LabelEncoder()
source_type              LabelEncoder()
source_class             LabelEncoder()
waterpoint_type          LabelEncoder()
waterpoint_type_group    LabelEncoder()
dtype: object



In [7]:

    
# DE-Encoding the variable
X = RAW_X.apply(lambda x: d[x.name].transform(x))

# DE-Encoding the variable
RAW_TEST_X = RAW_TEST_X.apply(lambda x: d[x.name].transform(x))

#
le = preprocessing.LabelEncoder().fit(y)
y = le.transform(y)









    



/Users/sampathm/miniconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:112: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)
/Users/sampathm/miniconda3/lib/python3.5/site-packages/sklearn/preprocessing/label.py:147: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
  y = column_or_1d(y, warn=True)



In [8]:

    
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)









    



/Users/sampathm/miniconda3/lib/python3.5/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

Benchmark



In [9]:

    
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()



In [10]:

    
clf = clf.fit(X_train, y_train)



In [11]:

    
pred = clf.predict_proba(X_test)



In [12]:

    
plt.figure(figsize=(20,10))

ax = plt.scatter(range(len(clf.feature_importances_)), clf.feature_importances_, alpha=0.6)



In [13]:

    
plt.figure(figsize=(20,10))

ax = plt.hist(clf.feature_importances_, alpha=0.6)



In [14]:

    
# len(pred)

clf.score(X_test, y_test) # 0.79303132333435367 # 0.79206203448627688









    Out[14]:





0.79206203448627688



In [16]:

    
test_ids = RAW_TEST_X.index



In [18]:

    
# predictions = clf.predict_proba(test_submission)
predictions = clf.predict(RAW_TEST_X)
print (predictions.shape)
predictions_labels = le.inverse_transform(predictions)
# sub = pd.DataFrame(predictions, columns=list(le.classes_))
sub = pd.DataFrame(predictions_labels, columns=['status_group'])
sub.head()









    



(14850,)






    Out[18]:






  
    
      
      status_group
    
  
  
    
      0
      functional
    
    
      1
      functional
    
    
      2
      functional
    
    
      3
      non functional
    
    
      4
      functional



In [19]:

    
sub.insert(0, 'id', test_ids)
sub.reset_index()
sub.to_csv('submit.csv', index = False)
sub.head()









    Out[19]:






  
    
      
      id
      status_group
    
  
  
    
      0
      50785
      functional
    
    
      1
      51630
      functional
    
    
      2
      17168
      functional
    
    
      3
      45559
      non functional
    
    
      4
      49871
      functional

	id	status_group
0	50785	functional
1	51630	functional
2	17168	functional
3	45559	non functional
4	49871	functional