Mostly a replicaiton of what was done if Pump-02, where we did data transformation and GridCV Selection.

As we futher took the data transformation of Variance Threshold checking and KBest Features selection, we need to recheck our evaluation methods



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import xgboost as xgb

from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from scripts.tools import sam_pickle_load, df_check_stats, check_metric
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import chi2, SelectPercentile, VarianceThreshold, SelectKBest
from sklearn.ensemble import GradientBoostingClassifier

%matplotlib inline


np.set_printoptions(precision=5)
np.random.seed(69572)
plt.style.use('ggplot')
sns.set(color_codes=True)

crazy_list = dir()



In [2]:

    
for each in dir():
    if each not in crazy_list:
        del each

print('Length of dir():', len(dir()))









    



Length of dir(): 45



In [3]:

    
X, y, TEST_X = sam_pickle_load(prefix='tmp/Iteration2_final_')

# preprocess dataset, split into training and test part
# X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=.25, random_state=42)









    



LOAD PREFIX USED:  tmp/Iteration2_final_



In [4]:

    
X.head()









    Out[4]:







  
    
      
      date_recorded
      funder
      gps_height
      installer
      longitude
      latitude
      basin
      region
      population
      public_meeting
      ...
      payment_type
      quality_group
      quantity
      quantity_group
      source
      waterpoint_type
      date_recorded_month
      Area
      Population02
      Population12
    
    
      id
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      69572
      54
      704
      1390
      789
      33611
      6653
      1
      3
      109
      1
      ...
      0
      2
      1
      1
      8
      1
      2
      10
      2
      1
    
    
      8776
      322
      793
      1399
      789
      30636
      62952
      4
      9
      279
      0
      ...
      2
      2
      2
      2
      5
      1
      2
      5
      10
      11
    
    
      34310
      313
      793
      685
      962
      53201
      46457
      5
      8
      250
      1
      ...
      5
      2
      1
      1
      0
      2
      1
      14
      4
      7
    
    
      67743
      285
      894
      262
      884
      59962
      611
      7
      12
      58
      1
      ...
      2
      2
      0
      0
      3
      2
      0
      3
      7
      4
    
    
      19728
      111
      793
      1061
      789
      5830
      64510
      4
      4
      0
      1
      ...
      2
      2
      3
      3
      5
      1
      6
      7
      17
      17
    
  

5 rows × 25 columns



In [5]:

    
X.shape









    Out[5]:





(59400, 25)



In [6]:

    
X.dtypes









    Out[6]:





date_recorded          int64
funder                 int64
gps_height             int64
installer              int64
longitude              int64
latitude               int64
basin                  int64
region                 int64
population             int64
public_meeting         int64
scheme_management      int64
permit                 int64
construction_year      int64
extraction_type        int64
management             int64
payment_type           int64
quality_group          int64
quantity               int64
quantity_group         int64
source                 int64
waterpoint_type        int64
date_recorded_month    int64
Area                   int64
Population02           int64
Population12           int64
dtype: object

XGboost Learning Curve

Understanding of how well the models trains might help us to know if the model if learning as time procceds or its stagnated after learning just few sample of data.



In [7]:

    
import xgboost as xgb



In [8]:

    
from sklearn.model_selection import learning_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Generate a simple plot of the test and training learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : int, cross-validation generator or an iterable, optional
        Determines the cross-validation splitting strategy.
        Possible inputs for cv are:
          - None, to use the default 3-fold cross-validation,
          - integer, to specify the number of folds.
          - An object to be used as a cross-validation generator.
          - An iterable yielding train/test splits.

        For integer/None inputs, if ``y`` is binary or multiclass,
        :class:`StratifiedKFold` used. If the estimator is not a classifier
        or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.

        Refer :ref:`User Guide <cross_validation>` for the various
        cross-validators that can be used here.

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
    plt.figure(figsize=(12, 5))
    plt.title(title, fontsize=9)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples", fontsize=8)
    plt.ylabel("Score", fontsize=8)
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)

    print('---------------------------------------')
    print('train_sizes', train_sizes)
    print('train_scores', train_scores)
    print('test_scores', test_scores)
    print('---------------------------------------')

    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt



In [9]:

    
estimator = xgb.XGBClassifier()

plot_learning_curve(estimator=estimator, title='XGB Learning Curve', X=X, y=y, ylim=None, cv=5,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 8))









    



---------------------------------------
train_sizes [ 4751 10861 16971 23080 29190 35299 41409 47519]
train_scores [[ 0.76468  0.75879  0.75879  0.75879  0.75879]
 [ 0.75251  0.74993  0.74993  0.74993  0.74993]
 [ 0.75594  0.75317  0.7524   0.7524   0.7524 ]
 [ 0.74866  0.74467  0.74987  0.74987  0.74987]
 [ 0.74752  0.74573  0.7469   0.75067  0.75067]
 [ 0.74719  0.74676  0.74824  0.74492  0.74492]
 [ 0.74822  0.74781  0.74636  0.74602  0.74629]
 [ 0.74833  0.74806  0.74863  0.74768  0.7465 ]]
test_scores [[ 0.73933  0.72721  0.72929  0.72971  0.73363]
 [ 0.74312  0.73479  0.73569  0.7367   0.7428 ]
 [ 0.74506  0.73698  0.73998  0.7415   0.74592]
 [ 0.74674  0.73731  0.7415   0.74234  0.74836]
 [ 0.74413  0.74009  0.74268  0.74402  0.74962]
 [ 0.74657  0.74144  0.74487  0.742    0.74701]
 [ 0.74506  0.74219  0.74192  0.74436  0.74912]
 [ 0.74573  0.74127  0.74192  0.74369  0.75004]]
---------------------------------------






    Out[9]:





<module 'matplotlib.pyplot' from '/Users/sampathm/miniconda3/lib/python3.5/site-packages/matplotlib/pyplot.py'>



In [10]:

    
np.linspace(.1, 1.0, 8)









    Out[10]:





array([ 0.1    ,  0.22857,  0.35714,  0.48571,  0.61429,  0.74286,
        0.87143,  1.     ])



In [11]:

    
estimator









    Out[11]:





XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

As you can see from above diagram, there are 3 features to identify

Sudden drop at 10K sample point.
Light Green Aura of Green Line & Green Line is slowly reducing till it reached 30K sample.
After 30K Sample point, both the score and the green Aura sighly declined.
After 30K Sample point, Red Aura slowly started decreasing futher.

Based on these features, I believe we can understand that our Xgboost learning kind of stagnated aroudn 30K records point. Overall, we can learn that Algorithm is learning well and reached it stagnation point with current parameters. So either we tune the Algorithm feature to keep learning or we can limit the training data at 30K Limit.

I believe limiting data is bad as it would be a waste of valuble data, we will continue parameter tuning to keep learning.



In [ ]:



In [ ]:



In [ ]:



In [ ]:

Fine Tuning

Pipeling is a meathod of combining several transformaiton techniques and a classifier to run them one after the other



In [12]:

    
print('Length of dir():', len(dir()))

for each in dir():
    if each not in crazy_list:
        del each

print('Length of dir():', len(dir()))









    



Length of dir(): 67
Length of dir(): 66



In [13]:

    
X, y, TEST_X = sam_pickle_load(prefix='tmp/Iteration2_final_')

# preprocess dataset, split into training and test part
# X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=.25, random_state=42)









    



LOAD PREFIX USED:  tmp/Iteration2_final_



In [14]:

    
X.shape









    Out[14]:





(59400, 25)



In [15]:

    
threshold_fns = lambda x: (x * (1 - x))



In [16]:

    
plt.figure(figsize=(12, 5))

plt.title('Variance Threshold Fns : x * ( 1 - x )')
plt.plot(np.linspace(0, 1, 15), [threshold_fns(_) for _ in np.linspace(0, 1, 15)])
plt.xlabel('Input')
plt.ylabel('Outuput - Threshold Values')









    Out[16]:





<matplotlib.text.Text at 0x102604b70>

Variance Threshold func, is to set the variance limit of a columns throught a Threshold Limit. Thus doing variance threshold check and removing features which more than threshold generally help ML algorithms to understand patterns.

Also removing such features which has too high threshold also help ML Algo, to focus on exsiting data rather than wasting time and resource on difficult to learn features.



In [17]:

    
#############################################################################
# Variance Check
vt = VarianceThreshold(threshold=(0.85 * (1 - 0.85)))
threshold_fns = lambda x: (x * (1 - x))
# Select K Best
selection = SelectKBest(chi2)
# Features Selector - Union
combined_features = FeatureUnion([("vt", vt), ("univ_select", selection)])
#############################################################################
# Classifier
clf = xgb.XGBClassifier()
#############################################################################
# Piplining
pipeline = Pipeline([("features", combined_features), ("clf", clf)])


param_grid = dict(features__vt__threshold=[threshold_fns(.65), threshold_fns(.75), threshold_fns(.85)],
                  features__univ_select__k=[15, 20, 25],
                  clf__n_estimators=[100, 150],
                  clf__max_depth=[3, 5],
                  clf__learning_rate=[.3, .1]
                  )

RS = RandomizedSearchCV(pipeline, param_grid, n_iter=10, n_jobs=-1, verbose=1)



In [18]:

    
RS









    Out[18]:





RandomizedSearchCV(cv=None, error_score='raise',
          estimator=Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('vt', VarianceThreshold(threshold=0.1275)), ('univ_select', SelectKBest(k=10, score_func=<function chi2 at 0x108731048>))],
       transformer_weights=None)), ('clf', XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_byt...logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1))]),
          fit_params={}, iid=True, n_iter=10, n_jobs=-1,
          param_distributions={'features__vt__threshold': [0.22749999999999998, 0.1875, 0.1275], 'features__univ_select__k': [15, 20, 25], 'clf__max_depth': [3, 5], 'clf__n_estimators': [100, 150], 'clf__learning_rate': [0.3, 0.1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score=True, scoring=None, verbose=1)



In [19]:

    
RS.fit(X,y)









    



Fitting 3 folds for each of 10 candidates, totalling 30 fits






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-19-b5a49927f8ab> in <module>()
----> 1 RS.fit(X,y)

~/miniconda3/lib/python3.5/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups)
   1188                                           self.n_iter,
   1189                                           random_state=self.random_state)
-> 1190         return self._fit(X, y, groups, sampled_params)

~/miniconda3/lib/python3.5/site-packages/sklearn/model_selection/_search.py in _fit(self, X, y, groups, parameter_iterable)
    562                                   return_times=True, return_parameters=True,
    563                                   error_score=self.error_score)
--> 564           for parameters in parameter_iterable
    565           for train, test in cv_iter)
    566 

~/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    766                 # consumption.
    767                 self._iterating = False
--> 768             self.retrieve()
    769             # Make sure that we get a last message telling us we are done
    770             elapsed_time = time.time() - self._start_time

~/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in retrieve(self)
    717                     ensure_ready = self._managed_backend
    718                     backend.abort_everything(ensure_ready=ensure_ready)
--> 719                 raise exception
    720 
    721     def __call__(self, iterable):

~/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py in retrieve(self)
    680                 # check if timeout supported in backend future implementation
    681                 if 'timeout' in getfullargspec(job.get).args:
--> 682                     self._output.extend(job.get(timeout=self.timeout))
    683                 else:
    684                     self._output.extend(job.get())

~/miniconda3/lib/python3.5/multiprocessing/pool.py in get(self, timeout)
    600 
    601     def get(self, timeout=None):
--> 602         self.wait(timeout)
    603         if not self.ready():
    604             raise TimeoutError

~/miniconda3/lib/python3.5/multiprocessing/pool.py in wait(self, timeout)
    597 
    598     def wait(self, timeout=None):
--> 599         self._event.wait(timeout)
    600 
    601     def get(self, timeout=None):

~/miniconda3/lib/python3.5/threading.py in wait(self, timeout)
    547             signaled = self._flag
    548             if not signaled:
--> 549                 signaled = self._cond.wait(timeout)
    550             return signaled
    551 

~/miniconda3/lib/python3.5/threading.py in wait(self, timeout)
    291         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    292             if timeout is None:
--> 293                 waiter.acquire()
    294                 gotit = True
    295             else:

KeyboardInterrupt:



In [ ]:

    
RS.best_estimator_



In [ ]:

    
RS.best_params_, RS.best_score_

Learning Curve of Best Param



In [ ]:

    
#############################################################################
# Variance Check
vt = VarianceThreshold(threshold=0.22749999999999998)

# Select K Best
selection = SelectKBest(chi2, k=15)

# Features Selector - Union
combined_features = FeatureUnion([("vt", vt), ("univ_select", selection)])
#############################################################################
# Classifier
clf = xgb.XGBClassifier(learning_rate=0.3, n_estimators=100, max_depth=5)
#############################################################################
# Piplining
# pipeline = Pipeline([("features", combined_features), ("clf", clf)])



In [ ]:

    
X_new = combined_features.fit_transform(X, y)



In [ ]:

    
np.linspace(.1, 1.0, 8)



In [ ]:

    
plot_learning_curve(estimator=clf,
                    title='Tuned - XGB Learning Curve',
#                     X=X, # Optimised to X_new
                    X=X_new,
                    y=y, ylim=None, cv=3,
                    n_jobs=1,
#                     train_sizes=np.linspace(.1, 1.0, 8), # optimising search space
                    train_sizes=[0.61429, 0.74286, 0.87143, 1.0 ]
                   )

Submit



In [ ]:

    
# Load Data
X, y, TEST_X = sam_pickle_load(prefix='tmp/Iteration2_final_')

# Variance Check
vt = VarianceThreshold(threshold=0.22749999999999998)

# Select K Best
selection = SelectKBest(chi2, k=15)

# Features Selector - Union
combined_features = FeatureUnion([("vt", vt), ("univ_select", selection)])

# saving the index
test_ids = TEST_X.index

# Data Transformations
X = combined_features.fit_transform(X, y)
TEST_X = combined_features.transform(TEST_X)



In [ ]:

    
# Classifier modeling
clf = xgb.XGBClassifier(learning_rate=0.3, n_estimators=100, max_depth=5)

# Classifier training
clf = clf.fit(X_new, y)



In [ ]:

    
# loading pickle to de-transform y into lables
le = pickle.load(open('tmp/le.pkl', 'rb'))

# predicint the values
predictions = clf.predict(TEST_X)
print(predictions.shape)

# Converting int to its respective Labels
predictions_labels = le.inverse_transform(predictions)

# setting up column name & save file
sub = pd.DataFrame(predictions_labels, columns=['status_group'])
sub.head()
sub.insert(loc=0, column='id', value=test_ids)
sub.reset_index()
sub.to_csv('submit.csv', index=False)
sub.head()



In [ ]:

	date_recorded	funder	gps_height	installer	longitude	latitude	basin	region	population	public_meeting	...	payment_type	quality_group	quantity	quantity_group	source	waterpoint_type	date_recorded_month	Area	Population02	Population12
id
69572	54	704	1390	789	33611	6653	1	3	109	1	...	0	2	1	1	8	1	2	10	2	1
8776	322	793	1399	789	30636	62952	4	9	279	0	...	2	2	2	2	5	1	2	5	10	11
34310	313	793	685	962	53201	46457	5	8	250	1	...	5	2	1	1	0	2	1	14	4	7
67743	285	894	262	884	59962	611	7	12	58	1	...	2	2	0	0	3	2	0	3	7	4
19728	111	793	1061	789	5830	64510	4	4	0	1	...	2	2	3	3	5	1	6	7	17	17

Refinement

XGboost Learning Curve

Fine Tuning

Submit