Mostly a replicaiton of what was done if Pump-02, where we did data transformation and GridCV Selection.
As we futher took the data transformation of Variance Threshold checking and KBest Features selection, we need to recheck our evaluation methods
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import xgboost as xgb
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from scripts.tools import sam_pickle_load, df_check_stats, check_metric
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import chi2, SelectPercentile, VarianceThreshold, SelectKBest
from sklearn.ensemble import GradientBoostingClassifier
%matplotlib inline
np.set_printoptions(precision=5)
np.random.seed(69572)
plt.style.use('ggplot')
sns.set(color_codes=True)
crazy_list = dir()
In [2]:
for each in dir():
if each not in crazy_list:
del each
print('Length of dir():', len(dir()))
In [3]:
X, y, TEST_X = sam_pickle_load(prefix='tmp/Iteration2_final_')
# preprocess dataset, split into training and test part
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
In [4]:
X.head()
Out[4]:
In [5]:
X.shape
Out[5]:
In [6]:
X.dtypes
Out[6]:
In [7]:
import xgboost as xgb
In [8]:
from sklearn.model_selection import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
"""
Generate a simple plot of the test and training learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy.
Possible inputs for cv are:
- None, to use the default 3-fold cross-validation,
- integer, to specify the number of folds.
- An object to be used as a cross-validation generator.
- An iterable yielding train/test splits.
For integer/None inputs, if ``y`` is binary or multiclass,
:class:`StratifiedKFold` used. If the estimator is not a classifier
or if ``y`` is neither binary nor multiclass, :class:`KFold` is used.
Refer :ref:`User Guide <cross_validation>` for the various
cross-validators that can be used here.
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure(figsize=(12, 5))
plt.title(title, fontsize=9)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples", fontsize=8)
plt.ylabel("Score", fontsize=8)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
train_sizes, train_scores, test_scores = learning_curve(
estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
print('---------------------------------------')
print('train_sizes', train_sizes)
print('train_scores', train_scores)
print('test_scores', test_scores)
print('---------------------------------------')
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
return plt
In [9]:
estimator = xgb.XGBClassifier()
plot_learning_curve(estimator=estimator, title='XGB Learning Curve', X=X, y=y, ylim=None, cv=5,
n_jobs=1, train_sizes=np.linspace(.1, 1.0, 8))
Out[9]:
In [10]:
np.linspace(.1, 1.0, 8)
Out[10]:
In [11]:
estimator
Out[11]:
As you can see from above diagram, there are 3 features to identify
Based on these features, I believe we can understand that our Xgboost learning kind of stagnated aroudn 30K records point. Overall, we can learn that Algorithm is learning well and reached it stagnation point with current parameters. So either we tune the Algorithm feature to keep learning or we can limit the training data at 30K Limit.
I believe limiting data is bad as it would be a waste of valuble data, we will continue parameter tuning to keep learning.
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [12]:
print('Length of dir():', len(dir()))
for each in dir():
if each not in crazy_list:
del each
print('Length of dir():', len(dir()))
In [13]:
X, y, TEST_X = sam_pickle_load(prefix='tmp/Iteration2_final_')
# preprocess dataset, split into training and test part
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.25, random_state=42)
In [14]:
X.shape
Out[14]:
In [15]:
threshold_fns = lambda x: (x * (1 - x))
In [16]:
plt.figure(figsize=(12, 5))
plt.title('Variance Threshold Fns : x * ( 1 - x )')
plt.plot(np.linspace(0, 1, 15), [threshold_fns(_) for _ in np.linspace(0, 1, 15)])
plt.xlabel('Input')
plt.ylabel('Outuput - Threshold Values')
Out[16]:
Variance Threshold func, is to set the variance limit of a columns throught a Threshold Limit. Thus doing variance threshold check and removing features which more than threshold generally help ML algorithms to understand patterns.
Also removing such features which has too high threshold also help ML Algo, to focus on exsiting data rather than wasting time and resource on difficult to learn features.
In [17]:
#############################################################################
# Variance Check
vt = VarianceThreshold(threshold=(0.85 * (1 - 0.85)))
threshold_fns = lambda x: (x * (1 - x))
# Select K Best
selection = SelectKBest(chi2)
# Features Selector - Union
combined_features = FeatureUnion([("vt", vt), ("univ_select", selection)])
#############################################################################
# Classifier
clf = xgb.XGBClassifier()
#############################################################################
# Piplining
pipeline = Pipeline([("features", combined_features), ("clf", clf)])
param_grid = dict(features__vt__threshold=[threshold_fns(.65), threshold_fns(.75), threshold_fns(.85)],
features__univ_select__k=[15, 20, 25],
clf__n_estimators=[100, 150],
clf__max_depth=[3, 5],
clf__learning_rate=[.3, .1]
)
RS = RandomizedSearchCV(pipeline, param_grid, n_iter=10, n_jobs=-1, verbose=1)
In [18]:
RS
Out[18]:
In [19]:
RS.fit(X,y)
In [ ]:
RS.best_estimator_
In [ ]:
RS.best_params_, RS.best_score_
Learning Curve of Best Param
In [ ]:
#############################################################################
# Variance Check
vt = VarianceThreshold(threshold=0.22749999999999998)
# Select K Best
selection = SelectKBest(chi2, k=15)
# Features Selector - Union
combined_features = FeatureUnion([("vt", vt), ("univ_select", selection)])
#############################################################################
# Classifier
clf = xgb.XGBClassifier(learning_rate=0.3, n_estimators=100, max_depth=5)
#############################################################################
# Piplining
# pipeline = Pipeline([("features", combined_features), ("clf", clf)])
In [ ]:
X_new = combined_features.fit_transform(X, y)
In [ ]:
np.linspace(.1, 1.0, 8)
In [ ]:
plot_learning_curve(estimator=clf,
title='Tuned - XGB Learning Curve',
# X=X, # Optimised to X_new
X=X_new,
y=y, ylim=None, cv=3,
n_jobs=1,
# train_sizes=np.linspace(.1, 1.0, 8), # optimising search space
train_sizes=[0.61429, 0.74286, 0.87143, 1.0 ]
)
In [ ]:
# Load Data
X, y, TEST_X = sam_pickle_load(prefix='tmp/Iteration2_final_')
# Variance Check
vt = VarianceThreshold(threshold=0.22749999999999998)
# Select K Best
selection = SelectKBest(chi2, k=15)
# Features Selector - Union
combined_features = FeatureUnion([("vt", vt), ("univ_select", selection)])
# saving the index
test_ids = TEST_X.index
# Data Transformations
X = combined_features.fit_transform(X, y)
TEST_X = combined_features.transform(TEST_X)
In [ ]:
# Classifier modeling
clf = xgb.XGBClassifier(learning_rate=0.3, n_estimators=100, max_depth=5)
# Classifier training
clf = clf.fit(X_new, y)
In [ ]:
# loading pickle to de-transform y into lables
le = pickle.load(open('tmp/le.pkl', 'rb'))
# predicint the values
predictions = clf.predict(TEST_X)
print(predictions.shape)
# Converting int to its respective Labels
predictions_labels = le.inverse_transform(predictions)
# setting up column name & save file
sub = pd.DataFrame(predictions_labels, columns=['status_group'])
sub.head()
sub.insert(loc=0, column='id', value=test_ids)
sub.reset_index()
sub.to_csv('submit.csv', index=False)
sub.head()
In [ ]: