In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

In [3]:
import os
data_path = '/home/zelig/Documents/dataQuest/misc/MLPB/Problems/Classify Dart Throwers/_Data'
os.chdir(data_path)

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

dtrain = train[['XCoord', 'YCoord']]
y_train = train['Competitor']
dtest = test[['XCoord', 'YCoord']]
y_test = test['Competitor']

dtrain = dtrain.assign(distFromOrg = np.sqrt(np.square(dtrain['XCoord'])+np.square(dtrain['YCoord'])))
dtest = dtest.assign(distFromOrg = np.sqrt(np.square(dtest['XCoord'])+np.square(dtest['YCoord'])))

In [5]:
colors = ('red', 'blue', 'lightgreen', 'cyan')
throwers = y_train.unique()
fig = plt.figure(figsize=(10,5))
fig.add_axes()
ax = fig.add_subplot(121)
for col,thr in zip(colors,throwers):    
    ax.scatter(dtrain.loc[y_train==thr,'XCoord'],dtrain.loc[y_train==thr,'YCoord'],c=col,label=thr)
    ax.set(title='Training Darts', xlabel='X Coord',ylabel='Y Coord')
    ax.set_yticks(np.arange(-1,1.5,0.5))
    ax.legend(loc='best')


K-Nearest Neighbors (Base Model1)


In [6]:
pipe = Pipeline([['sc',StandardScaler()] , ['knn', KNeighborsClassifier()]])
params = {'knn__n_neighbors':range(1,30)}
grid = GridSearchCV(estimator=pipe, param_grid=params, cv=5)
grid.fit(dtrain,y_train)
clf1 = grid.best_estimator_

In [7]:
y_pred = clf1.predict(dtest)
accuracy_score(y_test,y_pred)


Out[7]:
0.72972972972972971

In [ ]:
fails = np.extract(y_test!=y_pred,y_test)
val, count = np.unique(fails,return_counts=True)
print(np.asarray((val,count)).T)

In [ ]:
y_test.value_counts()

In [ ]:
val, count = np.unique(y_pred,return_counts=True)
print(np.asarray((val,count)).T)

Support Vector Machine (Base Model2)


In [8]:
pipe = Pipeline([['sc',StandardScaler()] , ['svm', LinearSVC(dual=False)]])

params = {'svm__C':[0.001,0.01,0.1,1,10,100,1000], 
          'svm__penalty':['l1', 'l2'], 
          'svm__multi_class':['ovr', 'crammer_singer']}
grid = GridSearchCV(estimator=pipe, param_grid=params, cv=5)
grid.fit(dtrain,y_train)
clf2 = grid.best_estimator_

In [9]:
y_pred = clf2.predict(dtest)
accuracy_score(y_test,y_pred)


Out[9]:
0.83783783783783783

In [ ]:
fails = np.extract(y_test!=y_pred,y_test)
val, count = np.unique(fails,return_counts=True)
print(np.asarray((val,count)).T)

In [ ]:
y_test.value_counts()

In [ ]:
val, count = np.unique(y_pred,return_counts=True)
print(np.asarray((val,count)).T)

Stacking (Meta Ensembling)

1. Partition the training data into test folds


In [10]:
splitter = StratifiedKFold(n_splits=5, shuffle=True)
kfolds = splitter.split(dtrain,y_train)

kfolds = [fold for _,fold in kfolds]

2. Create a dataset called train_meta with the same row Ids and fold Ids as the training dataset. Similarly create a dataset called test_meta with the same row Ids as the test dataset and columns M1 and M2. For each base model:

2.1 Fit the model to the training fold and make predictions on the test fold. Store these predictions in train_meta to be used as features for the stacking model.

2.2 Fit each base model to the full training dataset and make predictions on the test dataset. Store these predictions inside test_metal.


In [11]:
def get_train_meta(clf,kfolds):
    meta = pd.Series(index=dtrain.index)
    for test_fold in kfolds:
        train_fold = ~dtrain.index.isin(test_fold)
        clf.fit(dtrain.ix[train_fold], y_train.ix[train_fold])
        meta.ix[test_fold] = clf.predict(dtrain.ix[test_fold])
    return meta

In [12]:
def get_test_meta(clf):
    return pd.Series(clf.predict(dtest))

In [13]:
params = clf1.get_params()['steps'][1][1].get_params()
clf = KNeighborsClassifier().set_params(**params)
train_meta_clf1 = get_train_meta(clf,kfolds)
test_meta_clf1 = get_test_meta(clf1)

params = clf2.get_params()['steps'][1][1].get_params()
clf = LinearSVC(dual=False).set_params(**params)
train_meta_clf2 = get_train_meta(clf,kfolds)
test_meta_clf2 = get_test_meta(clf2)

In [14]:
sc = StandardScaler()
dtrain_std = pd.DataFrame(sc.fit_transform(dtrain))
dtest_std = pd.DataFrame(sc.fit_transform(dtest))

In [15]:
train_meta = dtrain_std.assign(meta_clf1=train_meta_clf1).assign(meta_clf2=train_meta_clf2)
test_meta = dtest_std.assign(meta_clf1=test_meta_clf1).assign(meta_clf2=test_meta_clf2)

In [16]:
X_train = pd.get_dummies(train_meta)

In [ ]:
X_train.head()

3. Fit a new model, S (i.e the stacking model) to train_meta, using M1 and M2 as features. Optionally, include other features from the original training dataset or engineered features


In [17]:
X_train = pd.get_dummies(train_meta)
lr = LogisticRegression()
params = {'C':[.001, .01, .1, 1, 10, 100], 'penalty':['l1', 'l2']}

grid = GridSearchCV(estimator=lr, param_grid=params, cv=5)
grid.fit(X_train,y_train)
clf = grid.best_estimator_

In [ ]:
clf.get_params()

In [ ]:
grid.cv_results_

4. Use the stacked model S to make final predictions on test_meta


In [18]:
X_test = pd.get_dummies(test_meta)

In [19]:
blind_spots = [c for c in X_train.columns if c not in X_test.columns]

In [20]:
for att in blind_spots:
    X_test[att] = np.zeros(len(X_test), dtype=int)

In [ ]:
X_test.head()

In [21]:
y_pred = clf.predict(X_test)
accuracy_score(y_test,y_pred)


Out[21]:
0.7567567567567568

In [ ]:


In [ ]: