In [1]:
from sklearn import datasets
from polyssifier import poly
import numpy as np
import pandas as pd
from sklearn.datasets import make_moons, make_classification
import matplotlib.pyplot as plt 
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [3]:
data, label = make_moons(n_samples=500, noise=0.4)
print(data.shape)
print(label.shape)

plt.plot(data[label==0,0], data[label==0,1],'.', alpha=.5);
plt.plot(data[label==1,0], data[label==1,1],'r.', alpha=.5);
report = poly(data,label, n_folds=3, verbose=True, save=False, scale=True,
              feature_selection=False, scoring='auc', concurrency=1)
report.plot_scores()


(500, 2)
(500,)
                                 mean       std       min       max
Multilayer Perceptron train  0.945326  0.003201  0.941682  0.947685
                      test   0.927588  0.008685  0.920743  0.937358
Nearest Neighbors     train  0.941582  0.005668  0.936547  0.947721
                      test   0.919061  0.011822  0.911453  0.932681
SVM                   train  0.917366  0.011477  0.910655  0.930618
                      test   0.903057  0.002547  0.900131  0.904776
Linear SVM            train  0.902608  0.003627  0.899100  0.906343
                      test   0.903144  0.006267  0.897227  0.909711
Decision Tree         train  1.000000  0.000000  1.000000  1.000000
                      test   0.777897  0.032787  0.740964  0.803571
Random Forest         train  0.999731  0.000234  0.999570  1.000000
                      test   0.886372  0.012659  0.872478  0.897251
Logistic Regression   train  0.902572  0.003588  0.899064  0.906235
                      test   0.902028  0.007877  0.893744  0.909421
Naive Bayes           train  0.902736  0.003232  0.900355  0.906415
                      test   0.901061  0.005643  0.894760  0.905647
Voting                train  0.861987  0.021716  0.844311  0.886228
                      test   0.821954  0.013370  0.807229  0.833333

In [4]:
report.scores


Out[4]:
Multilayer Perceptron Nearest Neighbors SVM Linear SVM Decision Tree Random Forest Logistic Regression Naive Bayes Voting
train test train test train test train test train test train test train test train test train test
0 0.941682 0.937358 0.936547 0.932681 0.910655 0.904266 0.902381 0.902494 1 0.803571 1 0.897251 0.902417 0.90292 0.901437 0.902778 0.855422 0.833333
1 0.947685 0.920743 0.940478 0.91305 0.930618 0.900131 0.8991 0.909711 1 0.789157 0.999624 0.889389 0.899064 0.909421 0.900355 0.905647 0.886228 0.825301
2 0.94661 0.924663 0.947721 0.911453 0.910825 0.904776 0.906343 0.897227 1 0.740964 0.99957 0.872478 0.906235 0.893744 0.906415 0.89476 0.844311 0.807229

In [8]:
plt.figure(figsize=(20, 10))
for n, (key, pred) in enumerate(report.predictions.items()):
    plt.subplot(2,5,n+1)
    missclass = np.where(pred==0)[0]
    corrclass = np.where(pred==1)[0]
    plt.plot(data[corrclass,0], data[corrclass,1],'r.', alpha=0.8);
    plt.plot(data[missclass,0], data[missclass,1],'b.', alpha=0.8);
    plt.legend(['class 1', 'class2'])
    plt.title('{0} ({1:.2f}%)'.format(key, 100*np.mean(report.scores[key]['test']) ))



In [22]:
data, label = make_classification(n_samples=500, n_features=20,
                                  n_informative=5, n_redundant=2,
                                  n_repeated=0, n_classes=2,
                                  n_clusters_per_class=2, weights=None,
                                  flip_y=0.01, class_sep=1.0,
                                  hypercube=True, shift=0.0,
                                  scale=1.0, shuffle=True,
                                  random_state=None)
report = poly(data, label, n_folds=3, verbose=True,
              feature_selection=False, save=False, concurrency=1)
report.plot_scores()


                                 mean       std       min       max
Multilayer Perceptron train  1.000000  0.000000  1.000000  1.000000
                      test   0.948804  0.013261  0.933735  0.958692
Nearest Neighbors     train  0.954424  0.008247  0.945191  0.961060
                      test   0.921777  0.028079  0.889917  0.942915
SVM                   train  0.997144  0.000153  0.996988  0.997295
                      test   0.942016  0.013340  0.928571  0.955250
Linear SVM            train  0.948698  0.008763  0.939651  0.957146
                      test   0.932068  0.013884  0.919679  0.947074
Decision Tree         train  1.000000  0.000000  1.000000  1.000000
                      test   0.792001  0.014543  0.778256  0.807229
Random Forest         train  0.999892  0.000172  0.999693  1.000000
                      test   0.948124  0.013647  0.932444  0.957323
Logistic Regression   train  0.950606  0.010007  0.940300  0.960284
                      test   0.930293  0.015136  0.916523  0.946500
Naive Bayes           train  0.967118  0.005045  0.962063  0.972152
                      test   0.947105  0.011460  0.937177  0.959646
Voting                train  0.955847  0.006256  0.948831  0.960843
                      test   0.883773  0.007123  0.879518  0.891997

In [23]:
report.plot_features()


2017-05-05 09:55:01,655 - Report - INFO - Plotting Linear SVM coefs to temp_Linear SVM_feature_ranking.png
2017-05-05 09:55:02,022 - Report - INFO - Plotting Decision Tree coefs to temp_Decision Tree_feature_ranking.png
2017-05-05 09:55:02,373 - Report - INFO - Plotting Random Forest coefs to temp_Random Forest_feature_ranking.png
2017-05-05 09:55:02,718 - Report - INFO - Plotting Logistic Regression coefs to temp_Logistic Regression_feature_ranking.png

In [21]:
fs['Linear SVM']


Out[21]:
array([[ 0.99237181,  0.86387006, -0.09422622,  0.36759695],
       [ 0.56995341,  0.35415505, -0.2165773 ,  0.04691248],
       [ 0.35034115,  0.20246074, -0.22229275, -0.11547062]])

In [11]:
for key, val in report.confusions.items():
    print(key)
    print(val)


Multilayer Perceptron
[[ 219.   29.]
 [  31.  221.]]
Nearest Neighbors
[[ 217.   31.]
 [  38.  214.]]
SVM
[[ 221.   27.]
 [  38.  214.]]
Linear SVM
[[ 197.   51.]
 [  42.  210.]]
Decision Tree
[[ 185.   63.]
 [  56.  196.]]
Random Forest
[[ 208.   40.]
 [  40.  212.]]
Logistic Regression
[[ 197.   51.]
 [  45.  207.]]
Naive Bayes
[[ 197.   51.]
 [  51.  201.]]
Voting
[[ 224.   24.]
 [  45.  207.]]

In [13]:
report.test_proba.head()


Out[13]:
Multilayer Perceptron Nearest Neighbors SVM Linear SVM Decision Tree Random Forest Logistic Regression Naive Bayes Voting
0 0.000005 0.6 0.371396 0.289353 1.0 0.4 0.306328 0.481176 1.0
1 0.999995 0.4 0.628604 0.266117 0.0 0.6 0.693672 0.518824 1.0
2 0.000046 0.2 0.081778 -2.392563 0.0 0.0 0.323797 0.293630 0.0
3 0.999954 0.8 0.918222 -0.211439 1.0 1.0 0.676203 0.706370 0.0
4 1.000000 1.0 0.760359 0.694488 0.0 0.6 0.998923 0.999957 1.0

In [ ]: