In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
%matplotlib inline

In [2]:
from rep.utils import train_test_split
from sklearn.metrics import roc_auc_score

sig_data = pd.read_csv('toy_datasets/toyMC_sig_mass.csv', sep='\t')
bck_data = pd.read_csv('toy_datasets/toyMC_bck_mass.csv', sep='\t')

labels = np.array([1] * len(sig_data) + [0] * len(bck_data))
data = pd.concat([sig_data, bck_data])
variables = ["FlightDistance", "FlightDistanceError", "IP", "VertexChi2", "pt", "p0_pt", "p1_pt", "p2_pt", 'LifeTime','dira']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, train_size=0.5)

In [4]:
X_train.shape


Out[4]:
(72449, 40)

Neurolab

Тип сети задаётся опциональным параметром net_type. fit и predict работают в полном соответствии с sklearn'овской спецификацией, никаких предварительных преобразований входных данных производить не нужно. Также, в соответствии с идеологией REP, поддерживается аргумент features, отвечающий за то, по каким признакам идёт построение модели.


In [5]:
import neurolab as nl
f2 = nl.trans.SoftMax()
f = nl.trans.LogSig()
from rep.estimators import NeurolabClassifier
clf = NeurolabClassifier(show=1, layers=[300], transf=[f, f], epochs=10, trainf=nl.train.train_rprop, features=variables)

In [6]:
%time _ = clf.fit(X_train, y_train)


Epoch: 1; Error: 18112.25;
Epoch: 2; Error: 10399.4660253;
Epoch: 3; Error: 10223.0773994;
Epoch: 4; Error: 31063.7017444;
Epoch: 5; Error: 9045.97502239;
Epoch: 6; Error: 10005.9160544;
Epoch: 7; Error: 9074.48056294;
Epoch: 8; Error: 9724.98435978;
Epoch: 9; Error: 8622.03201182;
Epoch: 10; Error: 9058.43591732;
The maximum number of train epochs is reached
CPU times: user 1min 43s, sys: 264 ms, total: 1min 43s
Wall time: 1min 43s

In [7]:
predict_labels = clf.predict(X_test)
predict_proba = clf.predict_proba(X_test)

In [8]:
from sklearn.metrics.metrics import accuracy_score
score = accuracy_score(y_test, predict_labels)
print(score)


0.861131815045

In [9]:
print predict_labels
print predict_proba

print np.allclose(predict_proba.sum(axis=1), 1)
np.unique(predict_proba.sum(axis=1))


[1 1 1 ..., 1 1 1]
[[ 0.05866649  0.94133351]
 [ 0.05850089  0.94149911]
 [ 0.06303893  0.93696107]
 ..., 
 [ 0.06159636  0.93840364]
 [ 0.07083678  0.92916322]
 [ 0.11325916  0.88674084]]
True
Out[9]:
array([ 1.,  1.,  1.])

In [10]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, predict_proba[:, 1])


Out[10]:
0.75127785280820347

Проверим, что set_params работает


In [18]:
clf.set_params(epochs=5, show=0)
%time clf.fit(X_train, y_train)


CPU times: user 51.4 s, sys: 104 ms, total: 51.5 s
Wall time: 51.5 s
Out[18]:
NeurolabClassifier(_prepare_clf=<function newff at 0x7f32dee77c80>,
          _transform_features=<function _min_max_transform at 0x7f32dc9f86e0>,
          _transform_labels=<function _one_hot_transform at 0x7f32dc9f8668>,
          classes_=array([0, 1]),
          clf=<neurolab.core.Net object at 0x7f32df4ca310>, epochs=5,
          initf=<function init_zeros at 0x7f32dee67050>, layers=[300],
          net_type='feed-forward', show=0,
          trainf=<neurolab.core.Trainer object at 0x7f32dee6aed0>,
          transf=[<neurolab.trans.LogSig instance at 0x7f32dee64cb0>, <neurolab.trans.LogSig instance at 0x7f32dee64cb0>])

In [19]:
predict_proba = clf.predict_proba(X_test)

In [20]:
roc_auc_score(y_test, predict_proba[:, 1])


Out[20]:
0.70186063509124186

In [21]:
from sklearn.metrics import classification_report
from sklearn.metrics import zero_one_loss

print "Accuracy:", zero_one_loss(y_test, predict_labels)
print "Classification report:"
print classification_report(y_test, predict_labels)


Accuracy: 0.138868184955
Classification report:
             precision    recall  f1-score   support

          0       0.00      0.00      0.00     10061
          1       0.86      1.00      0.93     62389

avg / total       0.74      0.86      0.80     72450

Проверим, что сеть нормально (де-)сериализуется на диск


In [22]:
import pickle

pickle.dump(clf, open("dump.p", "wb"))
clf_loaded = pickle.load(open("dump.p", "rb"))

In [23]:
predict_proba = clf_loaded.predict_proba(X_test)
roc_auc_score(y_test, predict_proba[:, 1])


Out[23]:
0.70186063509124186

In [ ]: