This notebook demonstrates MatrixNet service wrapper which is provided by Reproducible experiment platform (REP) package. This service is available for CERN users.
To use MatrixNet, first acquire token::
Add token at the left panelMatrixNet and click Create token~/.rep-matrixnet.config.json file with the following content{
"url": "https://ml.cern.yandex.net/v1",
"token": "<your_token>"
}
In [1]:
!cd toy_datasets; wget -O MiniBooNE_PID.txt -nc --no-check-certificate https://archive.ics.uci.edu/ml/machine-learning-databases/00199/MiniBooNE_PID.txt
In [2]:
import numpy, pandas
from rep.utils import train_test_split
from sklearn.metrics import roc_auc_score
data = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep='\s*', skiprows=[0], header=None, engine='python')
labels = pandas.read_csv('toy_datasets/MiniBooNE_PID.txt', sep=' ', nrows=1, header=None)
labels = [1] * labels[1].values[0] + [0] * labels[2].values[0]
data.columns = ['feature_{}'.format(key) for key in data.columns]
In [3]:
data.head()
Out[3]:
In [4]:
# Get train and test data
train_data, test_data, train_labels, test_labels = train_test_split(data, labels, train_size=0.5)
In [5]:
variables = list(data.columns)[:10]
In [6]:
from rep.estimators import MatrixNetClassifier
In [7]:
import rep
rep.__file__
Out[7]:
In [8]:
print MatrixNetClassifier.__doc__
In [9]:
# configuring classifier (take configuration from $HOME/.rep-matrixnet.config.json)
mn = MatrixNetClassifier(features=variables, iterations=300, sync=False)
# training classifier
mn.fit(train_data, train_labels)
# pay attention: we set sync=False, so training is asynchronous
# we passed the dataset to server and you can do other operations in python when classifier is trained on the server
print('asynchronous training started')
In [10]:
import time
# Check status of training
print 'Is training complete?', mn.training_status()
time.sleep(15)
# get number of iterations
print 'Number of iterations already done', mn.get_iterations()
# Synchronize (wait until the training is complete)
mn.synchronize()
print 'Is training complete?', mn.training_status()
Note: if training is failed, call
mn.resubmit()
In [11]:
import rep
In [12]:
# predict probabilities for each class
prob = mn.predict_proba(test_data)
print prob
In [13]:
# for prob in mn.staged_predict_proba(test_data):
# print prob
In [14]:
print 'AUC', roc_auc_score(test_labels, prob[:, 1])
In [15]:
%matplotlib inline
from rep.report.metrics import RocAuc
mn.test_on(test_data, test_labels).learning_curve(RocAuc())
mn.predict_proba??
In [16]:
mn.predict(test_data)
Out[16]:
In [17]:
mn.get_feature_importances()
Out[17]: