In [ ]:
# import os
# os.environ['LIBRARY_PATH'] = os.environ['LD_LIBRARY_PATH'] = '/home/apanin/cuda-8.0/lib64'
# os.environ['PATH'] = "/usr/local/cuda-8.0/bin/:/home/apanin/cuda-8.0/lib64:"+os.environ['PATH']

In [ ]:
# %env THEANO_FLAGS=device=cuda0,gpuarray.preallocate=0.5,floatX=float32
# import theano
# import theano.tensor as T
# from lasagne import *
# from lasagne.layers import *

In [2]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

Loading Data


In [4]:
subsystemNames = ['L1tcalo', 'L1tmu', 'Hlt', 'Pix', 'Strip', 'Ecal', 'Hcal', 'Dt',
                    'Rpc', 'Es', 'Csc', 'Track', 'Egamma', 'Muon', 'Jetmet', 'Lumi']

In [5]:
Ids_labels = ['runId','lumiId','lumi','isSig']

In [6]:
file_name_merged = '/home/fedor/notebook/ml4dc/ok_files/merged.pickle'
data = pd.read_pickle(file_name_merged)

In [7]:
data = data.dropna(axis=0, how='any')

In [8]:
plt.hist(data["lumi"][data['isSig']==0], label = "bad", normed = True, alpha = 0.5, bins=40)#, range=(0.0,0.02))
plt.hist(data["lumi"][data['isSig']==1], label = "good", normed = True, alpha = 0.5, bins=40)#, range=(0.0,0.02))
plt.xlabel("lumi")
plt.legend()
plt.show()



In [9]:
data[data["lumi"]<0.01][data['isSig']==1][['runId','lumiId']]


/home/apanin/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  if __name__ == '__main__':
Out[9]:
runId lumiId
229 275963 82
9772 281727 1245
9773 281727 1246
9774 281727 1247
9775 281727 1341
9776 281727 1401
9777 281727 1405
10154 281613 262
10155 281613 263
10156 281613 264
10228 281727 797
10229 281727 1036
10230 281727 1037
10231 281727 1038
10232 281727 1039
10233 281727 1040
10234 281727 1050
10235 281727 1051
10236 281727 1052
10237 281727 1053
10238 281727 1054
10239 281727 1055
10240 281727 1071
10241 281727 1072
10242 281727 1073
10243 281727 1074
10244 281727 1075
10245 281727 1076
10246 281727 1309
10247 281727 1311
... ... ...
10584 283685 165
10586 283685 167
10587 283685 168
10588 283685 169
10589 283685 170
10591 283685 172
10592 283685 173
10593 283685 174
10594 283685 175
10596 283685 177
10598 283685 179
10599 283685 180
10601 283685 182
10602 283685 183
10603 283685 184
10604 283685 185
10605 283685 186
10607 283685 190
10608 283685 191
10609 283685 192
10612 283685 195
10613 283685 196
10614 283685 197
10616 283685 199
10617 283685 200
10618 283685 201
10619 283685 202
10634 283685 98
10635 283685 99
10636 283685 100

74 rows × 2 columns


In [10]:
nonempty = np.where(data["lumi"] >= 0.01)[0]

In [11]:
data = data.iloc[nonempty]

In [12]:
features = [x for x in data.columns.get_values() if x not in subsystemNames+['runId','lumiId','isSig']]

In [13]:
for f in features:
    xs = data[f].values

    if np.std(xs) > 0.0:
        data[f] = (xs - np.mean(xs)) / np.std(xs)

In [14]:
labels = data["isSig"]
data_features = data[features]

In [15]:
np.mean(labels)


Out[15]:
0.9799013678235787

In [16]:
data_features.shape


Out[16]:
(10747, 750)

In [17]:
num_good = np.sum(labels)
num_bad = len(labels)-np.sum(labels)

In [18]:
weights = 0.5 / np.where(labels == 1.0, num_good, num_bad)
weights *= len(labels)

In [19]:
num_good, num_bad


Out[19]:
(10531, 216)

In [20]:
from sklearn.model_selection import train_test_split
indx_train, indx_test = train_test_split(np.arange(len(labels), dtype='int32'), stratify=labels, test_size=0.1, random_state = 1)

In [21]:
y_train = np.array(labels.iloc[indx_train], 'float32')
y_test = np.array(labels.iloc[indx_test], 'float32')

X_train = np.array(data_features.iloc[indx_train], 'float32')
X_test = np.array(data_features.iloc[indx_test], 'float32')

weights_train = weights[indx_train]
weights_test = weights[indx_test]

In [22]:
len(y_test)-np.sum(y_test)


Out[22]:
22.0

In [23]:
from sklearn.metrics import roc_curve, auc, roc_auc_score,recall_score, confusion_matrix

In [24]:
from sklearn.model_selection import GridSearchCV,cross_val_score, StratifiedKFold, KFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression

In [25]:
names = ["LogisticRegression", 
         "RBF SVM", 
         "Random Forest",  
         "AdaBoost",
         "Naive Bayes"]

classifiers = [
    LogisticRegression(penalty= 'l1', C=0.02),
    SVC(kernel='rbf', C=100, gamma=0.001, probability=True),
    RandomForestClassifier(max_depth=15, n_estimators=40, max_leaf_nodes = 50, min_samples_leaf=5),
    AdaBoostClassifier(n_estimators=60),
    GaussianNB()]

In [28]:
plt.figure(figsize=(8, 8))
plt.plot([0, 1], [0, 1], '--', color='black')

# iterate over classifiers
for name, clf in zip(names, classifiers):
    clf.fit(X_train, y_train, sample_weight=weights_train)
    probas = clf.predict_proba(X_test)
    
    fpr, tpr, _ = roc_curve(y_test, probas[:,1], sample_weight=weights_test)
    auc_score = roc_auc_score(y_test, probas[:,1])
    rec_score = recall_score(1-y_test, 1-np.round(probas[:,1]))
    plt.plot(fpr, tpr, label= name+' AUC = %.3lf' % auc_score+', recall = %.3lf' % rec_score)
    
    print name  
    print 'confusion_matrix for train set'
    print confusion_matrix(y_train, np.round(clf.predict_proba(X_train)[:,1]))
    print 'confusion_matrix for train set'
    print confusion_matrix(y_test, np.round(probas[:,1])) 
    print len(np.where(probas[:,1][y_test == 0] <= 0.6)[0]), len(np.where(probas[:,1][y_test == 1] <= 0.6)[0])
    
plt.legend(loc='lower right', fontsize=20)
plt.title('ROC curve', fontsize=24)
plt.xlabel('FPR', fontsize=20)
plt.ylabel('TPR', fontsize=20)
plt.show()


LogisticRegression
confusion_matrix for train set
[[ 190    4]
 [ 850 8628]]
confusion_matrix for train set
[[ 19   3]
 [ 97 956]]
19 132
RBF SVM
confusion_matrix for train set
[[ 194    0]
 [   0 9478]]
confusion_matrix for train set
[[  13    9]
 [   1 1052]]
13 3
Random Forest
confusion_matrix for train set
[[ 194    0]
 [  16 9462]]
confusion_matrix for train set
[[  12   10]
 [   2 1051]]
13 9
AdaBoost
confusion_matrix for train set
[[ 194    0]
 [ 211 9267]]
confusion_matrix for train set
[[  15    7]
 [  25 1028]]
21 874
Naive Bayes
confusion_matrix for train set
[[ 175   19]
 [4470 5008]]
confusion_matrix for train set
[[ 19   3]
 [481 572]]
19 484

In [ ]:


In [29]:
from sklearn import svm


# fit the model
clf = svm.OneClassSVM(nu = 0.05)
clf.fit(X_train[y_train == 1])
y_pred_train = clf.predict(X_train[y_train == 1])
y_pred_test = clf.predict(X_test[y_test == 1])
y_pred_outliers = clf.predict(np.vstack((X_test[y_test == 0],X_train[y_train == 0])))

In [30]:
n_error_train = y_pred_train[y_pred_train == -1].size
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

In [31]:
y_pred_train[y_pred_train == 1].size, y_pred_test[y_pred_test == 1].size


Out[31]:
(9003, 1007)

In [32]:
n_error_train


Out[32]:
475

In [33]:
n_error_test


Out[33]:
46

In [34]:
n_error_outliers


Out[34]:
180

In [ ]: