In [1]:
%%time
#Imports requisite packages
import os
import time
import numpy
import pickle
import cProfile
import itertools
import matplotlib
from sklearn.svm import *
from sklearn.metrics import *
from sklearn.preprocessing import *
from matplotlib import pyplot as plt
from sklearn.cross_validation import *
from sklearn.feature_selection import *
#%jsroot on9
%matplotlib inline
matplotlib.use('Agg')
In [ ]:
#Converts ROOT tree to numpy array
# %%time
# tChain = rt.TChain('MyAnalysis/MyTree')
# tChain.Add("ntuples/*.root")
# array = root_numpy.tree2array(tChain)
# print 'Total number of entries: ',tChain.GetEntries()
#Saves dataset for pickling
# dataset, target = outputs(array)
# outFile = open('dataWithMet.pkl', 'wb')
# pickle.dump(dataset, outFile)
# pickle.dump(target, outFile)
# outFile.close()
#Loads pickled dataset
#inFile = open('dataWithMet.pkl', 'rb')
#dataset = pickle.load(inFile, encoding="latin1")
#target = pickle.load(inFile, encoding="latin1")
#inFile.close()
In [2]:
%%time
#Takes the converted tree and turns it into an
#array usable by sklearn.
def outputs(array):
#Only uses events with non-zero luminosity
goodEvents = array[array['lumi'] != 0]
ind = numpy.lexsort((goodEvents['lumiId'],goodEvents['runId']))
events = goodEvents[ind]
dataset = numpy.empty([len(goodEvents),30])
target = numpy.empty([len(goodEvents)])
badOnes = numpy.array([])
#Fills dataset array with proper features
for j, event in enumerate(events):
try:
dataset[j,0:7] = event['qPFJetPt']
dataset[j,7:14] = event['qPFJetEta']
dataset[j,14:21] = event['qPFJetPhi']
dataset[j,21:28] = event["qMetPt"]
dataset[j,28:35] = event["qMetPhi"]
dataset[j,35:42] = event['qNVtx']
dataset[j,42] = event['crossSection']
dataset[j,43] = event['lumi']
target[j] = event['isSig']
except ValueError:
badOnes = numpy.append(badOnes,j)
#Takes out corrupt events
mask = numpy.zeros(len(dataset), dtype=bool)
mask[badOnes.astype(int)] = True
mask = ~mask
dataset = dataset[mask]
target = target[mask]
return dataset, target
In [3]:
#Function that plots confusion matrix, taken from sklearn website
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
#This function prints and plots the confusion matrix.
#Normalization can be applied by setting `normalize=True`.
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis]
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = numpy.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
print("Normalized confusion matrix")
else:
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = numpy.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max()*.7
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [4]:
#Loads pickled dataset with MET data
inFile = open('dataWithMet.pkl', 'rb')
dataset = pickle.load(inFile, encoding = "latin1")
target = pickle.load(inFile, encoding = "latin1")
inFile.close()
#Loads pickled label data
inFile = open('jetMetTarget.pkl', 'rb')
target = pickle.load(inFile, encoding = "latin1")
inFile.close()
#Loads pickled runIDs and lumiIDs
inFile = open('ids.pkl', 'rb')
ids = pickle.load(inFile, encoding = "latin1")
origIds = ids
inFile.close()
dataset = numpy.append(dataset, ids,axis=1)
In [5]:
sigInd = numpy.where(target==1)
backInd = numpy.where(target==0)
sigVals = dataset[sigInd]
backVals = dataset[backInd]
sigTarget = target[sigInd]
backTarget = target[backInd]
sigTrain, sigTest, sigTrainTarget, sigTestTarget = train_test_split(sigVals, sigTarget, test_size = .5, random_state = 42)
sigTrainIds = sigTrain[:,-2:]
sigTestIds = sigTest[:,-2:]
backValsIds = backVals[:,-2:]
sigTrain = sigTrain[:,:-2]
sigTest = sigTest[:,:-2]
backVals = backVals[:,:-2]
In [6]:
#Scales the data to zero mean and unit variance
#and changes labels from 0 to -1
scaler = StandardScaler() #RobustScaler works worse
scaler.fit(sigTrain)
sigTrain = scaler.transform(sigTrain)
sigTest = scaler.transform(sigTest)
backVals = scaler.transform(backVals)
backTarget[backTarget == 0] = -1
In [7]:
#Runs the classifier with given nuVal and gammaVal
#Some type of cross-validation must be added - Andrey said on meeting he knows some template to do this and compare
#models in a nice and easy way
nuVal = 0.05
gammaVal = 0.06
osvmClf = OneClassSVM(nu=nuVal, kernel = 'rbf', gamma = gammaVal)
osvmClf.fit(sigTrain)
Out[7]:
In [8]:
#Here if you need to pickle classifier
# outFile = open("osvm_nu_%s_gamma_%s.pkl" % (nuVal, gammaVal), 'wb')
# pickle.dump(osvmClf, outFile)
# outFile.close()
In [9]:
# Here if you need to import pickled classifier
# inFile = open("osvm_nu_%s_gamma_%s.pkl" % (nuVal, gammaVal), 'rb')
# osvmClf = pickle.load(inFile, encoding = "latin1")
# inFile.close()
In [10]:
#Runs predictions on classifier for later calculations
y_pred_train = osvmClf.predict(sigTrain)
y_pred_test = osvmClf.predict(sigTest)
y_pred_outliers = osvmClf.predict(backVals)
In [11]:
#Finds numbers of false negatives/positives and true positives
falseNegTrain = y_pred_train[y_pred_train == -1].size
falseNegTest = y_pred_test[y_pred_test == -1].size
falsePos = y_pred_outliers[y_pred_outliers == 1].size
truePosTrain = y_pred_train[y_pred_train == 1].size
truePosTest = y_pred_test[y_pred_test == 1].size
In [12]:
#Plots classification results for signal and background
osvmArrs = []
osvmHists = []
#Separates decision function results into signal and background
#along with training and testing
osvmArrs.append(osvmClf.decision_function(sigTrain).ravel())
osvmArrs.append(osvmClf.decision_function(sigTest).ravel())
osvmArrs.append(osvmClf.decision_function(backVals).ravel())
In [13]:
#Sets up plot boundaries
plotMin = min(min(osvmArrs[0]), min(osvmArrs[1]), min(osvmArrs[2]))
plotMax = max(max(osvmArrs[0]), max(osvmArrs[1]), max(osvmArrs[2]))
binz = numpy.linspace(plotMin, plotMax, 60)
#Creates first histogram of Un-normalized Classification
plt.figure(figsize=(7, 10))
plt.subplot(211)
plt.hist(osvmArrs[0], normed = False, bins = binz, edgecolor = 'red',
facecolor = 'white', alpha=1, label = "Signal Train", linewidth = 1.5)
plt.hist(osvmArrs[1], normed = False, bins = binz, edgecolor = 'green',
facecolor = 'white', alpha=1, label = "Signal Test")
plt.hist(osvmArrs[2], normed = False, bins = binz, edgecolor = 'blue',
facecolor = 'white', alpha=.5, label = "Background")
plt.title("Classification Plot, OneClassSVM, Un-normalized, Nu = %s, Gamma = %s" % (nuVal,gammaVal))
plt.xlabel("Decision Function Score")
plt.ylabel("Counts per Bin")
plt.legend(loc = "upper left")
#Creates second histogram of Normalized Classification
plt.subplot(212)
plt.hist(osvmArrs[0], normed = True, bins = binz, edgecolor = 'red',
facecolor = 'white', alpha=1, label = "Signal Train", linewidth = 1.5)
plt.hist(osvmArrs[1], normed = True, bins = binz, edgecolor = 'green',
facecolor = 'white', alpha=1, label = "Signal Test")
plt.hist(osvmArrs[2], normed = True, bins = binz, edgecolor = 'blue',
facecolor = 'white', alpha=.5, label = "Background")
plt.title("Classification Plot, OneClassSVM, Normalized, Nu = %s, Gamma = %s" % (nuVal,gammaVal))
plt.xlabel("Decision Function Score")
plt.ylabel("Counts per Bin")
plt.legend()
#Prints relevant statistics below
print("Loss Rate : ", (falseNegTest/(truePosTest+falseNegTest)*100))
print("Pollution Rate : ", (falsePos/(truePosTest+falsePos))*100)
print("Number of errors on training set : ", falseNegTrain, " Percentage: ", (falseNegTrain/len(sigTrain)*100))
print("Number of errors on test set : ", falseNegTest, " Percentage: ", (falseNegTest/len(sigTest)*100))
print("Number of errors on outliers set : ", falsePos, " Percentage: ", (falsePos/len(backVals)*100))
In [14]:
#Creates ROC curve
yTest = numpy.append(sigTestTarget, backTarget)
osvmScore = numpy.append(osvmArrs[1], osvmArrs[2])
fpr, tpr, _ = roc_curve(yTest, osvmScore)
roc_auc = auc(fpr, tpr)
plt.figure()
lw = 2;
plt.plot(fpr, tpr, color='darkorange',
lw = lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw = lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC OSVM, Tuned, Nu = %s, Gamma = %s' % (nuVal,gammaVal))
plt.legend(loc="lower right")
plt.show()
In [19]:
decFuncs = numpy.concatenate([osvmArrs[0], osvmArrs[1], osvmArrs[2]])
ids = numpy.concatenate([sigTrainIds, sigTestIds, backValsIds])
sortIndices = numpy.lexsort((ids[:,1], ids[:,0]))
ids = ids[sortIndices]
decFuncs = decFuncs[sortIndices]
#by changing these two values we can examine badly classified lumisections starting from chosen decision function value
DecFuncPosBoundary = 25
DecFuncNegBoundary = -25
extremeFalsePos = [i for i,j in enumerate(decFuncs) if j > DecFuncPosBoundary and target[i] == 0]
extremeFalseNeg = [i for i,j in enumerate(decFuncs) if j < DecFuncNegBoundary and target[i] == 1]
#efp = background events that are very wrongly misclassified as signal events
efp = ids[extremeFalsePos]
efp = [[int(float(j)) for j in i] for i in efp] #make integers from float
#efn = sig events that are very wrongly classified as background events
efn = ids[extremeFalseNeg]
efn = [[int(float(j)) for j in i] for i in efn] #make integers from float
In [20]:
from itertools import groupby
from collections import defaultdict
from operator import itemgetter
#MAKE JSON-LIKE DICTIONARY
#Have a list of bad lumisections and create a list of list
def make_compact(my_list):
ranges=[]
for k, g in groupby(enumerate(my_list), lambda x:x[0]-x[1]):
group = list(map(itemgetter(1), g))
ranges.append([group[0], group[-1]])
return ranges
def make_dictionary(list):
d={}
for key, val in list: #creates a dictionary and listing all the values
d.setdefault(key, []).append(val)
d2 = {}
for k, v in d.items(): #take values and make them compact
d2[k] = make_compact(v)
return d2
json_dictionary1 = make_dictionary(efp)
json_dictionary2 = make_dictionary(efn)
import pprint
# pprint.pprint((make_dictionary(efp)))
pprint.pprint((json_dictionary1))
In [ ]:
In [ ]: