Imports


In [1]:
%%time
#Imports requisite packages
import os
import time
import numpy
import pickle
import cProfile
import itertools
import matplotlib
from sklearn.svm import *
from sklearn.metrics import *
from sklearn.preprocessing import *
from matplotlib import pyplot as plt
from sklearn.cross_validation import *
from sklearn.feature_selection import *

#%jsroot on9
%matplotlib inline
matplotlib.use('Agg')


CPU times: user 1.14 s, sys: 267 ms, total: 1.4 s
Wall time: 2.14 s
/Users/fsiroky/anaconda3/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
/Users/fsiroky/anaconda3/lib/python3.6/site-packages/matplotlib/__init__.py:1401: UserWarning:  This call to matplotlib.use() has no effect
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)

Root to Numpy arrays and pickling


In [ ]:
#Converts ROOT tree to numpy array

# %%time
# tChain = rt.TChain('MyAnalysis/MyTree')
# tChain.Add("ntuples/*.root")
# array = root_numpy.tree2array(tChain)
# print 'Total number of entries: ',tChain.GetEntries()

#Saves dataset for pickling
# dataset, target = outputs(array)
# outFile = open('dataWithMet.pkl', 'wb')
# pickle.dump(dataset, outFile)
# pickle.dump(target, outFile)
# outFile.close()

#Loads pickled dataset
#inFile = open('dataWithMet.pkl', 'rb')
#dataset = pickle.load(inFile, encoding="latin1")
#target = pickle.load(inFile, encoding="latin1")
#inFile.close()

Function Definitions


In [2]:
%%time
#Takes the converted tree and turns it into an
#array usable by sklearn.
def outputs(array):
    #Only uses events with non-zero luminosity
    goodEvents = array[array['lumi'] != 0]
    ind = numpy.lexsort((goodEvents['lumiId'],goodEvents['runId']))
    events = goodEvents[ind]
    dataset = numpy.empty([len(goodEvents),30])
    target = numpy.empty([len(goodEvents)])
    badOnes = numpy.array([])

    #Fills dataset array with proper features
    for j, event in enumerate(events):
        try:
            dataset[j,0:7] = event['qPFJetPt']
            dataset[j,7:14] = event['qPFJetEta']
            dataset[j,14:21] = event['qPFJetPhi']
            dataset[j,21:28] = event["qMetPt"]
            dataset[j,28:35] = event["qMetPhi"]
            dataset[j,35:42] = event['qNVtx']
            dataset[j,42] = event['crossSection']
            dataset[j,43] = event['lumi']
            target[j] = event['isSig']
        except ValueError:
            badOnes = numpy.append(badOnes,j)
            
    #Takes out corrupt events
    mask = numpy.zeros(len(dataset), dtype=bool)
    mask[badOnes.astype(int)] = True
    mask = ~mask
    dataset = dataset[mask]
    target = target[mask]
       
    return dataset, target


CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.34 µs

In [3]:
#Function that plots confusion matrix, taken from sklearn website
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    #This function prints and plots the confusion matrix.
    #Normalization can be applied by setting `normalize=True`.     
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, numpy.newaxis]
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = numpy.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)
        print("Normalized confusion matrix")
    else:
        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = numpy.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)
        print('Confusion matrix, without normalization')

    print(cm)
    
    thresh = cm.max()*.7
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

Data Prep


In [4]:
#Loads pickled dataset with MET data
inFile  = open('dataWithMet.pkl', 'rb')
dataset = pickle.load(inFile, encoding = "latin1")
target  = pickle.load(inFile, encoding = "latin1")
inFile.close()

#Loads pickled label data
inFile = open('jetMetTarget.pkl', 'rb')
target = pickle.load(inFile, encoding = "latin1")
inFile.close()

#Loads pickled runIDs and lumiIDs
inFile = open('ids.pkl', 'rb')
ids    = pickle.load(inFile, encoding = "latin1")
origIds = ids
inFile.close()

dataset = numpy.append(dataset, ids,axis=1)

In [5]:
sigInd     = numpy.where(target==1)
backInd    = numpy.where(target==0)
sigVals    = dataset[sigInd]
backVals   = dataset[backInd]
sigTarget  = target[sigInd]
backTarget = target[backInd]

sigTrain, sigTest, sigTrainTarget, sigTestTarget = train_test_split(sigVals, sigTarget, test_size = .5, random_state = 42)
sigTrainIds = sigTrain[:,-2:]
sigTestIds  = sigTest[:,-2:]
backValsIds = backVals[:,-2:]
sigTrain    = sigTrain[:,:-2]
sigTest     = sigTest[:,:-2]
backVals    = backVals[:,:-2]

In [6]:
#Scales the data to zero mean and unit variance
#and changes labels from 0 to -1
scaler = StandardScaler()  #RobustScaler works worse
scaler.fit(sigTrain)

sigTrain = scaler.transform(sigTrain)
sigTest  = scaler.transform(sigTest)
backVals = scaler.transform(backVals)

backTarget[backTarget == 0] = -1

ML model and plotting


In [7]:
#Runs the classifier with given nuVal and gammaVal
#Some type of cross-validation must be added - Andrey said on meeting he knows some template to do this and compare
#models in a nice and easy way
nuVal    = 0.05
gammaVal = 0.06
osvmClf  = OneClassSVM(nu=nuVal, kernel = 'rbf', gamma = gammaVal)
osvmClf.fit(sigTrain)


Out[7]:
OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma=0.06, kernel='rbf',
      max_iter=-1, nu=0.05, random_state=None, shrinking=True, tol=0.001,
      verbose=False)

In [8]:
#Here if you need to pickle classifier
# outFile = open("osvm_nu_%s_gamma_%s.pkl" % (nuVal, gammaVal), 'wb')
# pickle.dump(osvmClf, outFile)
# outFile.close()

In [9]:
# Here if you need to import pickled classifier
# inFile = open("osvm_nu_%s_gamma_%s.pkl" % (nuVal, gammaVal), 'rb')
# osvmClf = pickle.load(inFile, encoding = "latin1")
# inFile.close()

In [10]:
#Runs predictions on classifier for later calculations
y_pred_train = osvmClf.predict(sigTrain)
y_pred_test = osvmClf.predict(sigTest)
y_pred_outliers = osvmClf.predict(backVals)

In [11]:
#Finds numbers of false negatives/positives and true positives
falseNegTrain = y_pred_train[y_pred_train == -1].size
falseNegTest = y_pred_test[y_pred_test == -1].size
falsePos = y_pred_outliers[y_pred_outliers == 1].size
truePosTrain = y_pred_train[y_pred_train == 1].size
truePosTest = y_pred_test[y_pred_test == 1].size

In [12]:
#Plots classification results for signal and background
osvmArrs = []
osvmHists = []

#Separates decision function results into signal and background
#along with training and testing
osvmArrs.append(osvmClf.decision_function(sigTrain).ravel())
osvmArrs.append(osvmClf.decision_function(sigTest).ravel())
osvmArrs.append(osvmClf.decision_function(backVals).ravel())

In [13]:
#Sets up plot boundaries
plotMin = min(min(osvmArrs[0]), min(osvmArrs[1]), min(osvmArrs[2]))
plotMax = max(max(osvmArrs[0]), max(osvmArrs[1]), max(osvmArrs[2]))
binz = numpy.linspace(plotMin, plotMax, 60)

#Creates first histogram of Un-normalized Classification
plt.figure(figsize=(7, 10))
plt.subplot(211)
plt.hist(osvmArrs[0], normed = False, bins = binz, edgecolor = 'red',   
         facecolor = 'white', alpha=1, label = "Signal Train", linewidth = 1.5)
plt.hist(osvmArrs[1], normed = False, bins = binz, edgecolor = 'green', 
         facecolor = 'white', alpha=1, label = "Signal Test")
plt.hist(osvmArrs[2], normed = False, bins = binz, edgecolor = 'blue',  
         facecolor = 'white', alpha=.5, label = "Background")
plt.title("Classification Plot, OneClassSVM, Un-normalized, Nu = %s, Gamma = %s" % (nuVal,gammaVal))
plt.xlabel("Decision Function Score")
plt.ylabel("Counts per Bin")
plt.legend(loc = "upper left")

#Creates second histogram of Normalized Classification
plt.subplot(212)
plt.hist(osvmArrs[0], normed = True, bins = binz, edgecolor = 'red',   
         facecolor = 'white', alpha=1, label = "Signal Train", linewidth = 1.5)
plt.hist(osvmArrs[1], normed = True, bins = binz, edgecolor = 'green', 
         facecolor = 'white', alpha=1, label = "Signal Test")
plt.hist(osvmArrs[2], normed = True, bins = binz, edgecolor = 'blue',  
         facecolor = 'white', alpha=.5, label = "Background")
plt.title("Classification Plot, OneClassSVM, Normalized, Nu = %s, Gamma = %s" % (nuVal,gammaVal))
plt.xlabel("Decision Function Score")
plt.ylabel("Counts per Bin")
plt.legend()

#Prints relevant statistics below
print("Loss Rate                                           : ", (falseNegTest/(truePosTest+falseNegTest)*100))
print("Pollution Rate                                      : ", (falsePos/(truePosTest+falsePos))*100)
print("Number of errors on training set : ", falseNegTrain, " Percentage: ", (falseNegTrain/len(sigTrain)*100))
print("Number of errors on test set     : ", falseNegTest, " Percentage: ", (falseNegTest/len(sigTest)*100))
print("Number of errors on outliers set : ", falsePos, "  Percentage: ", (falsePos/len(backVals)*100))


Loss Rate                                           :  5.10517587501091
Pollution Rate                                      :  0.7313599094245905
Number of errors on training set :  5726  Percentage:  4.997817927904338
Number of errors on test set     :  5849  Percentage:  5.10517587501091
Number of errors on outliers set :  801   Percentage:  24.2874469375379

In [14]:
#Creates ROC curve
yTest = numpy.append(sigTestTarget, backTarget)
osvmScore = numpy.append(osvmArrs[1], osvmArrs[2])
fpr, tpr, _ = roc_curve(yTest, osvmScore)
roc_auc = auc(fpr, tpr)

plt.figure()
lw = 2;
plt.plot(fpr, tpr, color='darkorange',
        lw = lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw = lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC OSVM, Tuned, Nu = %s, Gamma = %s' % (nuVal,gammaVal))
plt.legend(loc="lower right")
plt.show()


JSON tool for examining very badly classified lumisections


In [19]:
decFuncs = numpy.concatenate([osvmArrs[0], osvmArrs[1], osvmArrs[2]])
ids      = numpy.concatenate([sigTrainIds, sigTestIds, backValsIds])


sortIndices   = numpy.lexsort((ids[:,1], ids[:,0]))
ids           = ids[sortIndices]
decFuncs      = decFuncs[sortIndices]

#by changing these two values we can examine badly classified lumisections starting from chosen decision function value
DecFuncPosBoundary = 25 
DecFuncNegBoundary = -25
extremeFalsePos = [i for i,j in enumerate(decFuncs) if j >  DecFuncPosBoundary and target[i] ==  0]
extremeFalseNeg = [i for i,j in enumerate(decFuncs) if j <  DecFuncNegBoundary and target[i] ==  1]

#efp = background events that are very wrongly misclassified as signal events
efp = ids[extremeFalsePos]
efp = [[int(float(j)) for j in i] for i in efp] #make integers from float

#efn = sig events that are very wrongly classified as background events
efn = ids[extremeFalseNeg]
efn = [[int(float(j)) for j in i] for i in efn] #make integers from float

In [20]:
from itertools import groupby
from collections import defaultdict
from operator import itemgetter


#MAKE JSON-LIKE DICTIONARY
#Have a list of bad lumisections and create a list of list

def make_compact(my_list):
    ranges=[]
    for k, g in groupby(enumerate(my_list), lambda x:x[0]-x[1]):
        group = list(map(itemgetter(1), g))
        ranges.append([group[0], group[-1]])
    return ranges

def make_dictionary(list):
    d={}
    for key, val in list: #creates a dictionary and listing all the values
        d.setdefault(key, []).append(val)
    d2 = {}
    for k, v in d.items(): #take values and make them compact
        d2[k] = make_compact(v)
    return d2

json_dictionary1 = make_dictionary(efp)
json_dictionary2 = make_dictionary(efn)

import pprint
# pprint.pprint((make_dictionary(efp)))
pprint.pprint((json_dictionary1))


{273426: [[1, 4], [7, 7], [10, 14], [16, 17], [19, 19], [21, 22], [25, 25]],
 274157: [[108, 112],
          [114, 117],
          [120, 120],
          [122, 123],
          [126, 130],
          [132, 136],
          [139, 139],
          [145, 146],
          [148, 149],
          [152, 152],
          [154, 156],
          [159, 164],
          [166, 167],
          [214, 214],
          [216, 216],
          [403, 403]],
 274959: [[64, 64],
          [100, 100],
          [190, 190],
          [212, 212],
          [232, 232],
          [272, 272],
          [281, 281]],
 275758: [[3, 4]],
 275922: [[4, 5]],
 276453: [[44, 44],
          [47, 47],
          [49, 49],
          [51, 51],
          [54, 55],
          [57, 57],
          [60, 60],
          [63, 63],
          [65, 66],
          [73, 73],
          [75, 76],
          [78, 80],
          [84, 84],
          [88, 88],
          [90, 91],
          [93, 93],
          [99, 99],
          [108, 109],
          [118, 119],
          [121, 123],
          [125, 125]],
 278309: [[1, 1]],
 282663: [[127, 127]]}

In [ ]:


In [ ]: