In [2]:
import json

# Event files for the Derby and Belmont Stakes
eventFiles = {"Kentucky Derby":"2014-05-03_kentucky_derby", "Belmont Stakes":"2014-06-07_belmont_stakes"}
eventData = {}

eventPath = "../events"
for event in eventFiles.keys():
    eventFileName = eventPath + "/" + eventFiles[event] + ".json"
    with open(eventFileName, 'r') as f:
        eventData[event] = json.load(f)

#print json.dumps(eventData, sort_keys=True, indent=4, separators=(',', ': '))
print "Events:", eventData.keys()


Events: ['Kentucky Derby', 'Belmont Stakes']

In [5]:
from datetime import datetime
import pytz
import time

# Now build a map of windows to events from the input event files
eventTimeMap = {}
for eventName in eventData.keys():
    eventList = eventData[eventName]["datasets"][0]["events"]
    eventTimeMap[eventName] = {}

    for event in eventList:
        timeStr = None
        if ( "EDT" in event["time"] ):
            timeObj = datetime.strptime(event["time"][:-3], "%Y-%m-%d %H:%M:%S")
            edtObj = pytz.timezone('America/New_York')
            timeObj = edtObj.localize(timeObj)
            utcTime = pytz.utc.normalize(timeObj.astimezone(pytz.utc))
            timeStr = utcTime.strftime("%Y-%m-%d %H:%M:%SGMT+00:00")
        else:
            timeStr = event["time"]

        eventTimeMap[eventName][timeStr] = event
    
    print "Number of events in", eventName, ":", len(eventTimeMap[eventName].keys())


Number of events in Kentucky Derby : 3
Number of events in Belmont Stakes : 3

In [6]:
import sys
sys.path.append("../code/")

from events import eventutils

print "Successful import."


Successful import.

In [7]:
from sklearn.externals import joblib

currentDataLocation = "../csvs/withStops/10_min-3_size/unnormal_kdd_test"

pickledSvmFile = currentDataLocation + "/svm.pkl"
pickledRFFile = currentDataLocation + "/forests.pkl"

svmClassifier = joblib.load(pickledSvmFile)
rfClassifier = joblib.load(pickledRFFile)

print "SVM Type:", type(svmClassifier)
print "RF Type:", type(rfClassifier)


SVM Type: <class 'sklearn.svm.classes.SVC'>
RF Type: <class 'sklearn.ensemble.forest.RandomForestClassifier'>

In [9]:
for eventName in eventFiles.keys():

    inputDataFilePath = currentDataLocation + "/normal." + eventFiles[eventName] + ".json"
    with open(inputDataFilePath, 'r') as inputFile:
        eventData[eventName]['data'] = json.load(inputFile)

    eventData[eventName]['windows'] = eventutils.getWindowsFromData(eventData[eventName]['data'])
    print "Number of windows in", eventName, ":", len(eventData[eventName]['windows'])


Number of windows in Belmont Stakes : 18
Number of windows in Kentucky Derby : 18

In [10]:
import numpy as np
from sklearn import svm

windowSize = 10
offset = 60

detectedEvents = {}

print "Starting classification..."
for eventName in eventData.keys():
    print "Classifying keywords for event:", eventName
    
    localTimeMap = eventTimeMap[eventName]
    windows = eventData[eventName]['windows']
    inputData = eventData[eventName]['data']

    positiveTimeMap = {}
    
    for windowId in sorted(windows):
        
    #    print "-"*10, "New Window", "-"*10
    #    print "Window ID:", windowId
        
        windowActualTime = time.gmtime((int(windowId)/1000) + ((windowSize - 1)*offset))
        windowStartTime = time.gmtime((int(windowId)/1000))
        timeActualStr = time.strftime("%Y-%m-%d %H:%M:%SGMT+00:00", windowActualTime)
        timeStartStr = time.strftime("%Y-%m-%d %H:%M:%SGMT+00:00", windowStartTime)
        
#        if ( timeActualStr in localTimeMap ):
#           print "Found event:", localTimeMap[timeActualStr]["name"]
#        else:
#           print "No known event."

        samples = eventutils.getSamplesInWindow(inputData, windowId)
        (wordArray, dataMatrix) = eventutils.getDataMatrix(samples)

        svmResults = svmClassifier.predict(dataMatrix)
        rfResults = rfClassifier.predict(dataMatrix)
        
        positives = None
        svmPositives = np.array(wordArray)[np.nonzero(svmResults)]
        rfPositives = np.array(wordArray)[np.nonzero(rfResults)]
        
        positiveTimeMap[timeActualStr] = (svmPositives, rfPositives)
        
    detectedEvents[eventName] = positiveTimeMap

print "Classification complete!"


Starting classification...
Classifying keywords for event: Kentucky Derby
Classifying keywords for event: Belmont Stakes
Classification complete!

In [11]:
# Set target keywords
#eventData["Kentucky Derby"]["targets"] = \
#    ['race', 'underway', 'win', 'won', 'finish', 'finished', 'california', 'chrome', \
#    'kentucky', 'derby', 'churchill', 'downs', 'triple', 'crown']
    
eventData["Kentucky Derby"]["targets"] = {}
eventData["Kentucky Derby"]["targets"]["Race Start"] = {"color":"blue", "marker":"o", "actuals":[], "svm":[], "rf":[]}
eventData["Kentucky Derby"]["targets"]["Race Start"]['keywords'] = ['gate', 'race', 'underway', 'start',\
    'kentucky', 'derby', 'churchill', 'downs']
eventData["Kentucky Derby"]["targets"]["Race Finish"] = {"color":"red", "marker":"o", "actuals":[], "svm":[], "rf":[]}
eventData["Kentucky Derby"]["targets"]["Race Finish"]['keywords'] = ['win', 'wins', 'won', \
     'finish',  'finishes',  'finished', 'over', 'line']
eventData["Kentucky Derby"]["targets"]["Winning Horse"] = {"color":"yellow", "marker":"o", "actuals":[], "svm":[], "rf":[]}
eventData["Kentucky Derby"]["targets"]["Winning Horse"]['keywords'] = ['california', 'chrome']
    
#eventData["Belmont Stakes"]["targets"] = \
#    ['race', 'underway', 'win', 'won', 'finish', 'finished', 'tonalist', \
#    'belmont', 'stakes', 'triple', 'crown']
    
eventData["Belmont Stakes"]["targets"] = {}
eventData["Belmont Stakes"]["targets"]["Race Start"] = {"color":"blue", "marker":"o", "actuals":[], "svm":[], "rf":[]}
eventData["Belmont Stakes"]["targets"]["Race Start"]['keywords'] = ['gate', 'race', 'underway', 'start',\
    'belmont', 'stakes', 'triple', 'crown']
eventData["Belmont Stakes"]["targets"]["Race Finish"] = {"color":"red", "marker":"o", "actuals":[], "svm":[], "rf":[]}
eventData["Belmont Stakes"]["targets"]["Race Finish"]['keywords'] = ['win', 'wins', 'won', \
     'finish',  'finishes',  'finished', 'over', 'line']
eventData["Belmont Stakes"]["targets"]["Winning Horse"] = {"color":"yellow", "marker":"o", "actuals":[], "svm":[], "rf":[]}
eventData["Belmont Stakes"]["targets"]["Winning Horse"]['keywords'] = ['tonalist']

In [12]:
for eventName in eventData.keys():
    
    event = eventData[eventName]
    localTimeMap = eventTimeMap[eventName]
    windows = event['windows']
    inputData = event['data']
    positives = detectedEvents[eventName]

    xData = range(len(windows))
    xLabels = []

    targetEvents = event["targets"]

    for windowId in sorted(windows):
        
        windowActualTime = time.gmtime((int(windowId)/1000) + ((windowSize - 1)*offset))
        windowStartTime = time.gmtime((int(windowId)/1000))
        timeActualStr = time.strftime("%Y-%m-%d %H:%M:%SGMT+00:00", windowActualTime)
        timeStartStr = time.strftime("%Y-%m-%d %H:%M:%SGMT+00:00", windowStartTime)
        shortTimeStr = time.strftime("%H:%M:%S GMT", windowActualTime)
        
        xLabels.append(shortTimeStr)
        
        eventList = []
        if ( timeActualStr in localTimeMap ):
            thisEvent = localTimeMap[timeActualStr]
            eventList = thisEvent["keywords"]
        
        detectedListSvm = detectedEvents[eventName][timeActualStr][0]
        detectedListRf = detectedEvents[eventName][timeActualStr][1]
        
        for targetEvent in targetEvents.keys():
            targetActual = 0
            detectedSvm = 0
            detectedRf = 0
            
            targetWords = targetEvents[targetEvent]["keywords"]
            
            for targetWord in targetWords:
                if ( targetWord in eventList ):
                    targetActual = 1
                    
                if ( targetWord in detectedListSvm ):
                    detectedSvm = 1
                    
                if ( targetWord in detectedListRf ):
                    detectedRf = 1
                    
            targetEvents[targetEvent]["actuals"].append(targetActual)
            targetEvents[targetEvent]["svm"].append(detectedSvm)
            targetEvents[targetEvent]["rf"].append(detectedRf)
        
    eventData[eventName]['xData'] = xData
    eventData[eventName]['xLabels'] = xLabels

    print eventName, ":"
    headerStr = "label"
    for targetEvent in targetEvents.keys():
        headerStr += ", %s" % (targetEvent)
    print headerStr
    for x in xData:
        
        dataStr = "%s" % (xLabels[x])
        for targetEvent in targetEvents.keys():
            for dataKey in ["actuals", "svm", "rf"]:
                dataStr += ", %s" % (targetEvents[targetEvent][dataKey][x])
        print dataStr


Kentucky Derby :
label, Winning Horse, Race Finish, Race Start
22:27:00 GMT, 0, 0, 0, 0, 0, 0, 0, 0, 0
22:28:00 GMT, 0, 0, 0, 0, 0, 0, 0, 0, 0
22:29:00 GMT, 0, 0, 0, 0, 0, 0, 0, 0, 0
22:30:00 GMT, 0, 0, 0, 0, 0, 0, 0, 0, 1
22:31:00 GMT, 0, 0, 0, 0, 0, 0, 0, 1, 1
22:32:00 GMT, 0, 0, 0, 0, 0, 0, 1, 1, 1
22:33:00 GMT, 0, 0, 1, 0, 0, 0, 0, 1, 1
22:34:00 GMT, 0, 1, 1, 0, 0, 0, 0, 0, 0
22:35:00 GMT, 1, 1, 1, 1, 1, 1, 0, 0, 0
22:36:00 GMT, 0, 1, 1, 0, 1, 1, 0, 0, 0
22:37:00 GMT, 0, 1, 1, 0, 1, 1, 0, 0, 0
22:38:00 GMT, 0, 1, 1, 0, 1, 1, 0, 0, 0
22:39:00 GMT, 0, 1, 1, 0, 1, 1, 0, 0, 0
22:40:00 GMT, 0, 1, 1, 0, 0, 1, 0, 0, 0
22:41:00 GMT, 0, 1, 1, 0, 0, 0, 0, 0, 0
22:42:00 GMT, 0, 0, 0, 0, 0, 0, 0, 0, 0
22:43:00 GMT, 0, 0, 0, 0, 0, 0, 0, 0, 0
22:44:00 GMT, 0, 0, 0, 0, 0, 0, 0, 0, 0
Belmont Stakes :
label, Winning Horse, Race Finish, Race Start
22:49:00 GMT, 0, 0, 0, 0, 0, 0, 0, 0, 0
22:50:00 GMT, 0, 0, 0, 0, 0, 0, 0, 0, 0
22:51:00 GMT, 0, 0, 0, 0, 0, 0, 0, 0, 1
22:52:00 GMT, 0, 0, 0, 0, 0, 0, 0, 1, 1
22:53:00 GMT, 0, 0, 0, 0, 0, 0, 0, 1, 1
22:54:00 GMT, 0, 0, 0, 0, 0, 0, 1, 1, 1
22:55:00 GMT, 0, 0, 0, 0, 0, 0, 0, 1, 1
22:56:00 GMT, 0, 0, 0, 0, 0, 0, 0, 1, 1
22:57:00 GMT, 1, 0, 1, 1, 0, 1, 0, 0, 0
22:58:00 GMT, 0, 1, 1, 0, 1, 1, 0, 1, 1
22:59:00 GMT, 0, 1, 1, 0, 1, 1, 0, 1, 1
23:00:00 GMT, 0, 1, 1, 0, 1, 1, 0, 1, 1
23:01:00 GMT, 0, 1, 1, 0, 1, 1, 0, 1, 1
23:02:00 GMT, 0, 1, 1, 0, 0, 1, 0, 1, 1
23:03:00 GMT, 0, 1, 1, 0, 0, 0, 0, 0, 1
23:04:00 GMT, 0, 1, 1, 0, 0, 0, 0, 0, 0
23:05:00 GMT, 0, 0, 0, 0, 0, 0, 0, 0, 0
23:06:00 GMT, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [13]:
for eventName in eventFiles.keys():
    event = eventData[eventName]
    xData = event['xData']
    xLabels = event['xLabels']
    
    targetEvents = event["targets"]
    
    for targetEventName in targetEvents.keys():
        
        targetEvent = targetEvents[targetEventName]
        
        fig, ax = plt.subplots()

        plt.title(eventName + ": " + targetEventName)
        smallerXTicks = filter(lambda x: True if x%1==0 else False, xData)
        pylab.xticks(smallerXTicks, [xLabels[x] for x in smallerXTicks], rotation=45)

        ax.set_ylim(-0.1, 1.5)
#        ax.set_xlim(0, 175)

        ax.plot(xData, targetEvent["actuals"], color="green", marker='+', lw=7, markersize=20, label="Actual")
        ax.plot(xData, targetEvent["svm"], color="blue", marker='o', markersize=5, label="SVM")
        ax.plot(xData, targetEvent["rf"], color="red", marker='x', markersize=5, label="RF")
        ax.legend()



In [ ]:


In [ ]: