In [2]:
import json
# Event files for the Derby and Belmont Stakes
eventFiles = {"Kentucky Derby":"2014-05-03_kentucky_derby", "Belmont Stakes":"2014-06-07_belmont_stakes"}
eventData = {}
eventPath = "../events"
for event in eventFiles.keys():
eventFileName = eventPath + "/" + eventFiles[event] + ".json"
with open(eventFileName, 'r') as f:
eventData[event] = json.load(f)
#print json.dumps(eventData, sort_keys=True, indent=4, separators=(',', ': '))
print "Events:", eventData.keys()
In [5]:
from datetime import datetime
import pytz
import time
# Now build a map of windows to events from the input event files
eventTimeMap = {}
for eventName in eventData.keys():
eventList = eventData[eventName]["datasets"][0]["events"]
eventTimeMap[eventName] = {}
for event in eventList:
timeStr = None
if ( "EDT" in event["time"] ):
timeObj = datetime.strptime(event["time"][:-3], "%Y-%m-%d %H:%M:%S")
edtObj = pytz.timezone('America/New_York')
timeObj = edtObj.localize(timeObj)
utcTime = pytz.utc.normalize(timeObj.astimezone(pytz.utc))
timeStr = utcTime.strftime("%Y-%m-%d %H:%M:%SGMT+00:00")
else:
timeStr = event["time"]
eventTimeMap[eventName][timeStr] = event
print "Number of events in", eventName, ":", len(eventTimeMap[eventName].keys())
In [6]:
import sys
sys.path.append("../code/")
from events import eventutils
print "Successful import."
In [7]:
from sklearn.externals import joblib
currentDataLocation = "../csvs/withStops/10_min-3_size/unnormal_kdd_test"
pickledSvmFile = currentDataLocation + "/svm.pkl"
pickledRFFile = currentDataLocation + "/forests.pkl"
svmClassifier = joblib.load(pickledSvmFile)
rfClassifier = joblib.load(pickledRFFile)
print "SVM Type:", type(svmClassifier)
print "RF Type:", type(rfClassifier)
In [9]:
for eventName in eventFiles.keys():
inputDataFilePath = currentDataLocation + "/normal." + eventFiles[eventName] + ".json"
with open(inputDataFilePath, 'r') as inputFile:
eventData[eventName]['data'] = json.load(inputFile)
eventData[eventName]['windows'] = eventutils.getWindowsFromData(eventData[eventName]['data'])
print "Number of windows in", eventName, ":", len(eventData[eventName]['windows'])
In [10]:
import numpy as np
from sklearn import svm
windowSize = 10
offset = 60
detectedEvents = {}
print "Starting classification..."
for eventName in eventData.keys():
print "Classifying keywords for event:", eventName
localTimeMap = eventTimeMap[eventName]
windows = eventData[eventName]['windows']
inputData = eventData[eventName]['data']
positiveTimeMap = {}
for windowId in sorted(windows):
# print "-"*10, "New Window", "-"*10
# print "Window ID:", windowId
windowActualTime = time.gmtime((int(windowId)/1000) + ((windowSize - 1)*offset))
windowStartTime = time.gmtime((int(windowId)/1000))
timeActualStr = time.strftime("%Y-%m-%d %H:%M:%SGMT+00:00", windowActualTime)
timeStartStr = time.strftime("%Y-%m-%d %H:%M:%SGMT+00:00", windowStartTime)
# if ( timeActualStr in localTimeMap ):
# print "Found event:", localTimeMap[timeActualStr]["name"]
# else:
# print "No known event."
samples = eventutils.getSamplesInWindow(inputData, windowId)
(wordArray, dataMatrix) = eventutils.getDataMatrix(samples)
svmResults = svmClassifier.predict(dataMatrix)
rfResults = rfClassifier.predict(dataMatrix)
positives = None
svmPositives = np.array(wordArray)[np.nonzero(svmResults)]
rfPositives = np.array(wordArray)[np.nonzero(rfResults)]
positiveTimeMap[timeActualStr] = (svmPositives, rfPositives)
detectedEvents[eventName] = positiveTimeMap
print "Classification complete!"
In [11]:
# Set target keywords
#eventData["Kentucky Derby"]["targets"] = \
# ['race', 'underway', 'win', 'won', 'finish', 'finished', 'california', 'chrome', \
# 'kentucky', 'derby', 'churchill', 'downs', 'triple', 'crown']
eventData["Kentucky Derby"]["targets"] = {}
eventData["Kentucky Derby"]["targets"]["Race Start"] = {"color":"blue", "marker":"o", "actuals":[], "svm":[], "rf":[]}
eventData["Kentucky Derby"]["targets"]["Race Start"]['keywords'] = ['gate', 'race', 'underway', 'start',\
'kentucky', 'derby', 'churchill', 'downs']
eventData["Kentucky Derby"]["targets"]["Race Finish"] = {"color":"red", "marker":"o", "actuals":[], "svm":[], "rf":[]}
eventData["Kentucky Derby"]["targets"]["Race Finish"]['keywords'] = ['win', 'wins', 'won', \
'finish', 'finishes', 'finished', 'over', 'line']
eventData["Kentucky Derby"]["targets"]["Winning Horse"] = {"color":"yellow", "marker":"o", "actuals":[], "svm":[], "rf":[]}
eventData["Kentucky Derby"]["targets"]["Winning Horse"]['keywords'] = ['california', 'chrome']
#eventData["Belmont Stakes"]["targets"] = \
# ['race', 'underway', 'win', 'won', 'finish', 'finished', 'tonalist', \
# 'belmont', 'stakes', 'triple', 'crown']
eventData["Belmont Stakes"]["targets"] = {}
eventData["Belmont Stakes"]["targets"]["Race Start"] = {"color":"blue", "marker":"o", "actuals":[], "svm":[], "rf":[]}
eventData["Belmont Stakes"]["targets"]["Race Start"]['keywords'] = ['gate', 'race', 'underway', 'start',\
'belmont', 'stakes', 'triple', 'crown']
eventData["Belmont Stakes"]["targets"]["Race Finish"] = {"color":"red", "marker":"o", "actuals":[], "svm":[], "rf":[]}
eventData["Belmont Stakes"]["targets"]["Race Finish"]['keywords'] = ['win', 'wins', 'won', \
'finish', 'finishes', 'finished', 'over', 'line']
eventData["Belmont Stakes"]["targets"]["Winning Horse"] = {"color":"yellow", "marker":"o", "actuals":[], "svm":[], "rf":[]}
eventData["Belmont Stakes"]["targets"]["Winning Horse"]['keywords'] = ['tonalist']
In [12]:
for eventName in eventData.keys():
event = eventData[eventName]
localTimeMap = eventTimeMap[eventName]
windows = event['windows']
inputData = event['data']
positives = detectedEvents[eventName]
xData = range(len(windows))
xLabels = []
targetEvents = event["targets"]
for windowId in sorted(windows):
windowActualTime = time.gmtime((int(windowId)/1000) + ((windowSize - 1)*offset))
windowStartTime = time.gmtime((int(windowId)/1000))
timeActualStr = time.strftime("%Y-%m-%d %H:%M:%SGMT+00:00", windowActualTime)
timeStartStr = time.strftime("%Y-%m-%d %H:%M:%SGMT+00:00", windowStartTime)
shortTimeStr = time.strftime("%H:%M:%S GMT", windowActualTime)
xLabels.append(shortTimeStr)
eventList = []
if ( timeActualStr in localTimeMap ):
thisEvent = localTimeMap[timeActualStr]
eventList = thisEvent["keywords"]
detectedListSvm = detectedEvents[eventName][timeActualStr][0]
detectedListRf = detectedEvents[eventName][timeActualStr][1]
for targetEvent in targetEvents.keys():
targetActual = 0
detectedSvm = 0
detectedRf = 0
targetWords = targetEvents[targetEvent]["keywords"]
for targetWord in targetWords:
if ( targetWord in eventList ):
targetActual = 1
if ( targetWord in detectedListSvm ):
detectedSvm = 1
if ( targetWord in detectedListRf ):
detectedRf = 1
targetEvents[targetEvent]["actuals"].append(targetActual)
targetEvents[targetEvent]["svm"].append(detectedSvm)
targetEvents[targetEvent]["rf"].append(detectedRf)
eventData[eventName]['xData'] = xData
eventData[eventName]['xLabels'] = xLabels
print eventName, ":"
headerStr = "label"
for targetEvent in targetEvents.keys():
headerStr += ", %s" % (targetEvent)
print headerStr
for x in xData:
dataStr = "%s" % (xLabels[x])
for targetEvent in targetEvents.keys():
for dataKey in ["actuals", "svm", "rf"]:
dataStr += ", %s" % (targetEvents[targetEvent][dataKey][x])
print dataStr
In [13]:
for eventName in eventFiles.keys():
event = eventData[eventName]
xData = event['xData']
xLabels = event['xLabels']
targetEvents = event["targets"]
for targetEventName in targetEvents.keys():
targetEvent = targetEvents[targetEventName]
fig, ax = plt.subplots()
plt.title(eventName + ": " + targetEventName)
smallerXTicks = filter(lambda x: True if x%1==0 else False, xData)
pylab.xticks(smallerXTicks, [xLabels[x] for x in smallerXTicks], rotation=45)
ax.set_ylim(-0.1, 1.5)
# ax.set_xlim(0, 175)
ax.plot(xData, targetEvent["actuals"], color="green", marker='+', lw=7, markersize=20, label="Actual")
ax.plot(xData, targetEvent["svm"], color="blue", marker='o', markersize=5, label="SVM")
ax.plot(xData, targetEvent["rf"], color="red", marker='x', markersize=5, label="RF")
ax.legend()
In [ ]:
In [ ]: