In [1]:
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
import numpy as np
import re
import time
import datetime
import matplotlib.dates as mdates
from math import ceil
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
In [2]:
def extract_split_data(data):
content = re.findall("\[(.*?)\]", data)
values = []
for c in content[0].split(","):
c = (c.strip()[1:-1])
if len(c)>21:
x, y = c.split("#")
values.append(int(x))
return values
In [3]:
def gsr_analysis(span, plotting = True):
sessionCount = 0
veryBeginning = []
nearEnd = []
duringGame = []
size = span//2 + 1 # half
for url in glob.glob("/Users/xueguoliang/Desktop/Data_v2/*.csv"):
player = pd.read_csv(url, delimiter=";")
for session in player['GSR']:
rate = extract_split_data(session)
if len(rate)>span:
sessionCount += 1
veryBeginning.append(rate[0:span])
nearEnd.append(rate[-1-span:-1])
if len(rate)%2 == 0:
duringGame.append(rate[len(rate)//2-size+1:len(rate)//2+size-1])
else:
duringGame.append(rate[len(rate)//2-size+1:len(rate)//2+size-1])
print("We have collected {} games.".format(sessionCount))
print("The size of GSR sample is {}.".format(span))
if(plotting):
#plot
fig, ax = plt.subplots(3, 2, figsize=(15,15))
labels = ["Postive", "Negative"]
########################################## Begin ##################################################
std_begin = []
slope_begin = []
for item in veryBeginning:
slope_begin.append((item[-1]-item[0])/(len(item)-1))
std_begin.append(round(np.std(item), 2))
begin_pos = sum([1 for x in slope_begin if x > 0])
begin_nag = sum([1 for x in slope_begin if x < 0])
dict_begin = {1:begin_pos, 2:begin_nag}
ax[0][0].set_title("Distribution of STD for Beginning of game")
ax[0][0].hist(std_begin, bins=50, range=(min(std_begin), max(std_begin)))
#ax[0][0].set_xlim([0,10])
#ax[0][0].set_ylim([0,400])
ax[0][1].set_title("Distribution of Slope(+,-) for Beginning of game")
ax[0][1].bar(range(len(dict_begin)), dict_begin.values(), color='g')
ax[0][1].set_xticklabels(("","","Positive","","","", "Negative"))
########################################## During #################################################
std_during = []
slope_during = []
for item in duringGame:
slope_during.append((item[-1]-item[0])/(len(item)-1))
std_during.append(round(np.std(item), 2))
during_pos = sum([1 for x in slope_during if x > 0])
during_nag = sum([1 for x in slope_during if x < 0])
dict_during = {1:during_pos, 2:during_nag}
ax[1][0].set_title("Distribution of STD for During of game")
ax[1][0].hist(std_during, bins=50, range=(min(std_during), max(std_during)))
#ax[1][0].set_xlim([0,300])
ax[1][1].set_title("Distribution of Slope(+,-) for During of game")
ax[1][1].bar(list(dict_during.keys()), dict_during.values(), color='g')
ax[1][1].set_xticklabels(("","","Positive","","","", "Negative"))
########################################## End ###################################################
std_end = []
slope_end = []
for item in nearEnd:
slope_end.append((item[-1]-item[0])/(len(item)-1))
std_end.append(round(np.std(item), 2))
end_pos = sum([1 for x in slope_end if x > 0])
end_nag = sum([1 for x in slope_end if x < 0])
dict_end = {1:end_pos, 2:end_nag}
ax[2][0].set_title("Distribution of STD for End of game")
ax[2][0].hist(std_end, bins=50, range=(min(std_end), max(std_end)))
#ax[2][0].set_xlim([0,100])
ax[2][1].set_title("Distribution of Slope(+,-) for End of game")
ax[2][1].bar(list(dict_end.keys()), dict_end.values(), color='g')
ax[2][1].set_xticklabels(("","","Positive","","","", "Negative"))
plt.show()
return veryBeginning, duringGame, nearEnd
In [4]:
test = [20, 50, 100, 150, 250]
for t in test:
gsr_analysis(t)
In [5]:
begin, during, end = gsr_analysis(20, False)
In [6]:
dict_begin = {}
label_begin = []
var_begin = []
max_begin = []
min_begin = []
first_quartile_begin = []
third_quartile_begin = []
average_begin = []
median_begin = []
for b in begin:
label_begin.append(int(0))
var_begin.append(np.var(b))
max_begin.append(np.max(b))
min_begin.append(np.min(b))
first_quartile_begin.append(np.percentile(b, 25))
third_quartile_begin.append(np.percentile(b, 75))
average_begin.append(np.average(b))
median_begin.append(np.median(b))
dict_begin["label"] = label_begin
dict_begin["variance"] = var_begin
dict_begin["max"] = max_begin
dict_begin["min"] = min_begin
dict_begin["first_quartile"] = first_quartile_begin
dict_begin["third_quartile"] = third_quartile_begin
dict_begin["average"] = average_begin
dict_begin["median"] = median_begin
f1 = pd.DataFrame(dict_begin)
print(f1.info())
In [7]:
dict_during = {}
label_during = []
var_during = []
max_during = []
min_during = []
first_quartile_during = []
third_quartile_during = []
average_during = []
median_during = []
for b in during:
label_during.append(int(1))
var_during.append(np.var(b))
max_during.append(np.max(b))
min_during.append(np.min(b))
first_quartile_during.append(np.percentile(b, 25))
third_quartile_during.append(np.percentile(b, 75))
average_during.append(np.average(b))
median_during.append(np.median(b))
dict_during["label"] = label_during
dict_during["variance"] = var_during
dict_during["max"] = max_during
dict_during["min"] = min_during
dict_during["first_quartile"] = first_quartile_during
dict_during["third_quartile"] = third_quartile_during
dict_during["average"] = average_during
dict_during["median"] = median_during
f2 = pd.DataFrame(dict_during)
print(f2.info())
In [8]:
dict_end = {}
label_end = []
var_end = []
max_end = []
min_end = []
first_quartile_end = []
third_quartile_end = []
average_end = []
median_end = []
for b in end:
label_end.append(int(2))
var_end.append(np.var(b))
max_end.append(np.max(b))
min_end.append(np.min(b))
first_quartile_end.append(np.percentile(b, 25))
third_quartile_end.append(np.percentile(b, 75))
average_end.append(np.average(b))
median_end.append(np.median(b))
dict_end["label"] = label_end
dict_end["variance"] = var_end
dict_end["max"] = max_end
dict_end["min"] = min_end
dict_end["first_quartile"] = first_quartile_end
dict_end["third_quartile"] = third_quartile_end
dict_end["average"] = average_end
dict_end["median"] = median_end
f3 = pd.DataFrame(dict_end)
In [9]:
final_data = pd.concat([f1,f2,f3], ignore_index=True)
In [10]:
epochs = 10
In [11]:
for i in range(epochs):
final_data = final_data.sample(frac=1)
train, test = train_test_split(final_data, test_size = 0.2)
y = train["label"]
X = train.drop("label", axis=1)
#X = train[['average', 'max', 'median', 'min','variance']]
tree_clf = DecisionTreeClassifier(random_state=0, max_depth=5)
tree_clf.fit(X, y)
print("the score for each epoch: {}".format(tree_clf.score(X,y)))
In [12]:
print(X.columns)
tree_clf.feature_importances_
Out[12]:
In [13]:
export_graphviz(
tree_clf,
out_file= "bird_tree.dot",
feature_names=X.columns,
class_names=["begin","during","end"],
rounded=True,
filled=True
)
In [14]:
from subprocess import check_call
check_call(['dot','-Tpng','bird_tree.dot','-o','bird_tree.png'])
Out[14]:
In [15]:
y_test = test["label"]
X_test = test.drop("label", axis=1)
tree_clf.score(X_test, y_test)
Out[15]: