In [1]:
import csv
sports = [] # This is a python "list" data structure (it is "mutable")
# The file has a list of sports, one per line.
# There are spaces in some names, but no commas or weird punctuation
with open('../data/SportsDataset_ListOfSports.csv','r') as csvfile:
myreader = csv.reader(csvfile)
for index, row in enumerate( myreader ):
sports.append(' '.join(row) ) # the join() call merges all fields
# Make a look-up table: if you input the name of the sport, it tells you the index
# Also, print out a list of all the sports, to make sure it looks OK
Sport2Index = {}
for ind, sprt in enumerate( sports ):
Sport2Index[sprt] = ind
print('Sport #', ind,'is',sprt)
# And example usage of the index lookup:
#print('The sport "', sports[7],'" has 0-based index', Sport2Index[sports[7]])
# -- And read in the list of questions --
# this csv file has only a single row
questions = []
with open('../data/SportsDataset_ListOfAttributes.csv','r') as csvfile:
myreader = csv.reader( csvfile )
for row in myreader:
questions = row
Question2Index = {}
for ind, quest in enumerate( questions ):
Question2Index[quest] = ind
#print('Question #', ind,': ',quest)
# And example usage of the index lookup:
#print('The question "', questions[10],'" has 0-based index', Question2Index[questions[10]])
# -- And read in the training data --
YesNoDict = { "Yes": 1, "No": -1, "Unsure": 0, "": 0 }
# Load from the csv file.
# Note: the file only has "1"s, because blanks mean "No"
X = []
with open('../data/SportsDataset_DataAttributes.csv','r') as csvfile:
myreader = csv.reader(csvfile)
for row in myreader:
data = [];
for col in row:
data.append( col or "-1")
X.append( list(map(int,data)) ) # integers, not strings
# This data file is listed in the same order as the sports
# The variable "y" contains the index of the sport
y = range(len(sports)) # this doesn't work
y = list( map(int,y) ) # Instead, we need to ask python to really enumerate it!
In [17]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
#clf = tree.DecisionTreeClassifier(max_depth=8,min_samples_leaf=2)
clf = tree.DecisionTreeClassifier(max_depth=13,min_samples_leaf=1)
clf.fit(X,y)
# Try changing the training data, so that we don't get 100% accuracy:
#X2 = X.copy()
#X2[15][-1] = -1
#clf.fit(X2,y)
In [31]:
# -- Visualize the decision tree --
import graphviz
dot_data = tree.export_graphviz( clf, out_file='sportsTree.dot', feature_names = questions,impurity=False,
class_names = sports,filled=True, rounded=True,label=None,
proportion=True)
# export to out_file = 'sportsTree.dot', then in vim, use `%s/\\n\[.*\]\\n/\\n/g` to remove labels
#graph = graphviz.Source( dot_data )
#graph.render('sportsTree')
#graph
from IPython.display import Image
Image(url='sportsTree.png')
Out[31]:
In [13]:
# let's see how well we do
# You can also use clf.score(X,y)
def correctPercentage( predictions, actual ):
correct = 0
for i,guess in enumerate(predictions):
if guess == actual[i]:
correct = correct + 1
return correct/len(predictions)
In [18]:
clf2 = RandomForestClassifier(max_depth=10,n_estimators=10)
clf2 = clf2.fit(X,y)
print(correctPercentage( clf.predict(X), y ))
print(correctPercentage( clf2.predict(X), y ))
clf.score(X,y)
Out[18]:
In [ ]:
# cross validate (hard to do, due to small amount of data)
clf3 = tree.DecisionTreeClassifier(random_state=0,max_depth=8)
from sklearn.model_selection import cross_val_score
cross_val_score(clf3, X, y)#, cv=2)
len(X)
some info here: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier and also movie list here: https://docs.google.com/spreadsheets/d/1-849aPzi8Su_c5HwwDFERrogXjvSaZFfp_y9MHeO1IA/edit?usp=sharing
In [19]:
tree_ = clf.tree_
from sklearn.tree import _tree
import numpy as np
#dir(_tree.Tree) # inspect what we have to work with
#dir(_tree)
In [20]:
def parseInput(str):
# first, ignore capitalization
str=str.lower()
if str[0] == 'y':
return 1
elif str[0] == 'n':
return -1
else:
return 0
def askQuestion(node=0):
Q = tree_.feature[node]
threshold = tree_.threshold[node]
if Q == _tree.TREE_UNDEFINED or Q == _tree.TREE_LEAF:
# at a leaf node, so make the prediction
vals = tree_.value[node][0] # size of all movies
ind = np.argmax( vals )
print('GUESS: ', sports[ind] )
else:
# ask a question and recurse
print(questions[Q])
ans = parseInput(input(" [Yes/no/unsure] "))
if ans <= threshold:
askQuestion(tree_.children_left[node])
else:
askQuestion(tree_.children_right[node])
# or maybe ask for all 13 questions
def fullSport():
x = [0]*len(questions)
for i,Q in enumerate( questions ):
print(Q)
x[i] = parseInput(input(" [Yes/no/unsure] "))
return x
In [25]:
# Play game!
askQuestion()
In [57]:
# Or get all 13 unique questions on one movie, and try random forests
x = fullSport()
print('PREDICTION (random forests): ', sports[ clf2.predict([x])[0] ] )
print('PREDICTION (decision tree ): ', sports[ clf.predict([x])[0] ] )