In [1]:
import os
import numpy as np
import pandas as pd
data_filename = os.getcwd()+"/data/NBA_2014_games-february.csv"
In [15]:
results = pd.read_csv(data_filename)
results.ix[:5]
Out[15]:
In [3]:
# Don't read the first row, as it is blank, and parse the date column as a date
results = pd.read_csv(data_filename, parse_dates=["Date"], skiprows=[0,])
# Fix the name of the columns
results.columns = ["Date", "Score Type", "Visitor Team", "VisitorPts", "Home Team", "HomePts", "OT", "Notes"]
results.ix[:5]
In [4]:
results["HomeWin"] = results["VisitorPts"] < results["HomePts"]
# Our "class values"
y_true = results["HomeWin"].values
results.ix[:5]
Out[4]:
In [5]:
print("Home Win percentage: {0:.1f}%".format(100 * results["HomeWin"].sum() / results["HomeWin"].count()))
In [6]:
results["HomeLastWin"] = False
results["VisitorLastWin"] = False
# This creates two new columns, all set to False
results.ix[:5]
Out[6]:
In [7]:
# Now compute the actual values for these
# Did the home and visitor teams win their last game?
from collections import defaultdict
won_last = defaultdict(int)
for index, row in results.iterrows(): # Note that this is not efficient
home_team = row["Home Team"]
visitor_team = row["Visitor Team"]
row["HomeLastWin"] = won_last[home_team]
row["VisitorLastWin"] = won_last[visitor_team]
results.ix[index] = row
# Set current win
won_last[home_team] = row["HomeWin"]
won_last[visitor_team] = not row["HomeWin"]
results.ix[20:25]
Out[7]:
In [8]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=14)
In [9]:
from sklearn.cross_validation import cross_val_score
# Create a dataset with just the neccessary information
X_previouswins = results[["HomeLastWin", "VisitorLastWin"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_previouswins, y_true, scoring='accuracy')
print("Using just the last result from the home and visitor teams")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
In [10]:
# What about win streaks?
results["HomeWinStreak"] = 0
results["VisitorWinStreak"] = 0
# Did the home and visitor teams win their last game?
from collections import defaultdict
win_streak = defaultdict(int)
for index, row in results.iterrows(): # Note that this is not efficient
home_team = row["Home Team"]
visitor_team = row["Visitor Team"]
row["HomeWinStreak"] = win_streak[home_team]
row["VisitorWinStreak"] = win_streak[visitor_team]
results.ix[index] = row
# Set current win
if row["HomeWin"]:
win_streak[home_team] += 1
win_streak[visitor_team] = 0
else:
win_streak[home_team] = 0
win_streak[visitor_team] += 1
In [11]:
clf = DecisionTreeClassifier(random_state=14)
X_winstreak = results[["HomeLastWin", "VisitorLastWin", "HomeWinStreak", "VisitorWinStreak"]].values
scores = cross_val_score(clf, X_winstreak, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
In [12]:
# Let's try see which team is better on the ladder. Using the previous year's ladder
ladder_filename = os.path.join(data_folder, "leagues_NBA_2013_standings_expanded-standings.csv")
ladder = pd.read_csv(ladder_filename, skiprows=[0,1])
ladder
Out[12]:
In [13]:
# We can create a new feature -- HomeTeamRanksHigher\
results["HomeTeamRanksHigher"] = 0
for index, row in results.iterrows():
home_team = row["Home Team"]
visitor_team = row["Visitor Team"]
if home_team == "New Orleans Pelicans":
home_team = "New Orleans Hornets"
elif visitor_team == "New Orleans Pelicans":
visitor_team = "New Orleans Hornets"
home_rank = ladder[ladder["Team"] == home_team]["Rk"].values[0]
visitor_rank = ladder[ladder["Team"] == visitor_team]["Rk"].values[0]
row["HomeTeamRanksHigher"] = int(home_rank > visitor_rank)
results.ix[index] = row
results[:5]
Out[13]:
In [14]:
X_homehigher = results[["HomeLastWin", "VisitorLastWin", "HomeTeamRanksHigher"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_homehigher, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
In [15]:
from sklearn.grid_search import GridSearchCV
parameter_space = {
"max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
}
clf = DecisionTreeClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_homehigher, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))
In [16]:
# Who won the last match? We ignore home/visitor for this bit
last_match_winner = defaultdict(int)
results["HomeTeamWonLast"] = 0
for index, row in results.iterrows():
home_team = row["Home Team"]
visitor_team = row["Visitor Team"]
teams = tuple(sorted([home_team, visitor_team])) # Sort for a consistent ordering
# Set in the row, who won the last encounter
row["HomeTeamWonLast"] = 1 if last_match_winner[teams] == row["Home Team"] else 0
results.ix[index] = row
# Who won this one?
winner = row["Home Team"] if row["HomeWin"] else row["Visitor Team"]
last_match_winner[teams] = winner
results.ix[:5]
Out[16]:
In [17]:
X_home_higher = results[["HomeTeamRanksHigher", "HomeTeamWonLast"]].values
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_home_higher, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
In [18]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
encoding = LabelEncoder()
encoding.fit(results["Home Team"].values)
home_teams = encoding.transform(results["Home Team"].values)
visitor_teams = encoding.transform(results["Visitor Team"].values)
X_teams = np.vstack([home_teams, visitor_teams]).T
onehot = OneHotEncoder()
X_teams = onehot.fit_transform(X_teams).todense()
clf = DecisionTreeClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
In [19]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_teams, y_true, scoring='accuracy')
print("Using full team labels is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
In [20]:
X_all = np.hstack([X_home_higher, X_teams])
print(X_all.shape)
In [21]:
clf = RandomForestClassifier(random_state=14)
scores = cross_val_score(clf, X_all, y_true, scoring='accuracy')
print("Using whether the home team is ranked higher")
print("Accuracy: {0:.1f}%".format(np.mean(scores) * 100))
In [22]:
#n_estimators=10, criterion='gini', max_depth=None,
#min_samples_split=2, min_samples_leaf=1,
#max_features='auto',
#max_leaf_nodes=None, bootstrap=True,
#oob_score=False, n_jobs=1,
#random_state=None, verbose=0, min_density=None, compute_importances=None
parameter_space = {
"max_features": [2, 10, 'auto'],
"n_estimators": [100,],
"criterion": ["gini", "entropy"],
"min_samples_leaf": [2, 4, 6],
}
clf = RandomForestClassifier(random_state=14)
grid = GridSearchCV(clf, parameter_space)
grid.fit(X_all, y_true)
print("Accuracy: {0:.1f}%".format(grid.best_score_ * 100))
print(grid.best_estimator_)
In [ ]: