In [1]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
def removepunct(str_in):
for n in [".",",","!","'",'"','\n','-']:
str_in = str_in.replace(n,'')
str_out = str_in.lower()
return unicode(str_out)
In [3]:
def parse_round(x):
y = None
parsed = None
if type(x) == str:
parsed = re.search(r'([0-9]+)+',x).group(1)
y = int(parsed)
if (type(x) == int) or (type(x) == float):
y = int(x)
return y
In [4]:
mens_mixed_url = 'https://docs.google.com/spreadsheets/d/15TNjIJDkopneZ6PWg4IGhTQNvTl-T5U8u_i9zgJPEXA/export?format=csv&id=15TNjIJDkopneZ6PWg4IGhTQNvTl-T5U8u_i9zgJPEXA&gid=0'
elos_workbook = 'https://raw.githubusercontent.com/robfox92/HockeyElo/master/Elos_2016b_week_1.csv'
mens_mixed_raw = pd.read_csv(mens_mixed_url)
elos_start_raw = pd.read_csv(elos_workbook)
In [5]:
# Only get results that are validated
results = mens_mixed_raw[mens_mixed_raw['Validate'] == 'Y']
results.reset_index(inplace=True)
In [6]:
results['Round_number'] = results['ROUND'].apply(parse_round)
results.head()
Out[6]:
In [7]:
results['HOME'] = results['HOME'].apply(removepunct)
results['AWAY'] = results['AWAY'].apply(removepunct)
In [ ]:
In [8]:
def getKfactor(x):
newteam = 'New Team'
newteamK = 75
oldteamK = 50
if x == newteam:
out = newteamK
else:
out = oldteamK
return out
# Create a dict of team starting elos
team_elos = elos_start_raw
team_elos['2016b Teams lower'] = elos_start_raw['2016b Teams'].apply(removepunct)
elos_dict = dict(zip(team_elos['2016b Teams lower'],team_elos['Starting Elo']))
team_elos['K Factor'] = team_elos['New Team'].apply(getKfactor)
team_K_factors = dict(zip(team_elos['2016b Teams lower'],team_elos['K Factor']))
In [9]:
results.loc[:,'Home Elo'] = None
results.loc[:,'Away Elo'] = None
In [10]:
results.head()
Out[10]:
In [39]:
elos_dict = dict(zip(team_elos['2016b Teams lower'],team_elos['Starting Elo']))
for row in range(0,len(results)):
# Get the home and away teams
hometeam = results['HOME'][row]
awayteam = results['AWAY'][row]
# Get the elos from the elo dictionary
awayElo = elos_dict[awayteam]
homeElo = elos_dict[hometeam]
# Write to the df
results.loc[row,'Home Elo'] = homeElo
results.loc[row,'Away Elo'] = awayElo
# Get the team K factors
homeK = team_K_factors[hometeam]
awayK = team_K_factors[awayteam]
# Calculate the Score Expectancies
homeSE = 1 / (1 + 10 ** -((homeElo - awayElo) / 400))
awaySE = 1 / (1 + 10 ** ((homeElo - awayElo) / 400))
# Write to DF
results.loc[row,'Home Predicted Result'] = homeSE
results.loc[row,'Away Predicted Result'] = awaySE
# Get home, away and total scores
homescore = results['SCORE'][row]
awayscore = results['SCORE.1'][row]
totalscore = homescore+awayscore
# Calculate home and away score percentages
homeScorePerc = homescore / totalscore
awayScorePerc = awayscore / totalscore
# Write to DF
results.loc[row,'Home Actual Result'] = homeScorePerc
results.loc[row,'Away Actual Result'] = awayScorePerc
# Find Elo Changes
homeNewElo = homeElo + homeK * (homeScorePerc - homeSE)
awayNewElo = awayElo + awayK * (awayScorePerc - awaySE)
# Check to ensure winning teams don't lose Elo
if homescore > awayscore:
homeNewElo = max(homeNewElo,homeElo)
if awayscore > homescore:
awayNewElo = max(awayNewElo,awayElo)
newElos = {hometeam:homeNewElo, awayteam:awayNewElo}
elos_dict.update(newElos)
In [12]:
hometeams = set(results['HOME'].unique())
awayteams = set(results['AWAY'].unique())
allteams = hometeams | awayteams
In [13]:
print "all hometeams in teams?",hometeams.issubset(allteams)
print "all awayteams in teams?",awayteams.issubset(allteams)
In [14]:
def getscoreprogression(team):
progress = []
team = unicode(team)
out = None
if team in allteams:
progress = results[results['HOME']==team]
progress = progress.append(results[results['AWAY']==team])
progress.sort_values('Round_number',inplace=True)
progress.reset_index(inplace=True)
if len(progress)>0:
out = []
for row in range(0,len(progress)):
if progress.loc[row,'HOME'] == team:
roundno = progress.loc[row,'Round_number']
if type(roundno) == str:
roundno = parse_round(roundno)
out.append((progress.loc[row,'Round_number'],progress.loc[row,'Home Elo']))
if progress.loc[row,'AWAY'] == team:
roundno = progress.loc[row,'Round_number']
if type(roundno) == str:
roundno = parse_round(roundno)
out.append((roundno,progress.loc[row,'Away Elo']))
return out
In [15]:
getscoreprogression('harambaeyswater')
Out[15]:
In [16]:
def getresults(team):
progress = []
team = unicode(team)
if team in allteams:
progress = results[results['HOME']==team]
progress = progress.append(results[results['AWAY']==team])
progress.sort_values('Round_number',inplace=True)
progress.reset_index(inplace=True)
out = progress
else:
out = None
return out
In [17]:
getresults('harambaeyswater');
In [18]:
results.head()
Out[18]:
In [19]:
parse_round('9-10 BYE')
Out[19]:
In [20]:
teams = ['harambaeyswater','get bentley','city beach dont kill my vibe',
'south perth major blazers','fremantle metropolis double blacks',
'booragoonbagz','mount hawtnhorny']
teams = list(allteams)
plt.figure(figsize=(15,10))
axes = plt.gca()
#axes.set_xlim([0,7])
#axes.set_ylim([500,700])
plt.ylabel("Elo Rating")
plt.xlabel("Round Number")
plt.title(" Elo vs Round for various SRHL Teams")
for team in teams:
if getscoreprogression(team) is not None:
s=getscoreprogression(team)
x,y = zip(*s)
if max(x) < 8:
plt.scatter(x,y)
plt.plot(x,y)
if len(teams) < 10:
plt.legend(teams, loc='lower center')
In [21]:
results.head()
Out[21]:
In [46]:
homeresults = results['Home Actual Result'];
b = [0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1]
#plt.figure(figsize=(15,10))
axes = plt.gca()
#axes.set_xlim([0,7])
#axes.set_ylim([500,700])
plt.ylabel("Frequency")
plt.xlabel("Home Score Outcome")
#plt.title("Histogram of Home Score Outcomes up to SRHL Round 7")
plt.hist(homeresults, bins = b)
print "2016b Score Histogram"
print "Mean Home Score Outcome:",np.mean(homeresults)
print "Standard Deviation of score outcomes:",np.std(homeresults)
print len(homeresults),'game results'
In [23]:
results_2016a_file = "2016a SRHL Ladders - Mens-Mixed Game Results.csv"
results_2016a_raw = pd.read_csv(results_2016a_file)
results_2016a = results_2016a_raw[results_2016a_raw['Validate']=="Y"]
# Reset the index and drop the old index
results_2016a.reset_index(inplace=True)
results_2016a.drop('index',axis=1,inplace=True)
In [24]:
for row in range(0,len(results_2016a)):
# Get the home, away and total scores
homescore = results_2016a['SCORE'][row]
awayscore = results_2016a['SCORE.1'][row]
totalscore = homescore + awayscore
homeoutcome = homescore/totalscore
results_2016a.loc[row,'Home Actual Result'] = homeoutcome
In [45]:
homeresults = results_2016a['Home Actual Result']
b = [0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1]
#plt.figure(figsize=(15,10))
axes = plt.gca()
#axes.set_xlim([0,7])
#axes.set_ylim([500,700])
plt.ylabel("Frequency")
plt.xlabel("Home Score Outcome")
#plt.title("Histogram of Home Score Outcomes up to SRHL Round 7")
plt.hist(homeresults)
print "2016a Score Histogram"
print "Mean Home Score Outcome:",np.mean(homeresults)
print "Standard Deviation of score outcomes:",np.std(homeresults)
print len(homeresults),'game results'
In [26]:
allteams;
In [ ]:
In [47]:
team = 'yokine drugs n crime'
print getscoreprogression(team)[-1]
In [ ]:
In [48]:
team_elos_df = pd.DataFrame.from_dict(elos_dict,orient='index')
colname = team_elos_df.columns.values[0]
plt.hist(team_elos_df[colname])
print "Mean Elo:",np.mean(team_elos_df[colname])
print "Elo Rating Histogram as at Round 7 2016b"
print "For SRHL"
In [ ]:
In [33]:
results.drop(['index','OT','H_Win','A_Win'],axis=1,inplace=True)
In [34]:
results.head()
Out[34]:
In [40]:
results['Home Error'] = None
for row in range(0,len(results)):
predicted = results.loc[row,'Home Predicted Result']
actual = results.loc[row,'Home Actual Result']
error = (predicted - actual)**2
results.loc[row,'Home Error'] = error
In [42]:
RMSE = (np.mean(results['Home Error']))**0.5
print "Reducing Elo for wins gives an RMS Error of:"
print RMSE
In [ ]: