In [1]:
from lxml import html
import requests
from bs4 import BeautifulSoup
import urllib
import urllib2
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from scipy.interpolate import interp1d
import emcee
import seaborn as sns
import matplotlib.pyplot as plt
In [2]:
r = urllib.urlopen('http://www.buda.org/leagues/past-leagues')
soup = BeautifulSoup(r, 'html.parser')
In [3]:
iframe = soup.find_all('iframe')[0]
response = urllib2.urlopen(iframe.attrs['src'])
iframe_soup = BeautifulSoup(response)
In [4]:
leaguelinks = [i.a['href'] for i in iframe_soup.find_all("td", class_="infobody")]
In [7]:
i.get_text()
Out[7]:
In [85]:
# define the dictionary that will contain all player ratings
all_players = {}
# loop over all leagues in the BUDA database
for link in leaguelinks:
# extract the league id for this league
leagueid = link[link.index('league=') + 7:]
# scrape the scores for this league
leaguescoreurl = 'http://www.buda.org/hatleagues/scores.php?section=showLeagueSchedule&league=' + leagueid + '&byDivision=1&showGames=1'
response = urllib2.urlopen(leaguescoreurl)
leaguescore_soup = BeautifulSoup(response)
# assemble the data of team ratings for this league
data = []
data_opponent = []
try:
table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
except IndexError:
print("Unable to find a database of scores for league {}".format(leagueid))
continue
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('th')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
tdcols = row.find_all('td')
tdcols = [ele.text.strip() for ele in tdcols]
data_opponent.append([ele for ele in tdcols if ele]) # Get rid of empty values
# convert to dataframe and drop irrelevant columns
dfdata = pd.DataFrame(data)
# print(leagueid, dfdata.columns)
dfdata.columns = dfdata.ix[0, :]#['Team', 'Record', 'Plus/Minus', 'Tourney Qualifying games']
# print(leagueid, dfdata.columns)
dfdata = dfdata.drop(0).reset_index()
# fill na's with -99 to facilitate division dividers
dfdata = dfdata.fillna(-99)
# get the list of divisions in this league
divnames = dfdata.ix[dfdata['Record'] == -99, 'Team'].values
if len(divnames) == 0:
print("No divisions found, skipping league {}".format(leagueid))
continue
# define base ratings by division (arbitrarily assigned based on my experience)
divratings = {'4/3 Div 1': 1800, '4/3 Div 2': 1400, '4/3 Div 3': 1000, '4/3 Div 4': 900,
'5/2 Div 1': 1700, '5/2 Div 2': 1300, '5/2 Div 3': 900, '5/2 Div 4': 800,
'Open Div 1': 1400, 'Open Div 2': 1200}
dfdata['div'] = np.zeros(len(dfdata))
for i in range(len(divnames)-1):
try:
divstart = np.where(dfdata['Team'] == divnames[i])[0][0]
except IndexError:
print("{} not found, skipping league {}".format(divnames[i], leagueid))
continue
try:
divend = np.where(dfdata['Team'] == divnames[i + 1])[0][0]
except IndexError:
print("{} not found, skipping league {}".format(divnames[i + 1], leagueid))
continue
try:
dfdata.ix[divstart + 1: divend, 'div'] = divratings[divnames[i]]
except KeyError:
print("No base rating for {}, skipping league {}".format(divnames[i], leagueid))
continue
try:
dfdata.ix[divend + 1:, 'div'] = divratings[divnames[-1]]
except KeyError:
print("No base rating for {}, skipping league {}".format(divnames[-1], leagueid))
continue
# remove the division dividers from the dataframe
for i in range(len(divnames)):
dfdata = dfdata.drop(dfdata.index[dfdata['Team'] == divnames[i]])
# generate the average goal differential column
dfdata['wins'] = dfdata['Record'].apply(lambda x: int(x.split('-')[0]))
dfdata['losses'] = dfdata['Record'].apply(lambda x: int(x.split('-')[1]))
dfdata['games'] = dfdata['wins'] + dfdata['losses']
dfdata['avgplusminus'] = dfdata['Plus/Minus'].astype('float') / dfdata['games']
# assert that an average goal differential per game of +5 gives +300 rating points.
dfdata['rating'] = dfdata['div'] + 60. * dfdata['avgplusminus']
# build the dictionary of game scores
dfdata_opponents = pd.DataFrame(data_opponent)
dfdata_opponents['teamscore'] = dfdata.ix[:, 1].apply(lambda x: int(x.split('-')[0]))
dfdata_opponents['opponentscore'] = dfdata.ix[:, 1].apply(lambda x: int(x.split('-')[1]))
opponentcounter = 0
game_scores = {}
for idf in range(len(dfdata)):
teamname = dfdata.ix[idf, 'Team']
ngames = dfdata.ix[idf, 'games']
for igame in range(ngames):
opponentname = dfdata_opponents.ix[opponentcounter, 0]
teamscore = dfdata_opponents.ix[opponentcounter, 'teamscore']
opponentscore = dfdata_opponents.ix[opponentcounter, 'opponentscore']
adversary_key = (teamname, opponentname)
game_scores[adversary_key] = [teamscore, opponentscore]
# scrape the list of teams for this league
teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
response = urllib2.urlopen(teamsurl)
teams_soup = BeautifulSoup(response)
# generate list of team ids and names for this league
tdlist = teams_soup.find_all('td', class_='infobody')
teamids = []
teamnames = []
for td in tdlist:
try:
url = td.a['href']
idindex = url.index('team=')
whichindex = url.index('which=')
teamids.append(url[idindex+5:whichindex-1])
teamnames.append(td.a.get_text())
except:
continue
# find all players associated with each team
# link the team rating to each player on that team
for teamid, teamname in zip(teamids, teamnames):
try:
teamrating = dfdata.ix[dfdata['Team'] == teamname.strip(' '), 'rating'].values[0]
except IndexError:
print("Couldn't match {} to scores database, skipping this team.".format(teamname))
continue
teamurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeamRoster&team=' + teamid
response = urllib2.urlopen(teamurl)
roster_soup = BeautifulSoup(response)
players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
for player in players:
if player in all_players:
all_players[player].append(teamrating)
else:
all_players[player] = [teamrating]
print("Finished successfully with league {}".format(leagueid))
In [87]:
all_players.pop('')
Out[87]:
In [90]:
import pickle
pickle.dump(all_players, open( "all_players.p", "wb" ) )
In [105]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
In [93]:
%matplotlib inline
for player in all_players.keys():
if len(all_players[player]) > 5:
plt.plot(all_players[player], color='gray', lw=0.5, alpha=0.1)
In [124]:
pmean = []
players_means = {}
for player in all_players.keys():
pratings = np.array(all_players[player])
toolow = np.where(pratings < 0)
if toolow[0].size > 0:
pratings[toolow[0]] = 0
pmean.append(pratings.mean())
if pratings.mean() < 0:
print(pratings)
players_means[player] = pratings.mean()
In [122]:
pdf = pd.DataFrame(pmean)
In [123]:
sns.distplot(pdf.dropna())
Out[123]:
In [125]:
In [130]:
# extract the league id for this league
springhat2016id = '40258'
leagueid = springhat2016id
# scrape the list of teams for this league
teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
response = urllib2.urlopen(teamsurl)
teams_soup = BeautifulSoup(response)
# generate list of team ids and names for this league
tdlist = teams_soup.find_all('td', class_='infobody')
teamids = []
teamnames = []
for td in tdlist:
try:
url = td.a['href']
idindex = url.index('team=')
whichindex = url.index('which=')
teamids.append(url[idindex+5:whichindex-1])
teamnames.append(td.a.get_text())
except:
continue
# find all players associated with each team
teamratings = {}
for teamid, teamname in zip(teamids, teamnames):
teamurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeamRoster&team=' + teamid
response = urllib2.urlopen(teamurl)
roster_soup = BeautifulSoup(response)
playerratings = []
players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
for player in players:
if player in all_players:
playerratings.append(players_means[player])
else:
# if someone hasn't played club league, they probably aren't very good
playerratings.append(800)
# the team rating is the average of the player ratings for that team
teamratings[teamname] = np.mean(playerratings)
print("Finished successfully with league {}".format(leagueid))
In [156]:
sns.distplot(pd.DataFrame(teamratings.values()).dropna(), kde=False, bins=10)
plt.axvline(teamratings['Team 20 (20)'], label='Team 20')
plt.legend(loc='auto')
plt.ylabel('Number of Teams')
plt.xlabel('Team Rating')
plt.savefig('Team20Rating.png')
In [161]:
teamratings['Team 27 (27)'] = 1000
In [162]:
keylist = []
valuelist = []
for key in teamratings.keys():
keylist.append(key)
valuelist.append(teamratings[key])
In [163]:
shl = pd.DataFrame({'team':keylist, 'rating':valuelist})
In [164]:
shl = shl.sort('rating', ascending=False)
In [166]:
shl.team
Out[166]:
In [180]:
5/25.
Out[180]:
In [179]:
2/28.
Out[179]:
In [ ]:
def rating_to_point(rating1, rating2):
# tune k so that rating differential of ... corresponds to point ratio of ...
# 800 ... 0.5
# 400 ... 0.15
# 200 ... 0.07
# 100 ... 0.035
point_ratio1 = 1 / (1 + np.exp(-k * x))
return point_ratio1
In [33]:
# define the dictionary that will contain all player ratings
all_players = {}
# extract the league id for this league
leagueid = '39641'#link[link.index('league=') + 7:]
# scrape the scores for this league
leaguescoreurl = 'http://www.buda.org/hatleagues/scores.php?section=showLeagueSchedule&league=' + leagueid + '&byDivision=1&showGames=1'
response = urllib2.urlopen(leaguescoreurl)
leaguescore_soup = BeautifulSoup(response)
# assemble the data of team ratings for this league
data = []
try:
table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
except IndexError:
print("Unable to find a database of scores for league {}".format(leagueid))
rows = table.find_all('tr')
for row in rows:
# cols = row.find_all('th')
# cols = [ele.text.strip() for ele in cols]
# if len(cols) == 0:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
# convert to dataframe and drop irrelevant columns
dfdata = pd.DataFrame(data)
In [35]:
dfdata.dropna().ix[:, 0]
Out[35]:
Data format should be: dictionary with key as ('Team1, Team2') and value as (score1, score2). Then, to generate the lnprob, we loop over all keys in the dictionary, building a list of difference delta-ratings.
In [1]:
def point_to_rating(point1, point2):
base_rating = [-1200, -800,-400,-200,-100,0,100,200,400,800, 1200]
base_point_ratio = [-1, -0.5, -0.2, -0.07, -0.03, 0.0, 0.03, 0.07, 0.2, 0.5, 1]
interpfunc = interp1d(base_point_ratio, base_rating)
point_ratio = (point1 - point2) / (point1 + point2)
delta_rating = interpfunc(point_ratio)
return delta_rating
# plt.plot(outputs, indices,'-o')
In [2]:
def lnprob(param):
# set the bounds
if param.any() < 0:
return np.inf
if param.any() > 10000:
return np.inf
# populate the team ratings according to the current model
model_ratings = {}
for iteam, teamname in enumerate(teamnames):
model_ratings[teamname] = param[iteam]
# compute the rating delta for both model and data
model_delta = []
observed_delta = []
for gamekey in game_scores.keys():
key0 = gamekey[0]
key1 = gamekey[1]
model_delta.append(model_ratings[key0] - model_ratings[key1])
observed_delta.append(point_to_rating(game_scores[gamekey][0], game_scores[gamekey][1]))
# lnprob is defined as mean absolute error between model and true deltas
probln = -mean_absolute_error(observed_delta, model_delta)
return probln
In [ ]:
# define the dictionary that will contain all player ratings
all_players = {}
# loop over all leagues in the BUDA database
for link in leaguelinks[0]:
# extract the league id for this league
leagueid = '39641'#link[link.index('league=') + 7:]
# scrape the scores for this league
leaguescoreurl = 'http://www.buda.org/hatleagues/scores.php?section=showLeagueSchedule&league=' + leagueid + '&byDivision=1&showGames=1'
response = urllib2.urlopen(leaguescoreurl)
leaguescore_soup = BeautifulSoup(response)
# assemble the data of team ratings for this league
data = []
data_opponent = []
try:
table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
except IndexError:
print("Unable to find a database of scores for league {}".format(leagueid))
continue
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('th')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
tdcols = row.find_all('td')
tdcols = [ele.text.strip() for ele in tdcols]
data_opponent.append([ele for ele in tdcols if ele]) # Get rid of empty values
# convert to dataframe and drop irrelevant columns
dfdata = pd.DataFrame(data)
# print(leagueid, dfdata.columns)
dfdata.columns = dfdata.ix[0, :]#['Team', 'Record', 'Plus/Minus', 'Tourney Qualifying games']
dfdata = dfdata.dropna(how='all')
# print(leagueid, dfdata.columns)
dfdata = dfdata.drop(0).reset_index()
dfdata = dfdata.drop(['index', 'Tourney Qualifying games*'], axis=1)
# fill na's with -99 to facilitate division dividers
dfdata = dfdata.fillna(-99)
# get the list of divisions in this league
divnames = dfdata.ix[dfdata['Record'] == -99, 'Team'].values
if len(divnames) == 0:
print("No divisions found, skipping league {}".format(leagueid))
continue
# define base ratings by division (arbitrarily assigned based on my experience)
divratings = {'4/3 Div 1': 1800, '4/3 Div 2': 1400, '4/3 Div 3': 1000, '4/3 Div 4': 900,
'5/2 Div 1': 1700, '5/2 Div 2': 1300, '5/2 Div 3': 900, '5/2 Div 4': 800,
'Open Div 1': 1400, 'Open Div 2': 1200}
dfdata['div'] = np.zeros(len(dfdata))
for i in range(len(divnames)-1):
try:
divstart = np.where(dfdata['Team'] == divnames[i])[0][0]
except IndexError:
print("{} not found, skipping league {}".format(divnames[i], leagueid))
continue
try:
divend = np.where(dfdata['Team'] == divnames[i + 1])[0][0]
except IndexError:
print("{} not found, skipping league {}".format(divnames[i + 1], leagueid))
continue
try:
dfdata.ix[divstart + 1: divend, 'div'] = divratings[divnames[i]]
except KeyError:
print("No base rating for {}, skipping league {}".format(divnames[i], leagueid))
import pdb; pdb.set_trace()
continue
try:
dfdata.ix[divend + 1:, 'div'] = divratings[divnames[-1]]
except KeyError:
print("No base rating for {}, skipping league {}".format(divnames[-1], leagueid))
import pdb; pdb.set_trace()
continue
# remove the division dividers from the dataframe
for i in range(len(divnames)):
dfdata = dfdata.drop(dfdata.index[dfdata['Team'] == divnames[i]])
# generate the average goal differential column
dfdata['wins'] = dfdata['Record'].apply(lambda x: int(x.split('-')[0]))
dfdata['losses'] = dfdata['Record'].apply(lambda x: int(x.split('-')[1]))
dfdata['games'] = dfdata['wins'] + dfdata['losses']
dfdata['avgplusminus'] = dfdata['Plus/Minus'].astype('float') / dfdata['games']
# assert that an average goal differential per game of +5 gives +300 rating points.
dfdata['rating'] = dfdata['div'] + 60. * dfdata['avgplusminus']
# build the dictionary of game scores
dfdata_opponents = pd.DataFrame(data_opponent).dropna().reset_index().drop('index', axis=1)
dfdata_opponents.columns = ['Opponent', 'Record']
dfdata_opponents['teamscore'] = dfdata_opponents.ix[:, 'Record'].apply(lambda x: int(x.split('-')[0]))
dfdata_opponents['opponentscore'] = dfdata_opponents.ix[:, 'Record'].apply(lambda x: int(x.split('-')[1]))
opponentcounter = 0
game_scores = {}
for idf in dfdata.index:
teamname = dfdata.ix[idf, 'Team']
ngames = dfdata.ix[idf, 'games']
for igame in range(ngames):
opponentname = dfdata_opponents.ix[opponentcounter, 'Opponent']
teamscore = dfdata_opponents.ix[opponentcounter, 'teamscore']
opponentscore = dfdata_opponents.ix[opponentcounter, 'opponentscore']
adversary_key = (teamname, opponentname)
game_scores[adversary_key] = [teamscore, opponentscore]
teamnames = dfdata['Team']
ndim = len(dfdata)
nwalkers = ndim * 2 + 2
p0 = [np.random.normal(irating, 200, nwalkers) for irating in dfdata['rating']]
p0 = np.array(p0).transpose()
# p0 = [np.random.rand(ndim) for i in range(nwalkers)]
sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, threads=4)
sampler.run_mcmc(p0, 200)
plt.plot(sampler.flatchain[:, 0])
plt.show()
import pdb; pdb.set_trace()
# scrape the list of teams for this league
teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
response = urllib2.urlopen(teamsurl)
teams_soup = BeautifulSoup(response)
# generate list of team ids and names for this league
tdlist = teams_soup.find_all('td', class_='infobody')
teamids = []
teamnames = []
for td in tdlist:
try:
url = td.a['href']
idindex = url.index('team=')
whichindex = url.index('which=')
teamids.append(url[idindex+5:whichindex-1])
teamnames.append(td.a.get_text())
except:
continue
# find all players associated with each team
# link the team rating to each player on that team
for teamid, teamname in zip(teamids, teamnames):
try:
teamrating = dfdata.ix[dfdata['Team'] == teamname.strip(' '), 'rating'].values[0]
except IndexError:
print("Couldn't match {} to scores database, skipping this team.".format(teamname))
continue
teamurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeamRoster&team=' + teamid
response = urllib2.urlopen(teamurl)
roster_soup = BeautifulSoup(response)
players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
for player in players:
if player in all_players:
all_players[player].append(teamrating)
else:
all_players[player] = [teamrating]
print("Finished successfully with league {}".format(leagueid))
One possible approach would be to say expected point differential is rating differential divided by 100.
In [14]:
dfdata
Out[14]:
Everything is set up except the dictionary of game scores: "game_scores".
In [12]:
np.array(p0).shape
Out[12]:
In [13]:
nwalkers
Out[13]:
In [ ]: