In [1]:
from lxml import html
import requests
from bs4 import BeautifulSoup
import urllib2
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
%matplotlib inline
In [121]:
r = urllib2.urlopen('http://www.buda.org/leagues/past-leagues')
soup = BeautifulSoup(r, 'html.parser')
iframe = soup.find_all('iframe')[0]
response = urllib2.urlopen(iframe.attrs['src'])
iframe_soup = BeautifulSoup(response)
leaguelinks = [i.a['href'] for i in iframe_soup.find_all("td", class_="infobody")]
In [2]:
picklefile = '../all_players.p'
In [3]:
all_players = pickle.load( open( picklefile, "rb" ) )
In [4]:
pmean = []
players_means = {}
for player in all_players.keys():
pratings = np.array(all_players[player])
toolow = np.where(pratings < 0)
if toolow[0].size > 0:
pratings[toolow[0]] = 0
pmean.append(pratings.mean())
if pratings.mean() < 0:
print(pratings)
players_means[player] = pratings.mean()
In [127]:
# teamratings = {}
# playerlist = {}
teamavgratings = {}
# teamself = {}
# teamcaptain = {}
# teamcombined = {}
truedifferential = {}
# loop over all leagues in the BUDA database
for link in leaguelinks:
# extract the league id for this league
leagueid = link[link.index('league=') + 7:]
# extract the league id for this league
# springhat2016id = '30924'
# leagueid = springhat2016id
print("Working on league {}".format(leagueid))
# scrape the list of teams for this league
teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
response = urllib2.urlopen(teamsurl)
teams_soup = BeautifulSoup(response)
# generate list of team ids and names for this league
tdlist = teams_soup.find_all('td', class_='infobody')
teamids = []
teamnames = []
for td in tdlist:
try:
url = td.a['href']
idindex = url.index('team=')
whichindex = url.index('which=')
teamids.append(url[idindex+5:whichindex-1])
teamnames.append(td.a.get_text())
except:
continue
# if this league is complete, get the true score differential for each team
# scrape the scores for this league
leaguescoreurl = 'http://www.buda.org/hatleagues/scores.php?section=showLeagueSchedule&league=' + leagueid + '&byDivision=1&showGames=0'
response = urllib2.urlopen(leaguescoreurl)
leaguescore_soup = BeautifulSoup(response)
# assemble the data of team ratings for this league
print("Building database of team ratings")
data = []
data_opponent = []
try:
table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
except IndexError:
print("Unable to find a database of scores for league {}".format(leagueid))
continue
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('th')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
# convert to dataframe and drop irrelevant columns
dfdata = pd.DataFrame(data)
# print(leagueid, dfdata.columns)
dfdata.columns = dfdata.ix[0, :]#['Team', 'Record', 'Plus/Minus', 'Tourney Qualifying games']
# print(leagueid, dfdata.columns)
dfdata = dfdata.drop(0).reset_index()
# fill na's with -99 to facilitate division dividers
dfdata = dfdata.fillna(-99)
# get the list of divisions in this league
divnames = dfdata.ix[dfdata['Record'] == -99, 'Team'].values
if len(divnames) == 0:
print("No divisions found, skipping league {}".format(leagueid))
# continue
dfdata['div'] = np.zeros(len(dfdata))
for i in range(len(divnames)-1):
try:
divstart = np.where(dfdata['Team'] == divnames[i])[0][0]
except IndexError:
print("{} not found, skipping league {}".format(divnames[i], leagueid))
# continue
try:
divend = np.where(dfdata['Team'] == divnames[i + 1])[0][0]
except IndexError:
print("{} not found, skipping league {}".format(divnames[i + 1], leagueid))
# continue
try:
dfdata.ix[divstart + 1: divend, 'div'] = divnames[i]
except KeyError:
print("No base rating for {}, skipping league {}".format(divnames[i], leagueid))
# continue
try:
dfdata.ix[divend + 1:, 'div'] = divnames[-1]
except KeyError:
print("No base rating for {}, skipping league {}".format(divnames[-1], leagueid))
# continue
# remove the division dividers from the dataframe
for i in range(len(divnames)):
dfdata = dfdata.drop(dfdata.index[dfdata['Team'] == divnames[i]])
# generate the average goal differential column
dfdata['wins'] = dfdata['Record'].apply(lambda x: int(x.split('-')[0]))
dfdata['losses'] = dfdata['Record'].apply(lambda x: int(x.split('-')[1]))
dfdata['games'] = dfdata['wins'] + dfdata['losses']
dfdata['avgplusminus'] = dfdata['Plus/Minus'].astype('float') / dfdata['games']
# find all players associated with each team
print("Finding all players associated with each team")
for teamid, teamname in zip(teamids, teamnames):
teamurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeamRoster&team=' + teamid
response = urllib2.urlopen(teamurl)
roster_soup = BeautifulSoup(response)
playerratings = []
selfrating = []
captainrating = []
combinedrating = []
players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
for player in players:
if player == '':
continue
if player in all_players:
playerratings.append(players_means[player])
else:
# if someone hasn't played club league, they probably aren't very good
playerratings.append(700)
PLAYER = player.upper()
# if player == 'Bussmann, Shane':
# import pdb; pdb.set_trace()
if PLAYER in sph2011['full name'].values:
# import pdb; pdb.set_trace()
sph2011row = sph2011['full name'] == PLAYER
selfrating.append(sph2011.ix[sph2011row, 'Self'].values[0])
captainrating.append(sph2011.ix[sph2011row, 'CR'].values[0])
combinedrating.append(sph2011.ix[sph2011row, 'Rating'].values[0])
# the team rating is the average of the player ratings for that team
# teamratings[teamname] = playerratings
# playerlist[teamname] = players
teamavgratings[teamname] = np.mean(playerratings)
# teamself[teamname] = np.mean(selfrating)
# teamcaptain[teamname] = np.mean(captainrating)
# teamcombined[teamname] = np.mean(combinedrating)
# print("Finished with team {}".format(teamname))
dfdatarow = dfdata['Team'] == teamname
truedifferential[teamname] = dfdata.ix[dfdatarow, 'avgplusminus'].values[0]
dfdata['divmean'] = np.zeros(len(dfdata))
for div in divnames:
divmean = np.mean(dfdata.ix[dfdata['div'] == div, 'Historical Rating'])
allteamdf.ix[allteamdf['div'] == div, 'divmean'] = divmean
print("Finished successfully with league {}".format(leagueid))
print("")
# import pdb; pdb.set_trace()
In [146]:
tdiff[0]
Out[146]:
In [144]:
tag = np.array(teamavgratings.values())
tdiff = np.array(truedifferential.values())
ok = np.where(np.abs(tdiff) < 1e4)
tag = tag[ok]
plt.plot(tdiff[ok], tag, 'o')
In [141]:
tag
Out[141]:
In [5]:
allratings = pd.DataFrame({'Historical Rating': teamavgratings.values(),
# 'Self Rating': teamself.values(),
# 'Captain Rating': teamcaptain.values(),
# 'Combined Rating': teamcombined.values(),
'Team': teamavgratings.keys()})
alltruediff = pd.DataFrame({'Team': truedifferential.keys(), 'avgplusminus': truedifferential.values()})
alltrueratings = allratings.merge(alltruediff, on='Team', how='inner')
alltrueratings = alltrueratings.set_index('Team')
In [123]:
alltrueratings
Out[123]:
In [112]:
allteamdf = teamdf.join(allratings)
In [115]:
allteamdf['divmean'] = np.zeros(len(allteamdf))
for div in divnames:
divmean = np.mean(allteamdf.ix[allteamdf['div'] == div, 'Historical Rating'])
allteamdf.ix[allteamdf['div'] == div, 'divmean'] = divmean
In [117]:
allteamdf['Relative Historical Rating'] = allteamdf['Historical Rating'] - allteamdf['divmean']
In [59]:
def teamnumber(x):
i0 = x.index('(')
i1 = x.index(')')
return x[i0+1:i1]
In [82]:
allteamdf.index
Out[82]:
In [104]:
allteamdf['teamnumber'] = allteamdf.index.str.extract('\((.*)\)').values.astype('int')
In [52]:
sns.jointplot(allteamdf['avgplusminus'], allteamdf['Historical Rating'])
Out[52]:
In [105]:
jpmixed = allteamdf[allteamdf['teamnumber'] < 25]
In [106]:
allteamdf['teamnumber'].values.astype('int')
Out[106]:
In [119]:
sns.jointplot(allteamdf['avgplusminus'], allteamdf['Relative Historical Rating'])
Out[119]:
In [45]:
draftteams = allteamdf.dropna(subset=['Self Rating'])
draftteams = draftteams[draftteams['games'] > 5]
In [47]:
sns.jointplot(draftteams['avgplusminus'], draftteams['Historical Rating'])
Out[47]:
In [46]:
sns.jointplot(draftteams['avgplusminus'], draftteams['Self Rating'])
Out[46]:
In [50]:
sns.jointplot(draftteams['avgplusminus'], draftteams['Combined Rating'])
Out[50]:
In [1]:
def printteam(teamratings, playerlist, team):
print(team)
for i in range(len(teamratings[team])):
print("{:20} {:5.0f}".format(playerlist[team][i], teamratings[team][i]))
print("")
In [2]:
def plot2teams(teamratings, team1, team2):
plt.plot(np.cumsum(np.sort(teamratings[team1])), 'o-', label=team1)
plt.plot(np.cumsum(np.sort(teamratings[team2])), 'o-', label=team2)
plt.legend()
# printteam(teamratings, playerlist, team1)
# printteam(teamratings, playerlist, team2)
I need a function that takes a league id as input, and then:
For spring hat league 2011, I have captain's ratings and self ratings for every player in Monday night hat league. The money plot will be a comparison of predicted and true point differential for three possible metrics: the rating system I've devised here (let's call it "historical experience rating"), self rating, and captain's rating. I'll bet that there is a stronger correlation between true point differential and HER than there is for either self rating or captain's rating.
In [7]:
# load spring hat league self ratings and captain's ratings
sph2011file = '/Users/rbussman/Documents/ultimate/sphl11_mon_draft.csv'
sph2011 = pd.read_csv(sph2011file)
In [8]:
sph2011['full name'] = sph2011['LAST'] + ', ' + sph2011['FIRST']
In [9]:
teamratings = {}
playerlist = {}
teamavgratings = {}
# teamself = {}
# teamcaptain = {}
# teamcombined = {}
truedifferential = {}
# extract the league id for this league
leagueid = '40258'
# extract the league id for this league
# springhat2016id = '30924'
# leagueid = springhat2016id
print("Working on league {}".format(leagueid))
# scrape the list of teams for this league
teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
response = urllib2.urlopen(teamsurl)
teams_soup = BeautifulSoup(response)
# generate list of team ids and names for this league
tdlist = teams_soup.find_all('td', class_='infobody')
teamids = []
teamnames = []
for td in tdlist:
try:
url = td.a['href']
idindex = url.index('team=')
whichindex = url.index('which=')
teamids.append(url[idindex+5:whichindex-1])
teamnames.append(td.a.get_text())
except:
continue
# if this league is complete, get the true score differential for each team
# scrape the scores for this league
leaguescoreurl = 'http://www.buda.org/hatleagues/scores.php?section=showLeagueSchedule&league=' + leagueid + '&byDivision=1&showGames=0'
response = urllib2.urlopen(leaguescoreurl)
leaguescore_soup = BeautifulSoup(response)
# assemble the data of team ratings for this league
print("Building database of team ratings")
data = []
data_opponent = []
try:
table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
except IndexError:
print("Unable to find a database of scores for league {}".format(leagueid))
# continue
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('th')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
# convert to dataframe and drop irrelevant columns
dfdata = pd.DataFrame(data)
# print(leagueid, dfdata.columns)
dfdata.columns = dfdata.ix[0, :]#['Team', 'Record', 'Plus/Minus', 'Tourney Qualifying games']
# print(leagueid, dfdata.columns)
dfdata = dfdata.drop(0).reset_index()
# fill na's with -99 to facilitate division dividers
dfdata = dfdata.fillna(-99)
# get the list of divisions in this league
divnames = dfdata.ix[dfdata['Record'] == -99, 'Team'].values
if len(divnames) == 0:
print("No divisions found, skipping league {}".format(leagueid))
# continue
dfdata['div'] = np.zeros(len(dfdata))
for i in range(len(divnames)-1):
try:
divstart = np.where(dfdata['Team'] == divnames[i])[0][0]
except IndexError:
print("{} not found, skipping league {}".format(divnames[i], leagueid))
# continue
try:
divend = np.where(dfdata['Team'] == divnames[i + 1])[0][0]
except IndexError:
print("{} not found, skipping league {}".format(divnames[i + 1], leagueid))
# continue
try:
dfdata.ix[divstart + 1: divend, 'div'] = divnames[i]
except KeyError:
print("No base rating for {}, skipping league {}".format(divnames[i], leagueid))
# continue
try:
dfdata.ix[divend + 1:, 'div'] = divnames[-1]
except KeyError:
print("No base rating for {}, skipping league {}".format(divnames[-1], leagueid))
# continue
# remove the division dividers from the dataframe
for i in range(len(divnames)):
dfdata = dfdata.drop(dfdata.index[dfdata['Team'] == divnames[i]])
# generate the average goal differential column
dfdata['wins'] = dfdata['Record'].apply(lambda x: int(x.split('-')[0]))
dfdata['losses'] = dfdata['Record'].apply(lambda x: int(x.split('-')[1]))
dfdata['games'] = dfdata['wins'] + dfdata['losses']
dfdata['avgplusminus'] = dfdata['Plus/Minus'].astype('float') / dfdata['games']
# find all players associated with each team
print("Finding all players associated with each team")
for teamid, teamname in zip(teamids, teamnames):
teamurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeamRoster&team=' + teamid
response = urllib2.urlopen(teamurl)
roster_soup = BeautifulSoup(response)
playerratings = []
selfrating = []
captainrating = []
combinedrating = []
players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
for player in players:
if player == '':
continue
if player in all_players:
playerratings.append(players_means[player])
else:
# if someone hasn't played club league, they probably aren't very good
playerratings.append(700)
PLAYER = player.upper()
# if player == 'Bussmann, Shane':
# import pdb; pdb.set_trace()
if PLAYER in sph2011['full name'].values:
# import pdb; pdb.set_trace()
sph2011row = sph2011['full name'] == PLAYER
selfrating.append(sph2011.ix[sph2011row, 'Self'].values[0])
captainrating.append(sph2011.ix[sph2011row, 'CR'].values[0])
combinedrating.append(sph2011.ix[sph2011row, 'Rating'].values[0])
# the team rating is the average of the player ratings for that team
teamratings[teamname] = playerratings
playerlist[teamname] = players
teamavgratings[teamname] = np.mean(playerratings)
# teamself[teamname] = np.mean(selfrating)
# teamcaptain[teamname] = np.mean(captainrating)
# teamcombined[teamname] = np.mean(combinedrating)
# print("Finished with team {}".format(teamname))
# dfdatarow = dfdata['Team'] == teamname
# truedifferential[teamname] = dfdata.ix[dfdatarow, 'avgplusminus'].values[0]
# dfdata['divmean'] = np.zeros(len(dfdata))
# for div in divnames:
# divmean = np.mean(dfdata.ix[dfdata['div'] == div, 'Historical Rating'])
# allteamdf.ix[allteamdf['div'] == div, 'divmean'] = divmean
print("Finished successfully with league {}".format(leagueid))
print("")
# import pdb; pdb.set_trace()
In [10]:
teamavgratings
Out[10]:
In [11]:
names = []
predrating = []
for key in teamavgratings.keys():
names.append(key)
predrating.append(teamavgratings[key])
# print("{:38} {:.0f}".format(key, teamavgratings[key]) )
dfpred = pd.DataFrame({'teamname': names, 'predictedrating': predrating})
In [13]:
dfpred.sort('predictedrating')
Out[13]:
In [35]:
(1077 - 1900/16. - 1400/16.) * 16/14.
Out[35]:
In [3]:
plot2teams(teamratings, 'R2Defense (20)', 'Poe Hammeron (15)')
In [4]:
plot2teams(teamratings, 'R2Defense (20)', 'Team 14 (14)')
In [5]:
plot2teams(teamratings, 'R2Defense (20)', 'Team 24 (24)')
In [6]:
plot2teams(teamratings, 'R2Defense (20)', 'Team 22 (22)')
In [7]:
plot2teams(teamratings, 'R2Defense (20)', 'Winning the Pooh (13)')
In [8]:
plot2teams(teamratings, 'R2Defense (20)', 'Team 16 (16)')
In [9]:
plot2teams(teamratings, 'R2Defense (20)', 'Team 18 (18)')
In [10]:
np.sort(teamratings['R2Defense (20)'])
In [ ]: