from lxml import html
import requests
from bs4 import BeautifulSoup
import urllib2
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
%matplotlib inline
r = urllib2.urlopen('')
soup = BeautifulSoup(r, 'html.parser')
iframe = soup.find_all('iframe')[0]
response = urllib2.urlopen(iframe.attrs['src'])
iframe_soup = BeautifulSoup(response)
leaguelinks = [i.a['href'] for i in iframe_soup.find_all("td", class_="infobody")]
picklefile = '../all_players.p'
In [3]:
all_players = pickle.load( open( picklefile, "rb" ) )
In [4]:
pmean = []
players_means = {}
for player in all_players.keys():
pratings = np.array(all_players[player])
toolow = np.where(pratings < 0)
if toolow[0].size > 0:
pratings[toolow[0]] = 0
if pratings.mean() < 0:
players_means[player] = pratings.mean()
# teamratings = {}
# playerlist = {}
teamavgratings = {}
# teamself = {}
# teamcaptain = {}
# teamcombined = {}
truedifferential = {}
# loop over all leagues in the BUDA database
for link in leaguelinks:
# extract the league id for this league
leagueid = link[link.index('league=') + 7:]
# extract the league id for this league
# springhat2016id = '30924'
# leagueid = springhat2016id
print("Working on league {}".format(leagueid))
# scrape the list of teams for this league
teamsurl = '' + leagueid
response = urllib2.urlopen(teamsurl)
teams_soup = BeautifulSoup(response)
# generate list of team ids and names for this league
tdlist = teams_soup.find_all('td', class_='infobody')
teamids = []
teamnames = []
for td in tdlist:
url = td.a['href']
idindex = url.index('team=')
whichindex = url.index('which=')
# if this league is complete, get the true score differential for each team
# scrape the scores for this league
leaguescoreurl = '' + leagueid + '&byDivision=1&showGames=0'
response = urllib2.urlopen(leaguescoreurl)
leaguescore_soup = BeautifulSoup(response)
# assemble the data of team ratings for this league
print("Building database of team ratings")
data = []
data_opponent = []
table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
except IndexError:
print("Unable to find a database of scores for league {}".format(leagueid))
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('th')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
# convert to dataframe and drop irrelevant columns
dfdata = pd.DataFrame(data)
# print(leagueid, dfdata.columns)
dfdata.columns = dfdata.ix[0, :]#['Team', 'Record', 'Plus/Minus', 'Tourney Qualifying games']
# print(leagueid, dfdata.columns)
dfdata = dfdata.drop(0).reset_index()
# fill na's with -99 to facilitate division dividers
dfdata = dfdata.fillna(-99)
# get the list of divisions in this league
divnames = dfdata.ix[dfdata['Record'] == -99, 'Team'].values
if len(divnames) == 0:
print("No divisions found, skipping league {}".format(leagueid))
# continue
dfdata['div'] = np.zeros(len(dfdata))
for i in range(len(divnames)-1):
divstart = np.where(dfdata['Team'] == divnames[i])[0][0]
except IndexError:
print("{} not found, skipping league {}".format(divnames[i], leagueid))
# continue
divend = np.where(dfdata['Team'] == divnames[i + 1])[0][0]
except IndexError:
print("{} not found, skipping league {}".format(divnames[i + 1], leagueid))
# continue
dfdata.ix[divstart + 1: divend, 'div'] = divnames[i]
except KeyError:
print("No base rating for {}, skipping league {}".format(divnames[i], leagueid))
# continue
dfdata.ix[divend + 1:, 'div'] = divnames[-1]
except KeyError:
print("No base rating for {}, skipping league {}".format(divnames[-1], leagueid))
# continue
# remove the division dividers from the dataframe
for i in range(len(divnames)):
dfdata = dfdata.drop(dfdata.index[dfdata['Team'] == divnames[i]])
# generate the average goal differential column
dfdata['wins'] = dfdata['Record'].apply(lambda x: int(x.split('-')[0]))
dfdata['losses'] = dfdata['Record'].apply(lambda x: int(x.split('-')[1]))
dfdata['games'] = dfdata['wins'] + dfdata['losses']
dfdata['avgplusminus'] = dfdata['Plus/Minus'].astype('float') / dfdata['games']
# find all players associated with each team
print("Finding all players associated with each team")
for teamid, teamname in zip(teamids, teamnames):
teamurl = '' + teamid
response = urllib2.urlopen(teamurl)
roster_soup = BeautifulSoup(response)
playerratings = []
selfrating = []
captainrating = []
combinedrating = []
players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
for player in players:
if player == '':
if player in all_players:
# if someone hasn't played club league, they probably aren't very good
PLAYER = player.upper()
# if player == 'Bussmann, Shane':
# import pdb; pdb.set_trace()
if PLAYER in sph2011['full name'].values:
# import pdb; pdb.set_trace()
sph2011row = sph2011['full name'] == PLAYER
selfrating.append(sph2011.ix[sph2011row, 'Self'].values[0])
captainrating.append(sph2011.ix[sph2011row, 'CR'].values[0])
combinedrating.append(sph2011.ix[sph2011row, 'Rating'].values[0])
# the team rating is the average of the player ratings for that team
# teamratings[teamname] = playerratings
# playerlist[teamname] = players
teamavgratings[teamname] = np.mean(playerratings)
# teamself[teamname] = np.mean(selfrating)
# teamcaptain[teamname] = np.mean(captainrating)
# teamcombined[teamname] = np.mean(combinedrating)
# print("Finished with team {}".format(teamname))
dfdatarow = dfdata['Team'] == teamname
truedifferential[teamname] = dfdata.ix[dfdatarow, 'avgplusminus'].values[0]
dfdata['divmean'] = np.zeros(len(dfdata))
for div in divnames:
divmean = np.mean(dfdata.ix[dfdata['div'] == div, 'Historical Rating'])
allteamdf.ix[allteamdf['div'] == div, 'divmean'] = divmean
print("Finished successfully with league {}".format(leagueid))
# import pdb; pdb.set_trace()
tag = np.array(teamavgratings.values())
tdiff = np.array(truedifferential.values())
ok = np.where(np.abs(tdiff) < 1e4)
tag = tag[ok]
plt.plot(tdiff[ok], tag, 'o')
In [5]:
allratings = pd.DataFrame({'Historical Rating': teamavgratings.values(),
# 'Self Rating': teamself.values(),
# 'Captain Rating': teamcaptain.values(),
# 'Combined Rating': teamcombined.values(),
'Team': teamavgratings.keys()})
alltruediff = pd.DataFrame({'Team': truedifferential.keys(), 'avgplusminus': truedifferential.values()})
alltrueratings = allratings.merge(alltruediff, on='Team', how='inner')
alltrueratings = alltrueratings.set_index('Team')
allteamdf = teamdf.join(allratings)
In [115]:
allteamdf['divmean'] = np.zeros(len(allteamdf))
for div in divnames:
divmean = np.mean(allteamdf.ix[allteamdf['div'] == div, 'Historical Rating'])
allteamdf.ix[allteamdf['div'] == div, 'divmean'] = divmean
allteamdf['Relative Historical Rating'] = allteamdf['Historical Rating'] - allteamdf['divmean']
def teamnumber(x):
i0 = x.index('(')
i1 = x.index(')')
return x[i0+1:i1]
allteamdf['teamnumber'] = allteamdf.index.str.extract('\((.*)\)').values.astype('int')
sns.jointplot(allteamdf['avgplusminus'], allteamdf['Historical Rating'])
In [105]:
jpmixed = allteamdf[allteamdf['teamnumber'] < 25]
In [106]:
In [119]:
sns.jointplot(allteamdf['avgplusminus'], allteamdf['Relative Historical Rating'])
draftteams = allteamdf.dropna(subset=['Self Rating'])
draftteams = draftteams[draftteams['games'] > 5]
sns.jointplot(draftteams['avgplusminus'], draftteams['Historical Rating'])
sns.jointplot(draftteams['avgplusminus'], draftteams['Self Rating'])
sns.jointplot(draftteams['avgplusminus'], draftteams['Combined Rating'])
def printteam(teamratings, playerlist, team):
for i in range(len(teamratings[team])):
print("{:20} {:5.0f}".format(playerlist[team][i], teamratings[team][i]))
def plot2teams(teamratings, team1, team2):
plt.plot(np.cumsum(np.sort(teamratings[team1])), 'o-', label=team1)
plt.plot(np.cumsum(np.sort(teamratings[team2])), 'o-', label=team2)
# printteam(teamratings, playerlist, team1)
# printteam(teamratings, playerlist, team2)
I need a function that takes a league id as input, and then:
For spring hat league 2011, I have captain's ratings and self ratings for every player in Monday night hat league. The money plot will be a comparison of predicted and true point differential for three possible metrics: the rating system I've devised here (let's call it "historical experience rating"), self rating, and captain's rating. I'll bet that there is a stronger correlation between true point differential and HER than there is for either self rating or captain's rating.
# load spring hat league self ratings and captain's ratings
sph2011file = '/Users/rbussman/Documents/ultimate/sphl11_mon_draft.csv'
sph2011 = pd.read_csv(sph2011file)
sph2011['full name'] = sph2011['LAST'] + ', ' + sph2011['FIRST']
teamratings = {}
playerlist = {}
teamavgratings = {}
# teamself = {}
# teamcaptain = {}
# teamcombined = {}
truedifferential = {}
# extract the league id for this league
leagueid = '40258'
# extract the league id for this league
# springhat2016id = '30924'
# leagueid = springhat2016id
print("Working on league {}".format(leagueid))
# scrape the list of teams for this league
teamsurl = '' + leagueid
response = urllib2.urlopen(teamsurl)
teams_soup = BeautifulSoup(response)
# generate list of team ids and names for this league
tdlist = teams_soup.find_all('td', class_='infobody')
teamids = []
teamnames = []
for td in tdlist:
url = td.a['href']
idindex = url.index('team=')
whichindex = url.index('which=')
# if this league is complete, get the true score differential for each team
# scrape the scores for this league
leaguescoreurl = '' + leagueid + '&byDivision=1&showGames=0'
response = urllib2.urlopen(leaguescoreurl)
leaguescore_soup = BeautifulSoup(response)
# assemble the data of team ratings for this league
print("Building database of team ratings")
data = []
data_opponent = []
table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
except IndexError:
print("Unable to find a database of scores for league {}".format(leagueid))
# continue
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('th')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele]) # Get rid of empty values
# convert to dataframe and drop irrelevant columns
dfdata = pd.DataFrame(data)
# print(leagueid, dfdata.columns)
dfdata.columns = dfdata.ix[0, :]#['Team', 'Record', 'Plus/Minus', 'Tourney Qualifying games']
# print(leagueid, dfdata.columns)
dfdata = dfdata.drop(0).reset_index()
# fill na's with -99 to facilitate division dividers
dfdata = dfdata.fillna(-99)
# get the list of divisions in this league
divnames = dfdata.ix[dfdata['Record'] == -99, 'Team'].values
if len(divnames) == 0:
print("No divisions found, skipping league {}".format(leagueid))
# continue
dfdata['div'] = np.zeros(len(dfdata))
for i in range(len(divnames)-1):
divstart = np.where(dfdata['Team'] == divnames[i])[0][0]
except IndexError:
print("{} not found, skipping league {}".format(divnames[i], leagueid))
# continue
divend = np.where(dfdata['Team'] == divnames[i + 1])[0][0]
except IndexError:
print("{} not found, skipping league {}".format(divnames[i + 1], leagueid))
# continue
dfdata.ix[divstart + 1: divend, 'div'] = divnames[i]
except KeyError:
print("No base rating for {}, skipping league {}".format(divnames[i], leagueid))
# continue
dfdata.ix[divend + 1:, 'div'] = divnames[-1]
except KeyError:
print("No base rating for {}, skipping league {}".format(divnames[-1], leagueid))
# continue
# remove the division dividers from the dataframe
for i in range(len(divnames)):
dfdata = dfdata.drop(dfdata.index[dfdata['Team'] == divnames[i]])
# generate the average goal differential column
dfdata['wins'] = dfdata['Record'].apply(lambda x: int(x.split('-')[0]))
dfdata['losses'] = dfdata['Record'].apply(lambda x: int(x.split('-')[1]))
dfdata['games'] = dfdata['wins'] + dfdata['losses']
dfdata['avgplusminus'] = dfdata['Plus/Minus'].astype('float') / dfdata['games']
# find all players associated with each team
print("Finding all players associated with each team")
for teamid, teamname in zip(teamids, teamnames):
teamurl = '' + teamid
response = urllib2.urlopen(teamurl)
roster_soup = BeautifulSoup(response)
playerratings = []
selfrating = []
captainrating = []
combinedrating = []
players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
for player in players:
if player == '':
if player in all_players:
# if someone hasn't played club league, they probably aren't very good
PLAYER = player.upper()
# if player == 'Bussmann, Shane':
# import pdb; pdb.set_trace()
if PLAYER in sph2011['full name'].values:
# import pdb; pdb.set_trace()
sph2011row = sph2011['full name'] == PLAYER
selfrating.append(sph2011.ix[sph2011row, 'Self'].values[0])
captainrating.append(sph2011.ix[sph2011row, 'CR'].values[0])
combinedrating.append(sph2011.ix[sph2011row, 'Rating'].values[0])
# the team rating is the average of the player ratings for that team
teamratings[teamname] = playerratings
playerlist[teamname] = players
teamavgratings[teamname] = np.mean(playerratings)
# teamself[teamname] = np.mean(selfrating)
# teamcaptain[teamname] = np.mean(captainrating)
# teamcombined[teamname] = np.mean(combinedrating)
# print("Finished with team {}".format(teamname))
# dfdatarow = dfdata['Team'] == teamname
# truedifferential[teamname] = dfdata.ix[dfdatarow, 'avgplusminus'].values[0]
# dfdata['divmean'] = np.zeros(len(dfdata))
# for div in divnames:
# divmean = np.mean(dfdata.ix[dfdata['div'] == div, 'Historical Rating'])
# allteamdf.ix[allteamdf['div'] == div, 'divmean'] = divmean
print("Finished successfully with league {}".format(leagueid))
# import pdb; pdb.set_trace()
names = []
predrating = []
for key in teamavgratings.keys():
# print("{:38} {:.0f}".format(key, teamavgratings[key]) )
dfpred = pd.DataFrame({'teamname': names, 'predictedrating': predrating})
(1077 - 1900/16. - 1400/16.) * 16/14.
plot2teams(teamratings, 'R2Defense (20)', 'Poe Hammeron (15)')
plot2teams(teamratings, 'R2Defense (20)', 'Team 14 (14)')
plot2teams(teamratings, 'R2Defense (20)', 'Team 24 (24)')
plot2teams(teamratings, 'R2Defense (20)', 'Team 22 (22)')
plot2teams(teamratings, 'R2Defense (20)', 'Winning the Pooh (13)')
plot2teams(teamratings, 'R2Defense (20)', 'Team 16 (16)')
plot2teams(teamratings, 'R2Defense (20)', 'Team 18 (18)')
np.sort(teamratings['R2Defense (20)'])
