In [1]:
from lxml import html
import requests
from bs4 import BeautifulSoup
import urllib
import urllib2
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from scipy.interpolate import interp1d
import emcee
import seaborn as sns
import matplotlib.pyplot as plt


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-1-ba3bfb218f69> in <module>()
----> 1 from lxml import html
      2 import requests
      3 from bs4 import BeautifulSoup
      4 import urllib
      5 import urllib2

ImportError: No module named lxml

In [2]:
r = urllib.urlopen('http://www.buda.org/leagues/past-leagues')
soup = BeautifulSoup(r, 'html.parser')

In [3]:
iframe = soup.find_all('iframe')[0]
response = urllib2.urlopen(iframe.attrs['src'])
iframe_soup = BeautifulSoup(response)

In [4]:
leaguelinks = [i.a['href'] for i in iframe_soup.find_all("td", class_="infobody")]

In [7]:
i.get_text()


Out[7]:
u'Summer Hat League- 1999'

In [85]:
# define the dictionary that will contain all player ratings
all_players = {}

# loop over all leagues in the BUDA database
for link in leaguelinks:

    # extract the league id for this league
    leagueid = link[link.index('league=') + 7:]

    # scrape the scores for this league
    leaguescoreurl = 'http://www.buda.org/hatleagues/scores.php?section=showLeagueSchedule&league=' + leagueid + '&byDivision=1&showGames=1'
    response = urllib2.urlopen(leaguescoreurl)
    leaguescore_soup = BeautifulSoup(response)

    # assemble the data of team ratings for this league
    data = []
    data_opponent = []
    try:
        table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
    except IndexError:
        print("Unable to find a database of scores for league {}".format(leagueid))
        continue
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('th')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) # Get rid of empty values
        tdcols = row.find_all('td')
        tdcols = [ele.text.strip() for ele in tdcols]
        data_opponent.append([ele for ele in tdcols if ele]) # Get rid of empty values

    # convert to dataframe and drop irrelevant columns
    dfdata = pd.DataFrame(data)
#     print(leagueid, dfdata.columns)
    dfdata.columns = dfdata.ix[0, :]#['Team', 'Record', 'Plus/Minus', 'Tourney Qualifying games']
#     print(leagueid, dfdata.columns)
    dfdata = dfdata.drop(0).reset_index()
    
    # fill na's with -99 to facilitate division dividers
    dfdata = dfdata.fillna(-99)
    
    # get the list of divisions in this league
    divnames = dfdata.ix[dfdata['Record'] == -99, 'Team'].values
    if len(divnames) == 0:
        print("No divisions found, skipping league {}".format(leagueid))
        continue

    # define base ratings by division (arbitrarily assigned based on my experience)
    divratings = {'4/3 Div 1': 1800, '4/3 Div 2': 1400, '4/3 Div 3': 1000, '4/3 Div 4': 900, 
                '5/2 Div 1': 1700, '5/2 Div 2': 1300, '5/2 Div 3': 900, '5/2 Div 4': 800,
                'Open Div 1': 1400, 'Open Div 2': 1200}
    dfdata['div'] = np.zeros(len(dfdata))
    for i in range(len(divnames)-1):
        try:
            divstart = np.where(dfdata['Team'] == divnames[i])[0][0]
        except IndexError:
            print("{} not found, skipping league {}".format(divnames[i], leagueid))
            continue
        try:
            divend = np.where(dfdata['Team'] == divnames[i + 1])[0][0]
        except IndexError:
            print("{} not found, skipping league {}".format(divnames[i + 1], leagueid))
            continue
        try:
            dfdata.ix[divstart + 1: divend, 'div'] = divratings[divnames[i]]
        except KeyError:
            print("No base rating for {}, skipping league {}".format(divnames[i], leagueid))
            continue
    try:
        dfdata.ix[divend + 1:, 'div'] = divratings[divnames[-1]]
    except KeyError:
        print("No base rating for {}, skipping league {}".format(divnames[-1], leagueid))
        continue        

    # remove the division dividers from the dataframe
    for i in range(len(divnames)):
        dfdata = dfdata.drop(dfdata.index[dfdata['Team'] == divnames[i]])

    # generate the average goal differential column
    dfdata['wins'] = dfdata['Record'].apply(lambda x: int(x.split('-')[0]))
    dfdata['losses'] = dfdata['Record'].apply(lambda x: int(x.split('-')[1]))
    dfdata['games'] = dfdata['wins'] + dfdata['losses']
    dfdata['avgplusminus'] = dfdata['Plus/Minus'].astype('float') / dfdata['games']

    # assert that an average goal differential per game of +5 gives +300 rating points.
    dfdata['rating'] = dfdata['div'] + 60. * dfdata['avgplusminus']

    # build the dictionary of game scores
    dfdata_opponents = pd.DataFrame(data_opponent)
    dfdata_opponents['teamscore'] = dfdata.ix[:, 1].apply(lambda x: int(x.split('-')[0]))
    dfdata_opponents['opponentscore'] = dfdata.ix[:, 1].apply(lambda x: int(x.split('-')[1]))

    opponentcounter = 0
    game_scores = {}
    for idf in range(len(dfdata)):
        teamname = dfdata.ix[idf, 'Team']
        ngames = dfdata.ix[idf, 'games']
        for igame in range(ngames):
            opponentname = dfdata_opponents.ix[opponentcounter, 0]
            teamscore = dfdata_opponents.ix[opponentcounter, 'teamscore']
            opponentscore = dfdata_opponents.ix[opponentcounter, 'opponentscore']
            adversary_key = (teamname, opponentname)
            game_scores[adversary_key] = [teamscore, opponentscore]
    
    # scrape the list of teams for this league
    teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
    response = urllib2.urlopen(teamsurl)
    teams_soup = BeautifulSoup(response)

    # generate list of team ids and names for this league
    tdlist = teams_soup.find_all('td', class_='infobody')
    teamids = []
    teamnames = []
    for td in tdlist:
        try:
            url = td.a['href']
            idindex = url.index('team=')
            whichindex = url.index('which=')
            teamids.append(url[idindex+5:whichindex-1])
            teamnames.append(td.a.get_text())
        except:
            continue

    # find all players associated with each team
    # link the team rating to each player on that team
    for teamid, teamname in zip(teamids, teamnames):
        try:
            teamrating = dfdata.ix[dfdata['Team'] == teamname.strip(' '), 'rating'].values[0]
        except IndexError:
            print("Couldn't match {} to scores database, skipping this team.".format(teamname))
            continue

        teamurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeamRoster&team=' + teamid
        response = urllib2.urlopen(teamurl)
        roster_soup = BeautifulSoup(response)

        players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
        for player in players:
            if player in all_players:
                all_players[player].append(teamrating)
            else:
                all_players[player] = [teamrating]
    print("Finished successfully with league {}".format(leagueid))


Unable to find a database of scores for league 40491
Unable to find a database of scores for league 40278
Unable to find a database of scores for league 40273
Unable to find a database of scores for league 40268
Unable to find a database of scores for league 40264
Unable to find a database of scores for league 40258
Unable to find a database of scores for league 40253
Unable to find a database of scores for league 40249
Unable to find a database of scores for league 40245
No base rating for Northborough Open, skipping league 39633
No base rating for Danvers Weeknight, skipping league 39633
No base rating for Danvers Weekend, skipping league 39633
Unable to find a database of scores for league 39960
Unable to find a database of scores for league 39939
Unable to find a database of scores for league 39904
Unable to find a database of scores for league 39678
Unable to find a database of scores for league 39673
Finished successfully with leage 39641
No base rating for Northborough Open, skipping league 39628
No base rating for Danvers Weeknight, skipping league 39628
No base rating for Danvers Weekend, skipping league 39628
No base rating for JP Mixed (4/3), skipping league 39622
No base rating for Lexington Mixed (4/3) Weekend, skipping league 39622
Couldn't match Kung Fu Fighting Monks to scores database, skipping this team.
Couldn't match Lexington 300 to scores database, skipping this team.
Couldn't match Huck and Play D to scores database, skipping this team.
Couldn't match Just Fall Down to scores database, skipping this team.
Couldn't match Old Spam to scores database, skipping this team.
Couldn't match Natick Ultimate to scores database, skipping this team.
Couldn't match Not Dead Yet to scores database, skipping this team.
Couldn't match Electric Mayhem to scores database, skipping this team.
Couldn't match Huckagenerians to scores database, skipping this team.
Couldn't match Fossil Fuel to scores database, skipping this team.
Couldn't match Wednesday Warriors to scores database, skipping this team.
Couldn't match Turn and Burn to scores database, skipping this team.
Finished successfully with leage 39616
Unable to find a database of scores for league 39611
No base rating for Worcester Mixed, skipping league 39602
No base rating for Metro Mixed (4/3), skipping league 39344
No base rating for Metro Coed, skipping league 39344
No base rating for Lexington Mixed (4/3) Weekend, skipping league 39344
Unable to find a database of scores for league 38871
Unable to find a database of scores for league 38552
Unable to find a database of scores for league 38548
Unable to find a database of scores for league 38516
Couldn't match Not Dead Yet to scores database, skipping this team.
Couldn't match Only Mostly Dead to scores database, skipping this team.
Couldn't match Natick Ultimate to scores database, skipping this team.
Couldn't match Old Spam to scores database, skipping this team.
Couldn't match The Expendables to scores database, skipping this team.
Couldn't match The Seven Samurai to scores database, skipping this team.
Finished successfully with leage 38506
Unable to find a database of scores for league 38502
Unable to find a database of scores for league 38498
No base rating for Worcester Mixed, skipping league 38493
No base rating for Worcester Mixed, skipping league 38490
No base rating for JP Mixed (4/3), skipping league 38484
No base rating for Lexington Mixed (4/3) Weekend, skipping league 38484
Finished successfully with leage 38480
Unable to find a database of scores for league 38475
No base rating for Northborough Open, skipping league 37668
No base rating for Danvers Weeknight, skipping league 37668
No base rating for Danvers Weekend, skipping league 37668
Unable to find a database of scores for league 37959
Unable to find a database of scores for league 37690
Unable to find a database of scores for league 37686
No base rating for Open Div 3, skipping league 37674
No base rating for Northborough Open, skipping league 37663
No base rating for Danvers Weeknight, skipping league 37663
No base rating for Danvers Weekend, skipping league 37663
Finished successfully with leage 37659
No base rating for Worcester Mixed, skipping league 37653
No base rating for JP Mixed (4/3), skipping league 37648
No base rating for Lexington Mixed (4/3) Weekend, skipping league 37648
Unable to find a database of scores for league 37634
Unable to find a database of scores for league 37376
Unable to find a database of scores for league 37370
No base rating for Lexington Coed (5/2) Weekend, skipping league 36881
No base rating for Metro Mixed (4/3), skipping league 36881
No base rating for Metro Coed, skipping league 36881
Unable to find a database of scores for league 36852
Unable to find a database of scores for league 36841
Unable to find a database of scores for league 36824
Unable to find a database of scores for league 36530
Unable to find a database of scores for league 36486
No base rating for Grand Masters, skipping league 36481
No base rating for JP Fridays, skipping league 36481
Unable to find a database of scores for league 36469
Finished successfully with leage 36465
No base rating for JP Mixed (4/3), skipping league 36459
No base rating for Lexington Mixed (4/3) Weekend, skipping league 36459
No base rating for Revere Open, skipping league 35836
No base rating for Northborough Open, skipping league 35836
No base rating for Danvers Weeknight, skipping league 35836
No base rating for Northborough Open, skipping league 35831
No base rating for Danvers Weeknight, skipping league 35831
Unable to find a database of scores for league 35821
Unable to find a database of scores for league 35817
No base rating for Grand Masters Div 1, skipping league 35487
Finished successfully with leage 35481
No base rating for Grand Masters, skipping league 35476
No base rating for Lexington Coed (5/2) Weekend, skipping league 35470
No base rating for JP Mixed (4/3), skipping league 35470
Unable to find a database of scores for league 35462
Unable to find a database of scores for league 35358
No base rating for Lexington Coed (5/2) Weekend, skipping league 34896
No base rating for Metro Mixed (4/3), skipping league 34896
No base rating for Metro Coed, skipping league 34896
Unable to find a database of scores for league 34841
Unable to find a database of scores for league 34579
Finished successfully with leage 34550
No base rating for Grand Masters, skipping league 34524
No base rating for JP Mixed (4/3), skipping league 34405
No base rating for Lexington Mixed (4/3) Weekend, skipping league 34405
Unable to find a database of scores for league 34400
No base rating for Revere Open, skipping league 34395
No base rating for Northborough Open, skipping league 34395
No base rating for Revere Open, skipping league 34330
No base rating for Northborough Open, skipping league 34330
Unable to find a database of scores for league 34170
No base rating for Grand Masters Div 1, skipping league 34023
Finished successfully with leage 33993
No base rating for Lexington Coed (5/2) Weeknight, skipping league 33985
No base rating for Lexington Coed (5/2) Weekend, skipping league 33985
No base rating for JP Mixed (4/3), skipping league 33985
No base rating for Grand Masters, skipping league 33980
No base rating for Lexington Coed (5/2) Weekend, skipping league 33233
No base rating for JP Mixed (4/3), skipping league 33233
Finished successfully with leage 32945
Unable to find a database of scores for league 32941
No base rating for JP Coed, skipping league 32936
No base rating for Lexington Coed (5/2) Weekend, skipping league 32746
No base rating for JP Coed, skipping league 32746
No base rating for JP Mixed (4/3), skipping league 32746
No base rating for Revere Open, skipping league 32741
No base rating for Northborough Open, skipping league 32741
Unable to find a database of scores for league 32566
Unable to find a database of scores for league 32563
No base rating for Revere Open, skipping league 32420
No base rating for Northborough Open, skipping league 32420
Unable to find a database of scores for league 32268
Finished successfully with leage 32262
No base rating for Lexington Coed (5/2) Weeknight, skipping league 31034
No base rating for JP Mixed (4/3), skipping league 31034
No base rating for Lexington Coed (5/2) Weeknight, skipping league 31029
No base rating for Metro Mixed (4/3), skipping league 31029
Unable to find a database of scores for league 31023
No base rating for Waitlist, skipping league 31018
No base rating for Grand Masters, skipping league 31018
Unable to find a database of scores for league 31014
Unable to find a database of scores for league 31007
Finished successfully with leage 31002
No base rating for Lexington Coed (5/2) Weeknight, skipping league 30924
No base rating for JP Coed, skipping league 30924
No base rating for JP Mixed (4/3), skipping league 30924
No base rating for Milford Open, skipping league 30919
No base rating for Revere Open, skipping league 30919
No base rating for Revere Open, skipping league 30844
No base rating for JP Open, skipping league 29449
No base rating for Grand Masters Div 1, skipping league 29432
No base rating for Grand Masters Div 2, skipping league 29432
No base rating for JP Coed, skipping league 29390
No base rating for Grand Masters, skipping league 29390
Finished successfully with leage 29389
No base rating for JP Coed, skipping league 29388
No base rating for JP Mixed (4/3), skipping league 29388
No base rating for Waltham Coed, skipping league 29388
No base rating for Newton Coed, skipping league 29387
No base rating for Metro Mixed (4/3), skipping league 29387
No base rating for Metro Coed, skipping league 29387
Unable to find a database of scores for league 29386
No base rating for JP Mixed (4/3), skipping league 29385
No base rating for Waltham Coed, skipping league 29385
No base rating for Girls, skipping league 29237
Finished successfully with leage 29235
No base rating for Milford Open, skipping league 29192
No base rating for Revere Open, skipping league 29192
No base rating for Revere Open, skipping league 29187
No base rating for Grand Masters, skipping league 28912
No base rating for Girls, skipping league 26998
Finished successfully with leage 26909
No base rating for Lexington Coed (5/2) Weeknight, skipping league 26908
No base rating for JP Coed, skipping league 26908
No base rating for JP Mixed (4/3), skipping league 26908
No base rating for Waltham Coed, skipping league 26908
Finished successfully with leage 26907
No base rating for Newton Coed, skipping league 26906
No base rating for Metro Mixed (4/3), skipping league 26906
No base rating for Metro Coed, skipping league 26906
No base rating for JP Mixed (4/3), skipping league 26905
No base rating for Waltham Coed, skipping league 26905
Unable to find a database of scores for league 26904
No base rating for Milford Open, skipping league 26903
No base rating for Revere Open, skipping league 26903
No base rating for Girls, skipping league 23714
No base rating for Grand Masters, skipping league 23696
No base rating for Lexington Coed (5/2) Weeknight, skipping league 23695
No base rating for JP Coed, skipping league 23695
No base rating for JP Mixed (4/3), skipping league 23695
No base rating for Waltham Coed, skipping league 23695
Finished successfully with leage 23694
No base rating for Newton Coed, skipping league 23693
No base rating for Metro Mixed (4/3), skipping league 23693
No base rating for Metro Coed, skipping league 23693
Unable to find a database of scores for league 23692
No base rating for JP Mixed (4/3), skipping league 23691
No base rating for Waltham Coed, skipping league 23691
No base rating for Milford Open, skipping league 23626
Finished successfully with leage 20439
No base rating for Lexington Coed (5/2) Weeknight, skipping league 20438
No base rating for JP Coed, skipping league 20438
No base rating for JP Mixed (4/3), skipping league 20438
No base rating for Waltham Coed, skipping league 20438
Finished successfully with leage 20436
No base rating for Newton Coed, skipping league 20435
No base rating for Metro Mixed (4/3), skipping league 20435
No base rating for Metro Coed, skipping league 20435
Unable to find a database of scores for league 20434
No base rating for JP Mixed (4/3), skipping league 20433
No base rating for Waltham Coed, skipping league 20433
No base rating for Milford Open, skipping league 20175
No base rating for Lexington Coed (5/2) Weeknight, skipping league 18047
No base rating for JP Coed, skipping league 18047
No base rating for JP Mixed (4/3), skipping league 18047
No base rating for Waltham Coed, skipping league 18047
No base rating for Open, skipping league 17692
Finished successfully with leage 17692
No base rating for OPEN, skipping league 17691
Finished successfully with leage 17691
Unable to find a database of scores for league 17604
No base rating for Metro Open, skipping league 17603
No base rating for Newton Coed, skipping league 17603
No base rating for Metro Mixed (4/3), skipping league 17603
No base rating for JP Coed, skipping league 17602
No base rating for JP Mixed (4/3), skipping league 17602
No base rating for Waltham Coed, skipping league 17602
No base rating for Milford Open, skipping league 17499
Finished successfully with leage 16881
No base rating for Lexington Coed (5/2) Weeknight, skipping league 16880
No base rating for JP Coed, skipping league 16880
No base rating for JP Mixed (4/3), skipping league 16880
No base rating for Waltham Coed, skipping league 16880
No base rating for OPEN, skipping league 13647
Finished successfully with leage 13647
Unable to find a database of scores for league 13447
No base rating for Newton Coed, skipping league 13446
No base rating for Metro Mixed (4/3), skipping league 13446
No base rating for Metro Coed, skipping league 13446
No base rating for JP Coed, skipping league 13445
No base rating for JP Mixed (4/3), skipping league 13445
No base rating for Waltham Coed, skipping league 13445
No base rating for Milford Open, skipping league 13215
No base rating for Open, skipping league 12807
No base rating for Mixed A, skipping league 12807
No base rating for Mixed B, skipping league 12807
No base rating for JP Coed, skipping league 12769
No base rating for Waltham Open, skipping league 12769
No base rating for Lexington Open, skipping league 12769
Unable to find a database of scores for league 11632
Unable to find a database of scores for league 11631
No base rating for JP Coed, skipping league 11604
No base rating for Metro Open, skipping league 11604
No base rating for Newton Open, skipping league 11604
No base rating for JP Coed, skipping league 11402
No base rating for JP Open, skipping league 11402
No base rating for Waltham Open, skipping league 11402
No base rating for Milford Open, skipping league 11401
No base rating for JP Coed, skipping league 10918
No base rating for JP Open, skipping league 10918
No base rating for Lexington Open, skipping league 10918
No base rating for Northwest Open, skipping league 10918
No base rating for Open, skipping league 10917
No base rating for Mixed A, skipping league 10917
No base rating for Mixed B, skipping league 10917
Unable to find a database of scores for league 10309
Unable to find a database of scores for league 10217
No base rating for West Open, skipping league 6340
No base rating for Metro Coed, skipping league 6340
No base rating for Metro Open, skipping league 6340
No base rating for Newton Open, skipping league 6340
No base rating for Metro Coed, skipping league 6326
No base rating for Northwest Open, skipping league 6326
No base rating for Milford Open, skipping league 6235
No base rating for OPEN, skipping league 5629
No base rating for Mixed A, skipping league 5629
No base rating for Mixed B, skipping league 5629
No base rating for Lexington Open, skipping league 5130
No base rating for Metro Coed, skipping league 5130
No base rating for Northwest Open, skipping league 5130
No base rating for West Open, skipping league 5129
No base rating for Metro Coed, skipping league 5129
No base rating for Metro Open, skipping league 5129
No base rating for Newton Open, skipping league 5129
Unable to find a database of scores for league 4919
Unable to find a database of scores for league 4886
No base rating for JP Coed, skipping league 3804
No base rating for JP Open, skipping league 3804
No base rating for Waltham Open, skipping league 3804
No base rating for Milford Open, skipping league 3669
No base rating for Mixed, skipping league 3162
No base rating for JP Coed, skipping league 3144
No base rating for Waltham Open, skipping league 3144
No base rating for Lexington Open, skipping league 3144
Unable to find a database of scores for league 2408
No base rating for West Open, skipping league 2407
No base rating for Metro Coed, skipping league 2407
No base rating for Metro Open, skipping league 2407
No base rating for JP Coed, skipping league 1080
No base rating for JP Open, skipping league 1080
No base rating for Waltham Open, skipping league 1080
Unable to find a database of scores for league 1083
Unable to find a database of scores for league 1082
Unable to find a database of scores for league 1081
Unable to find a database of scores for league 1095
Unable to find a database of scores for league 1094
Unable to find a database of scores for league 1093

In [87]:
all_players.pop('')


Out[87]:
[1635.0,
 1500.0,
 1455.0,
 1940.0,
 1908.0,
 2020.0,
 1944.0,
 2115.0,
 1740.0,
 1688.5714285714287,
 1662.8571428571429,
 1785.0,
 1750.0,
 1025.0,
 1205.0,
 800.0,
 1408.5714285714287,
 1254.2857142857142,
 1845.7142857142858,
 1040.0,
 1592.0,
 1415.0,
 1220.0,
 1751.4285714285713,
 1208.0,
 1280.0,
 1412.0,
 1340.0,
 1230.0,
 1460.0,
 1357.1428571428571,
 1580.0,
 1648.5714285714287,
 1520.0,
 1500.0,
 1452.5,
 922.85714285714289,
 1168.0,
 1390.0,
 620.0,
 616.0,
 890.0,
 905.71428571428578,
 957.14285714285711,
 1248.5714285714287,
 1135.0,
 725.71428571428578,
 1408.0,
 900.0,
 1317.1428571428571,
 1328.0,
 1004.0,
 1440.0,
 1797.5,
 1380.0,
 680.0,
 1920.0,
 1424.0,
 1632.5,
 390.0,
 1813.3333333333333,
 1868.5714285714287,
 2005.7142857142858,
 2061.1764705882351,
 1812.0,
 1860.0,
 1687.5,
 1626.0,
 2070.0,
 1202.0,
 1516.25,
 1657.1428571428571,
 1463.3333333333333,
 1526.3157894736842,
 1262.0,
 1373.3333333333333,
 1271.4285714285716,
 1386.1538461538462,
 1164.2857142857142,
 1318.8235294117646,
 1538.0,
 1460.0,
 1430.0,
 1486.4000000000001,
 946.66666666666674,
 1445.8823529411766,
 1490.0,
 670.0,
 929.09090909090912,
 1123.3333333333333,
 718.0,
 426.66666666666663,
 922.5,
 808.0,
 1032.7272727272727,
 1133.8461538461538,
 1580.0,
 2040.0,
 2128.5714285714284,
 2004.0,
 1689.090909090909,
 1442.5,
 1426.6666666666667,
 1000.0,
 1186.6666666666667,
 1334.2857142857142,
 1188.5714285714287,
 1233.3333333333333,
 1365.0,
 993.33333333333337,
 1280.0,
 1420.0,
 1535.7142857142858,
 1265.0,
 1609.2307692307693,
 1082.5,
 960.0,
 1065.0,
 1008.5714285714286,
 1020.0,
 652.5,
 973.84615384615381,
 735.0,
 977.14285714285711,
 1052.7272727272727,
 731.25,
 893.33333333333337,
 466.15384615384613,
 740.0,
 480.0,
 848.18181818181813,
 1475.0,
 1588.5714285714284,
 1233.3333333333333,
 1390.7692307692307,
 1166.6666666666667,
 1542.5,
 1130.0,
 1325.0,
 1514.0,
 1322.8571428571429,
 1020.0,
 1980.0,
 1692.0,
 1740.0,
 1640.0,
 1740.0,
 1825.7142857142858,
 1660.0,
 2242.5,
 1740.0,
 1665.0,
 1774.2857142857142,
 1320.0,
 988.57142857142867,
 1430.0,
 1520.0,
 1348.5714285714287,
 1460.0,
 1480.0,
 1220.0,
 1190.0,
 1268.0,
 1412.0,
 1365.7142857142858,
 1592.0,
 1588.5714285714284,
 1460.0,
 1184.0,
 1820.0,
 1090.0,
 1240.0,
 870.0,
 508.00000000000006,
 1025.7142857142858,
 1068.5714285714287,
 1420.0,
 960.0,
 700.0,
 1010.0,
 805.0,
 1160.0,
 1248.5714285714287,
 794.28571428571433,
 nan,
 1126.6666666666667,
 1091.4285714285713,
 1580.0,
 1820.0,
 1520.0,
 1526.6666666666667,
 795.0,
 1963.6363636363635,
 1620.0,
 1635.0,
 1770.0,
 1664.2105263157896,
 2076.0,
 2020.0,
 1977.1428571428571,
 1794.0,
 1410.0,
 1975.3846153846155,
 1965.0,
 1640.0,
 2020.0,
 1487.2727272727273,
 1250.0,
 1175.0,
 1070.0,
 1409.4736842105262,
 1583.75,
 1312.3076923076924,
 1352.6315789473683,
 1364.0,
 1164.0,
 1349.2307692307693,
 1645.4545454545455,
 1400.0,
 1480.0,
 1444.3478260869565,
 1150.0,
 1460.0,
 1534.4000000000001,
 798.18181818181824,
 805.0,
 855.0,
 873.33333333333337,
 1280.0,
 300.0,
 1094.2857142857142,
 1163.2,
 820.0,
 940.0,
 1164.0,
 1616.0,
 2078.0,
 1785.0,
 1522.0,
 1015.0,
 1212.3076923076924,
 1245.0,
 1327.2727272727273,
 1330.0,
 1444.0,
 1136.0,
 1427.5,
 1462.8571428571429,
 1504.0,
 1132.5,
 856.0,
 969.0,
 1172.3076923076924,
 1131.4285714285716,
 715.38461538461536,
 1024.2857142857142,
 690.0,
 405.0,
 1046.6666666666667,
 575.29411764705878,
 923.07692307692309,
 485.0,
 770.0,
 675.0,
 924.0,
 1634.5454545454545,
 1385.0,
 1207.3684210526317,
 1477.1428571428571,
 1250.0,
 1450.0,
 1588.0,
 1320.0,
 1390.0,
 1020.0,
 771.42857142857133,
 924.0,
 1400.0,
 696.0,
 1560.0,
 2090.0,
 1872.0,
 1524.0,
 1812.0,
 1645.7142857142858,
 2000.0,
 2010.0,
 1680.0,
 1988.5714285714284,
 2091.4285714285716,
 1880.0,
 1702.5,
 1800.0,
 1436.0,
 1314.2857142857142,
 992.0,
 1091.4285714285713,
 1597.1428571428571,
 1490.0,
 830.0,
 1400.0,
 1322.8571428571429,
 1797.5,
 1620.0,
 1340.0,
 1200.0,
 1305.7142857142858,
 1452.5,
 1316.0,
 1437.5,
 1542.5,
 1385.7142857142858,
 1310.0,
 1128.5714285714284,
 410.0,
 1162.8571428571429,
 1205.7142857142858,
 871.42857142857144,
 670.0,
 724.0,
 805.0,
 1295.0,
 1854.2857142857142,
 1326.6666666666667,
 1376.0,
 1400.0,
 540.0,
 2000.0,
 1806.6666666666667,
 1923.3333333333333,
 1827.2727272727273,
 2008.421052631579,
 1728.0,
 2065.7142857142858,
 1778.5714285714287,
 2063.0769230769229,
 1312.7272727272727,
 1400.0,
 1067.2727272727273,
 1543.3333333333333,
 1130.0,
 1421.1764705882354,
 1586.0,
 1130.0,
 1420.0,
 1467.0588235294117,
 1487.6923076923076,
 1420.0,
 1295.0,
 1503.6363636363637,
 1312.3076923076924,
 1330.0,
 1569.4117647058824,
 793.33333333333326,
 1146.6666666666667,
 1201.8181818181818,
 357.14285714285722,
 953.33333333333337,
 869.41176470588243,
 646.66666666666663,
 1827.5,
 2045.0,
 1867.1428571428571,
 1670.0,
 1475.7142857142858,
 1053.3333333333335,
 1444.0,
 1020.0,
 1428.0,
 1468.0,
 1270.0,
 1228.0,
 1303.3333333333333,
 1278.1818181818182,
 1042.8571428571429,
 1695.0,
 1096.0,
 1101.5384615384614,
 1047.1428571428571,
 1090.0,
 1533.3333333333333,
 372.0,
 553.33333333333326,
 1020.0,
 1044.0,
 900.0,
 870.0,
 680.0,
 720.0,
 788.57142857142856,
 1104.0,
 972.0,
 700.0,
 886.66666666666663,
 770.76923076923072,
 892.5,
 975.0,
 1544.0,
 1583.1578947368421,
 1357.1428571428571,
 1505.0,
 1378.5714285714287,
 1440.0,
 1115.0,
 1486.6666666666667,
 1435.0,
 1222.5,
 1240.0,
 992.72727272727275,
 1131.4285714285713,
 1980.0,
 1980.0,
 1930.0,
 1524.0,
 1764.0,
 1688.5714285714287,
 1580.0,
 1870.0,
 1740.0,
 1385.0,
 1752.5,
 1482.5,
 1742.8571428571429,
 1400.0,
 980.0,
 1545.7142857142858,
 1262.8571428571429,
 1390.0,
 1496.0,
 1250.0,
 1828.5714285714287,
 930.0,
 1220.0,
 1314.2857142857142,
 1000.0,
 712.0,
 1042.8571428571429,
 605.71428571428578,
 1565.7142857142858,
 502.85714285714283,
 1080.0,
 1030.0,
 820.0,
 1100.0,
 580.0,
 1142.5,
 897.14285714285711,
 1008.5714285714286,
 737.5,
 1345.0,
 1090.0,
 1397.5,
 1100.0,
 1880.0,
 1310.0,
 1680.0,
 1530.0,
 1120.0,
 1565.4545454545455,
 1980.0,
 1475.0,
 1930.0,
 1951.578947368421,
 1972.0,
 2042.7272727272727,
 1960.0,
 2082.8571428571431,
 1300.0,
 1450.5263157894738,
 1220.0,
 1504.2105263157896,
 1332.5,
 1256.9230769230769,
 1361.4285714285713,
 1400.0,
 1528.5714285714284,
 1244.0,
 1409.0,
 1355.0,
 1343.1578947368421,
 1366.25,
 1456.8421052631579,
 1430.0,
 1585.0,
 1568.75,
 1272.5,
 628.0,
 1012.0,
 1124.0,
 610.0,
 655.0,
 841.81818181818176,
 650.90909090909088,
 1186.6666666666667,
 983.33333333333337,
 1910.0,
 1880.0,
 2000.0,
 1896.3636363636365,
 1792.7272727272727,
 1614.2857142857142,
 1172.5,
 1150.0,
 1480.0,
 1096.0,
 1229.090909090909,
 1450.0,
 1545.4545454545455,
 1582.0,
 1780.0,
 991.42857142857144,
 904.61538461538464,
 882.0,
 1156.0,
 1002.8571428571429,
 456.0,
 876.0,
 484.0,
 1032.0,
 934.28571428571433,
 382.5,
 863.07692307692309,
 555.0,
 926.66666666666663,
 906.31578947368416,
 250.0,
 480.0,
 1508.0,
 826.66666666666663,
 1592.0,
 1504.2105263157896,
 1545.7142857142858,
 1563.6363636363635,
 1700.0,
 1497.5,
 1345.0,
 1335.7142857142858,
 1200.0,
 1315.3846153846155,
 978.0,
 1300.0,
 830.0,
 1997.1428571428571,
 1860.0,
 1890.0,
 1752.0,
 1455.0,
 1812.0,
 1930.0,
 1752.0,
 1205.0,
 1305.7142857142858,
 1424.0,
 1436.0,
 1760.0,
 1194.2857142857142,
 1693.3333333333335,
 1400.0,
 1297.1428571428571,
 1554.2857142857142,
 1340.0,
 1190.0,
 1250.0,
 1430.0,
 1250.0,
 1037.5,
 1270.0,
 640.0,
 620.0,
 1162.8571428571429,
 1187.5,
 1120.0,
 1082.5,
 1127.5,
 657.14285714285711,
 725.71428571428578,
 1262.5,
 1202.5,
 760.0,
 1674.2857142857142,
 1472.0,
 1170.0,
 1500.0,
 1477.1428571428571,
 1670.0,
 590.0,
 1590.0,
 1090.0,
 756.0,
 1671.4285714285716,
 1912.0,
 1650.0,
 1818.75,
 1820.0,
 1935.0,
 1884.0,
 1720.0,
 2040.0,
 1381.5384615384614,
 1380.0,
 1837.1428571428571,
 1415.0,
 1379.0,
 1400.0,
 1512.5,
 1532.6315789473683,
 1297.1428571428571,
 1390.5263157894738,
 1430.0,
 1288.5714285714287,
 1293.3333333333333,
 1488.5714285714287,
 1252.3076923076924,
 1515.0,
 1460.0,
 730.0,
 940.0,
 1041.5384615384614,
 964.70588235294122,
 1013.3333333333334,
 760.0,
 406.0,
 724.0,
 840.0,
 1261.0,
 1004.2857142857143,
 2032.7272727272727,
 1962.5,
 1752.5,
 1863.6363636363635,
 1802.0,
 1985.0,
 1475.7142857142858,
 1087.2727272727273,
 1111.4285714285716,
 1574.2857142857142,
 1310.909090909091,
 1300.0,
 1201.8181818181818,
 1428.5714285714284,
 636.0,
 1064.0,
 921.42857142857144,
 988.0,
 744.0,
 760.0,
 641.53846153846155,
 1308.0,
 1220.0,
 822.85714285714289,
 474.54545454545456,
 1182.0,
 916.36363636363637,
 1069.4117647058824,
 600.0,
 696.0,
 801.0,
 945.88235294117646,
 528.0,
 477.5,
 1868.75,
 1568.0,
 1446.6666666666667,
 1438.1818181818182,
 1624.2105263157896,
 1513.3333333333333,
 1268.0,
 1507.1428571428571,
 1502.8571428571429,
 1557.5,
 1089.090909090909,
 1325.0,
 1348.0,
 1234.2857142857142,
 918.46153846153845,
 977.14285714285711,
 932.30769230769238,
 1095.0,
 1680.0,
 1145.0,
 936.0,
 1006.1538461538462,
 1987.5,
 1957.5,
 1932.0,
 1705.7142857142858,
 1792.5,
 1900.0,
 1362.8571428571429,
 1980.0,
 1894.2857142857142,
 1851.4285714285713,
 1697.1428571428571,
 1548.0,
 1030.0,
 1520.0,
 1520.0,
 1460.0,
 1170.0,
 1730.0,
 1560.0,
 1391.4285714285713,
 1550.0,
 1352.5,
 1025.7142857142858,
 1140.0,
 573.33333333333337,
 1420.0,
 860.0,
 490.0,
 592.0,
 1170.0,
 1040.0,
 1024.0,
 880.0,
 1340.0,
 1490.0,
 1904.0,
 1480.0,
 790.0,
 1164.0,
 600.0,
 1337.1428571428571,
 1926.0,
 1782.8571428571429,
 1936.3636363636365,
 1897.5,
 1816.0,
 1785.0,
 2040.0,
 1668.0,
 1968.0,
 1674.5454545454545,
 1976.8421052631579,
 1156.4705882352941,
 1265.8823529411766,
 1400.0,
 1289.4736842105262,
 1384.2105263157894,
 1050.0,
 1532.0,
 1576.0,
 1505.8823529411766,
 1258.1818181818182,
 1450.0,
 1650.909090909091,
 1537.1428571428571,
 1220.0,
 1287.5,
 1180.0,
 791.20000000000005,
 1213.75,
 652.0,
 753.33333333333337,
 310.0,
 720.0,
 885.4545454545455,
 896.36363636363637,
 951.25,
 1213.3333333333333,
 1814.5454545454545,
 2193.333333333333,
 1348.5714285714287,
 2067.5,
 1644.6153846153845,
 1866.1538461538462,
 1994.0,
 1634.5454545454545,
 1985.8823529411766,
 991.42857142857133,
 790.0,
 1345.0,
 1372.0,
 1450.0,
 1425.0,
 1150.0,
 1258.0,
 1543.1578947368421,
 1354.5454545454545,
 730.0,
 593.33333333333337,
 1002.0,
 1144.0,
 888.57142857142856,
 1046.6666666666667,
 1312.5,
 844.61538461538464,
 900.0,
 845.0,
 769.09090909090912,
 1237.5,
 885.0,
 969.0,
 832.5,
 380.0,
 220.0,
 658.18181818181824,
 267.5,
 830.0,
 1704.0,
 1441.0526315789473,
 1745.0,
 1266.6666666666667,
 1691.4285714285713,
 776.0,
 1305.0,
 1500.0,
 1773.8461538461538,
 1284.2857142857142,
 1314.0,
 672.0,
 1097.1428571428571,
 1233.3333333333333,
 612.0,
 830.76923076923072,
 1020.0,
 1240.0,
 1160.0,
 1527.2727272727273,
 549.23076923076917,
 1390.909090909091,
 1435.0,
 1007.3684210526316,
 2040.0,
 1560.0,
 1452.0,
 2030.0,
 1700.0,
 1774.2857142857142,
 1750.0,
 1910.0,
 1940.0,
 1970.0,
 1670.0,
 1340.0,
 1400.0,
 800.0,
 1400.0,
 1400.0,
 1196.0,
 1440.0,
 1842.5,
 1340.0,
 1597.1428571428571,
 1280.0,
 1440.0,
 1680.0,
 220.0,
 1070.0,
 700.0,
 700.0,
 1330.0,
 430.0,
 1137.1428571428571,
 1650.0,
 1102.8571428571429,
 948.57142857142856,
 1168.0,
 1811.4285714285713,
 1382.8571428571429,
 1620.0,
 1400.0,
 1532.0,
 930.0,
 1500.0,
 891.42857142857133,
 760.0,
 1620.0,
 1770.0,
 1845.0,
 1968.0,
 1853.3333333333333,
 1907.3684210526317,
 2105.0,
 2165.4545454545455,
 1785.0,
 1260.0,
 1417.1428571428571,
 1320.0,
 1559.1304347826087,
 1250.0,
 1493.0,
 1345.4545454545455,
 1268.0,
 1717.6470588235293,
 1246.25,
 1362.5,
 1270.0,
 827.82608695652175,
 1037.5,
 868.0,
 965.71428571428567,
 841.81818181818176,
 905.71428571428578,
 720.0,
 1730.0,
 1947.0588235294117,
 1798.1818181818182,
 1748.0,
 2006.6666666666665,
 1340.0,
 1492.7272727272727,
 1912.7272727272727,
 1526.0,
 2005.4545454545455,
 1749.090909090909,
 1898.0,
 1000.0,
 1120.0,
 1465.0,
 1461.5384615384614,
 1425.4545454545455,
 955.0,
 985.0,
 1317.1428571428571,
 964.0,
 1200.0,
 1493.8461538461538,
 922.5,
 962.85714285714289,
 1091.25,
 916.0,
 881.53846153846155,
 1016.6666666666666,
 872.30769230769226,
 1010.7692307692307,
 868.0,
 950.0,
 1183.3333333333333,
 692.30769230769226,
 864.0,
 836.0,
 702.35294117647061,
 990.0,
 1121.5384615384614,
 627.27272727272725,
 366.66666666666669,
 218.0,
 1400.0,
 1572.0,
 1645.0,
 1673.3333333333333,
 1685.0,
 1496.0,
 1362.5,
 1358.75,
 526.66666666666674,
 694.28571428571422,
 1029.2307692307693,
 1508.0,
 640.0,
 1140.0,
 940.0,
 1161.8181818181818,
 1389.2307692307693,
 1592.3076923076924,
 967.5,
 1014.0,
 1897.5,
 1946.6666666666667,
 1726.6666666666667,
 2148.0,
 1948.421052631579,
 1697.1428571428571,
 1640.0,
 1515.0,
 1866.0,
 2036.25,
 1680.0,
 1250.0,
 1404.6153846153845,
 1313.75,
 1400.0,
 1517.1428571428571,
 1323.3333333333333,
 1534.1176470588234,
 1150.0,
 1575.7142857142858,
 1657.1428571428571,
 1435.0,
 1391.4285714285713,
 1496.6666666666667,
 876.0,
 820.0,
 622.0,
 974.28571428571433,
 558.57142857142867,
 923.63636363636363,
 1652.0,
 1980.0,
 1754.0,
 1925.0,
 2004.6153846153845,
 1526.6666666666667,
 1708.5714285714287,
 860.0,
 1516.9230769230769,
 1334.2857142857142,
 1285.0,
 1312.0,
 ...]

In [90]:
import pickle

pickle.dump(all_players, open( "all_players.p", "wb" ) )

In [105]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

In [93]:
%matplotlib inline
for player in all_players.keys():
    if len(all_players[player]) > 5:
        plt.plot(all_players[player], color='gray', lw=0.5, alpha=0.1)



In [124]:
pmean = []
players_means = {}
for player in all_players.keys():
    pratings = np.array(all_players[player])
    toolow = np.where(pratings < 0)
    if toolow[0].size > 0:
        pratings[toolow[0]] = 0
    pmean.append(pratings.mean())
    if pratings.mean() < 0:
        print(pratings)
    players_means[player] = pratings.mean()

In [122]:
pdf = pd.DataFrame(pmean)

In [123]:
sns.distplot(pdf.dropna())


Out[123]:
<matplotlib.axes._subplots.AxesSubplot at 0x12ad5a810>

In [125]:


In [130]:
# extract the league id for this league
springhat2016id = '40258'
leagueid = springhat2016id

# scrape the list of teams for this league
teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
response = urllib2.urlopen(teamsurl)
teams_soup = BeautifulSoup(response)

# generate list of team ids and names for this league
tdlist = teams_soup.find_all('td', class_='infobody')
teamids = []
teamnames = []
for td in tdlist:
    try:
        url = td.a['href']
        idindex = url.index('team=')
        whichindex = url.index('which=')
        teamids.append(url[idindex+5:whichindex-1])
        teamnames.append(td.a.get_text())
    except:
        continue

# find all players associated with each team
teamratings = {}
for teamid, teamname in zip(teamids, teamnames):

    teamurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeamRoster&team=' + teamid
    response = urllib2.urlopen(teamurl)
    roster_soup = BeautifulSoup(response)

    playerratings = []
    players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
    for player in players:
        if player in all_players:
            playerratings.append(players_means[player])
        else:
            # if someone hasn't played club league, they probably aren't very good
            playerratings.append(800)
    # the team rating is the average of the player ratings for that team
    teamratings[teamname] = np.mean(playerratings)
print("Finished successfully with league {}".format(leagueid))


Finished successfully with league 40258

In [156]:
sns.distplot(pd.DataFrame(teamratings.values()).dropna(), kde=False, bins=10)
plt.axvline(teamratings['Team 20 (20)'], label='Team 20')
plt.legend(loc='auto')
plt.ylabel('Number of Teams')
plt.xlabel('Team Rating')
plt.savefig('Team20Rating.png')



In [161]:
teamratings['Team 27 (27)'] = 1000

In [162]:
keylist = []
valuelist = []
for key in teamratings.keys():
    keylist.append(key)
    valuelist.append(teamratings[key])

In [163]:
shl = pd.DataFrame({'team':keylist, 'rating':valuelist})

In [164]:
shl = shl.sort('rating', ascending=False)

In [166]:
shl.team


Out[166]:
19         Team 2 (2)
1          Team 3 (3)
21       Team 11 (11)
26       Team 14 (14)
13       Team 23 (23)
23       Team 10 (10)
10       Team 13 (13)
20       Team 19 (19)
2        Team 24 (24)
0        Team 21 (21)
9        Team 22 (22)
17       Team 30 (30)
24         Team 6 (6)
7        Team 12 (12)
22       Team 17 (17)
8          Team 8 (8)
5        Team 18 (18)
3          Team 9 (9)
11       Team 16 (16)
28       Team 15 (15)
29       Team 26 (26)
4        Team 25 (25)
12         Team 7 (7)
18    Mark Hammer (1)
15       Team 20 (20)
27         Team 4 (4)
25       Team 29 (29)
14         Team 5 (5)
16       Team 27 (27)
6        Team 28 (28)
Name: team, dtype: object

In [180]:
5/25.


Out[180]:
0.2

In [179]:
2/28.


Out[179]:
0.07142857142857142

In [ ]:
def rating_to_point(rating1, rating2):
    
    # tune k so that rating differential of ... corresponds to point ratio of ...
    # 800 ... 0.5
    # 400 ... 0.15
    # 200 ... 0.07
    # 100 ... 0.035
    
    point_ratio1 = 1 / (1 + np.exp(-k * x))
    
    return point_ratio1

In [33]:
# define the dictionary that will contain all player ratings
all_players = {}

# extract the league id for this league
leagueid = '39641'#link[link.index('league=') + 7:]

# scrape the scores for this league
leaguescoreurl = 'http://www.buda.org/hatleagues/scores.php?section=showLeagueSchedule&league=' + leagueid + '&byDivision=1&showGames=1'
response = urllib2.urlopen(leaguescoreurl)
leaguescore_soup = BeautifulSoup(response)

# assemble the data of team ratings for this league
data = []
try:
    table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
except IndexError:
    print("Unable to find a database of scores for league {}".format(leagueid))
rows = table.find_all('tr')
for row in rows:
#     cols = row.find_all('th')
#     cols = [ele.text.strip() for ele in cols]
#     if len(cols) == 0:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele]) # Get rid of empty values

# convert to dataframe and drop irrelevant columns
dfdata = pd.DataFrame(data)

In [35]:
dfdata.dropna().ix[:, 0]


Out[35]:
3                 Nutgetters
4             Stormageddon V
5                  FallChart
6         Supermona Reloaded
8                The Animals
9                 BUDA U19 A
10     SnakeCountry Bromance
11        Pickle Bush (LPFK)
12          Somerville Youth
13        Supermona Reloaded
15          Somerville Youth
16      Topless Pillow Fight
17        Supermona Reloaded
18                 FallChart
19              Bear Cavalry
21            Stormageddon V
22     SnakeCountry Bromance
23        Supermona Reloaded
25      Topless Pillow Fight
26                Downstream
27               The Animals
28             Burnt Pudding
29        Supermona Reloaded
31      Topless Pillow Fight
32                 FallChart
33              Haunted HAOS
34       GrassBurner Harvest
35             Burnt Pudding
37        Pickle Bush (LPFK)
38              Bear Cavalry
               ...          
378          Lesley Ultimate
379              Squid Squad
380               Crossroads
382         Buttonwood Bears
383          Concrete Jungle
384              Spirit Fowl
385               Crossroads
386              Squid Squad
387          Concrete Jungle
389                 Hammered
390       Merrimack Ultimate
391          Lesley Ultimate
392               Crossroads
393              Squid Squad
395               Crossroads
396                 Hammered
397              Spirit Fowl
398       Merrimack Ultimate
399                 Hammered
401       Merrimack Ultimate
402      Concrete Schoolyard
403         Buttonwood Bears
404                 FastCAPs
405              Spirit Fowl
406       Merrimack Ultimate
407              Squid Squad
408               Crossroads
411       Merrimack Ultimate
412          Lesley Ultimate
413              Squid Squad
Name: 0, dtype: object

Data format should be: dictionary with key as ('Team1, Team2') and value as (score1, score2). Then, to generate the lnprob, we loop over all keys in the dictionary, building a list of difference delta-ratings.


In [1]:
def point_to_rating(point1, point2):
    base_rating = [-1200, -800,-400,-200,-100,0,100,200,400,800, 1200]
    base_point_ratio = [-1, -0.5, -0.2, -0.07, -0.03, 0.0, 0.03, 0.07, 0.2, 0.5, 1]
    interpfunc = interp1d(base_point_ratio, base_rating)
    point_ratio = (point1 - point2) / (point1 + point2)
    delta_rating = interpfunc(point_ratio)
    return delta_rating
# plt.plot(outputs, indices,'-o')

In [2]:
def lnprob(param):
    
    # set the bounds
    if param.any() < 0:
        return np.inf
    
    if param.any() > 10000:
        return np.inf

    # populate the team ratings according to the current model
    model_ratings = {}
    for iteam, teamname in enumerate(teamnames):
        model_ratings[teamname] = param[iteam]

    # compute the rating delta for both model and data
    model_delta = []
    observed_delta = []
    for gamekey in game_scores.keys():
        key0 = gamekey[0]
        key1 = gamekey[1]
        model_delta.append(model_ratings[key0] - model_ratings[key1])
        observed_delta.append(point_to_rating(game_scores[gamekey][0], game_scores[gamekey][1]))

    # lnprob is defined as mean absolute error between model and true deltas
    probln = -mean_absolute_error(observed_delta, model_delta)
    
    return probln

In [ ]:
# define the dictionary that will contain all player ratings
all_players = {}

# loop over all leagues in the BUDA database
for link in leaguelinks[0]:

    # extract the league id for this league
    leagueid = '39641'#link[link.index('league=') + 7:]

    # scrape the scores for this league
    leaguescoreurl = 'http://www.buda.org/hatleagues/scores.php?section=showLeagueSchedule&league=' + leagueid + '&byDivision=1&showGames=1'
    response = urllib2.urlopen(leaguescoreurl)
    leaguescore_soup = BeautifulSoup(response)

    # assemble the data of team ratings for this league
    data = []
    data_opponent = []
    try:
        table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
    except IndexError:
        print("Unable to find a database of scores for league {}".format(leagueid))
        continue
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('th')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) # Get rid of empty values
        tdcols = row.find_all('td')
        tdcols = [ele.text.strip() for ele in tdcols]
        data_opponent.append([ele for ele in tdcols if ele]) # Get rid of empty values

    # convert to dataframe and drop irrelevant columns
    dfdata = pd.DataFrame(data)
#     print(leagueid, dfdata.columns)
    dfdata.columns = dfdata.ix[0, :]#['Team', 'Record', 'Plus/Minus', 'Tourney Qualifying games']
    dfdata = dfdata.dropna(how='all')
#     print(leagueid, dfdata.columns)
    dfdata = dfdata.drop(0).reset_index()
    
    dfdata = dfdata.drop(['index', 'Tourney Qualifying games*'], axis=1)
    
    # fill na's with -99 to facilitate division dividers
    dfdata = dfdata.fillna(-99)
    
    # get the list of divisions in this league
    divnames = dfdata.ix[dfdata['Record'] == -99, 'Team'].values
    if len(divnames) == 0:
        print("No divisions found, skipping league {}".format(leagueid))
        continue

    # define base ratings by division (arbitrarily assigned based on my experience)
    divratings = {'4/3 Div 1': 1800, '4/3 Div 2': 1400, '4/3 Div 3': 1000, '4/3 Div 4': 900, 
                '5/2 Div 1': 1700, '5/2 Div 2': 1300, '5/2 Div 3': 900, '5/2 Div 4': 800,
                'Open Div 1': 1400, 'Open Div 2': 1200}
    dfdata['div'] = np.zeros(len(dfdata))
    for i in range(len(divnames)-1):
        try:
            divstart = np.where(dfdata['Team'] == divnames[i])[0][0]
        except IndexError:
            print("{} not found, skipping league {}".format(divnames[i], leagueid))
            continue
        try:
            divend = np.where(dfdata['Team'] == divnames[i + 1])[0][0]
        except IndexError:
            print("{} not found, skipping league {}".format(divnames[i + 1], leagueid))
            continue
        try:
            dfdata.ix[divstart + 1: divend, 'div'] = divratings[divnames[i]]
        except KeyError:
            print("No base rating for {}, skipping league {}".format(divnames[i], leagueid))
            import pdb; pdb.set_trace()

            continue
    try:
        dfdata.ix[divend + 1:, 'div'] = divratings[divnames[-1]]
    except KeyError:
        print("No base rating for {}, skipping league {}".format(divnames[-1], leagueid))
        import pdb; pdb.set_trace()
        continue        

    # remove the division dividers from the dataframe
    for i in range(len(divnames)):
        dfdata = dfdata.drop(dfdata.index[dfdata['Team'] == divnames[i]])

    # generate the average goal differential column
    dfdata['wins'] = dfdata['Record'].apply(lambda x: int(x.split('-')[0]))
    dfdata['losses'] = dfdata['Record'].apply(lambda x: int(x.split('-')[1]))
    dfdata['games'] = dfdata['wins'] + dfdata['losses']
    dfdata['avgplusminus'] = dfdata['Plus/Minus'].astype('float') / dfdata['games']

    # assert that an average goal differential per game of +5 gives +300 rating points.
    dfdata['rating'] = dfdata['div'] + 60. * dfdata['avgplusminus']

    # build the dictionary of game scores
    dfdata_opponents = pd.DataFrame(data_opponent).dropna().reset_index().drop('index', axis=1)
    dfdata_opponents.columns = ['Opponent', 'Record']
    dfdata_opponents['teamscore'] = dfdata_opponents.ix[:, 'Record'].apply(lambda x: int(x.split('-')[0]))
    dfdata_opponents['opponentscore'] = dfdata_opponents.ix[:, 'Record'].apply(lambda x: int(x.split('-')[1]))

    opponentcounter = 0
    game_scores = {}
    for idf in dfdata.index:
        teamname = dfdata.ix[idf, 'Team']
        ngames = dfdata.ix[idf, 'games']
        for igame in range(ngames):
            opponentname = dfdata_opponents.ix[opponentcounter, 'Opponent']
            teamscore = dfdata_opponents.ix[opponentcounter, 'teamscore']
            opponentscore = dfdata_opponents.ix[opponentcounter, 'opponentscore']
            adversary_key = (teamname, opponentname)
            game_scores[adversary_key] = [teamscore, opponentscore]
            
            
    teamnames = dfdata['Team']
    
    ndim = len(dfdata)
    nwalkers = ndim * 2 + 2
    p0 = [np.random.normal(irating, 200, nwalkers) for irating in dfdata['rating']]
    p0 = np.array(p0).transpose()
#     p0 = [np.random.rand(ndim) for i in range(nwalkers)]

    sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, threads=4)
    sampler.run_mcmc(p0, 200)

    plt.plot(sampler.flatchain[:, 0])
    plt.show()
    import pdb; pdb.set_trace()
    
    # scrape the list of teams for this league
    teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
    response = urllib2.urlopen(teamsurl)
    teams_soup = BeautifulSoup(response)

    # generate list of team ids and names for this league
    tdlist = teams_soup.find_all('td', class_='infobody')
    teamids = []
    teamnames = []
    for td in tdlist:
        try:
            url = td.a['href']
            idindex = url.index('team=')
            whichindex = url.index('which=')
            teamids.append(url[idindex+5:whichindex-1])
            teamnames.append(td.a.get_text())
        except:
            continue

    # find all players associated with each team
    # link the team rating to each player on that team
    for teamid, teamname in zip(teamids, teamnames):
        try:
            teamrating = dfdata.ix[dfdata['Team'] == teamname.strip(' '), 'rating'].values[0]
        except IndexError:
            print("Couldn't match {} to scores database, skipping this team.".format(teamname))
            continue

        teamurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeamRoster&team=' + teamid
        response = urllib2.urlopen(teamurl)
        roster_soup = BeautifulSoup(response)

        players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
        for player in players:
            if player in all_players:
                all_players[player].append(teamrating)
            else:
                all_players[player] = [teamrating]
    print("Finished successfully with league {}".format(leagueid))


> <ipython-input-15-7d9972975925>(122)<module>()
-> sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, threads=4)
(Pdb) p0.shape
(122, 60)
(Pdb) c
> <ipython-input-15-7d9972975925>(130)<module>()
-> teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
(Pdb) plt.plot(sampler.flatchain[:, 1])
[<matplotlib.lines.Line2D object at 0x1249316d0>]
(Pdb) plt.show()
(Pdb) plt.plot(sampler.flatchain[:, 2])
[<matplotlib.lines.Line2D object at 0x1263354d0>]
(Pdb) plt.show()

One possible approach would be to say expected point differential is rating differential divided by 100.


In [14]:
dfdata


Out[14]:
Team Record Plus/Minus div wins losses games avgplusminus rating
1 SnakeCountry Bromance 4-0-0 21 1800.0 4 0 4 5.250000 2115.000000
2 FallChart 5-1-0 14 1800.0 5 1 6 2.333333 1940.000000
3 Pickle Bush (LPFK) 4-1-0 12 1800.0 4 1 5 2.400000 1944.000000
4 Nutgetters 2-1-0 11 1800.0 2 1 3 3.666667 2020.000000
5 GrassBurner Harvest 3-2-0 9 1800.0 3 2 5 1.800000 1908.000000
6 The Animals 2-2-1 -1 1800.0 2 2 4 -0.250000 1785.000000
7 Somerville Youth 1-2-0 -3 1800.0 1 2 3 -1.000000 1740.000000
8 Topless Pillow Fight 2-4-0 -5 1800.0 2 4 6 -0.833333 1750.000000
9 BUDA U19 A 0-2-0 -10 1800.0 0 2 2 -5.000000 1500.000000
10 Stormageddon V 3-4-0 -13 1800.0 3 4 7 -1.857143 1688.571429
11 Supermona Reloaded 3-4-0 -16 1800.0 3 4 7 -2.285714 1662.857143
12 Bear Cavalry 0-4-1 -11 1800.0 0 4 4 -2.750000 1635.000000
13 Burnt Pudding 0-4-2 -23 1800.0 0 4 4 -5.750000 1455.000000
15 Downstream 7-0-0 52 1400.0 7 0 7 7.428571 1845.714286
16 Injustice League 6-1-0 41 1400.0 6 1 7 5.857143 1751.428571
17 Scoobers in Scotland 6-1-0 29 1400.0 6 1 7 4.142857 1648.571429
18 Furtle Boy 3-2-2 16 1400.0 3 2 5 3.200000 1592.000000
19 Rubs the Duckie 3-0-1 9 1400.0 3 0 3 3.000000 1580.000000
20 Toads 4-2-0 10 1400.0 4 2 6 1.666667 1500.000000
21 The BUT 2-1-0 6 1400.0 2 1 3 2.000000 1520.000000
22 Upstream 4-4-0 7 1400.0 4 4 8 0.875000 1452.500000
23 Haunted HAOS 5-3-0 2 1400.0 5 3 8 0.250000 1415.000000
24 Reading Rainbow 3-3-0 6 1400.0 3 3 6 1.000000 1460.000000
25 Nerd Alert 3-2-0 1 1400.0 3 2 5 0.200000 1412.000000
26 Dartmouth Women's Lacrosse 3-4-0 1 1400.0 3 4 7 0.142857 1408.571429
27 Return of the Jedi 3-4-0 -5 1400.0 3 4 7 -0.714286 1357.142857
28 O'Rhinos 2-4-0 -6 1400.0 2 4 6 -1.000000 1340.000000
29 Hippos 2-2-1 -12 1400.0 2 2 4 -3.000000 1220.000000
30 Moose Lightning 1-5-1 -12 1400.0 1 5 6 -2.000000 1280.000000
31 Disc Envy 3-4-0 -17 1400.0 3 4 7 -2.428571 1254.285714
32 Apocalypse Meow 0-4-1 -13 1400.0 0 4 4 -3.250000 1205.000000
33 OctoberFetch 2-4-0 -17 1400.0 2 4 6 -2.833333 1230.000000
34 Lady and the BAMF 1-4-0 -16 1400.0 1 4 5 -3.200000 1208.000000
35 BUDA U19 B 0-2-0 -20 1400.0 0 2 2 -10.000000 800.000000
36 A Lil Bit Sticky 1-3-0 -25 1400.0 1 3 4 -6.250000 1025.000000
37 FallMinion 0-6-0 -36 1400.0 0 6 6 -6.000000 1040.000000
39 Flaming Croissants 6-0-0 39 1000.0 6 0 6 6.500000 1390.000000
40 Underwater Monkey Cowboys 7-0-0 37 1000.0 7 0 7 5.285714 1317.142857
41 Too Drunk to Fail 4-1-2 34 1000.0 4 1 5 6.800000 1408.000000
42 Stack to the Future 5-2-0 29 1000.0 5 2 7 4.142857 1248.571429
43 Fall Dirt 3-2-1 14 1000.0 3 2 5 2.800000 1168.000000
44 THEM! 3-1-1 9 1000.0 3 1 4 2.250000 1135.000000
45 Spam 4-3-0 -5 1000.0 4 3 7 -0.714286 957.142857
46 Oddjob 3-3-0 -11 1000.0 3 3 6 -1.833333 890.000000
47 Batman and the Robins 3-4-0 -9 1000.0 3 4 7 -1.285714 922.857143
48 Olin College 3-4-0 -11 1000.0 3 4 7 -1.571429 905.714286
49 Top Shelf 2-4-1 -10 1000.0 2 4 6 -1.666667 900.000000
50 Tofu Wolf 1-6-0 -32 1000.0 1 6 7 -4.571429 725.714286
51 Moosehead Cowpokes 0-5-0 -32 1000.0 0 5 5 -6.400000 616.000000
52 It's a Trap 0-6-1 -38 1000.0 0 6 6 -6.333333 620.000000
54 Crossroads 8-0-0 53 1400.0 8 0 8 6.625000 1797.500000
55 Merrimack Ultimate 5-1-1 52 1400.0 5 1 6 8.666667 1920.000000
56 Squid Squad 6-2-0 31 1400.0 6 2 8 3.875000 1632.500000
57 Spirit Fowl 3-2-0 2 1400.0 3 2 5 0.400000 1424.000000
58 Concrete Schoolyard 1-2-0 2 1400.0 1 2 3 0.666667 1440.000000
59 Hammered 3-3-0 -2 1400.0 3 3 6 -0.333333 1380.000000
60 Buttonwood Bears 1-4-0 -6 1400.0 1 4 5 -1.200000 1328.000000
61 Concrete Jungle 0-5-0 -33 1400.0 0 5 5 -6.600000 1004.000000
62 Lesley Ultimate 0-6-2 -72 1400.0 0 6 6 -12.000000 680.000000
64 FastCAPs 0-2-1 -27 1200.0 0 2 2 -13.500000 390.000000

Everything is set up except the dictionary of game scores: "game_scores".


In [12]:
np.array(p0).shape


Out[12]:
(122, 60)

In [13]:
nwalkers


Out[13]:
122

In [ ]: