In [1]:

    
from lxml import html
import requests
from bs4 import BeautifulSoup
import urllib
import urllib2
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from scipy.interpolate import interp1d
import emcee
import seaborn as sns
import matplotlib.pyplot as plt









    



---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-1-ba3bfb218f69> in <module>()
----> 1 from lxml import html
      2 import requests
      3 from bs4 import BeautifulSoup
      4 import urllib
      5 import urllib2

ImportError: No module named lxml



In [2]:

    
r = urllib.urlopen('http://www.buda.org/leagues/past-leagues')
soup = BeautifulSoup(r, 'html.parser')



In [3]:

    
iframe = soup.find_all('iframe')[0]
response = urllib2.urlopen(iframe.attrs['src'])
iframe_soup = BeautifulSoup(response)



In [4]:

    
leaguelinks = [i.a['href'] for i in iframe_soup.find_all("td", class_="infobody")]



In [7]:

    
i.get_text()









    Out[7]:





u'Summer Hat League- 1999'



In [85]:

    
# define the dictionary that will contain all player ratings
all_players = {}

# loop over all leagues in the BUDA database
for link in leaguelinks:

    # extract the league id for this league
    leagueid = link[link.index('league=') + 7:]

    # scrape the scores for this league
    leaguescoreurl = 'http://www.buda.org/hatleagues/scores.php?section=showLeagueSchedule&league=' + leagueid + '&byDivision=1&showGames=1'
    response = urllib2.urlopen(leaguescoreurl)
    leaguescore_soup = BeautifulSoup(response)

    # assemble the data of team ratings for this league
    data = []
    data_opponent = []
    try:
        table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
    except IndexError:
        print("Unable to find a database of scores for league {}".format(leagueid))
        continue
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('th')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) # Get rid of empty values
        tdcols = row.find_all('td')
        tdcols = [ele.text.strip() for ele in tdcols]
        data_opponent.append([ele for ele in tdcols if ele]) # Get rid of empty values

    # convert to dataframe and drop irrelevant columns
    dfdata = pd.DataFrame(data)
#     print(leagueid, dfdata.columns)
    dfdata.columns = dfdata.ix[0, :]#['Team', 'Record', 'Plus/Minus', 'Tourney Qualifying games']
#     print(leagueid, dfdata.columns)
    dfdata = dfdata.drop(0).reset_index()
    
    # fill na's with -99 to facilitate division dividers
    dfdata = dfdata.fillna(-99)
    
    # get the list of divisions in this league
    divnames = dfdata.ix[dfdata['Record'] == -99, 'Team'].values
    if len(divnames) == 0:
        print("No divisions found, skipping league {}".format(leagueid))
        continue

    # define base ratings by division (arbitrarily assigned based on my experience)
    divratings = {'4/3 Div 1': 1800, '4/3 Div 2': 1400, '4/3 Div 3': 1000, '4/3 Div 4': 900, 
                '5/2 Div 1': 1700, '5/2 Div 2': 1300, '5/2 Div 3': 900, '5/2 Div 4': 800,
                'Open Div 1': 1400, 'Open Div 2': 1200}
    dfdata['div'] = np.zeros(len(dfdata))
    for i in range(len(divnames)-1):
        try:
            divstart = np.where(dfdata['Team'] == divnames[i])[0][0]
        except IndexError:
            print("{} not found, skipping league {}".format(divnames[i], leagueid))
            continue
        try:
            divend = np.where(dfdata['Team'] == divnames[i + 1])[0][0]
        except IndexError:
            print("{} not found, skipping league {}".format(divnames[i + 1], leagueid))
            continue
        try:
            dfdata.ix[divstart + 1: divend, 'div'] = divratings[divnames[i]]
        except KeyError:
            print("No base rating for {}, skipping league {}".format(divnames[i], leagueid))
            continue
    try:
        dfdata.ix[divend + 1:, 'div'] = divratings[divnames[-1]]
    except KeyError:
        print("No base rating for {}, skipping league {}".format(divnames[-1], leagueid))
        continue        

    # remove the division dividers from the dataframe
    for i in range(len(divnames)):
        dfdata = dfdata.drop(dfdata.index[dfdata['Team'] == divnames[i]])

    # generate the average goal differential column
    dfdata['wins'] = dfdata['Record'].apply(lambda x: int(x.split('-')[0]))
    dfdata['losses'] = dfdata['Record'].apply(lambda x: int(x.split('-')[1]))
    dfdata['games'] = dfdata['wins'] + dfdata['losses']
    dfdata['avgplusminus'] = dfdata['Plus/Minus'].astype('float') / dfdata['games']

    # assert that an average goal differential per game of +5 gives +300 rating points.
    dfdata['rating'] = dfdata['div'] + 60. * dfdata['avgplusminus']

    # build the dictionary of game scores
    dfdata_opponents = pd.DataFrame(data_opponent)
    dfdata_opponents['teamscore'] = dfdata.ix[:, 1].apply(lambda x: int(x.split('-')[0]))
    dfdata_opponents['opponentscore'] = dfdata.ix[:, 1].apply(lambda x: int(x.split('-')[1]))

    opponentcounter = 0
    game_scores = {}
    for idf in range(len(dfdata)):
        teamname = dfdata.ix[idf, 'Team']
        ngames = dfdata.ix[idf, 'games']
        for igame in range(ngames):
            opponentname = dfdata_opponents.ix[opponentcounter, 0]
            teamscore = dfdata_opponents.ix[opponentcounter, 'teamscore']
            opponentscore = dfdata_opponents.ix[opponentcounter, 'opponentscore']
            adversary_key = (teamname, opponentname)
            game_scores[adversary_key] = [teamscore, opponentscore]
    
    # scrape the list of teams for this league
    teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
    response = urllib2.urlopen(teamsurl)
    teams_soup = BeautifulSoup(response)

    # generate list of team ids and names for this league
    tdlist = teams_soup.find_all('td', class_='infobody')
    teamids = []
    teamnames = []
    for td in tdlist:
        try:
            url = td.a['href']
            idindex = url.index('team=')
            whichindex = url.index('which=')
            teamids.append(url[idindex+5:whichindex-1])
            teamnames.append(td.a.get_text())
        except:
            continue

    # find all players associated with each team
    # link the team rating to each player on that team
    for teamid, teamname in zip(teamids, teamnames):
        try:
            teamrating = dfdata.ix[dfdata['Team'] == teamname.strip(' '), 'rating'].values[0]
        except IndexError:
            print("Couldn't match {} to scores database, skipping this team.".format(teamname))
            continue

        teamurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeamRoster&team=' + teamid
        response = urllib2.urlopen(teamurl)
        roster_soup = BeautifulSoup(response)

        players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
        for player in players:
            if player in all_players:
                all_players[player].append(teamrating)
            else:
                all_players[player] = [teamrating]
    print("Finished successfully with league {}".format(leagueid))









    



Unable to find a database of scores for league 40491
Unable to find a database of scores for league 40278
Unable to find a database of scores for league 40273
Unable to find a database of scores for league 40268
Unable to find a database of scores for league 40264
Unable to find a database of scores for league 40258
Unable to find a database of scores for league 40253
Unable to find a database of scores for league 40249
Unable to find a database of scores for league 40245
No base rating for Northborough Open, skipping league 39633
No base rating for Danvers Weeknight, skipping league 39633
No base rating for Danvers Weekend, skipping league 39633
Unable to find a database of scores for league 39960
Unable to find a database of scores for league 39939
Unable to find a database of scores for league 39904
Unable to find a database of scores for league 39678
Unable to find a database of scores for league 39673
Finished successfully with leage 39641
No base rating for Northborough Open, skipping league 39628
No base rating for Danvers Weeknight, skipping league 39628
No base rating for Danvers Weekend, skipping league 39628
No base rating for JP Mixed (4/3), skipping league 39622
No base rating for Lexington Mixed (4/3) Weekend, skipping league 39622
Couldn't match Kung Fu Fighting Monks to scores database, skipping this team.
Couldn't match Lexington 300 to scores database, skipping this team.
Couldn't match Huck and Play D to scores database, skipping this team.
Couldn't match Just Fall Down to scores database, skipping this team.
Couldn't match Old Spam to scores database, skipping this team.
Couldn't match Natick Ultimate to scores database, skipping this team.
Couldn't match Not Dead Yet to scores database, skipping this team.
Couldn't match Electric Mayhem to scores database, skipping this team.
Couldn't match Huckagenerians to scores database, skipping this team.
Couldn't match Fossil Fuel to scores database, skipping this team.
Couldn't match Wednesday Warriors to scores database, skipping this team.
Couldn't match Turn and Burn to scores database, skipping this team.
Finished successfully with leage 39616
Unable to find a database of scores for league 39611
No base rating for Worcester Mixed, skipping league 39602
No base rating for Metro Mixed (4/3), skipping league 39344
No base rating for Metro Coed, skipping league 39344
No base rating for Lexington Mixed (4/3) Weekend, skipping league 39344
Unable to find a database of scores for league 38871
Unable to find a database of scores for league 38552
Unable to find a database of scores for league 38548
Unable to find a database of scores for league 38516
Couldn't match Not Dead Yet to scores database, skipping this team.
Couldn't match Only Mostly Dead to scores database, skipping this team.
Couldn't match Natick Ultimate to scores database, skipping this team.
Couldn't match Old Spam to scores database, skipping this team.
Couldn't match The Expendables to scores database, skipping this team.
Couldn't match The Seven Samurai to scores database, skipping this team.
Finished successfully with leage 38506
Unable to find a database of scores for league 38502
Unable to find a database of scores for league 38498
No base rating for Worcester Mixed, skipping league 38493
No base rating for Worcester Mixed, skipping league 38490
No base rating for JP Mixed (4/3), skipping league 38484
No base rating for Lexington Mixed (4/3) Weekend, skipping league 38484
Finished successfully with leage 38480
Unable to find a database of scores for league 38475
No base rating for Northborough Open, skipping league 37668
No base rating for Danvers Weeknight, skipping league 37668
No base rating for Danvers Weekend, skipping league 37668
Unable to find a database of scores for league 37959
Unable to find a database of scores for league 37690
Unable to find a database of scores for league 37686
No base rating for Open Div 3, skipping league 37674
No base rating for Northborough Open, skipping league 37663
No base rating for Danvers Weeknight, skipping league 37663
No base rating for Danvers Weekend, skipping league 37663
Finished successfully with leage 37659
No base rating for Worcester Mixed, skipping league 37653
No base rating for JP Mixed (4/3), skipping league 37648
No base rating for Lexington Mixed (4/3) Weekend, skipping league 37648
Unable to find a database of scores for league 37634
Unable to find a database of scores for league 37376
Unable to find a database of scores for league 37370
No base rating for Lexington Coed (5/2) Weekend, skipping league 36881
No base rating for Metro Mixed (4/3), skipping league 36881
No base rating for Metro Coed, skipping league 36881
Unable to find a database of scores for league 36852
Unable to find a database of scores for league 36841
Unable to find a database of scores for league 36824
Unable to find a database of scores for league 36530
Unable to find a database of scores for league 36486
No base rating for Grand Masters, skipping league 36481
No base rating for JP Fridays, skipping league 36481
Unable to find a database of scores for league 36469
Finished successfully with leage 36465
No base rating for JP Mixed (4/3), skipping league 36459
No base rating for Lexington Mixed (4/3) Weekend, skipping league 36459
No base rating for Revere Open, skipping league 35836
No base rating for Northborough Open, skipping league 35836
No base rating for Danvers Weeknight, skipping league 35836
No base rating for Northborough Open, skipping league 35831
No base rating for Danvers Weeknight, skipping league 35831
Unable to find a database of scores for league 35821
Unable to find a database of scores for league 35817
No base rating for Grand Masters Div 1, skipping league 35487
Finished successfully with leage 35481
No base rating for Grand Masters, skipping league 35476
No base rating for Lexington Coed (5/2) Weekend, skipping league 35470
No base rating for JP Mixed (4/3), skipping league 35470
Unable to find a database of scores for league 35462
Unable to find a database of scores for league 35358
No base rating for Lexington Coed (5/2) Weekend, skipping league 34896
No base rating for Metro Mixed (4/3), skipping league 34896
No base rating for Metro Coed, skipping league 34896
Unable to find a database of scores for league 34841
Unable to find a database of scores for league 34579
Finished successfully with leage 34550
No base rating for Grand Masters, skipping league 34524
No base rating for JP Mixed (4/3), skipping league 34405
No base rating for Lexington Mixed (4/3) Weekend, skipping league 34405
Unable to find a database of scores for league 34400
No base rating for Revere Open, skipping league 34395
No base rating for Northborough Open, skipping league 34395
No base rating for Revere Open, skipping league 34330
No base rating for Northborough Open, skipping league 34330
Unable to find a database of scores for league 34170
No base rating for Grand Masters Div 1, skipping league 34023
Finished successfully with leage 33993
No base rating for Lexington Coed (5/2) Weeknight, skipping league 33985
No base rating for Lexington Coed (5/2) Weekend, skipping league 33985
No base rating for JP Mixed (4/3), skipping league 33985
No base rating for Grand Masters, skipping league 33980
No base rating for Lexington Coed (5/2) Weekend, skipping league 33233
No base rating for JP Mixed (4/3), skipping league 33233
Finished successfully with leage 32945
Unable to find a database of scores for league 32941
No base rating for JP Coed, skipping league 32936
No base rating for Lexington Coed (5/2) Weekend, skipping league 32746
No base rating for JP Coed, skipping league 32746
No base rating for JP Mixed (4/3), skipping league 32746
No base rating for Revere Open, skipping league 32741
No base rating for Northborough Open, skipping league 32741
Unable to find a database of scores for league 32566
Unable to find a database of scores for league 32563
No base rating for Revere Open, skipping league 32420
No base rating for Northborough Open, skipping league 32420
Unable to find a database of scores for league 32268
Finished successfully with leage 32262
No base rating for Lexington Coed (5/2) Weeknight, skipping league 31034
No base rating for JP Mixed (4/3), skipping league 31034
No base rating for Lexington Coed (5/2) Weeknight, skipping league 31029
No base rating for Metro Mixed (4/3), skipping league 31029
Unable to find a database of scores for league 31023
No base rating for Waitlist, skipping league 31018
No base rating for Grand Masters, skipping league 31018
Unable to find a database of scores for league 31014
Unable to find a database of scores for league 31007
Finished successfully with leage 31002
No base rating for Lexington Coed (5/2) Weeknight, skipping league 30924
No base rating for JP Coed, skipping league 30924
No base rating for JP Mixed (4/3), skipping league 30924
No base rating for Milford Open, skipping league 30919
No base rating for Revere Open, skipping league 30919
No base rating for Revere Open, skipping league 30844
No base rating for JP Open, skipping league 29449
No base rating for Grand Masters Div 1, skipping league 29432
No base rating for Grand Masters Div 2, skipping league 29432
No base rating for JP Coed, skipping league 29390
No base rating for Grand Masters, skipping league 29390
Finished successfully with leage 29389
No base rating for JP Coed, skipping league 29388
No base rating for JP Mixed (4/3), skipping league 29388
No base rating for Waltham Coed, skipping league 29388
No base rating for Newton Coed, skipping league 29387
No base rating for Metro Mixed (4/3), skipping league 29387
No base rating for Metro Coed, skipping league 29387
Unable to find a database of scores for league 29386
No base rating for JP Mixed (4/3), skipping league 29385
No base rating for Waltham Coed, skipping league 29385
No base rating for Girls, skipping league 29237
Finished successfully with leage 29235
No base rating for Milford Open, skipping league 29192
No base rating for Revere Open, skipping league 29192
No base rating for Revere Open, skipping league 29187
No base rating for Grand Masters, skipping league 28912
No base rating for Girls, skipping league 26998
Finished successfully with leage 26909
No base rating for Lexington Coed (5/2) Weeknight, skipping league 26908
No base rating for JP Coed, skipping league 26908
No base rating for JP Mixed (4/3), skipping league 26908
No base rating for Waltham Coed, skipping league 26908
Finished successfully with leage 26907
No base rating for Newton Coed, skipping league 26906
No base rating for Metro Mixed (4/3), skipping league 26906
No base rating for Metro Coed, skipping league 26906
No base rating for JP Mixed (4/3), skipping league 26905
No base rating for Waltham Coed, skipping league 26905
Unable to find a database of scores for league 26904
No base rating for Milford Open, skipping league 26903
No base rating for Revere Open, skipping league 26903
No base rating for Girls, skipping league 23714
No base rating for Grand Masters, skipping league 23696
No base rating for Lexington Coed (5/2) Weeknight, skipping league 23695
No base rating for JP Coed, skipping league 23695
No base rating for JP Mixed (4/3), skipping league 23695
No base rating for Waltham Coed, skipping league 23695
Finished successfully with leage 23694
No base rating for Newton Coed, skipping league 23693
No base rating for Metro Mixed (4/3), skipping league 23693
No base rating for Metro Coed, skipping league 23693
Unable to find a database of scores for league 23692
No base rating for JP Mixed (4/3), skipping league 23691
No base rating for Waltham Coed, skipping league 23691
No base rating for Milford Open, skipping league 23626
Finished successfully with leage 20439
No base rating for Lexington Coed (5/2) Weeknight, skipping league 20438
No base rating for JP Coed, skipping league 20438
No base rating for JP Mixed (4/3), skipping league 20438
No base rating for Waltham Coed, skipping league 20438
Finished successfully with leage 20436
No base rating for Newton Coed, skipping league 20435
No base rating for Metro Mixed (4/3), skipping league 20435
No base rating for Metro Coed, skipping league 20435
Unable to find a database of scores for league 20434
No base rating for JP Mixed (4/3), skipping league 20433
No base rating for Waltham Coed, skipping league 20433
No base rating for Milford Open, skipping league 20175
No base rating for Lexington Coed (5/2) Weeknight, skipping league 18047
No base rating for JP Coed, skipping league 18047
No base rating for JP Mixed (4/3), skipping league 18047
No base rating for Waltham Coed, skipping league 18047
No base rating for Open, skipping league 17692
Finished successfully with leage 17692
No base rating for OPEN, skipping league 17691
Finished successfully with leage 17691
Unable to find a database of scores for league 17604
No base rating for Metro Open, skipping league 17603
No base rating for Newton Coed, skipping league 17603
No base rating for Metro Mixed (4/3), skipping league 17603
No base rating for JP Coed, skipping league 17602
No base rating for JP Mixed (4/3), skipping league 17602
No base rating for Waltham Coed, skipping league 17602
No base rating for Milford Open, skipping league 17499
Finished successfully with leage 16881
No base rating for Lexington Coed (5/2) Weeknight, skipping league 16880
No base rating for JP Coed, skipping league 16880
No base rating for JP Mixed (4/3), skipping league 16880
No base rating for Waltham Coed, skipping league 16880
No base rating for OPEN, skipping league 13647
Finished successfully with leage 13647
Unable to find a database of scores for league 13447
No base rating for Newton Coed, skipping league 13446
No base rating for Metro Mixed (4/3), skipping league 13446
No base rating for Metro Coed, skipping league 13446
No base rating for JP Coed, skipping league 13445
No base rating for JP Mixed (4/3), skipping league 13445
No base rating for Waltham Coed, skipping league 13445
No base rating for Milford Open, skipping league 13215
No base rating for Open, skipping league 12807
No base rating for Mixed A, skipping league 12807
No base rating for Mixed B, skipping league 12807
No base rating for JP Coed, skipping league 12769
No base rating for Waltham Open, skipping league 12769
No base rating for Lexington Open, skipping league 12769
Unable to find a database of scores for league 11632
Unable to find a database of scores for league 11631
No base rating for JP Coed, skipping league 11604
No base rating for Metro Open, skipping league 11604
No base rating for Newton Open, skipping league 11604
No base rating for JP Coed, skipping league 11402
No base rating for JP Open, skipping league 11402
No base rating for Waltham Open, skipping league 11402
No base rating for Milford Open, skipping league 11401
No base rating for JP Coed, skipping league 10918
No base rating for JP Open, skipping league 10918
No base rating for Lexington Open, skipping league 10918
No base rating for Northwest Open, skipping league 10918
No base rating for Open, skipping league 10917
No base rating for Mixed A, skipping league 10917
No base rating for Mixed B, skipping league 10917
Unable to find a database of scores for league 10309
Unable to find a database of scores for league 10217
No base rating for West Open, skipping league 6340
No base rating for Metro Coed, skipping league 6340
No base rating for Metro Open, skipping league 6340
No base rating for Newton Open, skipping league 6340
No base rating for Metro Coed, skipping league 6326
No base rating for Northwest Open, skipping league 6326
No base rating for Milford Open, skipping league 6235
No base rating for OPEN, skipping league 5629
No base rating for Mixed A, skipping league 5629
No base rating for Mixed B, skipping league 5629
No base rating for Lexington Open, skipping league 5130
No base rating for Metro Coed, skipping league 5130
No base rating for Northwest Open, skipping league 5130
No base rating for West Open, skipping league 5129
No base rating for Metro Coed, skipping league 5129
No base rating for Metro Open, skipping league 5129
No base rating for Newton Open, skipping league 5129
Unable to find a database of scores for league 4919
Unable to find a database of scores for league 4886
No base rating for JP Coed, skipping league 3804
No base rating for JP Open, skipping league 3804
No base rating for Waltham Open, skipping league 3804
No base rating for Milford Open, skipping league 3669
No base rating for Mixed, skipping league 3162
No base rating for JP Coed, skipping league 3144
No base rating for Waltham Open, skipping league 3144
No base rating for Lexington Open, skipping league 3144
Unable to find a database of scores for league 2408
No base rating for West Open, skipping league 2407
No base rating for Metro Coed, skipping league 2407
No base rating for Metro Open, skipping league 2407
No base rating for JP Coed, skipping league 1080
No base rating for JP Open, skipping league 1080
No base rating for Waltham Open, skipping league 1080
Unable to find a database of scores for league 1083
Unable to find a database of scores for league 1082
Unable to find a database of scores for league 1081
Unable to find a database of scores for league 1095
Unable to find a database of scores for league 1094
Unable to find a database of scores for league 1093



In [87]:

    
all_players.pop('')









    Out[87]:





[1635.0,
 1500.0,
 1455.0,
 1940.0,
 1908.0,
 2020.0,
 1944.0,
 2115.0,
 1740.0,
 1688.5714285714287,
 1662.8571428571429,
 1785.0,
 1750.0,
 1025.0,
 1205.0,
 800.0,
 1408.5714285714287,
 1254.2857142857142,
 1845.7142857142858,
 1040.0,
 1592.0,
 1415.0,
 1220.0,
 1751.4285714285713,
 1208.0,
 1280.0,
 1412.0,
 1340.0,
 1230.0,
 1460.0,
 1357.1428571428571,
 1580.0,
 1648.5714285714287,
 1520.0,
 1500.0,
 1452.5,
 922.85714285714289,
 1168.0,
 1390.0,
 620.0,
 616.0,
 890.0,
 905.71428571428578,
 957.14285714285711,
 1248.5714285714287,
 1135.0,
 725.71428571428578,
 1408.0,
 900.0,
 1317.1428571428571,
 1328.0,
 1004.0,
 1440.0,
 1797.5,
 1380.0,
 680.0,
 1920.0,
 1424.0,
 1632.5,
 390.0,
 1813.3333333333333,
 1868.5714285714287,
 2005.7142857142858,
 2061.1764705882351,
 1812.0,
 1860.0,
 1687.5,
 1626.0,
 2070.0,
 1202.0,
 1516.25,
 1657.1428571428571,
 1463.3333333333333,
 1526.3157894736842,
 1262.0,
 1373.3333333333333,
 1271.4285714285716,
 1386.1538461538462,
 1164.2857142857142,
 1318.8235294117646,
 1538.0,
 1460.0,
 1430.0,
 1486.4000000000001,
 946.66666666666674,
 1445.8823529411766,
 1490.0,
 670.0,
 929.09090909090912,
 1123.3333333333333,
 718.0,
 426.66666666666663,
 922.5,
 808.0,
 1032.7272727272727,
 1133.8461538461538,
 1580.0,
 2040.0,
 2128.5714285714284,
 2004.0,
 1689.090909090909,
 1442.5,
 1426.6666666666667,
 1000.0,
 1186.6666666666667,
 1334.2857142857142,
 1188.5714285714287,
 1233.3333333333333,
 1365.0,
 993.33333333333337,
 1280.0,
 1420.0,
 1535.7142857142858,
 1265.0,
 1609.2307692307693,
 1082.5,
 960.0,
 1065.0,
 1008.5714285714286,
 1020.0,
 652.5,
 973.84615384615381,
 735.0,
 977.14285714285711,
 1052.7272727272727,
 731.25,
 893.33333333333337,
 466.15384615384613,
 740.0,
 480.0,
 848.18181818181813,
 1475.0,
 1588.5714285714284,
 1233.3333333333333,
 1390.7692307692307,
 1166.6666666666667,
 1542.5,
 1130.0,
 1325.0,
 1514.0,
 1322.8571428571429,
 1020.0,
 1980.0,
 1692.0,
 1740.0,
 1640.0,
 1740.0,
 1825.7142857142858,
 1660.0,
 2242.5,
 1740.0,
 1665.0,
 1774.2857142857142,
 1320.0,
 988.57142857142867,
 1430.0,
 1520.0,
 1348.5714285714287,
 1460.0,
 1480.0,
 1220.0,
 1190.0,
 1268.0,
 1412.0,
 1365.7142857142858,
 1592.0,
 1588.5714285714284,
 1460.0,
 1184.0,
 1820.0,
 1090.0,
 1240.0,
 870.0,
 508.00000000000006,
 1025.7142857142858,
 1068.5714285714287,
 1420.0,
 960.0,
 700.0,
 1010.0,
 805.0,
 1160.0,
 1248.5714285714287,
 794.28571428571433,
 nan,
 1126.6666666666667,
 1091.4285714285713,
 1580.0,
 1820.0,
 1520.0,
 1526.6666666666667,
 795.0,
 1963.6363636363635,
 1620.0,
 1635.0,
 1770.0,
 1664.2105263157896,
 2076.0,
 2020.0,
 1977.1428571428571,
 1794.0,
 1410.0,
 1975.3846153846155,
 1965.0,
 1640.0,
 2020.0,
 1487.2727272727273,
 1250.0,
 1175.0,
 1070.0,
 1409.4736842105262,
 1583.75,
 1312.3076923076924,
 1352.6315789473683,
 1364.0,
 1164.0,
 1349.2307692307693,
 1645.4545454545455,
 1400.0,
 1480.0,
 1444.3478260869565,
 1150.0,
 1460.0,
 1534.4000000000001,
 798.18181818181824,
 805.0,
 855.0,
 873.33333333333337,
 1280.0,
 300.0,
 1094.2857142857142,
 1163.2,
 820.0,
 940.0,
 1164.0,
 1616.0,
 2078.0,
 1785.0,
 1522.0,
 1015.0,
 1212.3076923076924,
 1245.0,
 1327.2727272727273,
 1330.0,
 1444.0,
 1136.0,
 1427.5,
 1462.8571428571429,
 1504.0,
 1132.5,
 856.0,
 969.0,
 1172.3076923076924,
 1131.4285714285716,
 715.38461538461536,
 1024.2857142857142,
 690.0,
 405.0,
 1046.6666666666667,
 575.29411764705878,
 923.07692307692309,
 485.0,
 770.0,
 675.0,
 924.0,
 1634.5454545454545,
 1385.0,
 1207.3684210526317,
 1477.1428571428571,
 1250.0,
 1450.0,
 1588.0,
 1320.0,
 1390.0,
 1020.0,
 771.42857142857133,
 924.0,
 1400.0,
 696.0,
 1560.0,
 2090.0,
 1872.0,
 1524.0,
 1812.0,
 1645.7142857142858,
 2000.0,
 2010.0,
 1680.0,
 1988.5714285714284,
 2091.4285714285716,
 1880.0,
 1702.5,
 1800.0,
 1436.0,
 1314.2857142857142,
 992.0,
 1091.4285714285713,
 1597.1428571428571,
 1490.0,
 830.0,
 1400.0,
 1322.8571428571429,
 1797.5,
 1620.0,
 1340.0,
 1200.0,
 1305.7142857142858,
 1452.5,
 1316.0,
 1437.5,
 1542.5,
 1385.7142857142858,
 1310.0,
 1128.5714285714284,
 410.0,
 1162.8571428571429,
 1205.7142857142858,
 871.42857142857144,
 670.0,
 724.0,
 805.0,
 1295.0,
 1854.2857142857142,
 1326.6666666666667,
 1376.0,
 1400.0,
 540.0,
 2000.0,
 1806.6666666666667,
 1923.3333333333333,
 1827.2727272727273,
 2008.421052631579,
 1728.0,
 2065.7142857142858,
 1778.5714285714287,
 2063.0769230769229,
 1312.7272727272727,
 1400.0,
 1067.2727272727273,
 1543.3333333333333,
 1130.0,
 1421.1764705882354,
 1586.0,
 1130.0,
 1420.0,
 1467.0588235294117,
 1487.6923076923076,
 1420.0,
 1295.0,
 1503.6363636363637,
 1312.3076923076924,
 1330.0,
 1569.4117647058824,
 793.33333333333326,
 1146.6666666666667,
 1201.8181818181818,
 357.14285714285722,
 953.33333333333337,
 869.41176470588243,
 646.66666666666663,
 1827.5,
 2045.0,
 1867.1428571428571,
 1670.0,
 1475.7142857142858,
 1053.3333333333335,
 1444.0,
 1020.0,
 1428.0,
 1468.0,
 1270.0,
 1228.0,
 1303.3333333333333,
 1278.1818181818182,
 1042.8571428571429,
 1695.0,
 1096.0,
 1101.5384615384614,
 1047.1428571428571,
 1090.0,
 1533.3333333333333,
 372.0,
 553.33333333333326,
 1020.0,
 1044.0,
 900.0,
 870.0,
 680.0,
 720.0,
 788.57142857142856,
 1104.0,
 972.0,
 700.0,
 886.66666666666663,
 770.76923076923072,
 892.5,
 975.0,
 1544.0,
 1583.1578947368421,
 1357.1428571428571,
 1505.0,
 1378.5714285714287,
 1440.0,
 1115.0,
 1486.6666666666667,
 1435.0,
 1222.5,
 1240.0,
 992.72727272727275,
 1131.4285714285713,
 1980.0,
 1980.0,
 1930.0,
 1524.0,
 1764.0,
 1688.5714285714287,
 1580.0,
 1870.0,
 1740.0,
 1385.0,
 1752.5,
 1482.5,
 1742.8571428571429,
 1400.0,
 980.0,
 1545.7142857142858,
 1262.8571428571429,
 1390.0,
 1496.0,
 1250.0,
 1828.5714285714287,
 930.0,
 1220.0,
 1314.2857142857142,
 1000.0,
 712.0,
 1042.8571428571429,
 605.71428571428578,
 1565.7142857142858,
 502.85714285714283,
 1080.0,
 1030.0,
 820.0,
 1100.0,
 580.0,
 1142.5,
 897.14285714285711,
 1008.5714285714286,
 737.5,
 1345.0,
 1090.0,
 1397.5,
 1100.0,
 1880.0,
 1310.0,
 1680.0,
 1530.0,
 1120.0,
 1565.4545454545455,
 1980.0,
 1475.0,
 1930.0,
 1951.578947368421,
 1972.0,
 2042.7272727272727,
 1960.0,
 2082.8571428571431,
 1300.0,
 1450.5263157894738,
 1220.0,
 1504.2105263157896,
 1332.5,
 1256.9230769230769,
 1361.4285714285713,
 1400.0,
 1528.5714285714284,
 1244.0,
 1409.0,
 1355.0,
 1343.1578947368421,
 1366.25,
 1456.8421052631579,
 1430.0,
 1585.0,
 1568.75,
 1272.5,
 628.0,
 1012.0,
 1124.0,
 610.0,
 655.0,
 841.81818181818176,
 650.90909090909088,
 1186.6666666666667,
 983.33333333333337,
 1910.0,
 1880.0,
 2000.0,
 1896.3636363636365,
 1792.7272727272727,
 1614.2857142857142,
 1172.5,
 1150.0,
 1480.0,
 1096.0,
 1229.090909090909,
 1450.0,
 1545.4545454545455,
 1582.0,
 1780.0,
 991.42857142857144,
 904.61538461538464,
 882.0,
 1156.0,
 1002.8571428571429,
 456.0,
 876.0,
 484.0,
 1032.0,
 934.28571428571433,
 382.5,
 863.07692307692309,
 555.0,
 926.66666666666663,
 906.31578947368416,
 250.0,
 480.0,
 1508.0,
 826.66666666666663,
 1592.0,
 1504.2105263157896,
 1545.7142857142858,
 1563.6363636363635,
 1700.0,
 1497.5,
 1345.0,
 1335.7142857142858,
 1200.0,
 1315.3846153846155,
 978.0,
 1300.0,
 830.0,
 1997.1428571428571,
 1860.0,
 1890.0,
 1752.0,
 1455.0,
 1812.0,
 1930.0,
 1752.0,
 1205.0,
 1305.7142857142858,
 1424.0,
 1436.0,
 1760.0,
 1194.2857142857142,
 1693.3333333333335,
 1400.0,
 1297.1428571428571,
 1554.2857142857142,
 1340.0,
 1190.0,
 1250.0,
 1430.0,
 1250.0,
 1037.5,
 1270.0,
 640.0,
 620.0,
 1162.8571428571429,
 1187.5,
 1120.0,
 1082.5,
 1127.5,
 657.14285714285711,
 725.71428571428578,
 1262.5,
 1202.5,
 760.0,
 1674.2857142857142,
 1472.0,
 1170.0,
 1500.0,
 1477.1428571428571,
 1670.0,
 590.0,
 1590.0,
 1090.0,
 756.0,
 1671.4285714285716,
 1912.0,
 1650.0,
 1818.75,
 1820.0,
 1935.0,
 1884.0,
 1720.0,
 2040.0,
 1381.5384615384614,
 1380.0,
 1837.1428571428571,
 1415.0,
 1379.0,
 1400.0,
 1512.5,
 1532.6315789473683,
 1297.1428571428571,
 1390.5263157894738,
 1430.0,
 1288.5714285714287,
 1293.3333333333333,
 1488.5714285714287,
 1252.3076923076924,
 1515.0,
 1460.0,
 730.0,
 940.0,
 1041.5384615384614,
 964.70588235294122,
 1013.3333333333334,
 760.0,
 406.0,
 724.0,
 840.0,
 1261.0,
 1004.2857142857143,
 2032.7272727272727,
 1962.5,
 1752.5,
 1863.6363636363635,
 1802.0,
 1985.0,
 1475.7142857142858,
 1087.2727272727273,
 1111.4285714285716,
 1574.2857142857142,
 1310.909090909091,
 1300.0,
 1201.8181818181818,
 1428.5714285714284,
 636.0,
 1064.0,
 921.42857142857144,
 988.0,
 744.0,
 760.0,
 641.53846153846155,
 1308.0,
 1220.0,
 822.85714285714289,
 474.54545454545456,
 1182.0,
 916.36363636363637,
 1069.4117647058824,
 600.0,
 696.0,
 801.0,
 945.88235294117646,
 528.0,
 477.5,
 1868.75,
 1568.0,
 1446.6666666666667,
 1438.1818181818182,
 1624.2105263157896,
 1513.3333333333333,
 1268.0,
 1507.1428571428571,
 1502.8571428571429,
 1557.5,
 1089.090909090909,
 1325.0,
 1348.0,
 1234.2857142857142,
 918.46153846153845,
 977.14285714285711,
 932.30769230769238,
 1095.0,
 1680.0,
 1145.0,
 936.0,
 1006.1538461538462,
 1987.5,
 1957.5,
 1932.0,
 1705.7142857142858,
 1792.5,
 1900.0,
 1362.8571428571429,
 1980.0,
 1894.2857142857142,
 1851.4285714285713,
 1697.1428571428571,
 1548.0,
 1030.0,
 1520.0,
 1520.0,
 1460.0,
 1170.0,
 1730.0,
 1560.0,
 1391.4285714285713,
 1550.0,
 1352.5,
 1025.7142857142858,
 1140.0,
 573.33333333333337,
 1420.0,
 860.0,
 490.0,
 592.0,
 1170.0,
 1040.0,
 1024.0,
 880.0,
 1340.0,
 1490.0,
 1904.0,
 1480.0,
 790.0,
 1164.0,
 600.0,
 1337.1428571428571,
 1926.0,
 1782.8571428571429,
 1936.3636363636365,
 1897.5,
 1816.0,
 1785.0,
 2040.0,
 1668.0,
 1968.0,
 1674.5454545454545,
 1976.8421052631579,
 1156.4705882352941,
 1265.8823529411766,
 1400.0,
 1289.4736842105262,
 1384.2105263157894,
 1050.0,
 1532.0,
 1576.0,
 1505.8823529411766,
 1258.1818181818182,
 1450.0,
 1650.909090909091,
 1537.1428571428571,
 1220.0,
 1287.5,
 1180.0,
 791.20000000000005,
 1213.75,
 652.0,
 753.33333333333337,
 310.0,
 720.0,
 885.4545454545455,
 896.36363636363637,
 951.25,
 1213.3333333333333,
 1814.5454545454545,
 2193.333333333333,
 1348.5714285714287,
 2067.5,
 1644.6153846153845,
 1866.1538461538462,
 1994.0,
 1634.5454545454545,
 1985.8823529411766,
 991.42857142857133,
 790.0,
 1345.0,
 1372.0,
 1450.0,
 1425.0,
 1150.0,
 1258.0,
 1543.1578947368421,
 1354.5454545454545,
 730.0,
 593.33333333333337,
 1002.0,
 1144.0,
 888.57142857142856,
 1046.6666666666667,
 1312.5,
 844.61538461538464,
 900.0,
 845.0,
 769.09090909090912,
 1237.5,
 885.0,
 969.0,
 832.5,
 380.0,
 220.0,
 658.18181818181824,
 267.5,
 830.0,
 1704.0,
 1441.0526315789473,
 1745.0,
 1266.6666666666667,
 1691.4285714285713,
 776.0,
 1305.0,
 1500.0,
 1773.8461538461538,
 1284.2857142857142,
 1314.0,
 672.0,
 1097.1428571428571,
 1233.3333333333333,
 612.0,
 830.76923076923072,
 1020.0,
 1240.0,
 1160.0,
 1527.2727272727273,
 549.23076923076917,
 1390.909090909091,
 1435.0,
 1007.3684210526316,
 2040.0,
 1560.0,
 1452.0,
 2030.0,
 1700.0,
 1774.2857142857142,
 1750.0,
 1910.0,
 1940.0,
 1970.0,
 1670.0,
 1340.0,
 1400.0,
 800.0,
 1400.0,
 1400.0,
 1196.0,
 1440.0,
 1842.5,
 1340.0,
 1597.1428571428571,
 1280.0,
 1440.0,
 1680.0,
 220.0,
 1070.0,
 700.0,
 700.0,
 1330.0,
 430.0,
 1137.1428571428571,
 1650.0,
 1102.8571428571429,
 948.57142857142856,
 1168.0,
 1811.4285714285713,
 1382.8571428571429,
 1620.0,
 1400.0,
 1532.0,
 930.0,
 1500.0,
 891.42857142857133,
 760.0,
 1620.0,
 1770.0,
 1845.0,
 1968.0,
 1853.3333333333333,
 1907.3684210526317,
 2105.0,
 2165.4545454545455,
 1785.0,
 1260.0,
 1417.1428571428571,
 1320.0,
 1559.1304347826087,
 1250.0,
 1493.0,
 1345.4545454545455,
 1268.0,
 1717.6470588235293,
 1246.25,
 1362.5,
 1270.0,
 827.82608695652175,
 1037.5,
 868.0,
 965.71428571428567,
 841.81818181818176,
 905.71428571428578,
 720.0,
 1730.0,
 1947.0588235294117,
 1798.1818181818182,
 1748.0,
 2006.6666666666665,
 1340.0,
 1492.7272727272727,
 1912.7272727272727,
 1526.0,
 2005.4545454545455,
 1749.090909090909,
 1898.0,
 1000.0,
 1120.0,
 1465.0,
 1461.5384615384614,
 1425.4545454545455,
 955.0,
 985.0,
 1317.1428571428571,
 964.0,
 1200.0,
 1493.8461538461538,
 922.5,
 962.85714285714289,
 1091.25,
 916.0,
 881.53846153846155,
 1016.6666666666666,
 872.30769230769226,
 1010.7692307692307,
 868.0,
 950.0,
 1183.3333333333333,
 692.30769230769226,
 864.0,
 836.0,
 702.35294117647061,
 990.0,
 1121.5384615384614,
 627.27272727272725,
 366.66666666666669,
 218.0,
 1400.0,
 1572.0,
 1645.0,
 1673.3333333333333,
 1685.0,
 1496.0,
 1362.5,
 1358.75,
 526.66666666666674,
 694.28571428571422,
 1029.2307692307693,
 1508.0,
 640.0,
 1140.0,
 940.0,
 1161.8181818181818,
 1389.2307692307693,
 1592.3076923076924,
 967.5,
 1014.0,
 1897.5,
 1946.6666666666667,
 1726.6666666666667,
 2148.0,
 1948.421052631579,
 1697.1428571428571,
 1640.0,
 1515.0,
 1866.0,
 2036.25,
 1680.0,
 1250.0,
 1404.6153846153845,
 1313.75,
 1400.0,
 1517.1428571428571,
 1323.3333333333333,
 1534.1176470588234,
 1150.0,
 1575.7142857142858,
 1657.1428571428571,
 1435.0,
 1391.4285714285713,
 1496.6666666666667,
 876.0,
 820.0,
 622.0,
 974.28571428571433,
 558.57142857142867,
 923.63636363636363,
 1652.0,
 1980.0,
 1754.0,
 1925.0,
 2004.6153846153845,
 1526.6666666666667,
 1708.5714285714287,
 860.0,
 1516.9230769230769,
 1334.2857142857142,
 1285.0,
 1312.0,
 ...]



In [90]:

    
import pickle

pickle.dump(all_players, open( "all_players.p", "wb" ) )



In [105]:

    
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd



In [93]:

    
%matplotlib inline
for player in all_players.keys():
    if len(all_players[player]) > 5:
        plt.plot(all_players[player], color='gray', lw=0.5, alpha=0.1)



In [124]:

    
pmean = []
players_means = {}
for player in all_players.keys():
    pratings = np.array(all_players[player])
    toolow = np.where(pratings < 0)
    if toolow[0].size > 0:
        pratings[toolow[0]] = 0
    pmean.append(pratings.mean())
    if pratings.mean() < 0:
        print(pratings)
    players_means[player] = pratings.mean()



In [122]:

    
pdf = pd.DataFrame(pmean)



In [123]:

    
sns.distplot(pdf.dropna())









    Out[123]:





<matplotlib.axes._subplots.AxesSubplot at 0x12ad5a810>



In [125]:



In [130]:

    
# extract the league id for this league
springhat2016id = '40258'
leagueid = springhat2016id

# scrape the list of teams for this league
teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
response = urllib2.urlopen(teamsurl)
teams_soup = BeautifulSoup(response)

# generate list of team ids and names for this league
tdlist = teams_soup.find_all('td', class_='infobody')
teamids = []
teamnames = []
for td in tdlist:
    try:
        url = td.a['href']
        idindex = url.index('team=')
        whichindex = url.index('which=')
        teamids.append(url[idindex+5:whichindex-1])
        teamnames.append(td.a.get_text())
    except:
        continue

# find all players associated with each team
teamratings = {}
for teamid, teamname in zip(teamids, teamnames):

    teamurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeamRoster&team=' + teamid
    response = urllib2.urlopen(teamurl)
    roster_soup = BeautifulSoup(response)

    playerratings = []
    players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
    for player in players:
        if player in all_players:
            playerratings.append(players_means[player])
        else:
            # if someone hasn't played club league, they probably aren't very good
            playerratings.append(800)
    # the team rating is the average of the player ratings for that team
    teamratings[teamname] = np.mean(playerratings)
print("Finished successfully with league {}".format(leagueid))









    



Finished successfully with league 40258



In [156]:

    
sns.distplot(pd.DataFrame(teamratings.values()).dropna(), kde=False, bins=10)
plt.axvline(teamratings['Team 20 (20)'], label='Team 20')
plt.legend(loc='auto')
plt.ylabel('Number of Teams')
plt.xlabel('Team Rating')
plt.savefig('Team20Rating.png')



In [161]:

    
teamratings['Team 27 (27)'] = 1000



In [162]:

    
keylist = []
valuelist = []
for key in teamratings.keys():
    keylist.append(key)
    valuelist.append(teamratings[key])



In [163]:

    
shl = pd.DataFrame({'team':keylist, 'rating':valuelist})



In [164]:

    
shl = shl.sort('rating', ascending=False)



In [166]:

    
shl.team









    Out[166]:





19         Team 2 (2)
1          Team 3 (3)
21       Team 11 (11)
26       Team 14 (14)
13       Team 23 (23)
23       Team 10 (10)
10       Team 13 (13)
20       Team 19 (19)
2        Team 24 (24)
0        Team 21 (21)
9        Team 22 (22)
17       Team 30 (30)
24         Team 6 (6)
7        Team 12 (12)
22       Team 17 (17)
8          Team 8 (8)
5        Team 18 (18)
3          Team 9 (9)
11       Team 16 (16)
28       Team 15 (15)
29       Team 26 (26)
4        Team 25 (25)
12         Team 7 (7)
18    Mark Hammer (1)
15       Team 20 (20)
27         Team 4 (4)
25       Team 29 (29)
14         Team 5 (5)
16       Team 27 (27)
6        Team 28 (28)
Name: team, dtype: object



In [180]:

    
5/25.









    Out[180]:





0.2



In [179]:

    
2/28.









    Out[179]:





0.07142857142857142



In [ ]:

    
def rating_to_point(rating1, rating2):
    
    # tune k so that rating differential of ... corresponds to point ratio of ...
    # 800 ... 0.5
    # 400 ... 0.15
    # 200 ... 0.07
    # 100 ... 0.035
    
    point_ratio1 = 1 / (1 + np.exp(-k * x))
    
    return point_ratio1



In [33]:

    
# define the dictionary that will contain all player ratings
all_players = {}

# extract the league id for this league
leagueid = '39641'#link[link.index('league=') + 7:]

# scrape the scores for this league
leaguescoreurl = 'http://www.buda.org/hatleagues/scores.php?section=showLeagueSchedule&league=' + leagueid + '&byDivision=1&showGames=1'
response = urllib2.urlopen(leaguescoreurl)
leaguescore_soup = BeautifulSoup(response)

# assemble the data of team ratings for this league
data = []
try:
    table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
except IndexError:
    print("Unable to find a database of scores for league {}".format(leagueid))
rows = table.find_all('tr')
for row in rows:
#     cols = row.find_all('th')
#     cols = [ele.text.strip() for ele in cols]
#     if len(cols) == 0:
    cols = row.find_all('td')
    cols = [ele.text.strip() for ele in cols]
    data.append([ele for ele in cols if ele]) # Get rid of empty values

# convert to dataframe and drop irrelevant columns
dfdata = pd.DataFrame(data)



In [35]:

    
dfdata.dropna().ix[:, 0]









    Out[35]:





3                 Nutgetters
4             Stormageddon V
5                  FallChart
6         Supermona Reloaded
8                The Animals
9                 BUDA U19 A
10     SnakeCountry Bromance
11        Pickle Bush (LPFK)
12          Somerville Youth
13        Supermona Reloaded
15          Somerville Youth
16      Topless Pillow Fight
17        Supermona Reloaded
18                 FallChart
19              Bear Cavalry
21            Stormageddon V
22     SnakeCountry Bromance
23        Supermona Reloaded
25      Topless Pillow Fight
26                Downstream
27               The Animals
28             Burnt Pudding
29        Supermona Reloaded
31      Topless Pillow Fight
32                 FallChart
33              Haunted HAOS
34       GrassBurner Harvest
35             Burnt Pudding
37        Pickle Bush (LPFK)
38              Bear Cavalry
               ...          
378          Lesley Ultimate
379              Squid Squad
380               Crossroads
382         Buttonwood Bears
383          Concrete Jungle
384              Spirit Fowl
385               Crossroads
386              Squid Squad
387          Concrete Jungle
389                 Hammered
390       Merrimack Ultimate
391          Lesley Ultimate
392               Crossroads
393              Squid Squad
395               Crossroads
396                 Hammered
397              Spirit Fowl
398       Merrimack Ultimate
399                 Hammered
401       Merrimack Ultimate
402      Concrete Schoolyard
403         Buttonwood Bears
404                 FastCAPs
405              Spirit Fowl
406       Merrimack Ultimate
407              Squid Squad
408               Crossroads
411       Merrimack Ultimate
412          Lesley Ultimate
413              Squid Squad
Name: 0, dtype: object

Data format should be: dictionary with key as ('Team1, Team2') and value as (score1, score2). Then, to generate the lnprob, we loop over all keys in the dictionary, building a list of difference delta-ratings.



In [1]:

    
def point_to_rating(point1, point2):
    base_rating = [-1200, -800,-400,-200,-100,0,100,200,400,800, 1200]
    base_point_ratio = [-1, -0.5, -0.2, -0.07, -0.03, 0.0, 0.03, 0.07, 0.2, 0.5, 1]
    interpfunc = interp1d(base_point_ratio, base_rating)
    point_ratio = (point1 - point2) / (point1 + point2)
    delta_rating = interpfunc(point_ratio)
    return delta_rating
# plt.plot(outputs, indices,'-o')



In [2]:

    
def lnprob(param):
    
    # set the bounds
    if param.any() < 0:
        return np.inf
    
    if param.any() > 10000:
        return np.inf

    # populate the team ratings according to the current model
    model_ratings = {}
    for iteam, teamname in enumerate(teamnames):
        model_ratings[teamname] = param[iteam]

    # compute the rating delta for both model and data
    model_delta = []
    observed_delta = []
    for gamekey in game_scores.keys():
        key0 = gamekey[0]
        key1 = gamekey[1]
        model_delta.append(model_ratings[key0] - model_ratings[key1])
        observed_delta.append(point_to_rating(game_scores[gamekey][0], game_scores[gamekey][1]))

    # lnprob is defined as mean absolute error between model and true deltas
    probln = -mean_absolute_error(observed_delta, model_delta)
    
    return probln



In [ ]:

    
# define the dictionary that will contain all player ratings
all_players = {}

# loop over all leagues in the BUDA database
for link in leaguelinks[0]:

    # extract the league id for this league
    leagueid = '39641'#link[link.index('league=') + 7:]

    # scrape the scores for this league
    leaguescoreurl = 'http://www.buda.org/hatleagues/scores.php?section=showLeagueSchedule&league=' + leagueid + '&byDivision=1&showGames=1'
    response = urllib2.urlopen(leaguescoreurl)
    leaguescore_soup = BeautifulSoup(response)

    # assemble the data of team ratings for this league
    data = []
    data_opponent = []
    try:
        table = leaguescore_soup.find_all('table', attrs={'class':'info'})[1]
    except IndexError:
        print("Unable to find a database of scores for league {}".format(leagueid))
        continue
    rows = table.find_all('tr')
    for row in rows:
        cols = row.find_all('th')
        cols = [ele.text.strip() for ele in cols]
        data.append([ele for ele in cols if ele]) # Get rid of empty values
        tdcols = row.find_all('td')
        tdcols = [ele.text.strip() for ele in tdcols]
        data_opponent.append([ele for ele in tdcols if ele]) # Get rid of empty values

    # convert to dataframe and drop irrelevant columns
    dfdata = pd.DataFrame(data)
#     print(leagueid, dfdata.columns)
    dfdata.columns = dfdata.ix[0, :]#['Team', 'Record', 'Plus/Minus', 'Tourney Qualifying games']
    dfdata = dfdata.dropna(how='all')
#     print(leagueid, dfdata.columns)
    dfdata = dfdata.drop(0).reset_index()
    
    dfdata = dfdata.drop(['index', 'Tourney Qualifying games*'], axis=1)
    
    # fill na's with -99 to facilitate division dividers
    dfdata = dfdata.fillna(-99)
    
    # get the list of divisions in this league
    divnames = dfdata.ix[dfdata['Record'] == -99, 'Team'].values
    if len(divnames) == 0:
        print("No divisions found, skipping league {}".format(leagueid))
        continue

    # define base ratings by division (arbitrarily assigned based on my experience)
    divratings = {'4/3 Div 1': 1800, '4/3 Div 2': 1400, '4/3 Div 3': 1000, '4/3 Div 4': 900, 
                '5/2 Div 1': 1700, '5/2 Div 2': 1300, '5/2 Div 3': 900, '5/2 Div 4': 800,
                'Open Div 1': 1400, 'Open Div 2': 1200}
    dfdata['div'] = np.zeros(len(dfdata))
    for i in range(len(divnames)-1):
        try:
            divstart = np.where(dfdata['Team'] == divnames[i])[0][0]
        except IndexError:
            print("{} not found, skipping league {}".format(divnames[i], leagueid))
            continue
        try:
            divend = np.where(dfdata['Team'] == divnames[i + 1])[0][0]
        except IndexError:
            print("{} not found, skipping league {}".format(divnames[i + 1], leagueid))
            continue
        try:
            dfdata.ix[divstart + 1: divend, 'div'] = divratings[divnames[i]]
        except KeyError:
            print("No base rating for {}, skipping league {}".format(divnames[i], leagueid))
            import pdb; pdb.set_trace()

            continue
    try:
        dfdata.ix[divend + 1:, 'div'] = divratings[divnames[-1]]
    except KeyError:
        print("No base rating for {}, skipping league {}".format(divnames[-1], leagueid))
        import pdb; pdb.set_trace()
        continue        

    # remove the division dividers from the dataframe
    for i in range(len(divnames)):
        dfdata = dfdata.drop(dfdata.index[dfdata['Team'] == divnames[i]])

    # generate the average goal differential column
    dfdata['wins'] = dfdata['Record'].apply(lambda x: int(x.split('-')[0]))
    dfdata['losses'] = dfdata['Record'].apply(lambda x: int(x.split('-')[1]))
    dfdata['games'] = dfdata['wins'] + dfdata['losses']
    dfdata['avgplusminus'] = dfdata['Plus/Minus'].astype('float') / dfdata['games']

    # assert that an average goal differential per game of +5 gives +300 rating points.
    dfdata['rating'] = dfdata['div'] + 60. * dfdata['avgplusminus']

    # build the dictionary of game scores
    dfdata_opponents = pd.DataFrame(data_opponent).dropna().reset_index().drop('index', axis=1)
    dfdata_opponents.columns = ['Opponent', 'Record']
    dfdata_opponents['teamscore'] = dfdata_opponents.ix[:, 'Record'].apply(lambda x: int(x.split('-')[0]))
    dfdata_opponents['opponentscore'] = dfdata_opponents.ix[:, 'Record'].apply(lambda x: int(x.split('-')[1]))

    opponentcounter = 0
    game_scores = {}
    for idf in dfdata.index:
        teamname = dfdata.ix[idf, 'Team']
        ngames = dfdata.ix[idf, 'games']
        for igame in range(ngames):
            opponentname = dfdata_opponents.ix[opponentcounter, 'Opponent']
            teamscore = dfdata_opponents.ix[opponentcounter, 'teamscore']
            opponentscore = dfdata_opponents.ix[opponentcounter, 'opponentscore']
            adversary_key = (teamname, opponentname)
            game_scores[adversary_key] = [teamscore, opponentscore]
            
            
    teamnames = dfdata['Team']
    
    ndim = len(dfdata)
    nwalkers = ndim * 2 + 2
    p0 = [np.random.normal(irating, 200, nwalkers) for irating in dfdata['rating']]
    p0 = np.array(p0).transpose()
#     p0 = [np.random.rand(ndim) for i in range(nwalkers)]

    sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, threads=4)
    sampler.run_mcmc(p0, 200)

    plt.plot(sampler.flatchain[:, 0])
    plt.show()
    import pdb; pdb.set_trace()
    
    # scrape the list of teams for this league
    teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
    response = urllib2.urlopen(teamsurl)
    teams_soup = BeautifulSoup(response)

    # generate list of team ids and names for this league
    tdlist = teams_soup.find_all('td', class_='infobody')
    teamids = []
    teamnames = []
    for td in tdlist:
        try:
            url = td.a['href']
            idindex = url.index('team=')
            whichindex = url.index('which=')
            teamids.append(url[idindex+5:whichindex-1])
            teamnames.append(td.a.get_text())
        except:
            continue

    # find all players associated with each team
    # link the team rating to each player on that team
    for teamid, teamname in zip(teamids, teamnames):
        try:
            teamrating = dfdata.ix[dfdata['Team'] == teamname.strip(' '), 'rating'].values[0]
        except IndexError:
            print("Couldn't match {} to scores database, skipping this team.".format(teamname))
            continue

        teamurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeamRoster&team=' + teamid
        response = urllib2.urlopen(teamurl)
        roster_soup = BeautifulSoup(response)

        players = [td.get_text() for td in roster_soup.find_all("td", class_="infobody")]
        for player in players:
            if player in all_players:
                all_players[player].append(teamrating)
            else:
                all_players[player] = [teamrating]
    print("Finished successfully with league {}".format(leagueid))









    



> <ipython-input-15-7d9972975925>(122)<module>()
-> sampler = emcee.EnsembleSampler(nwalkers, ndim, lnprob, threads=4)
(Pdb) p0.shape
(122, 60)
(Pdb) c
> <ipython-input-15-7d9972975925>(130)<module>()
-> teamsurl = 'http://www.buda.org/hatleagues/rosters.php?section=showTeams&league=' + leagueid
(Pdb) plt.plot(sampler.flatchain[:, 1])
[<matplotlib.lines.Line2D object at 0x1249316d0>]
(Pdb) plt.show()
(Pdb) plt.plot(sampler.flatchain[:, 2])
[<matplotlib.lines.Line2D object at 0x1263354d0>]
(Pdb) plt.show()

One possible approach would be to say expected point differential is rating differential divided by 100.



In [14]:

    
dfdata









    Out[14]:






  
    
      
      Team
      Record
      Plus/Minus
      div
      wins
      losses
      games
      avgplusminus
      rating
    
  
  
    
      1
      SnakeCountry Bromance
      4-0-0
      21
      1800.0
      4
      0
      4
      5.250000
      2115.000000
    
    
      2
      FallChart
      5-1-0
      14
      1800.0
      5
      1
      6
      2.333333
      1940.000000
    
    
      3
      Pickle Bush (LPFK)
      4-1-0
      12
      1800.0
      4
      1
      5
      2.400000
      1944.000000
    
    
      4
      Nutgetters
      2-1-0
      11
      1800.0
      2
      1
      3
      3.666667
      2020.000000
    
    
      5
      GrassBurner Harvest
      3-2-0
      9
      1800.0
      3
      2
      5
      1.800000
      1908.000000
    
    
      6
      The Animals
      2-2-1
      -1
      1800.0
      2
      2
      4
      -0.250000
      1785.000000
    
    
      7
      Somerville Youth
      1-2-0
      -3
      1800.0
      1
      2
      3
      -1.000000
      1740.000000
    
    
      8
      Topless Pillow Fight
      2-4-0
      -5
      1800.0
      2
      4
      6
      -0.833333
      1750.000000
    
    
      9
      BUDA U19 A
      0-2-0
      -10
      1800.0
      0
      2
      2
      -5.000000
      1500.000000
    
    
      10
      Stormageddon V
      3-4-0
      -13
      1800.0
      3
      4
      7
      -1.857143
      1688.571429
    
    
      11
      Supermona Reloaded
      3-4-0
      -16
      1800.0
      3
      4
      7
      -2.285714
      1662.857143
    
    
      12
      Bear Cavalry
      0-4-1
      -11
      1800.0
      0
      4
      4
      -2.750000
      1635.000000
    
    
      13
      Burnt Pudding
      0-4-2
      -23
      1800.0
      0
      4
      4
      -5.750000
      1455.000000
    
    
      15
      Downstream
      7-0-0
      52
      1400.0
      7
      0
      7
      7.428571
      1845.714286
    
    
      16
      Injustice League
      6-1-0
      41
      1400.0
      6
      1
      7
      5.857143
      1751.428571
    
    
      17
      Scoobers in Scotland
      6-1-0
      29
      1400.0
      6
      1
      7
      4.142857
      1648.571429
    
    
      18
      Furtle Boy
      3-2-2
      16
      1400.0
      3
      2
      5
      3.200000
      1592.000000
    
    
      19
      Rubs the Duckie
      3-0-1
      9
      1400.0
      3
      0
      3
      3.000000
      1580.000000
    
    
      20
      Toads
      4-2-0
      10
      1400.0
      4
      2
      6
      1.666667
      1500.000000
    
    
      21
      The BUT
      2-1-0
      6
      1400.0
      2
      1
      3
      2.000000
      1520.000000
    
    
      22
      Upstream
      4-4-0
      7
      1400.0
      4
      4
      8
      0.875000
      1452.500000
    
    
      23
      Haunted HAOS
      5-3-0
      2
      1400.0
      5
      3
      8
      0.250000
      1415.000000
    
    
      24
      Reading Rainbow
      3-3-0
      6
      1400.0
      3
      3
      6
      1.000000
      1460.000000
    
    
      25
      Nerd Alert
      3-2-0
      1
      1400.0
      3
      2
      5
      0.200000
      1412.000000
    
    
      26
      Dartmouth Women's Lacrosse
      3-4-0
      1
      1400.0
      3
      4
      7
      0.142857
      1408.571429
    
    
      27
      Return of the Jedi
      3-4-0
      -5
      1400.0
      3
      4
      7
      -0.714286
      1357.142857
    
    
      28
      O'Rhinos
      2-4-0
      -6
      1400.0
      2
      4
      6
      -1.000000
      1340.000000
    
    
      29
      Hippos
      2-2-1
      -12
      1400.0
      2
      2
      4
      -3.000000
      1220.000000
    
    
      30
      Moose Lightning
      1-5-1
      -12
      1400.0
      1
      5
      6
      -2.000000
      1280.000000
    
    
      31
      Disc Envy
      3-4-0
      -17
      1400.0
      3
      4
      7
      -2.428571
      1254.285714
    
    
      32
      Apocalypse Meow
      0-4-1
      -13
      1400.0
      0
      4
      4
      -3.250000
      1205.000000
    
    
      33
      OctoberFetch
      2-4-0
      -17
      1400.0
      2
      4
      6
      -2.833333
      1230.000000
    
    
      34
      Lady and the BAMF
      1-4-0
      -16
      1400.0
      1
      4
      5
      -3.200000
      1208.000000
    
    
      35
      BUDA U19 B
      0-2-0
      -20
      1400.0
      0
      2
      2
      -10.000000
      800.000000
    
    
      36
      A Lil Bit Sticky
      1-3-0
      -25
      1400.0
      1
      3
      4
      -6.250000
      1025.000000
    
    
      37
      FallMinion
      0-6-0
      -36
      1400.0
      0
      6
      6
      -6.000000
      1040.000000
    
    
      39
      Flaming Croissants
      6-0-0
      39
      1000.0
      6
      0
      6
      6.500000
      1390.000000
    
    
      40
      Underwater Monkey Cowboys
      7-0-0
      37
      1000.0
      7
      0
      7
      5.285714
      1317.142857
    
    
      41
      Too Drunk to Fail
      4-1-2
      34
      1000.0
      4
      1
      5
      6.800000
      1408.000000
    
    
      42
      Stack to the Future
      5-2-0
      29
      1000.0
      5
      2
      7
      4.142857
      1248.571429
    
    
      43
      Fall Dirt
      3-2-1
      14
      1000.0
      3
      2
      5
      2.800000
      1168.000000
    
    
      44
      THEM!
      3-1-1
      9
      1000.0
      3
      1
      4
      2.250000
      1135.000000
    
    
      45
      Spam
      4-3-0
      -5
      1000.0
      4
      3
      7
      -0.714286
      957.142857
    
    
      46
      Oddjob
      3-3-0
      -11
      1000.0
      3
      3
      6
      -1.833333
      890.000000
    
    
      47
      Batman and the Robins
      3-4-0
      -9
      1000.0
      3
      4
      7
      -1.285714
      922.857143
    
    
      48
      Olin College
      3-4-0
      -11
      1000.0
      3
      4
      7
      -1.571429
      905.714286
    
    
      49
      Top Shelf
      2-4-1
      -10
      1000.0
      2
      4
      6
      -1.666667
      900.000000
    
    
      50
      Tofu Wolf
      1-6-0
      -32
      1000.0
      1
      6
      7
      -4.571429
      725.714286
    
    
      51
      Moosehead Cowpokes
      0-5-0
      -32
      1000.0
      0
      5
      5
      -6.400000
      616.000000
    
    
      52
      It's a Trap
      0-6-1
      -38
      1000.0
      0
      6
      6
      -6.333333
      620.000000
    
    
      54
      Crossroads
      8-0-0
      53
      1400.0
      8
      0
      8
      6.625000
      1797.500000
    
    
      55
      Merrimack Ultimate
      5-1-1
      52
      1400.0
      5
      1
      6
      8.666667
      1920.000000
    
    
      56
      Squid Squad
      6-2-0
      31
      1400.0
      6
      2
      8
      3.875000
      1632.500000
    
    
      57
      Spirit Fowl
      3-2-0
      2
      1400.0
      3
      2
      5
      0.400000
      1424.000000
    
    
      58
      Concrete Schoolyard
      1-2-0
      2
      1400.0
      1
      2
      3
      0.666667
      1440.000000
    
    
      59
      Hammered
      3-3-0
      -2
      1400.0
      3
      3
      6
      -0.333333
      1380.000000
    
    
      60
      Buttonwood Bears
      1-4-0
      -6
      1400.0
      1
      4
      5
      -1.200000
      1328.000000
    
    
      61
      Concrete Jungle
      0-5-0
      -33
      1400.0
      0
      5
      5
      -6.600000
      1004.000000
    
    
      62
      Lesley Ultimate
      0-6-2
      -72
      1400.0
      0
      6
      6
      -12.000000
      680.000000
    
    
      64
      FastCAPs
      0-2-1
      -27
      1200.0
      0
      2
      2
      -13.500000
      390.000000

Everything is set up except the dictionary of game scores: "game_scores".



In [12]:

    
np.array(p0).shape









    Out[12]:





(122, 60)



In [13]:

    
nwalkers









    Out[13]:





122



In [ ]:

	Team	Record	Plus/Minus	div	wins	losses	games	avgplusminus	rating
1	SnakeCountry Bromance	4-0-0	21	1800.0	4	0	4	5.250000	2115.000000
2	FallChart	5-1-0	14	1800.0	5	1	6	2.333333	1940.000000
3	Pickle Bush (LPFK)	4-1-0	12	1800.0	4	1	5	2.400000	1944.000000
4	Nutgetters	2-1-0	11	1800.0	2	1	3	3.666667	2020.000000
5	GrassBurner Harvest	3-2-0	9	1800.0	3	2	5	1.800000	1908.000000
6	The Animals	2-2-1	-1	1800.0	2	2	4	-0.250000	1785.000000
7	Somerville Youth	1-2-0	-3	1800.0	1	2	3	-1.000000	1740.000000
8	Topless Pillow Fight	2-4-0	-5	1800.0	2	4	6	-0.833333	1750.000000
9	BUDA U19 A	0-2-0	-10	1800.0	0	2	2	-5.000000	1500.000000
10	Stormageddon V	3-4-0	-13	1800.0	3	4	7	-1.857143	1688.571429
11	Supermona Reloaded	3-4-0	-16	1800.0	3	4	7	-2.285714	1662.857143
12	Bear Cavalry	0-4-1	-11	1800.0	0	4	4	-2.750000	1635.000000
13	Burnt Pudding	0-4-2	-23	1800.0	0	4	4	-5.750000	1455.000000
15	Downstream	7-0-0	52	1400.0	7	0	7	7.428571	1845.714286
16	Injustice League	6-1-0	41	1400.0	6	1	7	5.857143	1751.428571
17	Scoobers in Scotland	6-1-0	29	1400.0	6	1	7	4.142857	1648.571429
18	Furtle Boy	3-2-2	16	1400.0	3	2	5	3.200000	1592.000000
19	Rubs the Duckie	3-0-1	9	1400.0	3	0	3	3.000000	1580.000000
20	Toads	4-2-0	10	1400.0	4	2	6	1.666667	1500.000000
21	The BUT	2-1-0	6	1400.0	2	1	3	2.000000	1520.000000
22	Upstream	4-4-0	7	1400.0	4	4	8	0.875000	1452.500000
23	Haunted HAOS	5-3-0	2	1400.0	5	3	8	0.250000	1415.000000
24	Reading Rainbow	3-3-0	6	1400.0	3	3	6	1.000000	1460.000000
25	Nerd Alert	3-2-0	1	1400.0	3	2	5	0.200000	1412.000000
26	Dartmouth Women's Lacrosse	3-4-0	1	1400.0	3	4	7	0.142857	1408.571429
27	Return of the Jedi	3-4-0	-5	1400.0	3	4	7	-0.714286	1357.142857
28	O'Rhinos	2-4-0	-6	1400.0	2	4	6	-1.000000	1340.000000
29	Hippos	2-2-1	-12	1400.0	2	2	4	-3.000000	1220.000000
30	Moose Lightning	1-5-1	-12	1400.0	1	5	6	-2.000000	1280.000000
31	Disc Envy	3-4-0	-17	1400.0	3	4	7	-2.428571	1254.285714
32	Apocalypse Meow	0-4-1	-13	1400.0	0	4	4	-3.250000	1205.000000
33	OctoberFetch	2-4-0	-17	1400.0	2	4	6	-2.833333	1230.000000
34	Lady and the BAMF	1-4-0	-16	1400.0	1	4	5	-3.200000	1208.000000
35	BUDA U19 B	0-2-0	-20	1400.0	0	2	2	-10.000000	800.000000
36	A Lil Bit Sticky	1-3-0	-25	1400.0	1	3	4	-6.250000	1025.000000
37	FallMinion	0-6-0	-36	1400.0	0	6	6	-6.000000	1040.000000
39	Flaming Croissants	6-0-0	39	1000.0	6	0	6	6.500000	1390.000000
40	Underwater Monkey Cowboys	7-0-0	37	1000.0	7	0	7	5.285714	1317.142857
41	Too Drunk to Fail	4-1-2	34	1000.0	4	1	5	6.800000	1408.000000
42	Stack to the Future	5-2-0	29	1000.0	5	2	7	4.142857	1248.571429
43	Fall Dirt	3-2-1	14	1000.0	3	2	5	2.800000	1168.000000
44	THEM!	3-1-1	9	1000.0	3	1	4	2.250000	1135.000000
45	Spam	4-3-0	-5	1000.0	4	3	7	-0.714286	957.142857
46	Oddjob	3-3-0	-11	1000.0	3	3	6	-1.833333	890.000000
47	Batman and the Robins	3-4-0	-9	1000.0	3	4	7	-1.285714	922.857143
48	Olin College	3-4-0	-11	1000.0	3	4	7	-1.571429	905.714286
49	Top Shelf	2-4-1	-10	1000.0	2	4	6	-1.666667	900.000000
50	Tofu Wolf	1-6-0	-32	1000.0	1	6	7	-4.571429	725.714286
51	Moosehead Cowpokes	0-5-0	-32	1000.0	0	5	5	-6.400000	616.000000
52	It's a Trap	0-6-1	-38	1000.0	0	6	6	-6.333333	620.000000
54	Crossroads	8-0-0	53	1400.0	8	0	8	6.625000	1797.500000
55	Merrimack Ultimate	5-1-1	52	1400.0	5	1	6	8.666667	1920.000000
56	Squid Squad	6-2-0	31	1400.0	6	2	8	3.875000	1632.500000
57	Spirit Fowl	3-2-0	2	1400.0	3	2	5	0.400000	1424.000000
58	Concrete Schoolyard	1-2-0	2	1400.0	1	2	3	0.666667	1440.000000
59	Hammered	3-3-0	-2	1400.0	3	3	6	-0.333333	1380.000000
60	Buttonwood Bears	1-4-0	-6	1400.0	1	4	5	-1.200000	1328.000000
61	Concrete Jungle	0-5-0	-33	1400.0	0	5	5	-6.600000	1004.000000
62	Lesley Ultimate	0-6-2	-72	1400.0	0	6	6	-12.000000	680.000000
64	FastCAPs	0-2-1	-27	1200.0	0	2	2	-13.500000	390.000000