In [4]:
from bs4 import BeautifulSoup
import urllib3
import requests
import pandas as pd
import io
import sys
import re
import json
from pprint import pprint
In [63]:
def grabKenPomCSV(url):
with open('data/kenpomcred.json') as data_file:
credentials = json.load(data_file)
urlMgr = urllib3.PoolManager()
payload = {'email' : credentials['username'], 'password' : credentials['password'], 'submit' : 'Login!'}
s = requests.session()
result = s.post('http://kenpom.com/handlers/login_handler.php', data = payload)
result = s.get(url)
if result.status_code == 200:
print('successfully read ' + url)
else:
print('error reading ' + url, file = sys.stderr)
return result
def kenPomColNorm(df, year, url):
# normalize to same standard aross yrs -- see diff formats
remap = { 'Size' : 'Height', 'Hgt5' : 'HgtC', 'Hgt4' : 'HgtPF', 'Hgt3' : 'HgtSF', 'Hgt2' : 'HgtSG', 'Hgt1' : 'HgtPG',
'OR5' : 'ORC', 'OR4' : 'ORPF', 'OR3' : 'ORSF', 'OR2' : 'ORSG', 'OR1' : 'ORPG',
'DR5' : 'DRC', 'DR4' : 'DRPF', 'DR3' : 'DRSF', 'DR2' : 'DRSG', 'DR1' : 'DRPG',
'Pts5' : 'PtsC', 'Pts4' : 'PtsPF', 'Pts3' : 'PtsSF', 'Pts2' : 'PtsSG', 'Pts1' : 'PtsPG'
}
colsToDrop = ['EM', 'RankEM', 'F3GRate', 'RankF3GRate', 'OppF3GRate', 'RankOppF3GRate', 'DefensiveFingerprint',\
'AdjEM', 'RankAdjEM']
df['Season'] = year
df.set_index(['TeamName', 'Season'])
if 'defense' in url:
df.columns = [ 'def' + col for col in df.columns ]
elif 'offense' in url:
df.columns = [ 'off' + col for col in df.columns ]
df.columns = df.columns.map(lambda col: re.sub('[ _]', '', col))
df.columns = df.columns.map(lambda col: col if col not in remap else remap[col])
return df[[col for col in df.columns if col not in colsToDrop]]
def allKenPomCSVs(year):
baseurl = 'http://kenpom.com/getdata.php?file='
#csvsToGrab = [ 'summary', 'offense', 'defense', 'pointdist', 'height', 'misc' ] # having trouble with pointdist format
#csvsToGrab = [ 'summary', 'offense', 'defense', 'height', 'misc' ]
csvsToGrab = ['summary']
urlsToGrab = [ baseurl + csv + str(year)[-2:] + ('_pt' if year < 2017 and csv == 'summary' else '') for csv in csvsToGrab ]
return [ \
kenPomColNorm(pd.read_csv(io.StringIO(grabKenPomCSV(x).text)), year, x)\
for x in urlsToGrab ]
In [77]:
yearsToGrab = range(2003, 2018, 1)
kpDataEachYear = [ pd.concat(allKenPomCSVs(year), axis = 1) for year in yearsToGrab ]
#kpData = pd.concat(kpDataEachYear, axis = 0)
In [78]:
kpData = pd.concat(kpDataEachYear, axis = 0)
In [79]:
teamsInDB = pd.read_csv('data/TeamSpellings.csv')
teamMap = teamsInDB.set_index('name_spelling').to_dict()['team_id']
kpData['Team_Id'] = kpData['TeamName'].str.lower().map(lambda teamName: teamMap[teamName])
In [80]:
kpData.to_csv('data/kenPomTeamData.csv')