In [1]:
import urllib.request as request
from bs4 import BeautifulSoup
import re
base_url = 'http://www.espn.com/'
teams_url = base_url + 'nba/teams'
html_teams = request.urlopen(teams_url)
soup_teams = BeautifulSoup(html_teams, 'html5lib')
get the link for all teams and create dictionary that contain the team abbreviated name with their team name. This will make it easier to loop the entire team to scrape more data later.
notes: other alternative will be storing all the information in pandas dataframe, we will save that for later
In [2]:
urls = soup_teams.find_all(href = re.compile('/nba/teams/stats'))
team_urls = [base_url + url['href'] for url in urls]
urls = soup_teams.find_all(href = re.compile('http://www.espn.com/nba/team/_/name/'))
team_name = [url['href'].split('/')[-1] for url in urls]
team_name_abbrv = [url.split('=')[-1] for url in team_urls]
dict_team_name = dict(zip(team_name_abbrv, team_name))
dict_team_url = dict(zip(team_name_abbrv, team_urls))
In [3]:
def chunks(l, n):
""" Yield successive n-sized chunks from l.
"""
for i in range(0, len(l), n):
yield l[i:i+n]
In [4]:
def get_player_stats(team_code):
STATS_BASE_URL = 'http://espn.go.com/nba/teams/stats?team={0}'
html_team = request.urlopen(STATS_BASE_URL.format(team_code))
soup_team = BeautifulSoup(html_team, 'html5lib')
roster = soup_team.find_all('tr', class_=re.compile('player'))
roster_len = len(roster)
roster_game_stats, roster_shooting_stats = roster[:int(roster_len/2)], roster[-int(roster_len/2):]
players_id = [player.a['href'].split('/')[7] for player in roster_game_stats]
players = [data.get_text() for row in roster_game_stats for data in row]
players_array_game = [row for row in chunks(players, 15)]
#add players id and team to the list of list for roster_game_stats
for i in range(len(players_id)):
players_array_game[i].insert(0, players_id[i])
players_array_game[i].insert(2, dict_team_name[team_code])
#add players id and team to the list of list for roster_shooting_stats
players = [data.get_text() for row in roster_shooting_stats for data in row]
players_array_shooting = [row for row in chunks(players, 15)]
for i in range(len(players_id)):
players_array_shooting[i].insert(0, players_id[i])
players_array_shooting[i].insert(2, dict_team_name[team_code])
return players_array_game, players_array_shooting
In [5]:
# now we can pull the stats for each team by looping through the team code in the dictionary above
players_game_stats, players_shooting_stats = get_player_stats('bkn')
In [6]:
players_game_stats
Out[6]:
In [108]:
#http://www.espn.com/nba/player/gamelog/_/id/4299
#todo: get the column name for this and above function
player_id = 4299 #jeremy-lin
def get_match_stats(player_id):
stats_base_url = "http://www.espn.com/nba/player/gamelog/_/id/{0}"
html_stats = request.urlopen(stats_base_url.format(player_id))
soup_stats = BeautifulSoup(html_stats, 'html5lib')
game_stats = soup_stats.find_all('tr', class_ = re.compile('row'))
stats_text = ''
stats_text_list = []
row_count = 0
for rows in game_stats:
text = rows.find_all('td')
stats_text = ''
for row in text:
stats_text += row.get_text() + ';'
stats_text_list.append([stats_text])
player_match_stats = [text.split(';') for row in stats_text_list for text in row]
return player_match_stats
In [109]:
jeremy_lin_stats = get_match_stats(4299)
In [111]:
import pandas as pd
jeremy_lin_df = pd.DataFrame(jeremy_lin_stats)