In [1]:
import urllib.request as request
from bs4 import BeautifulSoup
import re

base_url = 'http://www.espn.com/'
teams_url = base_url + 'nba/teams'
html_teams = request.urlopen(teams_url)
soup_teams = BeautifulSoup(html_teams, 'html5lib')

get the link for all teams and create dictionary that contain the team abbreviated name with their team name. This will make it easier to loop the entire team to scrape more data later.

notes: other alternative will be storing all the information in pandas dataframe, we will save that for later


In [2]:
urls = soup_teams.find_all(href = re.compile('/nba/teams/stats'))
team_urls = [base_url + url['href'] for url in urls]
urls = soup_teams.find_all(href = re.compile('http://www.espn.com/nba/team/_/name/'))
team_name = [url['href'].split('/')[-1] for url in urls]
team_name_abbrv = [url.split('=')[-1] for url in team_urls]


dict_team_name = dict(zip(team_name_abbrv, team_name))
dict_team_url = dict(zip(team_name_abbrv, team_urls))

Creating list for player with their stats


In [3]:
def chunks(l, n):
    """ Yield successive n-sized chunks from l.
    """
    for i in range(0, len(l), n):
        yield l[i:i+n]

In [4]:
def get_player_stats(team_code):    
    STATS_BASE_URL = 'http://espn.go.com/nba/teams/stats?team={0}'
    html_team = request.urlopen(STATS_BASE_URL.format(team_code))
    soup_team = BeautifulSoup(html_team, 'html5lib')

    roster = soup_team.find_all('tr', class_=re.compile('player'))
    roster_len = len(roster)
    roster_game_stats, roster_shooting_stats = roster[:int(roster_len/2)], roster[-int(roster_len/2):]
    players_id = [player.a['href'].split('/')[7] for player in roster_game_stats]
    players = [data.get_text() for row in roster_game_stats for data in row]
    players_array_game = [row for row in chunks(players, 15)]

    #add players id and team to the list of list for roster_game_stats
    for i in range(len(players_id)):
        players_array_game[i].insert(0, players_id[i])
        players_array_game[i].insert(2, dict_team_name[team_code])

    #add players id and team to the list of list for roster_shooting_stats    
    players = [data.get_text() for row in roster_shooting_stats for data in row]
    players_array_shooting = [row for row in chunks(players, 15)]

    for i in range(len(players_id)):
        players_array_shooting[i].insert(0, players_id[i])
        players_array_shooting[i].insert(2, dict_team_name[team_code])
        
    return players_array_game, players_array_shooting

In [5]:
# now we can pull the stats for each team by looping through the team code in the dictionary above
players_game_stats, players_shooting_stats = get_player_stats('bkn')

In [6]:
players_game_stats


Out[6]:
[['3448',
  'Brook Lopez, C',
  'brooklyn-nets',
  '75',
  '75',
  '29.6',
  '20.5',
  '1.6',
  '3.8',
  '5.4',
  '2.3',
  '0.51',
  '1.65',
  '2.5',
  '2.6',
  '1.0',
  '20.5'],
 ['4299',
  'Jeremy Lin, PG',
  'brooklyn-nets',
  '36',
  '33',
  '24.5',
  '14.5',
  '0.3',
  '3.4',
  '3.8',
  '5.1',
  '1.17',
  '0.42',
  '2.4',
  '2.2',
  '2.1',
  '19.3'],
 ['3593',
  'Bojan Bogdanovic, SF†',
  'brooklyn-nets',
  '55',
  '54',
  '26.9',
  '14.2',
  '0.4',
  '3.2',
  '3.6',
  '1.6',
  '0.44',
  '0.05',
  '1.7',
  '1.8',
  '1.0',
  '13.1'],
 ['2488689',
  'Sean Kilpatrick, SG',
  'brooklyn-nets',
  '70',
  '24',
  '25.1',
  '13.1',
  '0.3',
  '3.7',
  '4.0',
  '2.2',
  '0.64',
  '0.09',
  '1.9',
  '1.7',
  '1.2',
  '13.2'],
 ['4270',
  'Trevor Booker, PF',
  'brooklyn-nets',
  '71',
  '43',
  '24.7',
  '10.0',
  '2.0',
  '6.0',
  '8.0',
  '1.9',
  '1.06',
  '0.39',
  '1.8',
  '2.1',
  '1.1',
  '15.7'],
 ['3064291',
  'Rondae Hollis-Jefferson, SF',
  'brooklyn-nets',
  '78',
  '50',
  '22.6',
  '8.7',
  '1.2',
  '4.6',
  '5.8',
  '2.0',
  '1.05',
  '0.56',
  '1.5',
  '2.3',
  '1.3',
  '13.7'],
 ['2528794',
  'Joe Harris, SG',
  'brooklyn-nets',
  '52',
  '11',
  '21.9',
  '8.2',
  '0.3',
  '2.5',
  '2.8',
  '1.0',
  '0.58',
  '0.15',
  '1.1',
  '2.3',
  '1.0',
  '9.0'],
 ['2991043',
  'Caris LeVert, SG',
  'brooklyn-nets',
  '57',
  '26',
  '21.7',
  '8.2',
  '0.4',
  '2.9',
  '3.3',
  '1.9',
  '0.86',
  '0.14',
  '1.0',
  '1.6',
  '1.9',
  '12.3'],
 ['2991281',
  'Archie Goodwin, SG†',
  'brooklyn-nets',
  '12',
  '0',
  '15.3',
  '7.9',
  '0.6',
  '1.8',
  '2.3',
  '1.9',
  '0.33',
  '0.33',
  '1.2',
  '0.6',
  '1.6',
  '18.6'],
 ['3136477',
  'Isaiah Whitehead, G',
  'brooklyn-nets',
  '73',
  '26',
  '22.5',
  '7.4',
  '0.4',
  '2.1',
  '2.5',
  '2.6',
  '0.58',
  '0.49',
  '1.9',
  '2.4',
  '1.4',
  '7.6'],
 ['2580782',
  'Spencer Dinwiddie, PG',
  'brooklyn-nets',
  '59',
  '18',
  '22.6',
  '7.3',
  '0.5',
  '2.3',
  '2.8',
  '3.1',
  '0.75',
  '0.39',
  '1.1',
  '2.0',
  '2.8',
  '12.7'],
 ['6590',
  'Justin Hamilton, C',
  'brooklyn-nets',
  '64',
  '7',
  '18.4',
  '6.9',
  '1.1',
  '3.0',
  '4.1',
  '0.9',
  '0.45',
  '0.67',
  '0.7',
  '1.4',
  '1.3',
  '13.6'],
 ['6576',
  'Quincy Acy, SF†',
  'brooklyn-nets',
  '32',
  '1',
  '15.9',
  '6.5',
  '0.6',
  '2.8',
  '3.3',
  '0.6',
  '0.44',
  '0.47',
  '0.6',
  '1.8',
  '0.9',
  '13.1'],
 ['2566741',
  'KJ McDaniels, SG†',
  'brooklyn-nets',
  '20',
  '0',
  '14.7',
  '6.3',
  '0.5',
  '2.2',
  '2.6',
  '0.5',
  '0.60',
  '0.50',
  '1.0',
  '1.3',
  '0.5',
  '12.5'],
 ['2991018',
  'Yogi Ferrell, PG†',
  'brooklyn-nets',
  '10',
  '0',
  '15.1',
  '5.4',
  '0.4',
  '0.8',
  '1.2',
  '1.7',
  '0.20',
  '0.20',
  '1.4',
  '1.1',
  '1.2',
  '6.6'],
 ['3003',
  'Randy Foye, SG',
  'brooklyn-nets',
  '69',
  '40',
  '18.6',
  '5.2',
  '0.1',
  '2.1',
  '2.2',
  '2.0',
  '0.51',
  '0.13',
  '1.2',
  '1.4',
  '1.7',
  '7.3'],
 ['1781',
  'Luis Scola, PF',
  'brooklyn-nets',
  '36',
  '1',
  '12.8',
  '5.1',
  '1.4',
  '2.4',
  '3.9',
  '1.0',
  '0.39',
  '0.11',
  '0.9',
  '1.8',
  '1.1',
  '14.0'],
 ['2991473',
  'Anthony Bennett, PF',
  'brooklyn-nets',
  '23',
  '1',
  '11.5',
  '5.0',
  '1.1',
  '2.3',
  '3.4',
  '0.5',
  '0.22',
  '0.13',
  '0.5',
  '0.8',
  '1.0',
  '14.7'],
 ['6614',
  'Andrew Nicholson, PF†',
  'brooklyn-nets',
  '10',
  '0',
  '11.1',
  '3.0',
  '0.4',
  '2.3',
  '2.7',
  '0.3',
  '0.50',
  '0.00',
  '0.6',
  '1.8',
  '0.5',
  '5.1'],
 ['3153165',
  'Chris McCullough, PF†',
  'brooklyn-nets',
  '14',
  '0',
  '5.1',
  '2.5',
  '0.6',
  '0.6',
  '1.2',
  '0.1',
  '0.07',
  '0.14',
  '0.1',
  '0.5',
  '1.0',
  '16.9'],
 ['4291',
  'Greivis Vasquez, PG',
  'brooklyn-nets',
  '3',
  '0',
  '13.0',
  '2.3',
  '0.0',
  '0.7',
  '0.7',
  '1.7',
  '0.33',
  '0.33',
  '0.3',
  '2.0',
  '5.0',
  '4.1']]

Getting historical match stats data for each players


In [108]:
#http://www.espn.com/nba/player/gamelog/_/id/4299
#todo: get the column name for this and above function
player_id = 4299 #jeremy-lin
def get_match_stats(player_id):
    stats_base_url = "http://www.espn.com/nba/player/gamelog/_/id/{0}"
    html_stats = request.urlopen(stats_base_url.format(player_id))
    soup_stats = BeautifulSoup(html_stats, 'html5lib')

    game_stats = soup_stats.find_all('tr', class_ = re.compile('row'))
    stats_text = ''
    stats_text_list = []
    row_count = 0
    for rows in game_stats:
        text = rows.find_all('td')
        stats_text = ''
        for row in text:
            stats_text += row.get_text() + ';'
        stats_text_list.append([stats_text])
    player_match_stats = [text.split(';') for row in stats_text_list for text in row]
    return player_match_stats

In [109]:
jeremy_lin_stats = get_match_stats(4299)

In [111]:
import pandas as pd
jeremy_lin_df = pd.DataFrame(jeremy_lin_stats)