In [3]:
import numpy as np
import pandas as pd
import requests
import keras
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import random
import calendar

%matplotlib inline


Using TensorFlow backend.

In [6]:
#http://danielfrg.com/blog/2013/04/01/nba-scraping-data/
url = 'http://espn.go.com/nba/teams'
r = requests.get(url)

soup = BeautifulSoup(r.text, 'html5lib')
tables = soup.find_all('ul', class_='medium-logos')

teams = []
prefix_1 = []
prefix_2 = []
teams_urls = []
for table in tables:
    lis = table.find_all('li')
    for li in lis:
        info = li.h5.a
        teams.append(info.text)
        url = info['href']
        teams_urls.append(url)
        prefix_1.append(url.split('/')[-2])
        prefix_2.append(url.split('/')[-1])


dic = {'url': teams_urls, 'prefix_2': prefix_2, 'prefix_1': prefix_1}
teams = pd.DataFrame(dic, index=teams)

In [8]:
BASE_URL = 'http://www.espn.com/nba/team/stats/_/name/{0}/year/{1}'

column_headers = [u'PLAYER', u'GP', u'GS', u'MIN', u'PPG', u'OFFR', u'DEFR', u'RPG', u'APG', u'SPG', u'BPG', u'TPG', u'FPG', u'A/TO', u'PER']

def normalizeColumn(column):
    theMax = column.max()
    theMin = column.min()
    normalizedColumn = [ (record - theMin) / (theMax - theMin) for record in column]
    return normalizedColumn

for year in range(2003, 2018):
    player_data = []
    for index, row in teams.iterrows():
        r = requests.get(BASE_URL.format(row['prefix_1'], year))
        table = BeautifulSoup(r.text, 'html5lib').table
        for row in table.find_all('tr')[2:][:-1]: # Remove header
            data = [td.getText() for td in row.findAll('td')]
            formattedData = [data[0].encode('utf-8')] + [float(x) for x in data[1:]] #formats the none-name fields to floats so we can do math
            player_data.append(formattedData)
    df = pd.DataFrame(player_data, columns=column_headers)
    for column in df.iloc[:,1:]:
        data = normalizeColumn(df[column])
        df[column + "_Normalized"] = data
    df['scores_Normalized'] = df.iloc[:,15:].apply(sum, axis = 1)
    df.to_csv("player_stats_" + str(year) + ".csv")

There are 2 more data needed for this training.

  1. The schedule of each game 'http://espn.go.com/nba/team/schedule/_/name/{0}/year/{1}/{2}'

  2. the stats of each player on all game they have played "http://www.espn.com/nba/player/gamelog/_/id/1975/carmelo-anthony"