In [3]:
import numpy as np
import pandas as pd
import requests
import keras
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import random
import calendar
%matplotlib inline
In [6]:
url = ''
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html5lib')
tables = soup.find_all('ul', class_='medium-logos')
teams = []
prefix_1 = []
prefix_2 = []
teams_urls = []
for table in tables:
lis = table.find_all('li')
for li in lis:
info = li.h5.a
url = info['href']
dic = {'url': teams_urls, 'prefix_2': prefix_2, 'prefix_1': prefix_1}
teams = pd.DataFrame(dic, index=teams)
In [8]:
BASE_URL = '{0}/year/{1}'
column_headers = [u'PLAYER', u'GP', u'GS', u'MIN', u'PPG', u'OFFR', u'DEFR', u'RPG', u'APG', u'SPG', u'BPG', u'TPG', u'FPG', u'A/TO', u'PER']
def normalizeColumn(column):
theMax = column.max()
theMin = column.min()
normalizedColumn = [ (record - theMin) / (theMax - theMin) for record in column]
return normalizedColumn
for year in range(2003, 2018):
player_data = []
for index, row in teams.iterrows():
r = requests.get(BASE_URL.format(row['prefix_1'], year))
table = BeautifulSoup(r.text, 'html5lib').table
for row in table.find_all('tr')[2:][:-1]: # Remove header
data = [td.getText() for td in row.findAll('td')]
formattedData = [data[0].encode('utf-8')] + [float(x) for x in data[1:]] #formats the none-name fields to floats so we can do math
df = pd.DataFrame(player_data, columns=column_headers)
for column in df.iloc[:,1:]:
data = normalizeColumn(df[column])
df[column + "_Normalized"] = data
df['scores_Normalized'] = df.iloc[:,15:].apply(sum, axis = 1)
df.to_csv("player_stats_" + str(year) + ".csv")
There are 2 more data needed for this training.
The schedule of each game '{0}/year/{1}/{2}'
the stats of each player on all game they have played ""