In [1]:
import datetime
import time
import functools
import pandas as pd
import numpy as np
import nba_py
import nba_py.game
import nba_py.player
import nba_py.team
import pymysql
from sqlalchemy import create_engine
import sys
sys.path.append('/Users/wonderui/OneDrive/6_Module_Package')
sys.path.append('/Users/WangRui/OneDrive/6_Module_Package')
import hoop_pwd
pwd = hoop_pwd.password
In [7]:
conn = create_engine('mysql+pymysql://root:%s@118.190.202.87:3306/nba_stats' % pwd)
try:
# read sql table of game header
game_header = pd.read_sql_table('game_header', conn)
length_1 = len(game_header)
print(str(length_1) + ' games loaded.')
# set begin date to the newest date in sql table
begin = datetime.datetime.strptime(game_header.iloc[-1]['GAME_DATE_EST'][:10], "%Y-%m-%d").date()
except ValueError:
print('no table yet!')
length_1 = 0
# if no table yet, set begin date to 2012-10-29
begin = datetime.date(2012, 10, 29)
# grab game headers of begining date
game_header = nba_py.Scoreboard(month = begin.month,
day = begin.day,
year = begin.year, league_id = '00', offset = 0).game_header()
# set end date to today
end = datetime.date.today()
for i in range((end - begin).days + 1):
# grab game headers from begin date to end date
day = begin + datetime.timedelta(days = i)
game_header = game_header.append(nba_py.Scoreboard(month = day.month,
day = day.day,
year = day.year,
league_id = '00',
offset = 0).game_header())
print(str(day) + ' finished! ' + str(datetime.datetime.now().time())[:8])
length_2 = len(game_header)
# drop the duplicate by game id
game_header = game_header.drop_duplicates('GAME_ID')
length_3 = len(game_header)
print(str(length_2 - length_3) + ' duplicates droped.')
print(str(length_3 - length_1) + ' games added.')
# sort game headers by game id ascending
# game_header = game_header.sort_values('GAME_ID')
# commit new game headers to sql table
game_header.to_sql('game_header', conn, index = False, if_exists = 'replace')
print(str(length_3) + ' game headers commit complete!')
In [28]:
conn = create_engine('mysql+pymysql://root:%s@118.190.202.87:3306/nba_stats' % pwd)
game_stats_logs = pd.DataFrame()
try:
# read sql table of game stats logs id
game_stats_logs_id = pd.read_sql_table('game_stats_logs', conn, columns = ['GAME_ID'])
length_1 = len(game_stats_logs_id)
print(str(length_1) + ' player stats loaded.')
except ValueError:
print('no table yet!')
length_1 = 0
# create table and commit it to sql
game_stats_logs.to_sql('game_stats_logs', conn, index = False, if_exists = 'append')
print('game stats logs initialized!')
# define game types by the head of game id
game_type = {'001': 'pre_season', '002': 'regular_season', '003': 'all_star', '004': 'play_offs'}
# ------method 1------for game id in game headers from the max one in sql table
# for i in game_header[game_header['GAME_ID'] >= game_stats_logs['GAME_ID'].max()]['GAME_ID']:
# ------method 2------for game id in game header but not in game stats logs
for i in game_header['GAME_ID'][game_header['GAME_ID'].isin(game_stats_logs_id['GAME_ID'].drop_duplicates()) == False]:
# get game player stats of i
game_stats = nba_py.game.Boxscore(i).player_stats()
# create home team player stats
home_team_id = int(game_header[game_header['GAME_ID'] == i]['HOME_TEAM_ID'])
home_stats_logs = game_stats[game_stats['TEAM_ID'] == int(home_team_id)].copy()
home_stats_logs['LOCATION'] = 'HOME'
home_stats_logs['AGAINST_TEAM_ID'] = int(game_header[game_header['GAME_ID'] == i]['VISITOR_TEAM_ID'])
# create away team player stats
away_team_id = int(game_header[game_header['GAME_ID'] == i]['VISITOR_TEAM_ID'])
away_stats_logs = game_stats[game_stats['TEAM_ID'] == int(away_team_id)].copy()
away_stats_logs['LOCATION'] = 'AWAY'
away_stats_logs['AGAINST_TEAM_ID'] = int(game_header[game_header['GAME_ID'] == i]['HOME_TEAM_ID'])
# combine home and away team player stats and append to game stats logs
game_stats_logs = game_stats_logs.append(home_stats_logs)
game_stats_logs = game_stats_logs.append(away_stats_logs)
print('game ' + i + ' added! ' + str(datetime.datetime.now().time())[:8])
def min_convert(m):
'''
convert mm:ss to float
'''
try:
if ':' in m:
return float(m[:-3]) + round(float(m[-2:])/60, 2)
else:
return float(m)
except TypeError:
return None
# create float time
game_stats_logs['MINS'] = game_stats_logs['MIN'].apply(min_convert)
# add game type
game_stats_logs['GAME_TYPE'] = game_stats_logs['GAME_ID'].apply(lambda x: x[:3]).map(game_type)
# add game date and game sequence
game_stats_logs = game_stats_logs.merge(game_header[['GAME_DATE_EST', 'GAME_SEQUENCE', 'GAME_ID']],
how = 'left', on = 'GAME_ID')
# add new ordered game_id
game_stats_logs['GAME_ID_O'] = game_stats_logs['GAME_ID'].apply(lambda x: x[3:5] + x[:3] + x[-5:])
length_2 = len(game_stats_logs)
# drop duplicate game stats by game id and player id
game_stats_logs = game_stats_logs.drop_duplicates(['GAME_ID', 'PLAYER_ID'])
length_3 = len(game_stats_logs)
print(str(length_2 - length_3) + ' duplicates droped.')
print(str(length_3) + ' player stats added.')
# commit new game stats logs to sql table
game_stats_logs.to_sql('game_stats_logs', conn, index = False, if_exists = 'append')
print(str(length_3) + ' player stats commit complete!')
In [ ]:
In [2]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [6]:
In [ ]:
In [ ]: