In [180]:
import pandas as pd
import datetime
import random
import pytz
from sqlalchemy import create_engine
from password import hoop_pwd
pwd = hoop_pwd.password
%matplotlib inline
from nba_seer import *
In [181]:
# create sql conn and load game stats logs ----------------------
conn = create_engine('mysql+pymysql://root:%s@118.190.202.87:3306/nba_stats' % pwd)
game_stats_logs = pd.read_sql_table('game_stats_logs', conn)
game_stats_logs = game_stats_logs[game_stats_logs['GAME_TYPE'] != 'all_star']
print(str(len(game_stats_logs)) + ' player stats loaded.')
In [182]:
# load player list ----------------------
all_players = nba_py.player.PlayerList(season='2017-18').info()
print('players list loaded')
In [183]:
def get_score(row):
player_id = row['PERSON_ID']
game_id = row['GAME_ID']
stats = game_stats_logs[(game_stats_logs['PLAYER_ID'] == player_id) &
(game_stats_logs['GAME_ID'] == game_id)].copy()
#print(stats)
stats['SCO'] = stats['PTS'] * 1 + stats['AST'] * 1.5 + \
stats['OREB'] * 1 + stats['DREB'] * 0.7 + \
stats['STL'] * 2 + stats['BLK'] * 1.8 + stats['TO'] * -1 + \
stats['FGM'] * 0.4 + (stats['FGA'] - stats['FGM']) * -1 + stats['FG3M'] * 0.5
return stats['SCO'].max()
In [184]:
def test_result(players):
diff_pct = players['SCO_DIFF_PCT'].describe().to_frame()
diff_sco = players['SCO_DIFF'].describe().to_frame()
ans = pd.concat([diff_sco, diff_pct], axis=1)
return ans
In [185]:
def factor_cov(players):
cov_diff = pd.DataFrame()
#cov_diff['cov'] = abs(players[['MIN_COV_20', 'SCO_COV_20']].mean(axis = 1))
cov_diff['cov'] = abs(players['MIN_COV_20'] * players['SCO_COV_20'])
cov_diff['diff_pct'] = abs(players['SCO_DIFF_PCT'])
cov_diff.plot(x='cov', y='diff_pct', kind='scatter', grid=True, figsize=(10, 7))
cov_diff_pct_corr = cov_diff['cov'].corr(cov_diff['diff_pct'])
print('the corr between cov and different percantage is: ', cov_diff_pct_corr)
In [186]:
def diff_distribution(players, rg, interval):
diff_pct = players[abs(players['SCO_DIFF_PCT']) <= rg]['SCO_DIFF_PCT'].abs()
bins = []
for i in range(int(rg/interval)):
bins.append(i*interval)
pct_cut = pd.cut(diff_pct, bins)
pct_cut_df = pd.value_counts(pct_cut).to_frame().rename(columns={'SCO_DIFF_PCT': 'count'})
pct_cut_df = pct_cut_df.sort_index(axis=0)
pct_cut_df.plot(kind='bar', figsize=(15, 7))
print('data in range:', len(diff_pct)/len(players))
In [187]:
game_date = game_stats_logs['GAME_DATE_EST'].drop_duplicates().apply(lambda x: x[:10])
In [220]:
players = pd.DataFrame()
for i in random.sample(list(game_date), 5):
j = datetime.datetime.strptime(i, "%Y-%m-%d").date()
games = get_games(j)
players = players.append(get_players_p(games, game_stats_logs))
print(str(j) + ' complete!')
else:
print(len(players), ' players total!')
In [221]:
players = get_exp_sco(players, game_stats_logs)
In [222]:
players['ACT_SCO'] = players.apply(get_score, axis = 1)
players['SCO_DIFF'] = players['EXP_SCO'] - players['ACT_SCO']
players['SCO_DIFF_PCT'] = players['SCO_DIFF'] / players['ACT_SCO']
print('test data loaded!')
In [223]:
players = players[(players['ACT_SCO'] != 0) & (players['ACT_SCO'].notnull())]
In [224]:
players.sort_values('ACT_SCO')
Out[224]:
In [247]:
players_t = players[players['5_g_d'] < 20]
players_t = players_t[players_t['SCO_COV_20'] < 0.6]
players_t = players_t[players_t['MIN_COV_20'] < 0.25]
players_t.sort_values('ACT_SCO')
Out[247]:
In [237]:
def location_eff(game_stats_logs, row):
player_id = row['PERSON_ID']
game_id_o = row['GAME_ID'][3:5] + row['GAME_ID'][:3] + row['GAME_ID'][-5:]
player_stats_logs = game_stats_logs[game_stats_logs['PLAYER_ID'] == player_id].sort_values('GAME_ID_O')
player_stats_home = player_stats_logs[(player_stats_logs['LOCATION'] == 'HOME') &
(player_stats_logs['MINS'].notnull()) &
(player_stats_logs['GAME_ID_O'] < game_id_o)].tail(20)
home_score_20 = get_score_36(player_stats_home)[0]
player_stats_away = player_stats_logs[(player_stats_logs['LOCATION'] == 'AWAY') &
(player_stats_logs['MINS'].notnull()) &
(player_stats_logs['GAME_ID_O'] < game_id_o)].tail(20)
away_score_20 = get_score_36(player_stats_away)[0]
player_stats_all = player_stats_logs[(player_stats_logs['MINS'].notnull()) &
(player_stats_logs['GAME_ID_O'] < game_id_o)].tail(40)
recent_score_40 = get_score_36(player_stats_all)[0]
return home_score_20 / recent_score_40, away_score_20 / recent_score_40
In [238]:
players_t_1 = players_t.copy()
players_t_1['home_eff'] = players_t_1.apply(lambda x: location_eff(game_stats_logs, x)[0], axis=1)
players_t_1['away_eff'] = players_t_1.apply(lambda x: location_eff(game_stats_logs, x)[1], axis=1)
players_t_1['EXP_SCO'] = players_t_1.apply(lambda x: x['EXP_SCO'] * x['home_eff'] if x['Location'] == 'HOME'
else x['EXP_SCO'] * x['away_eff'],
axis=1)
players_t_1['ACT_SCO'] = players_t_1.apply(get_score, axis = 1)
players_t_1['SCO_DIFF'] = players_t_1['EXP_SCO'] - players_t_1['ACT_SCO']
players_t_1['SCO_DIFF_PCT'] = players_t_1['SCO_DIFF'] / players_t_1['ACT_SCO']
print('test data loaded!')
In [239]:
test_result(players)
Out[239]:
In [240]:
test_result(players_t)
Out[240]:
In [241]:
test_result(players_t_1)
Out[241]:
In [242]:
factor_cov(players)
In [243]:
factor_cov(players_t)
In [244]:
diff_distribution(players, 5, 0.1)
In [245]:
diff_distribution(players_t, 5, 0.1)
In [246]:
diff_distribution(players_t_1, 5, 0.1)
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: