In [ ]:
import gc
import category_encoders as ce
import pandas as pd
import os
import tensorflow as tf
from keras import backend as K
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint
from keras.layers import Dropout
from keras.layers import ActivityRegularization
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import Imputer
from catboost_prediction import dropping_columns
from read_data import read_data
from enriching_data import enrich_all_data
import pickle
from enriching_data import enrich_all_data
from scrap_ref import getSchedule, scrap_all_data, get_first_year_for_player, scrap_new_data_only
#getSchedule('https://www.basketball-reference.com/leagues/NBA_')
nba_base_link = "https://www.basketball-reference.com/players/"
nba_base_url = "https://www.basketball-reference.com/"
scrap_new_data_only(nba_base_link, nba_base_url, 'nba')
enrich_all_data()
cat_features = ["name", "name2", "name3", "day_of_the_week", "location", "opponent", "opponent2",
"opponent3", "opp-team-coach", "opp-team-executive"]
def rmse(y_true, y_pred):
return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))
def mse(y_true, y_pred):
return K.mean(K.square(y_pred - y_true), axis=-1)
def sse(y_true, y_pred):
return K.sum(K.square(y_pred - y_true), axis=-1)
def mae(y_true, y_pred):
return K.mean(K.abs(y_pred - y_true), axis=-1)
In [ ]:
epochs_size=350
poly_features=2
batch_size=64
from_year=2005
l2=0.05
dropout=0.15
to_year=2200
learning_rate=0.001
dropping_columns = ['Unnamed: 0', 'fantasy_points', 'ast', 'blk', 'drb', 'fg', 'fg3', 'fg3_pct', 'fg3a', 'fg_pct',
'fga', 'ft', 'ft_pct', 'fta', 'minutes', 'orb', 'pf', 'plus', 'pts', 'result', 'score', 'stl',
'tov', 'trb', 'seconds','age', "team", "team2", "team3", "team-coach", "team-executive",
'year', 'age1', 'date', 'efg']
In [ ]:
print("reading data")
# all_data = read_data_for_player('jamesle01')
all_data = read_data(from_year, to_year)
points_ = all_data['fantasy_points']
all_data = all_data.drop(dropping_columns, axis=1)
# points_.reset_index(drop=True, inplace=True)
final = pd.concat([all_data, points_], axis=1)
non_categorical = all_data.drop(cat_features, axis=1)
print(list(all_data.columns.values))
print(list(all_data.dtypes))
print(all_data.head(10))
print(all_data.shape)
gc.collect()
print("normalizing non categorical features")
res = []
for name in non_categorical:
print(name)
# additional stuff
imp = Imputer(missing_values='NaN', strategy='median', axis=0)
robust = RobustScaler()
scaler = StandardScaler()
pipe = Pipeline([
('imp', imp),
('scaler', scaler)
#,
# ('poly', poly),
#('robust', robust)
])
dt = all_data[[name]]
pipe.fit(dt)
transformed = pipe.transform(dt)
file_pi = open(os.path.join('pipes', str(name)), 'wb')
pickle.dump(pipe, file_pi)
file_pi.close()
cols = [name]
frame = pd.DataFrame(transformed, columns=cols)
frame.reset_index(drop=True, inplace=True)
print(frame.shape)
res.append(frame)
print("starting convert cat features")
for name in cat_features:
print(name)
encoder = ce.BaseNEncoder()
dt = all_data[[name]]
encoder.fit(dt, points_)
name_cleaned = encoder.transform(dt)
file_pi = open(os.path.join('pipes', str(name)), 'wb')
pickle.dump(encoder, file_pi)
file_pi.close()
name_cleaned.reset_index(drop=True, inplace=True)
res.append(name_cleaned)
print(name_cleaned.shape)
print("finished converting cat features")
print("joining stuff")
all = pd.concat(res, axis=1)
gc.collect()
print(all.head(10))
input_shape = len(all.columns)
print(len(all.columns))
points_.reset_index(drop=True, inplace=True)
final = pd.concat([all, points_], axis=1)
x_train, x_test, y_train, y_test = train_test_split(all, points_, test_size=0,
random_state=1)
model = Sequential()
# Adds a densely-connected layer with 64 units to the model:
# (354,)
model.add(Dense(2500, activation='relu', input_shape=(input_shape,), bias_initializer='glorot_normal'))
model.add(Dropout(dropout))
# Add another:
model.add(Dense(1500, activation='relu', bias_initializer='glorot_normal'))
model.add(Dropout(dropout))
# Add another:
model.add(Dense(1000, activation='relu', bias_initializer='glorot_normal'))
model.add(Dropout(dropout))
# Add another:
model.add(Dense(500, activation='relu', bias_initializer='glorot_normal'))
model.add(Dropout(dropout))
model.add(Dense(250, activation='relu', bias_initializer='glorot_normal'))
model.add(Dropout(dropout))
model.add(Dense(100, activation='relu', bias_initializer='glorot_normal'))
# Add another:
model.add(Dense(1))
model.compile(optimizer=tf.train.AdamOptimizer(learning_rate),
loss='mean_squared_error',
metrics=[rmse, mae])
model.fit(x_train, y_train, epochs=epochs_size, batch_size=batch_size)
In [ ]:
model.save('fantasy.h5')
#print(model.evaluate(x_test, y_test, batch_size=batch_size))
In [ ]:
from utils import team_letters
from predict import predict_next_game_for_team
current_week = '51'
next_week = '49'
last_date= '2018-12-16'
for team in team_letters:
# predictions = []
predict_next_game_for_team(team, current_week, last_date)
# if len(predictions) > 0:
preds = pd.concat(predictions)
preds.to_csv("{}.csv".format(current_week))
In [ ]:
import shap
shap.initjs()
explainer = shap.KernelExplainer(model.predict, x_test)
shap_values = explainer.shap_values(x_test)
# plot the SHAP values for the Setosa output of the first instance
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], x_test.iloc[0,:])
In [ ]:
shap.summary_plot(shap_values, all, plot_type="bar", max_display=200)
In [ ]:
In [ ]: