notebook.community

Edit and run



In [ ]:

    
import gc
import category_encoders as ce
import pandas as pd
import os
import tensorflow as tf
from keras import backend as K
from keras.layers import Dense
from keras.callbacks import ModelCheckpoint
from keras.layers import Dropout
from keras.layers import ActivityRegularization
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import Imputer
from catboost_prediction import dropping_columns
from read_data import read_data
from enriching_data import enrich_all_data
import pickle
from enriching_data import enrich_all_data
from scrap_ref import getSchedule, scrap_all_data, get_first_year_for_player, scrap_new_data_only

#getSchedule('https://www.basketball-reference.com/leagues/NBA_')

nba_base_link = "https://www.basketball-reference.com/players/"
nba_base_url = "https://www.basketball-reference.com/"

scrap_new_data_only(nba_base_link, nba_base_url, 'nba')

enrich_all_data()

cat_features = ["name", "name2", "name3", "day_of_the_week", "location", "opponent", "opponent2",
                "opponent3", "opp-team-coach", "opp-team-executive"]


def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))


def mse(y_true, y_pred):
    return K.mean(K.square(y_pred - y_true), axis=-1)


def sse(y_true, y_pred):
    return K.sum(K.square(y_pred - y_true), axis=-1)


def mae(y_true, y_pred):
    return K.mean(K.abs(y_pred - y_true), axis=-1)



In [ ]:

    
epochs_size=350
poly_features=2
batch_size=64
from_year=2005
l2=0.05
dropout=0.15
to_year=2200
learning_rate=0.001
dropping_columns = ['Unnamed: 0', 'fantasy_points', 'ast', 'blk', 'drb', 'fg', 'fg3', 'fg3_pct', 'fg3a', 'fg_pct',
                    'fga', 'ft', 'ft_pct', 'fta', 'minutes', 'orb', 'pf', 'plus', 'pts', 'result', 'score', 'stl',
                    'tov', 'trb', 'seconds','age', "team", "team2", "team3", "team-coach", "team-executive",
                    'year', 'age1', 'date', 'efg']



In [ ]:

    
print("reading data")

        # all_data = read_data_for_player('jamesle01')
        all_data = read_data(from_year, to_year)

        points_ = all_data['fantasy_points']
        all_data = all_data.drop(dropping_columns, axis=1)

        # points_.reset_index(drop=True, inplace=True)
        final = pd.concat([all_data, points_], axis=1)

        non_categorical = all_data.drop(cat_features, axis=1)

        print(list(all_data.columns.values))
        print(list(all_data.dtypes))
        print(all_data.head(10))
        print(all_data.shape)

        gc.collect()

        print("normalizing non categorical features")

        res = []

        for name in non_categorical:
            print(name)

            # additional stuff

            imp = Imputer(missing_values='NaN', strategy='median', axis=0)
            robust = RobustScaler()
            scaler = StandardScaler()

            pipe = Pipeline([
                ('imp', imp),
                ('scaler', scaler)
                #,
                # ('poly', poly),
                #('robust', robust)
            ])

            dt = all_data[[name]]

            pipe.fit(dt)


            transformed = pipe.transform(dt)

            file_pi = open(os.path.join('pipes', str(name)), 'wb')
            pickle.dump(pipe, file_pi)
            file_pi.close()

            cols = [name]

            frame = pd.DataFrame(transformed, columns=cols)
            frame.reset_index(drop=True, inplace=True)
            print(frame.shape)
            res.append(frame)

        print("starting convert cat features")

        for name in cat_features:
            print(name)
            
            encoder = ce.BaseNEncoder()

            dt = all_data[[name]]

            encoder.fit(dt, points_)

            name_cleaned = encoder.transform(dt)
            
            file_pi = open(os.path.join('pipes', str(name)), 'wb')
            pickle.dump(encoder, file_pi)
            file_pi.close()
            
            name_cleaned.reset_index(drop=True, inplace=True)

            res.append(name_cleaned)

            print(name_cleaned.shape)

        print("finished converting cat features")

        print("joining stuff")

        all = pd.concat(res, axis=1)

        gc.collect()

        print(all.head(10))

        input_shape = len(all.columns)
        print(len(all.columns))
        
        points_.reset_index(drop=True, inplace=True)
        final = pd.concat([all, points_], axis=1)

        x_train, x_test, y_train, y_test = train_test_split(all, points_, test_size=0,
                                                            random_state=1)

        model = Sequential()
        # Adds a densely-connected layer with 64 units to the model:
        # (354,)
        model.add(Dense(2500, activation='relu', input_shape=(input_shape,), bias_initializer='glorot_normal'))
        
        model.add(Dropout(dropout))
        # Add another:
        model.add(Dense(1500, activation='relu', bias_initializer='glorot_normal'))
        
        model.add(Dropout(dropout))
        # Add another:
        model.add(Dense(1000, activation='relu', bias_initializer='glorot_normal'))

        model.add(Dropout(dropout))
        # Add another:
        model.add(Dense(500, activation='relu', bias_initializer='glorot_normal'))

        model.add(Dropout(dropout))

        model.add(Dense(250, activation='relu', bias_initializer='glorot_normal'))
        
        model.add(Dropout(dropout))

        model.add(Dense(100, activation='relu', bias_initializer='glorot_normal'))
        # Add another:
        model.add(Dense(1))

        model.compile(optimizer=tf.train.AdamOptimizer(learning_rate),
                      loss='mean_squared_error',
                      metrics=[rmse, mae])

        model.fit(x_train, y_train, epochs=epochs_size, batch_size=batch_size)



In [ ]:

    
model.save('fantasy.h5')
#print(model.evaluate(x_test, y_test, batch_size=batch_size))



In [ ]:

    
from utils import team_letters
from predict import predict_next_game_for_team

current_week = '51'
next_week = '49'
last_date= '2018-12-16'

for team in team_letters:
    # predictions = []
    predict_next_game_for_team(team, current_week, last_date)
    # if len(predictions) > 0:
preds = pd.concat(predictions)
preds.to_csv("{}.csv".format(current_week))



In [ ]:

    
import shap

shap.initjs()

explainer = shap.KernelExplainer(model.predict, x_test)
shap_values = explainer.shap_values(x_test)

# plot the SHAP values for the Setosa output of the first instance
shap.force_plot(explainer.expected_value[0], shap_values[0][0,:], x_test.iloc[0,:])



In [ ]:

    
shap.summary_plot(shap_values, all, plot_type="bar", max_display=200)



In [ ]:



In [ ]: