In [ ]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
In [ ]:
filename= "../data/kobe/kobe_bryant_shot_data.csv.gz"
df = pd.read_csv(filename, na_values={'shot_made_flag': ''})
df = df.dropna()
df = df.drop([u'game_event_id', u'game_id', 'combined_shot_type',
u'lat', u'lon', u'team_id', u'team_name', u'game_date',
u'shot_id'], axis=1)
df = df.drop(['loc_x', 'loc_y', 'shot_type','shot_zone_basic', 'shot_zone_range'], axis=1)
In [ ]:
df['home'] = df.matchup.apply(lambda matchup: 0 if '@' in matchup else 1)
df = df.drop(['matchup'], axis=1)
df['time_remaining'] = 60 * df['minutes_remaining'] + df['seconds_remaining']
df = df.drop(['minutes_remaining', 'seconds_remaining'], axis=1)
cols = df.columns.tolist()
cols.remove('shot_made_flag')
cols.append('shot_made_flag')
df = df[cols]
In [ ]:
filename= "../data/kobe/kobe_bryant_shot_data_refined.csv"
df.to_csv(filename, index=False)
In [ ]:
In [ ]:
filename= "../data/kobe/kobe_bryant_shot_data_refined.csv"
df = pd.read_csv(filename)
In [ ]:
In [ ]:
df
In [ ]:
In [ ]:
original_df = df.copy()
In [ ]:
In [ ]:
# turn categorical variables into dummy variables
categorical_vars = ['season', 'period', 'shot_zone_area', 'opponent', 'action_type']
for var in categorical_vars:
df = pd.concat([df, pd.get_dummies(df[var], prefix=var)], 1)
df = df.drop(var, 1)
In [ ]:
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, log_loss
from sklearn.cross_validation import train_test_split
In [ ]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('shot_made_flag', axis=1),
df['shot_made_flag'],
test_size=0.33,
random_state=42)
In [ ]:
In [ ]:
model = RandomForestClassifier(n_estimators=45, max_depth=14, criterion='entropy', random_state=42, n_jobs=-1)
In [ ]:
model.fit(X_train, y_train)
In [ ]:
y_pred = model.predict(X_test)
In [ ]:
y_pred_proba = model.predict_proba(X_test)
In [ ]:
confusion_matrix(y_test, y_pred), log_loss(y_test, y_pred_proba[:,1])
In [ ]:
In [ ]:
pd.DataFrame({'feature': X_train.columns,
'importance': model.feature_importances_}).sort_values('importance', ascending=False).head()
In [ ]:
In [ ]:
In [ ]:
In [ ]:
pred_df = original_df.join(pd.DataFrame(y_pred, columns=['shot_made_pred'], index=X_test.index))
In [ ]:
pred_df = pred_df[~pred_df.shot_made_pred.isnull()]
In [ ]:
pred_df.head()
In [ ]:
In [ ]:
pred_df[(pred_df.shot_made_flag != pred_df.shot_made_pred)]
In [ ]:
# http://scikit-learn.org/stable/modules/ensemble.html#forests-of-randomized-trees
In [ ]:
# http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
In [ ]:
# http://scikit-learn.org/stable/tutorial/statistical_inference/supervised_learning.html
In [ ]:
# http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html#example-svm-plot-iris-py
In [ ]: