In [1]:
%matplotlib inline

from matplotlib import pyplot as plt

import pandas as pd
import numpy as np

from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

pd.set_option('max_columns', 100)

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

models = {'svm': LinearSVC(), 
          'log_reg': LogisticRegression(), 
          'naive_baives': MultinomialNB(), 
          'knn': KNeighborsClassifier(),
          'dec_tree': DecisionTreeClassifier()}

In [3]:
kobe = pd.read_csv('../data/kobe.csv')
kobe.dropna(inplace=True)

kobe['reverse'] = kobe.action_type.map(lambda x: 1 if "Reverse" in x else 0)
kobe['running'] = kobe.action_type.map(lambda x: 1 if "Running" in x else 0)
kobe['driving'] = kobe.action_type.map(lambda x: 1 if "Driving" in x else 0)
kobe['floating'] = kobe.action_type.map(lambda x: 1 if "Floating" in x else 0)

combined_shot_type_dummies = pd.get_dummies(kobe.combined_shot_type)
shot_type_dummies = pd.get_dummies(kobe.shot_type)
shot_zone_range_dummies = pd.get_dummies(kobe.shot_zone_range)
opponent_dummies = pd.get_dummies(kobe.opponent)

kobe = pd.concat([
        kobe, 
        combined_shot_type_dummies, 
        shot_type_dummies, 
        shot_zone_range_dummies,
        opponent_dummies
    ], axis=1)

kobe.head(2)


Out[3]:
action_type combined_shot_type game_event_id game_id lat loc_x loc_y lon minutes_remaining period playoffs season seconds_remaining shot_distance shot_made_flag shot_type shot_zone_area shot_zone_basic shot_zone_range team_id team_name game_date matchup opponent shot_id reverse running driving floating Bank Shot Dunk Hook Shot Jump Shot Layup Tip Shot 2PT Field Goal 3PT Field Goal 16-24 ft. 24+ ft. 8-16 ft. Back Court Shot Less Than 8 ft. ATL BKN BOS CHA CHI CLE DAL DEN DET GSW HOU IND LAC MEM MIA MIL MIN NJN NOH NOP NYK OKC ORL PHI PHX POR SAC SAS SEA TOR UTA VAN WAS
1 Jump Shot Jump Shot 12 20000012 34.0443 -157 0 -118.4268 10 1 0 2000-01 22 15 0.0 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 2 0 0 0 0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 Jump Shot Jump Shot 35 20000012 33.9093 -101 135 -118.3708 7 1 0 2000-01 45 16 1.0 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 3 0 0 0 0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

In [4]:
feature_columns = [
    'shot_distance',
    'minutes_remaining',
    'period',
    'reverse',
] + list(combined_shot_type_dummies.columns) + list(shot_type_dummies.columns) + list(shot_zone_range_dummies)

print(feature_columns)


['shot_distance', 'minutes_remaining', 'period', 'reverse', 'Bank Shot', 'Dunk', 'Hook Shot', 'Jump Shot', 'Layup', 'Tip Shot', '2PT Field Goal', '3PT Field Goal', '16-24 ft.', '24+ ft.', '8-16 ft.', 'Back Court Shot', 'Less Than 8 ft.']

In [5]:
X = kobe[feature_columns]
y = kobe.shot_made_flag

logit = LogisticRegression()
cross_val_score(logit, X, y, 'accuracy', cv=10)


Out[5]:
array([ 0.6176585 ,  0.6176585 ,  0.60350195,  0.60350195,  0.62256809,
        0.61074348,  0.62281043,  0.62242118,  0.62358895,  0.60412612])

In [6]:
kobe_train, kobe_test = train_test_split(kobe, test_size=0.2, stratify=kobe.shot_made_flag)

X_train = kobe_train[feature_columns]
y_train = kobe_train.shot_made_flag

model = LogisticRegression()
model.fit(X_train, y_train)

X_test = kobe_test[feature_columns]
y_test = kobe_test.shot_made_flag # y_true

kobe_test['pred'] = model.predict(X_test) # throws an warning / error

accuracy_score(y_test, kobe_test.pred) # out of sample accuracy


/Users/johria/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:12: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
Out[6]:
0.61536964980544751