Loading necessary library


In [40]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb
import numpy as np

Loading data

deleting irrelevant features


In [43]:
kobe = pd.read_csv('data.csv', sep=',') 
kobe= kobe[np.isfinite(kobe['shot_made_flag'])]
del kobe['lat']
del kobe['lon']
del kobe['game_id']
del kobe['team_id']
del kobe['team_name']

kobe_2 = pd.read_csv('data.csv', sep=',') 
kobe_2= kobe_2[np.isfinite(kobe_2['shot_made_flag'])]
del kobe_2['lat']
del kobe_2['lon']
del kobe_2['game_id']
del kobe_2['team_id']
del kobe_2['team_name']

encoding catagorical features


In [26]:
mt_up = preprocessing.LabelEncoder()
kobe.matchup = mt_up.fit_transform(kobe.matchup )
#kobe_2.matchup = mt_up.fit_transform(kobe.matchup )

opp = preprocessing.LabelEncoder()
kobe.opponent = opp.fit_transform(kobe.opponent )
#kobe_2.opponent = opp.fit_transform(kobe.opponent )

dt = preprocessing.LabelEncoder()
kobe.game_date = dt.fit_transform(kobe.game_date )
#kobe_2.game_date = dt.fit_transform(kobe.game_date )

at = preprocessing.LabelEncoder()
kobe.action_type = at.fit_transform(kobe.action_type )
#kobe_2.action_type = at.fit_transform(kobe.action_type )

cst = preprocessing.LabelEncoder()
kobe.combined_shot_type = cst.fit_transform(kobe.combined_shot_type )
#kobe_2.combined_shot_type = cst.fit_transform(kobe.combined_shot_type )

seson = preprocessing.LabelEncoder()
kobe.season = seson.fit_transform(kobe.season )
#kobe_2.season = seson.fit_transform(kobe.season )

st = preprocessing.LabelEncoder()
kobe.shot_type = st.fit_transform(kobe.shot_type )
#kobe_2.shot_type = st.fit_transform(kobe.shot_type )

sza = preprocessing.LabelEncoder()
kobe.shot_zone_area = sza.fit_transform(kobe.shot_zone_area )
#kobe_2.shot_zone_area = sza.fit_transform(kobe.shot_zone_area )

szb = preprocessing.LabelEncoder()
kobe.shot_zone_basic = szb.fit_transform(kobe.shot_zone_basic )
#kobe_2.shot_zone_basic = szb.fit_transform(kobe.shot_zone_basic )

szr = preprocessing.LabelEncoder()
kobe.shot_zone_range = szr.fit_transform(kobe.shot_zone_range )
#kobe_2.shot_zone_range = szr.fit_transform(kobe.shot_zone_range )


Out[26]:
action_type combined_shot_type game_event_id loc_x loc_y minutes_remaining period playoffs season seconds_remaining shot_distance shot_made_flag shot_type shot_zone_area shot_zone_basic shot_zone_range game_date matchup opponent
2 25 3 35 -101 135 7 1 0 4 45 16 1.0 0 2 4 0 250 28 25
11 25 3 4 121 127 11 1 0 4 0 17 1.0 0 4 4 0 254 71 30
12 40 3 27 -67 110 7 1 0 4 9 12 1.0 0 3 2 2 254 71 30
17 25 3 138 -117 226 8 2 0 4 50 25 1.0 1 2 0 1 254 71 30
18 25 3 244 -132 97 11 3 0 4 29 16 0.0 0 2 4 0 254 71 30

splitting data into test and train


In [44]:
from sklearn.cross_validation import train_test_split
# Generate the training set.  Set random_state to be able to replicate results.
train = kobe.sample(frac=0.6, random_state=1)
train_2 = kobe_2.sample(frac=0.6, random_state=1)
# Select anything not in the training set and put it in the testing set.
test = kobe.loc[~kobe.index.isin(train.index)] 
test_2 = kobe_2.loc[~kobe_2.index.isin(train_2.index)]

seperating features and class in both test and train sets


In [45]:
columns = kobe.columns.tolist()
columns = [c for c in columns if c not in ["shot_made_flag","team_id","team_name"]]
kobe_train_x =train[columns]
kobe_test_x =test[columns]
kobe_train_y=train['shot_made_flag']
kobe_test_y=test['shot_made_flag']
print(kobe_train_x.shape)
print(kobe_test_x.shape)
print(kobe_train_y.shape)
print(kobe_test_y.shape)


(15418, 18)
(10279, 18)
(15418L,)
(10279L,)

getting best parameters

do not run this section as the best set of parameters is already found

In [77]:
def optimization(depth, n_est,l_r):
    maxacc=0
    best_depth=0
    best_n_est=0
    best_l_r=0
    for i in range(1,depth):
        for j in n_est:
            for k in l_r: 
                gbm = xgb.XGBClassifier(max_depth=i, n_estimators=j, learning_rate=k).fit(kobe_train_x, kobe_train_y)
                predicted = gbm.predict(kobe_test_x)
                key=str(i)+"_"+str(j)+"_"+str(k)
                accu=accuracy_score(kobe_test_y, predicted)
                if(accu>maxacc):
                    maxacc=accu
                    best_depth=i
                    best_n_est=j
                    best_l_r=k
                    print(maxkey+" "+str(maxacc))
    return(best_depth,best_n_est,best_l_r)

n_est=[5,10,20,50,100,150,200,250,300,350,400,450,500,550,600,650,700,750,800,850,900,950,1000]
depth=10
l_r = [0.0001, 0.001, 0.01,0.05, 0.1, 0.2, 0.3]
best_depth,best_n_est,best_l_r=optimization(depth,n_est,l_r)

creating model with best parameter combination and reporting metrics


In [5]:
#hard coded the best features
gbm = xgb.XGBClassifier(max_depth=4, n_estimators=600, learning_rate=0.01).fit(kobe_train_x, kobe_train_y) 
predicted = gbm.predict(kobe_test_x)
# summarize the fit of the model
print(metrics.classification_report(kobe_test_y, predicted))
print("Confusion Matrix")
print(metrics.confusion_matrix(kobe_test_y, predicted))
accuracy=accuracy_score(kobe_test_y, predicted)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


             precision    recall  f1-score   support

        0.0       0.67      0.86      0.75      5717
        1.0       0.73      0.46      0.56      4562

avg / total       0.69      0.68      0.67     10279

Confusion Matrix
[[4938  779]
 [2482 2080]]
Accuracy: 68.28%

creating a test file with predicted results to visualize


In [47]:
test_2['predicted']=predicted
test_2.to_csv(path_or_buf='test_with_predictions.csv', sep=',')
test_2.head(10)


C:\Users\Narmi\Anaconda2\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
Out[47]:
action_type combined_shot_type game_event_id loc_x loc_y minutes_remaining period playoffs season seconds_remaining shot_distance shot_made_flag shot_type shot_zone_area shot_zone_basic shot_zone_range game_date matchup opponent predicted
2 Jump Shot Jump Shot 35 -101 135 7 1 0 2000-01 45 16 1.0 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 10/31/2000 LAL @ POR POR 0.0
11 Jump Shot Jump Shot 4 121 127 11 1 0 2000-01 0 17 1.0 2PT Field Goal Right Side Center(RC) Mid-Range 16-24 ft. 11/1/2000 LAL vs. UTA UTA 0.0
12 Running Jump Shot Jump Shot 27 -67 110 7 1 0 2000-01 9 12 1.0 2PT Field Goal Left Side(L) In The Paint (Non-RA) 8-16 ft. 11/1/2000 LAL vs. UTA UTA 1.0
17 Jump Shot Jump Shot 138 -117 226 8 2 0 2000-01 50 25 1.0 3PT Field Goal Left Side Center(LC) Above the Break 3 24+ ft. 11/1/2000 LAL vs. UTA UTA 0.0
18 Jump Shot Jump Shot 244 -132 97 11 3 0 2000-01 29 16 0.0 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 11/1/2000 LAL vs. UTA UTA 0.0
22 Running Jump Shot Jump Shot 274 -16 110 7 3 0 2000-01 57 11 1.0 2PT Field Goal Center(C) In The Paint (Non-RA) 8-16 ft. 11/1/2000 LAL vs. UTA UTA 1.0
24 Running Jump Shot Jump Shot 307 -46 63 5 3 0 2000-01 11 7 1.0 2PT Field Goal Center(C) In The Paint (Non-RA) Less Than 8 ft. 11/1/2000 LAL vs. UTA UTA 1.0
25 Layup Shot Layup 332 0 0 2 3 0 2000-01 36 0 0.0 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 11/1/2000 LAL vs. UTA UTA 0.0
29 Jump Shot Jump Shot 429 3 87 6 4 0 2000-01 22 8 0.0 2PT Field Goal Center(C) In The Paint (Non-RA) 8-16 ft. 11/1/2000 LAL vs. UTA UTA 0.0
31 Jump Shot Jump Shot 499 127 34 0 4 0 2000-01 30 13 0.0 2PT Field Goal Right Side(R) Mid-Range 8-16 ft. 11/1/2000 LAL vs. UTA UTA 0.0