In [43]:
import numpy as np
import pandas as pd

from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import cross_val_score

# Load data and roughly clean it, then sort as game date
df = pd.read_csv("data.csv")
mask = df['shot_made_flag'].isnull()
target_id = df[mask]["shot_id"]
print df['shot_zone_basic']
df.drop(['game_event_id', 'game_id', 'lat', 'lon', 'team_id', 'team_name',"shot_id",'shot_zone_basic'], axis=1, inplace=True)
df.sort_values('game_date',  inplace=True)

# Clean data
actiontypes = dict(df.action_type.value_counts())
df['type'] = df.apply(lambda row: row['action_type'] if actiontypes[row['action_type']] > 20\
                          else row['combined_shot_type'], axis=1)
df.drop(['action_type', 'combined_shot_type'], axis=1, inplace=True)

df['away'] = df.matchup.str.contains('@')
df.drop(['matchup','game_date'], axis=1, inplace=True)

df['distance'] = df.apply(lambda row: row['shot_distance'] if row['shot_distance'] <45 else 45, axis=1)

df['time_remaining'] = df.apply(lambda row: row['minutes_remaining'] * 60 + row['seconds_remaining'], axis=1)
df['last_moments'] = df.apply(lambda row: 1 if row['time_remaining'] < 3 else 0, axis=1)


0                    Mid-Range
1                    Mid-Range
2                    Mid-Range
3                    Mid-Range
4              Restricted Area
5                    Mid-Range
6              Restricted Area
7              Restricted Area
8        In The Paint (Non-RA)
9        In The Paint (Non-RA)
10           Above the Break 3
11                   Mid-Range
12       In The Paint (Non-RA)
13                   Mid-Range
14       In The Paint (Non-RA)
15                   Mid-Range
16             Restricted Area
17           Above the Break 3
18                   Mid-Range
19             Restricted Area
20                   Mid-Range
21                   Mid-Range
22       In The Paint (Non-RA)
23                   Mid-Range
24       In The Paint (Non-RA)
25             Restricted Area
26                   Mid-Range
27           Above the Break 3
28                   Mid-Range
29       In The Paint (Non-RA)
                 ...          
30667        Above the Break 3
30668                Mid-Range
30669                Mid-Range
30670    In The Paint (Non-RA)
30671          Restricted Area
30672                Mid-Range
30673                Mid-Range
30674        Above the Break 3
30675        Above the Break 3
30676                Mid-Range
30677                Mid-Range
30678          Restricted Area
30679          Restricted Area
30680          Restricted Area
30681        Above the Break 3
30682    In The Paint (Non-RA)
30683                Mid-Range
30684                Mid-Range
30685        Above the Break 3
30686    In The Paint (Non-RA)
30687    In The Paint (Non-RA)
30688                Mid-Range
30689                Backcourt
30690                Mid-Range
30691          Restricted Area
30692    In The Paint (Non-RA)
30693          Restricted Area
30694                Mid-Range
30695        Above the Break 3
30696    In The Paint (Non-RA)
Name: shot_zone_basic, dtype: object

In [37]:
print 140*140+116*116
print 18*18

print 131*131+97*97
print 16*16


33056
324
26570
256

In [35]:
df[['loc_x','loc_y','shot_distance']].head(2)


Out[35]:
loc_x loc_y shot_distance
22901 -140 116 18
22902 -131 97 16

In [2]:
from math import atan
df['degree_shoot'] =df.apply(lambda row: 90 if row['loc_x'] == 0 else 57.2958*atan(row['loc_y']/row['loc_x']), axis=1)
df.drop(['loc_x','loc_y'], axis=1, inplace=True)

In [3]:
values = dict(df['shot_zone_range'].value_counts())
df['_shotZoneFreq'] = df['shot_zone_range'].map(values)

values = dict(df['type'].value_counts())
df['_typeFreq'] = df['type'].map(values)

In [4]:
print pd.Series.unique(df.type).shape[0]
pd.Series.unique(df.type)


36
Out[4]:
array(['Jump Shot', 'Layup Shot', 'Driving Layup Shot', 'Dunk Shot',
       'Driving Dunk Shot', 'Slam Dunk Shot', 'Running Jump Shot',
       'Tip Shot', 'Hook Shot', 'Reverse Dunk Shot', 'Reverse Layup Shot',
       'Turnaround Jump Shot', 'Running Hook Shot', 'Alley Oop Dunk Shot',
       'Alley Oop Layup shot', 'Dunk', 'Driving Finger Roll Shot',
       'Running Layup Shot', 'Finger Roll Shot', 'Fadeaway Jump Shot',
       'Jump Hook Shot', 'Layup', 'Jump Bank Shot', 'Bank Shot',
       'Driving Finger Roll Layup Shot', 'Fadeaway Bank shot',
       'Pullup Jump shot', 'Finger Roll Layup Shot',
       'Driving Reverse Layup Shot', 'Turnaround Fadeaway shot',
       'Driving Slam Dunk Shot', 'Step Back Jump shot',
       'Turnaround Bank shot', 'Floating Jump shot', 'Running Bank shot',
       'Driving Jump shot'], dtype=object)

In [5]:
from sklearn import preprocessing

featuresToOneHotEncoding=['shot_type', 'shot_zone_area', 'opponent',  'away','type']
for f in featuresToOneHotEncoding:
    df = pd.concat([df, pd.get_dummies(np.where(df[f] == df[f].unique()[0],None, df[f]), prefix=f),], axis=1)
 
df = df.drop(featuresToOneHotEncoding,axis=1)

In [6]:
featuresToLabel = ["season",'shot_zone_range']
print df.columns
lbl = preprocessing.LabelEncoder()
for f in featuresToLabel: 
    lbl = preprocessing.LabelEncoder() 
    lbl.fit(list(df[f].values)) 
    df[f] = lbl.transform(list(df[f].values))


Index([u'minutes_remaining', u'period', u'playoffs', u'season',
       u'seconds_remaining', u'shot_distance', u'shot_made_flag',
       u'shot_zone_range', u'distance', u'time_remaining', u'last_moments',
       u'degree_shoot', u'_shotZoneFreq', u'_typeFreq',
       u'shot_type_3PT Field Goal', u'shot_zone_area_Back Court(BC)',
       u'shot_zone_area_Center(C)', u'shot_zone_area_Left Side Center(LC)',
       u'shot_zone_area_Left Side(L)', u'shot_zone_area_Right Side Center(RC)',
       u'opponent_ATL', u'opponent_BKN', u'opponent_BOS', u'opponent_CHA',
       u'opponent_CHI', u'opponent_CLE', u'opponent_DAL', u'opponent_DEN',
       u'opponent_DET', u'opponent_GSW', u'opponent_HOU', u'opponent_IND',
       u'opponent_LAC', u'opponent_MEM', u'opponent_MIA', u'opponent_MIL',
       u'opponent_MIN', u'opponent_NJN', u'opponent_NOH', u'opponent_NOP',
       u'opponent_NYK', u'opponent_OKC', u'opponent_ORL', u'opponent_PHI',
       u'opponent_PHX', u'opponent_SAC', u'opponent_SAS', u'opponent_SEA',
       u'opponent_TOR', u'opponent_UTA', u'opponent_VAN', u'opponent_WAS',
       u'away_False', u'type_Alley Oop Dunk Shot',
       u'type_Alley Oop Layup shot', u'type_Bank Shot',
       u'type_Driving Dunk Shot', u'type_Driving Finger Roll Layup Shot',
       u'type_Driving Finger Roll Shot', u'type_Driving Jump shot',
       u'type_Driving Layup Shot', u'type_Driving Reverse Layup Shot',
       u'type_Driving Slam Dunk Shot', u'type_Dunk', u'type_Dunk Shot',
       u'type_Fadeaway Bank shot', u'type_Fadeaway Jump Shot',
       u'type_Finger Roll Layup Shot', u'type_Finger Roll Shot',
       u'type_Floating Jump shot', u'type_Hook Shot', u'type_Jump Bank Shot',
       u'type_Jump Hook Shot', u'type_Layup', u'type_Layup Shot',
       u'type_Pullup Jump shot', u'type_Reverse Dunk Shot',
       u'type_Reverse Layup Shot', u'type_Running Bank shot',
       u'type_Running Hook Shot', u'type_Running Jump Shot',
       u'type_Running Layup Shot', u'type_Slam Dunk Shot',
       u'type_Step Back Jump shot', u'type_Tip Shot',
       u'type_Turnaround Bank shot', u'type_Turnaround Fadeaway shot',
       u'type_Turnaround Jump Shot'],
      dtype='object')

In [7]:
X = df[~mask]
y = df.shot_made_flag[~mask]

X = X.drop(['shot_made_flag'],axis=1)

In [8]:
X.describe()


Out[8]:
minutes_remaining period playoffs season seconds_remaining shot_distance shot_zone_range distance time_remaining last_moments ... type_Running Bank shot type_Running Hook Shot type_Running Jump Shot type_Running Layup Shot type_Slam Dunk Shot type_Step Back Jump shot type_Tip Shot type_Turnaround Bank shot type_Turnaround Fadeaway shot type_Turnaround Jump Shot
count 25697.000000 25697.000000 25697.000000 25697.000000 25697.000000 25697.000000 25697.000000 25697.000000 25697.000000 25697.000000 ... 25697.000000 25697.000000 25697.000000 25697.000000 25697.000000 25697.000000 25697.000000 25697.000000 25697.000000 25697.000000
mean 4.886796 2.520800 0.146243 9.431879 28.311554 13.457096 1.871230 13.426392 321.519321 0.028174 ... 0.001673 0.001284 0.030315 0.001985 0.012998 0.004125 0.005915 0.002257 0.014243 0.034673
std 3.452475 1.151626 0.353356 4.855144 17.523392 9.388725 1.577972 9.253031 208.311076 0.165474 ... 0.040873 0.035813 0.171455 0.044506 0.113266 0.064095 0.076683 0.047456 0.118493 0.182955
min 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 2.000000 1.000000 0.000000 5.000000 13.000000 5.000000 0.000000 5.000000 141.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 5.000000 3.000000 0.000000 10.000000 28.000000 15.000000 2.000000 15.000000 304.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
75% 8.000000 3.000000 0.000000 13.000000 43.000000 21.000000 4.000000 21.000000 499.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
max 11.000000 7.000000 1.000000 19.000000 59.000000 79.000000 4.000000 45.000000 714.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 87 columns


In [9]:
print X.shape, len(y)
#(25697, 180)


(25697, 87) 25697

In [10]:
y.describe()


Out[10]:
count    25697.000000
mean         0.446161
std          0.497103
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: shot_made_flag, dtype: float64

In [14]:
from matplotlib import pyplot  as plt
import seaborn
%matplotlib inline

In [28]:
# # Split the Learning Set
X_fit = X.iloc[:-int(y.shape[0]*0.10)]
X_eval = X.iloc[-int(y.shape[0]*0.10):]

y_fit = y.iloc[:-int(y.shape[0]*0.10)]
y_eval = y.iloc[-int(y.shape[0]*0.10):]

In [19]:
gsearch1.get_params().keys()


Out[19]:
['n_jobs',
 'verbose',
 'estimator__gamma',
 'estimator__reg_alpha',
 'estimator__nthread',
 'estimator__silent',
 'estimator__min_child_weight',
 'estimator__max_depth',
 'estimator__base_score',
 'param_grid',
 'cv',
 'scoring',
 'estimator__reg_lambda',
 'estimator__scale_pos_weight',
 'estimator__seed',
 'estimator__colsample_bylevel',
 'estimator__max_delta_step',
 'pre_dispatch',
 'estimator__missing',
 'fit_params',
 'estimator__objective',
 'refit',
 'iid',
 'estimator__learning_rate',
 'estimator__n_estimators',
 'estimator__colsample_bytree',
 'estimator',
 'error_score',
 'estimator__subsample']

In [ ]:
#tuning hyperparameters
from sklearn.grid_search import GridSearchCV   #Perforing grid search

param_test1 = {
 'max_depth':range(4,9,1),
 'min_child_weight':range(1,5,1),
 'gamma': [i/10.0 for i in range(0,4)],
 'subsample' :[i/10.0 for i in range (6,10)],
 'colsample_bytree' :[i/10.0 for i in range (6,10)],  
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.3, n_estimators=100, 
 objective= 'binary:logistic', scale_pos_weight=1, seed=2100), 
 param_grid = param_test1, scoring='log_loss',iid=False, cv=3, verbose = 10,n_jobs=-1)
gsearch1.fit(X,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [ ]:
import xgboost as xgb

clf = xgb.XGBClassifier(missing=np.nan, max_depth=6, 
                        n_estimators=2000, learning_rate=0.02, 
                        subsample=0.75, colsample_bytree=0.8, seed=2100,objective= 'binary:logistic')
# clf = xgb.XGBClassifier(missing=np.nan, subsample=0.9, colsample_bytree=0.6, max_depth=4, n_estimators=2000, 
#                         learning_rate=0.02,
#                         gamma=0.0, min_child_weight=4, seed=2100,objective= 'binary:logistic')

#  logloss train: 0.52577
#   logloss valid: 0.60572
#  logloss train: 0.53614
# #   logloss valid: 0.60463
# logloss train: 0.55263
#   logloss valid: 0.60435

# logloss train: 0.54577
#   logloss valid: 0.60223

# logloss train: 0.55223 subsample = 0.55
#   logloss valid: 0.60279
# # -----------------------
#  logloss train: 0.55020
#   logloss valid: 0.60258
# fitting
clf.fit(X_fit, y_fit, early_stopping_rounds=250,  eval_metric="logloss", eval_set=[(X_eval, y_eval)])
#clf.fit(X,y)
# scores
from  sklearn.metrics import log_loss
log_train = log_loss(y_fit, clf.predict_proba(X_fit)[:,1])
log_valid = log_loss(y_eval, clf.predict_proba(X_eval)[:,1])


print('\n-----------------------')
print('  logloss train: %.5f'%log_train)
print('  logloss valid: %.5f'%log_valid)
print('-----------------------')

print('\nModel parameters...')
print(clf.get_params())


#print y_pred
target_x = df[mask]
target_x = target_x.drop(['shot_made_flag'],axis=1)
target_y = clf.predict_proba(target_x,ntree_limit = clf.best_iteration)[:,1]
submission = pd.DataFrame({"shot_id":target_id, "shot_made_flag":target_y})
submission.sort_values('shot_id',  inplace=True)
submission.to_csv("submissson.csv",index=False)

print ("Success")

In [ ]:
print target_x.shape, X.shape