notebook.community

Edit and run



In [43]:

    
import numpy as np
import pandas as pd

from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import cross_val_score

# Load data and roughly clean it, then sort as game date
df = pd.read_csv("data.csv")
mask = df['shot_made_flag'].isnull()
target_id = df[mask]["shot_id"]
print df['shot_zone_basic']
df.drop(['game_event_id', 'game_id', 'lat', 'lon', 'team_id', 'team_name',"shot_id",'shot_zone_basic'], axis=1, inplace=True)
df.sort_values('game_date',  inplace=True)

# Clean data
actiontypes = dict(df.action_type.value_counts())
df['type'] = df.apply(lambda row: row['action_type'] if actiontypes[row['action_type']] > 20\
                          else row['combined_shot_type'], axis=1)
df.drop(['action_type', 'combined_shot_type'], axis=1, inplace=True)

df['away'] = df.matchup.str.contains('@')
df.drop(['matchup','game_date'], axis=1, inplace=True)

df['distance'] = df.apply(lambda row: row['shot_distance'] if row['shot_distance'] <45 else 45, axis=1)

df['time_remaining'] = df.apply(lambda row: row['minutes_remaining'] * 60 + row['seconds_remaining'], axis=1)
df['last_moments'] = df.apply(lambda row: 1 if row['time_remaining'] < 3 else 0, axis=1)









    



0                    Mid-Range
1                    Mid-Range
2                    Mid-Range
3                    Mid-Range
4              Restricted Area
5                    Mid-Range
6              Restricted Area
7              Restricted Area
8        In The Paint (Non-RA)
9        In The Paint (Non-RA)
10           Above the Break 3
11                   Mid-Range
12       In The Paint (Non-RA)
13                   Mid-Range
14       In The Paint (Non-RA)
15                   Mid-Range
16             Restricted Area
17           Above the Break 3
18                   Mid-Range
19             Restricted Area
20                   Mid-Range
21                   Mid-Range
22       In The Paint (Non-RA)
23                   Mid-Range
24       In The Paint (Non-RA)
25             Restricted Area
26                   Mid-Range
27           Above the Break 3
28                   Mid-Range
29       In The Paint (Non-RA)
                 ...          
30667        Above the Break 3
30668                Mid-Range
30669                Mid-Range
30670    In The Paint (Non-RA)
30671          Restricted Area
30672                Mid-Range
30673                Mid-Range
30674        Above the Break 3
30675        Above the Break 3
30676                Mid-Range
30677                Mid-Range
30678          Restricted Area
30679          Restricted Area
30680          Restricted Area
30681        Above the Break 3
30682    In The Paint (Non-RA)
30683                Mid-Range
30684                Mid-Range
30685        Above the Break 3
30686    In The Paint (Non-RA)
30687    In The Paint (Non-RA)
30688                Mid-Range
30689                Backcourt
30690                Mid-Range
30691          Restricted Area
30692    In The Paint (Non-RA)
30693          Restricted Area
30694                Mid-Range
30695        Above the Break 3
30696    In The Paint (Non-RA)
Name: shot_zone_basic, dtype: object



In [37]:

    
print 140*140+116*116
print 18*18

print 131*131+97*97
print 16*16



In [35]:

    
df[['loc_x','loc_y','shot_distance']].head(2)









    Out[35]:






  
    
      
      loc_x
      loc_y
      shot_distance
    
  
  
    
      22901
      -140
      116
      18
    
    
      22902
      -131
      97
      16



In [2]:

    
from math import atan
df['degree_shoot'] =df.apply(lambda row: 90 if row['loc_x'] == 0 else 57.2958*atan(row['loc_y']/row['loc_x']), axis=1)
df.drop(['loc_x','loc_y'], axis=1, inplace=True)



In [3]:

    
values = dict(df['shot_zone_range'].value_counts())
df['_shotZoneFreq'] = df['shot_zone_range'].map(values)

values = dict(df['type'].value_counts())
df['_typeFreq'] = df['type'].map(values)



In [4]:

    
print pd.Series.unique(df.type).shape[0]
pd.Series.unique(df.type)









    



36






    Out[4]:





array(['Jump Shot', 'Layup Shot', 'Driving Layup Shot', 'Dunk Shot',
       'Driving Dunk Shot', 'Slam Dunk Shot', 'Running Jump Shot',
       'Tip Shot', 'Hook Shot', 'Reverse Dunk Shot', 'Reverse Layup Shot',
       'Turnaround Jump Shot', 'Running Hook Shot', 'Alley Oop Dunk Shot',
       'Alley Oop Layup shot', 'Dunk', 'Driving Finger Roll Shot',
       'Running Layup Shot', 'Finger Roll Shot', 'Fadeaway Jump Shot',
       'Jump Hook Shot', 'Layup', 'Jump Bank Shot', 'Bank Shot',
       'Driving Finger Roll Layup Shot', 'Fadeaway Bank shot',
       'Pullup Jump shot', 'Finger Roll Layup Shot',
       'Driving Reverse Layup Shot', 'Turnaround Fadeaway shot',
       'Driving Slam Dunk Shot', 'Step Back Jump shot',
       'Turnaround Bank shot', 'Floating Jump shot', 'Running Bank shot',
       'Driving Jump shot'], dtype=object)



In [5]:

    
from sklearn import preprocessing

featuresToOneHotEncoding=['shot_type', 'shot_zone_area', 'opponent',  'away','type']
for f in featuresToOneHotEncoding:
    df = pd.concat([df, pd.get_dummies(np.where(df[f] == df[f].unique()[0],None, df[f]), prefix=f),], axis=1)
 
df = df.drop(featuresToOneHotEncoding,axis=1)



In [6]:

    
featuresToLabel = ["season",'shot_zone_range']
print df.columns
lbl = preprocessing.LabelEncoder()
for f in featuresToLabel: 
    lbl = preprocessing.LabelEncoder() 
    lbl.fit(list(df[f].values)) 
    df[f] = lbl.transform(list(df[f].values))









    



Index([u'minutes_remaining', u'period', u'playoffs', u'season',
       u'seconds_remaining', u'shot_distance', u'shot_made_flag',
       u'shot_zone_range', u'distance', u'time_remaining', u'last_moments',
       u'degree_shoot', u'_shotZoneFreq', u'_typeFreq',
       u'shot_type_3PT Field Goal', u'shot_zone_area_Back Court(BC)',
       u'shot_zone_area_Center(C)', u'shot_zone_area_Left Side Center(LC)',
       u'shot_zone_area_Left Side(L)', u'shot_zone_area_Right Side Center(RC)',
       u'opponent_ATL', u'opponent_BKN', u'opponent_BOS', u'opponent_CHA',
       u'opponent_CHI', u'opponent_CLE', u'opponent_DAL', u'opponent_DEN',
       u'opponent_DET', u'opponent_GSW', u'opponent_HOU', u'opponent_IND',
       u'opponent_LAC', u'opponent_MEM', u'opponent_MIA', u'opponent_MIL',
       u'opponent_MIN', u'opponent_NJN', u'opponent_NOH', u'opponent_NOP',
       u'opponent_NYK', u'opponent_OKC', u'opponent_ORL', u'opponent_PHI',
       u'opponent_PHX', u'opponent_SAC', u'opponent_SAS', u'opponent_SEA',
       u'opponent_TOR', u'opponent_UTA', u'opponent_VAN', u'opponent_WAS',
       u'away_False', u'type_Alley Oop Dunk Shot',
       u'type_Alley Oop Layup shot', u'type_Bank Shot',
       u'type_Driving Dunk Shot', u'type_Driving Finger Roll Layup Shot',
       u'type_Driving Finger Roll Shot', u'type_Driving Jump shot',
       u'type_Driving Layup Shot', u'type_Driving Reverse Layup Shot',
       u'type_Driving Slam Dunk Shot', u'type_Dunk', u'type_Dunk Shot',
       u'type_Fadeaway Bank shot', u'type_Fadeaway Jump Shot',
       u'type_Finger Roll Layup Shot', u'type_Finger Roll Shot',
       u'type_Floating Jump shot', u'type_Hook Shot', u'type_Jump Bank Shot',
       u'type_Jump Hook Shot', u'type_Layup', u'type_Layup Shot',
       u'type_Pullup Jump shot', u'type_Reverse Dunk Shot',
       u'type_Reverse Layup Shot', u'type_Running Bank shot',
       u'type_Running Hook Shot', u'type_Running Jump Shot',
       u'type_Running Layup Shot', u'type_Slam Dunk Shot',
       u'type_Step Back Jump shot', u'type_Tip Shot',
       u'type_Turnaround Bank shot', u'type_Turnaround Fadeaway shot',
       u'type_Turnaround Jump Shot'],
      dtype='object')



In [7]:

    
X = df[~mask]
y = df.shot_made_flag[~mask]

X = X.drop(['shot_made_flag'],axis=1)



In [8]:

    
X.describe()









    Out[8]:






  
    
      
      minutes_remaining
      period
      playoffs
      season
      seconds_remaining
      shot_distance
      shot_zone_range
      distance
      time_remaining
      last_moments
      ...
      type_Running Bank shot
      type_Running Hook Shot
      type_Running Jump Shot
      type_Running Layup Shot
      type_Slam Dunk Shot
      type_Step Back Jump shot
      type_Tip Shot
      type_Turnaround Bank shot
      type_Turnaround Fadeaway shot
      type_Turnaround Jump Shot
    
  
  
    
      count
      25697.000000
      25697.000000
      25697.000000
      25697.000000
      25697.000000
      25697.000000
      25697.000000
      25697.000000
      25697.000000
      25697.000000
      ...
      25697.000000
      25697.000000
      25697.000000
      25697.000000
      25697.000000
      25697.000000
      25697.000000
      25697.000000
      25697.000000
      25697.000000
    
    
      mean
      4.886796
      2.520800
      0.146243
      9.431879
      28.311554
      13.457096
      1.871230
      13.426392
      321.519321
      0.028174
      ...
      0.001673
      0.001284
      0.030315
      0.001985
      0.012998
      0.004125
      0.005915
      0.002257
      0.014243
      0.034673
    
    
      std
      3.452475
      1.151626
      0.353356
      4.855144
      17.523392
      9.388725
      1.577972
      9.253031
      208.311076
      0.165474
      ...
      0.040873
      0.035813
      0.171455
      0.044506
      0.113266
      0.064095
      0.076683
      0.047456
      0.118493
      0.182955
    
    
      min
      0.000000
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      25%
      2.000000
      1.000000
      0.000000
      5.000000
      13.000000
      5.000000
      0.000000
      5.000000
      141.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      50%
      5.000000
      3.000000
      0.000000
      10.000000
      28.000000
      15.000000
      2.000000
      15.000000
      304.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      75%
      8.000000
      3.000000
      0.000000
      13.000000
      43.000000
      21.000000
      4.000000
      21.000000
      499.000000
      0.000000
      ...
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
    
    
      max
      11.000000
      7.000000
      1.000000
      19.000000
      59.000000
      79.000000
      4.000000
      45.000000
      714.000000
      1.000000
      ...
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
      1.000000
    
  

8 rows × 87 columns



In [9]:

    
print X.shape, len(y)
#(25697, 180)









    



(25697, 87) 25697



In [10]:

    
y.describe()









    Out[10]:





count    25697.000000
mean         0.446161
std          0.497103
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: shot_made_flag, dtype: float64



In [14]:

    
from matplotlib import pyplot  as plt
import seaborn
%matplotlib inline



In [28]:

    
# # Split the Learning Set
X_fit = X.iloc[:-int(y.shape[0]*0.10)]
X_eval = X.iloc[-int(y.shape[0]*0.10):]

y_fit = y.iloc[:-int(y.shape[0]*0.10)]
y_eval = y.iloc[-int(y.shape[0]*0.10):]



In [19]:

    
gsearch1.get_params().keys()









    Out[19]:





['n_jobs',
 'verbose',
 'estimator__gamma',
 'estimator__reg_alpha',
 'estimator__nthread',
 'estimator__silent',
 'estimator__min_child_weight',
 'estimator__max_depth',
 'estimator__base_score',
 'param_grid',
 'cv',
 'scoring',
 'estimator__reg_lambda',
 'estimator__scale_pos_weight',
 'estimator__seed',
 'estimator__colsample_bylevel',
 'estimator__max_delta_step',
 'pre_dispatch',
 'estimator__missing',
 'fit_params',
 'estimator__objective',
 'refit',
 'iid',
 'estimator__learning_rate',
 'estimator__n_estimators',
 'estimator__colsample_bytree',
 'estimator',
 'error_score',
 'estimator__subsample']



In [ ]:

    
#tuning hyperparameters
from sklearn.grid_search import GridSearchCV   #Perforing grid search

param_test1 = {
 'max_depth':range(4,9,1),
 'min_child_weight':range(1,5,1),
 'gamma': [i/10.0 for i in range(0,4)],
 'subsample' :[i/10.0 for i in range (6,10)],
 'colsample_bytree' :[i/10.0 for i in range (6,10)],  
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.3, n_estimators=100, 
 objective= 'binary:logistic', scale_pos_weight=1, seed=2100), 
 param_grid = param_test1, scoring='log_loss',iid=False, cv=3, verbose = 10,n_jobs=-1)
gsearch1.fit(X,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



In [ ]:

    
import xgboost as xgb

clf = xgb.XGBClassifier(missing=np.nan, max_depth=6, 
                        n_estimators=2000, learning_rate=0.02, 
                        subsample=0.75, colsample_bytree=0.8, seed=2100,objective= 'binary:logistic')
# clf = xgb.XGBClassifier(missing=np.nan, subsample=0.9, colsample_bytree=0.6, max_depth=4, n_estimators=2000, 
#                         learning_rate=0.02,
#                         gamma=0.0, min_child_weight=4, seed=2100,objective= 'binary:logistic')

#  logloss train: 0.52577
#   logloss valid: 0.60572
#  logloss train: 0.53614
# #   logloss valid: 0.60463
# logloss train: 0.55263
#   logloss valid: 0.60435

# logloss train: 0.54577
#   logloss valid: 0.60223

# logloss train: 0.55223 subsample = 0.55
#   logloss valid: 0.60279
# # -----------------------
#  logloss train: 0.55020
#   logloss valid: 0.60258
# fitting
clf.fit(X_fit, y_fit, early_stopping_rounds=250,  eval_metric="logloss", eval_set=[(X_eval, y_eval)])
#clf.fit(X,y)
# scores
from  sklearn.metrics import log_loss
log_train = log_loss(y_fit, clf.predict_proba(X_fit)[:,1])
log_valid = log_loss(y_eval, clf.predict_proba(X_eval)[:,1])


print('\n-----------------------')
print('  logloss train: %.5f'%log_train)
print('  logloss valid: %.5f'%log_valid)
print('-----------------------')

print('\nModel parameters...')
print(clf.get_params())


#print y_pred
target_x = df[mask]
target_x = target_x.drop(['shot_made_flag'],axis=1)
target_y = clf.predict_proba(target_x,ntree_limit = clf.best_iteration)[:,1]
submission = pd.DataFrame({"shot_id":target_id, "shot_made_flag":target_y})
submission.sort_values('shot_id',  inplace=True)
submission.to_csv("submissson.csv",index=False)

print ("Success")



In [ ]:

    
print target_x.shape, X.shape

	minutes_remaining	period	playoffs	season	seconds_remaining	shot_distance	shot_zone_range	distance	time_remaining	last_moments	...	type_Running Bank shot	type_Running Hook Shot	type_Running Jump Shot	type_Running Layup Shot	type_Slam Dunk Shot	type_Step Back Jump shot	type_Tip Shot	type_Turnaround Bank shot	type_Turnaround Fadeaway shot	type_Turnaround Jump Shot
count	25697.000000	25697.000000	25697.000000	25697.000000	25697.000000	25697.000000	25697.000000	25697.000000	25697.000000	25697.000000	...	25697.000000	25697.000000	25697.000000	25697.000000	25697.000000	25697.000000	25697.000000	25697.000000	25697.000000	25697.000000
mean	4.886796	2.520800	0.146243	9.431879	28.311554	13.457096	1.871230	13.426392	321.519321	0.028174	...	0.001673	0.001284	0.030315	0.001985	0.012998	0.004125	0.005915	0.002257	0.014243	0.034673
std	3.452475	1.151626	0.353356	4.855144	17.523392	9.388725	1.577972	9.253031	208.311076	0.165474	...	0.040873	0.035813	0.171455	0.044506	0.113266	0.064095	0.076683	0.047456	0.118493	0.182955
min	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	2.000000	1.000000	0.000000	5.000000	13.000000	5.000000	0.000000	5.000000	141.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	5.000000	3.000000	0.000000	10.000000	28.000000	15.000000	2.000000	15.000000	304.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
75%	8.000000	3.000000	0.000000	13.000000	43.000000	21.000000	4.000000	21.000000	499.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
max	11.000000	7.000000	1.000000	19.000000	59.000000	79.000000	4.000000	45.000000	714.000000	1.000000	...	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000