In [43]:
import numpy as np
import pandas as pd
from xgboost.sklearn import XGBClassifier
from sklearn.cross_validation import cross_val_score
# Load data and roughly clean it, then sort as game date
df = pd.read_csv("data.csv")
mask = df['shot_made_flag'].isnull()
target_id = df[mask]["shot_id"]
print df['shot_zone_basic']
df.drop(['game_event_id', 'game_id', 'lat', 'lon', 'team_id', 'team_name',"shot_id",'shot_zone_basic'], axis=1, inplace=True)
df.sort_values('game_date', inplace=True)
# Clean data
actiontypes = dict(df.action_type.value_counts())
df['type'] = df.apply(lambda row: row['action_type'] if actiontypes[row['action_type']] > 20\
else row['combined_shot_type'], axis=1)
df.drop(['action_type', 'combined_shot_type'], axis=1, inplace=True)
df['away'] = df.matchup.str.contains('@')
df.drop(['matchup','game_date'], axis=1, inplace=True)
df['distance'] = df.apply(lambda row: row['shot_distance'] if row['shot_distance'] <45 else 45, axis=1)
df['time_remaining'] = df.apply(lambda row: row['minutes_remaining'] * 60 + row['seconds_remaining'], axis=1)
df['last_moments'] = df.apply(lambda row: 1 if row['time_remaining'] < 3 else 0, axis=1)
In [37]:
print 140*140+116*116
print 18*18
print 131*131+97*97
print 16*16
In [35]:
df[['loc_x','loc_y','shot_distance']].head(2)
Out[35]:
In [2]:
from math import atan
df['degree_shoot'] =df.apply(lambda row: 90 if row['loc_x'] == 0 else 57.2958*atan(row['loc_y']/row['loc_x']), axis=1)
df.drop(['loc_x','loc_y'], axis=1, inplace=True)
In [3]:
values = dict(df['shot_zone_range'].value_counts())
df['_shotZoneFreq'] = df['shot_zone_range'].map(values)
values = dict(df['type'].value_counts())
df['_typeFreq'] = df['type'].map(values)
In [4]:
print pd.Series.unique(df.type).shape[0]
pd.Series.unique(df.type)
Out[4]:
In [5]:
from sklearn import preprocessing
featuresToOneHotEncoding=['shot_type', 'shot_zone_area', 'opponent', 'away','type']
for f in featuresToOneHotEncoding:
df = pd.concat([df, pd.get_dummies(np.where(df[f] == df[f].unique()[0],None, df[f]), prefix=f),], axis=1)
df = df.drop(featuresToOneHotEncoding,axis=1)
In [6]:
featuresToLabel = ["season",'shot_zone_range']
print df.columns
lbl = preprocessing.LabelEncoder()
for f in featuresToLabel:
lbl = preprocessing.LabelEncoder()
lbl.fit(list(df[f].values))
df[f] = lbl.transform(list(df[f].values))
In [7]:
X = df[~mask]
y = df.shot_made_flag[~mask]
X = X.drop(['shot_made_flag'],axis=1)
In [8]:
X.describe()
Out[8]:
In [9]:
print X.shape, len(y)
#(25697, 180)
In [10]:
y.describe()
Out[10]:
In [14]:
from matplotlib import pyplot as plt
import seaborn
%matplotlib inline
In [28]:
# # Split the Learning Set
X_fit = X.iloc[:-int(y.shape[0]*0.10)]
X_eval = X.iloc[-int(y.shape[0]*0.10):]
y_fit = y.iloc[:-int(y.shape[0]*0.10)]
y_eval = y.iloc[-int(y.shape[0]*0.10):]
In [19]:
gsearch1.get_params().keys()
Out[19]:
In [ ]:
#tuning hyperparameters
from sklearn.grid_search import GridSearchCV #Perforing grid search
param_test1 = {
'max_depth':range(4,9,1),
'min_child_weight':range(1,5,1),
'gamma': [i/10.0 for i in range(0,4)],
'subsample' :[i/10.0 for i in range (6,10)],
'colsample_bytree' :[i/10.0 for i in range (6,10)],
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.3, n_estimators=100,
objective= 'binary:logistic', scale_pos_weight=1, seed=2100),
param_grid = param_test1, scoring='log_loss',iid=False, cv=3, verbose = 10,n_jobs=-1)
gsearch1.fit(X,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
In [ ]:
import xgboost as xgb
clf = xgb.XGBClassifier(missing=np.nan, max_depth=6,
n_estimators=2000, learning_rate=0.02,
subsample=0.75, colsample_bytree=0.8, seed=2100,objective= 'binary:logistic')
# clf = xgb.XGBClassifier(missing=np.nan, subsample=0.9, colsample_bytree=0.6, max_depth=4, n_estimators=2000,
# learning_rate=0.02,
# gamma=0.0, min_child_weight=4, seed=2100,objective= 'binary:logistic')
# logloss train: 0.52577
# logloss valid: 0.60572
# logloss train: 0.53614
# # logloss valid: 0.60463
# logloss train: 0.55263
# logloss valid: 0.60435
# logloss train: 0.54577
# logloss valid: 0.60223
# logloss train: 0.55223 subsample = 0.55
# logloss valid: 0.60279
# # -----------------------
# logloss train: 0.55020
# logloss valid: 0.60258
# fitting
clf.fit(X_fit, y_fit, early_stopping_rounds=250, eval_metric="logloss", eval_set=[(X_eval, y_eval)])
#clf.fit(X,y)
# scores
from sklearn.metrics import log_loss
log_train = log_loss(y_fit, clf.predict_proba(X_fit)[:,1])
log_valid = log_loss(y_eval, clf.predict_proba(X_eval)[:,1])
print('\n-----------------------')
print(' logloss train: %.5f'%log_train)
print(' logloss valid: %.5f'%log_valid)
print('-----------------------')
print('\nModel parameters...')
print(clf.get_params())
#print y_pred
target_x = df[mask]
target_x = target_x.drop(['shot_made_flag'],axis=1)
target_y = clf.predict_proba(target_x,ntree_limit = clf.best_iteration)[:,1]
submission = pd.DataFrame({"shot_id":target_id, "shot_made_flag":target_y})
submission.sort_values('shot_id', inplace=True)
submission.to_csv("submissson.csv",index=False)
print ("Success")
In [ ]:
print target_x.shape, X.shape