In [97]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold

In [98]:
filename = 'data.csv'
raw = pd.read_csv(filename)
print raw.shape
raw.head()


(30697, 25)
Out[98]:
action_type combined_shot_type game_event_id game_id lat loc_x loc_y lon minutes_remaining period ... shot_type shot_zone_area shot_zone_basic shot_zone_range team_id team_name game_date matchup opponent shot_id
0 Jump Shot Jump Shot 10 20000012 33.9723 167 72 -118.1028 10 1 ... 2PT Field Goal Right Side(R) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 1
1 Jump Shot Jump Shot 12 20000012 34.0443 -157 0 -118.4268 10 1 ... 2PT Field Goal Left Side(L) Mid-Range 8-16 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 2
2 Jump Shot Jump Shot 35 20000012 33.9093 -101 135 -118.3708 7 1 ... 2PT Field Goal Left Side Center(LC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 3
3 Jump Shot Jump Shot 43 20000012 33.8693 138 175 -118.1318 6 1 ... 2PT Field Goal Right Side Center(RC) Mid-Range 16-24 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 4
4 Driving Dunk Shot Dunk 155 20000012 34.0443 0 0 -118.2698 6 2 ... 2PT Field Goal Center(C) Restricted Area Less Than 8 ft. 1610612747 Los Angeles Lakers 2000-10-31 LAL @ POR POR 5

5 rows × 25 columns


In [74]:
kobe = raw[pd.notnull(raw['shot_made_flag'])]
print kobe.shape


(25697, 25)

In [75]:
alpha = 0.02 

plt.figure(figsize=(10,10))

plt.subplot(121)
plt.scatter(kobe.loc_x,kobe.loc_y,color='R',alpha=alpha )
plt.title('loc_x and loc_y')

plt.subplot(122)
plt.scatter(kobe.lon,kobe.lat,color='B',alpha=alpha)
plt.title('lat and lon')


Out[75]:
<matplotlib.text.Text at 0x119933750>

In [101]:
raw['dist'] = np.sqrt(np.power(raw['loc_x'],2) + np.power(raw['loc_y'],2))

loc_x_zero = raw['loc_x'] == 0 
#print loc_x_zero
raw['angle'] = np.array([0] * len(raw))
#print raw['angle']
raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi / 2
#print raw['angle'][loc_x_zero]


/Users/jark/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/jark/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [88]:
raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining']

In [89]:
print kobe.action_type.unique()
print kobe.combined_shot_type.unique()
print kobe.shot_type.unique()
print kobe.shot_type.value_counts()


['Jump Shot' 'Driving Dunk Shot' 'Layup Shot' 'Running Jump Shot'
 'Reverse Dunk Shot' 'Slam Dunk Shot' 'Driving Layup Shot'
 'Turnaround Jump Shot' 'Reverse Layup Shot' 'Tip Shot' 'Running Hook Shot'
 'Alley Oop Dunk Shot' 'Dunk Shot' 'Alley Oop Layup shot'
 'Running Dunk Shot' 'Driving Finger Roll Shot' 'Running Layup Shot'
 'Finger Roll Shot' 'Fadeaway Jump Shot' 'Follow Up Dunk Shot' 'Hook Shot'
 'Turnaround Hook Shot' 'Jump Hook Shot' 'Running Finger Roll Shot'
 'Jump Bank Shot' 'Turnaround Finger Roll Shot' 'Hook Bank Shot'
 'Driving Hook Shot' 'Running Tip Shot' 'Running Reverse Layup Shot'
 'Driving Finger Roll Layup Shot' 'Fadeaway Bank shot' 'Pullup Jump shot'
 'Finger Roll Layup Shot' 'Turnaround Fadeaway shot'
 'Driving Reverse Layup Shot' 'Driving Slam Dunk Shot'
 'Step Back Jump shot' 'Turnaround Bank shot' 'Reverse Slam Dunk Shot'
 'Floating Jump shot' 'Putback Slam Dunk Shot' 'Running Bank shot'
 'Driving Bank shot' 'Driving Jump shot' 'Putback Layup Shot'
 'Putback Dunk Shot' 'Running Finger Roll Layup Shot' 'Pullup Bank shot'
 'Running Slam Dunk Shot' 'Cutting Layup Shot' 'Driving Floating Jump Shot'
 'Running Pull-Up Jump Shot' 'Tip Layup Shot'
 'Driving Floating Bank Jump Shot']
['Jump Shot' 'Dunk' 'Layup' 'Tip Shot' 'Hook Shot' 'Bank Shot']
['2PT Field Goal' '3PT Field Goal']
2PT Field Goal    20285
3PT Field Goal     5412
Name: shot_type, dtype: int64

In [95]:
kobe['season'].unique()
#kobe.season.unique()


Out[95]:
array(['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06',
       '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12',
       '2012-13', '2013-14', '2014-15', '2015-16', '1996-97', '1997-98',
       '1998-99', '1999-00'], dtype=object)

In [99]:
raw['season'] = raw['season'].apply(lambda x: int(x.split('-')[1]))
raw['season'].unique()


Out[99]:
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 97,
       98, 99,  0])

In [92]:
print kobe['team_id'].unique()
print kobe['team_name'].unique()


[1610612747]
['Los Angeles Lakers']

In [93]:
pd.DataFrame({'matchup':kobe.matchup,'opponent':kobe.opponent})


Out[93]:
matchup opponent
1 LAL @ POR POR
2 LAL @ POR POR
3 LAL @ POR POR
4 LAL @ POR POR
5 LAL @ POR POR
6 LAL @ POR POR
8 LAL @ POR POR
9 LAL @ POR POR
10 LAL @ POR POR
11 LAL vs. UTA UTA
12 LAL vs. UTA UTA
13 LAL vs. UTA UTA
14 LAL vs. UTA UTA
15 LAL vs. UTA UTA
17 LAL vs. UTA UTA
18 LAL vs. UTA UTA
20 LAL vs. UTA UTA
21 LAL vs. UTA UTA
22 LAL vs. UTA UTA
23 LAL vs. UTA UTA
24 LAL vs. UTA UTA
25 LAL vs. UTA UTA
26 LAL vs. UTA UTA
27 LAL vs. UTA UTA
28 LAL vs. UTA UTA
29 LAL vs. UTA UTA
30 LAL vs. UTA UTA
31 LAL vs. UTA UTA
38 LAL @ VAN VAN
39 LAL @ VAN VAN
... ... ...
30661 LAL @ IND IND
30662 LAL @ IND IND
30663 LAL @ IND IND
30665 LAL @ IND IND
30666 LAL @ IND IND
30667 LAL @ IND IND
30669 LAL @ IND IND
30670 LAL vs. IND IND
30671 LAL vs. IND IND
30672 LAL vs. IND IND
30673 LAL vs. IND IND
30674 LAL vs. IND IND
30675 LAL vs. IND IND
30676 LAL vs. IND IND
30677 LAL vs. IND IND
30678 LAL vs. IND IND
30679 LAL vs. IND IND
30681 LAL vs. IND IND
30683 LAL vs. IND IND
30684 LAL vs. IND IND
30685 LAL vs. IND IND
30687 LAL vs. IND IND
30688 LAL vs. IND IND
30689 LAL vs. IND IND
30690 LAL vs. IND IND
30691 LAL vs. IND IND
30692 LAL vs. IND IND
30694 LAL vs. IND IND
30695 LAL vs. IND IND
30696 LAL vs. IND IND

25697 rows × 2 columns


In [102]:
plt.figure(figsize=(5,5))
plt.scatter(raw.dist,raw.shot_distance,color='R')
plt.title('dist and show_distance')


Out[102]:
<matplotlib.text.Text at 0x11d0cfd10>

In [132]:
gs = kobe.groupby('shot_zone_area')
print kobe['shot_zone_area'].value_counts()
print len(gs)
#print zip(gs)[0]


Center(C)                11289
Right Side Center(RC)     3981
Right Side(R)             3859
Left Side Center(LC)      3364
Left Side(L)              3132
Back Court(BC)              72
Name: shot_zone_area, dtype: int64
6

In [133]:
import matplotlib.cm as cm 
plt.figure(figsize=(20,10))

def scatter_plot_by_category(feat):
    alpha = 0.1
    gs = kobe.groupby(feat)
    cs = cm.rainbow(np.linspace(0,1,len(gs)))
    #print zip(gs,cs)[0]
    for g,c in zip(gs,cs): 
        plt.scatter(g[1].loc_x,g[1].loc_y,color=c,alpha=alpha)
    
plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')


plt.subplot(132)
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')


plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')


Out[133]:
<matplotlib.text.Text at 0x120d451d0>

In [127]:
#print help(np.linspace)
x = [1, 2, 3]
y = [4, 5, 6, 7]
xy = zip(x, y)
print xy
print xy[0]


[(1, 4), (2, 5), (3, 6)]
(1, 4)

In [135]:
drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
         'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
         'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']

for drop in drops:
    raw = raw.drop(drop,1)

In [138]:
print raw['combined_shot_type'].value_counts()
pd.get_dummies(raw['combined_shot_type'],prefix='combined_shot_type')[0:2]


Jump Shot    23485
Layup         5448
Dunk          1286
Tip Shot       184
Hook Shot      153
Bank Shot      141
Name: combined_shot_type, dtype: int64
Out[138]:
combined_shot_type_Bank Shot combined_shot_type_Dunk combined_shot_type_Hook Shot combined_shot_type_Jump Shot combined_shot_type_Layup combined_shot_type_Tip Shot
0 0 0 0 1 0 0
1 0 0 0 1 0 0

In [139]:
categorical_vars = ['action_type', 'combined_shot_type', \
                    'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
    raw = pd.concat([raw,pd.get_dummies(raw[var], prefix=var)],1)
    raw = raw.drop(var,1)

In [148]:
train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
#print train_kobe
train_kobe_notnull = train_kobe.drop('shot_made_flag',1)
#print train_kobe_notnull
train_lable = train_kobe['shot_made_flag']
#print train_lable
test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_kobe_isnull = test_kobe.drop('shot_made_flag',1)

In [149]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,log_loss
import time

In [151]:
import numpy as np 
range_m = np.logspace(0,2,5).astype(int)
range_m


Out[151]:
array([  1,   3,  10,  31, 100])

In [158]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold

#Finding best n_estimators for RandomForestClassifier
print 'Finding best n_estimators for RandomForestClassifier...'
min_score = 100000
best_n = 0
scores_n = []
range_n = np.logspace(0,2,5).astype(int)
for n in range_n:
    print 'the number of tree :{0}'.format(n)
    t1 = time.time()
    
    rfc_score = 0.
    rfc = RandomForestClassifier(n_estimators=n)
    for train_k,test_k in KFold(len(train_kobe_notnull),n_folds=10,shuffle=True):
        rfc.fit(train_kobe_notnull.iloc[train_k],train_lable.iloc[train_k])
        pred = rfc.predict(train_kobe_notnull.iloc[test_k])
        rfc_score += log_loss(train_lable.iloc[test_k],pred)/10
    scores_n.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_n = n 
        
    t2 = time.time()
    print 'Done processing {0} trees ({1:.3f}sec)'.format(n,t2-t1)

print best_n,min_score


#find best max_depth for RandomForestClassifier
print 'Find best max_depth for RandomForestClassifier...'
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,5).astype(int)
for m in range_m:
    print 'the max depth:{0}'.format(m)
    t1 = time.time()
    
    rfc_score = 0.
    rfc = RandomForestClassifier(max_depth=m,n_estimators=best_n)
    
    for train_k,test_k in KFold(len(train_kobe_notnull),n_folds=10,shuffle=True):
        rfc.fit(train_kobe_notnull.iloc[train_k],train_lable.iloc[train_k])
        pred = rfc.predict(train_kobe_notnull.iloc[test_k])
        rfc_score +=log_loss(train_lable.iloc[test_k],pred) /10
    
    scores_m.append(rfc_score)
    if rfc_score<min_score:
        min_score = rfc_score
        best_m = m
        
    t2 = time.time()
    print 'Done processing {0} trees ({1:.3f}sec)'.format(m,t2-t1)

print best_m,min_score


Finding best n_estimators for RandomForestClassifier...
the number of tree :1
Done processing 1 trees (1.581sec)
the number of tree :3
Done processing 3 trees (2.613sec)
the number of tree :10
Done processing 10 trees (5.947sec)
the number of tree :31
Done processing 31 trees (18.201sec)
the number of tree :100
Done processing 100 trees (64.410sec)
100 12.3360201857
Find best max_depth for RandomForestClassifier...
the max depth:1
Done processing 1 trees (9.264sec)
the max depth:3
Done processing 3 trees (11.372sec)
the max depth:10
Done processing 10 trees (20.075sec)
the max depth:31
Done processing 31 trees (44.780sec)
the max depth:100
Done processing 100 trees (54.753sec)
10 11.0107469944

In [159]:
plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(range_n,scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')

plt.subplot(122)
plt.plot(range_m,scores_m)
plt.ylabel('score')
plt.xlabel('max depth')


Out[159]:
<matplotlib.text.Text at 0x12fc55450>

In [157]:
#model = RandomForestClassifier(n_estimators=best_n,max_depth=best_m)
#model.fit(train_kobe_notnull,train_lable)


Out[157]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [ ]: