notebook.community

Edit and run



In [97]:

    
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold



In [98]:

    
filename = 'data.csv'
raw = pd.read_csv(filename)
print raw.shape
raw.head()









    



(30697, 25)






    Out[98]:






  
    
      
      action_type
      combined_shot_type
      game_event_id
      game_id
      lat
      loc_x
      loc_y
      lon
      minutes_remaining
      period
      ...
      shot_type
      shot_zone_area
      shot_zone_basic
      shot_zone_range
      team_id
      team_name
      game_date
      matchup
      opponent
      shot_id
    
  
  
    
      0
      Jump Shot
      Jump Shot
      10
      20000012
      33.9723
      167
      72
      -118.1028
      10
      1
      ...
      2PT Field Goal
      Right Side(R)
      Mid-Range
      16-24 ft.
      1610612747
      Los Angeles Lakers
      2000-10-31
      LAL @ POR
      POR
      1
    
    
      1
      Jump Shot
      Jump Shot
      12
      20000012
      34.0443
      -157
      0
      -118.4268
      10
      1
      ...
      2PT Field Goal
      Left Side(L)
      Mid-Range
      8-16 ft.
      1610612747
      Los Angeles Lakers
      2000-10-31
      LAL @ POR
      POR
      2
    
    
      2
      Jump Shot
      Jump Shot
      35
      20000012
      33.9093
      -101
      135
      -118.3708
      7
      1
      ...
      2PT Field Goal
      Left Side Center(LC)
      Mid-Range
      16-24 ft.
      1610612747
      Los Angeles Lakers
      2000-10-31
      LAL @ POR
      POR
      3
    
    
      3
      Jump Shot
      Jump Shot
      43
      20000012
      33.8693
      138
      175
      -118.1318
      6
      1
      ...
      2PT Field Goal
      Right Side Center(RC)
      Mid-Range
      16-24 ft.
      1610612747
      Los Angeles Lakers
      2000-10-31
      LAL @ POR
      POR
      4
    
    
      4
      Driving Dunk Shot
      Dunk
      155
      20000012
      34.0443
      0
      0
      -118.2698
      6
      2
      ...
      2PT Field Goal
      Center(C)
      Restricted Area
      Less Than 8 ft.
      1610612747
      Los Angeles Lakers
      2000-10-31
      LAL @ POR
      POR
      5
    
  

5 rows × 25 columns



In [74]:

    
kobe = raw[pd.notnull(raw['shot_made_flag'])]
print kobe.shape









    



(25697, 25)



In [75]:

    
alpha = 0.02 

plt.figure(figsize=(10,10))

plt.subplot(121)
plt.scatter(kobe.loc_x,kobe.loc_y,color='R',alpha=alpha )
plt.title('loc_x and loc_y')

plt.subplot(122)
plt.scatter(kobe.lon,kobe.lat,color='B',alpha=alpha)
plt.title('lat and lon')









    Out[75]:





<matplotlib.text.Text at 0x119933750>



In [101]:

    
raw['dist'] = np.sqrt(np.power(raw['loc_x'],2) + np.power(raw['loc_y'],2))

loc_x_zero = raw['loc_x'] == 0 
#print loc_x_zero
raw['angle'] = np.array([0] * len(raw))
#print raw['angle']
raw['angle'][~loc_x_zero] = np.arctan(raw['loc_y'][~loc_x_zero] / raw['loc_x'][~loc_x_zero])
raw['angle'][loc_x_zero] = np.pi / 2
#print raw['angle'][loc_x_zero]









    



/Users/jark/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/Users/jark/anaconda/lib/python2.7/site-packages/ipykernel/__main__.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy



In [88]:

    
raw['remaining_time'] = raw['minutes_remaining'] * 60 + raw['seconds_remaining']



In [89]:

    
print kobe.action_type.unique()
print kobe.combined_shot_type.unique()
print kobe.shot_type.unique()
print kobe.shot_type.value_counts()









    



['Jump Shot' 'Driving Dunk Shot' 'Layup Shot' 'Running Jump Shot'
 'Reverse Dunk Shot' 'Slam Dunk Shot' 'Driving Layup Shot'
 'Turnaround Jump Shot' 'Reverse Layup Shot' 'Tip Shot' 'Running Hook Shot'
 'Alley Oop Dunk Shot' 'Dunk Shot' 'Alley Oop Layup shot'
 'Running Dunk Shot' 'Driving Finger Roll Shot' 'Running Layup Shot'
 'Finger Roll Shot' 'Fadeaway Jump Shot' 'Follow Up Dunk Shot' 'Hook Shot'
 'Turnaround Hook Shot' 'Jump Hook Shot' 'Running Finger Roll Shot'
 'Jump Bank Shot' 'Turnaround Finger Roll Shot' 'Hook Bank Shot'
 'Driving Hook Shot' 'Running Tip Shot' 'Running Reverse Layup Shot'
 'Driving Finger Roll Layup Shot' 'Fadeaway Bank shot' 'Pullup Jump shot'
 'Finger Roll Layup Shot' 'Turnaround Fadeaway shot'
 'Driving Reverse Layup Shot' 'Driving Slam Dunk Shot'
 'Step Back Jump shot' 'Turnaround Bank shot' 'Reverse Slam Dunk Shot'
 'Floating Jump shot' 'Putback Slam Dunk Shot' 'Running Bank shot'
 'Driving Bank shot' 'Driving Jump shot' 'Putback Layup Shot'
 'Putback Dunk Shot' 'Running Finger Roll Layup Shot' 'Pullup Bank shot'
 'Running Slam Dunk Shot' 'Cutting Layup Shot' 'Driving Floating Jump Shot'
 'Running Pull-Up Jump Shot' 'Tip Layup Shot'
 'Driving Floating Bank Jump Shot']
['Jump Shot' 'Dunk' 'Layup' 'Tip Shot' 'Hook Shot' 'Bank Shot']
['2PT Field Goal' '3PT Field Goal']
2PT Field Goal    20285
3PT Field Goal     5412
Name: shot_type, dtype: int64



In [95]:

    
kobe['season'].unique()
#kobe.season.unique()









    Out[95]:





array(['2000-01', '2001-02', '2002-03', '2003-04', '2004-05', '2005-06',
       '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12',
       '2012-13', '2013-14', '2014-15', '2015-16', '1996-97', '1997-98',
       '1998-99', '1999-00'], dtype=object)



In [99]:

    
raw['season'] = raw['season'].apply(lambda x: int(x.split('-')[1]))
raw['season'].unique()









    Out[99]:





array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 97,
       98, 99,  0])



In [92]:

    
print kobe['team_id'].unique()
print kobe['team_name'].unique()









    



[1610612747]
['Los Angeles Lakers']



In [93]:

    
pd.DataFrame({'matchup':kobe.matchup,'opponent':kobe.opponent})









    Out[93]:






  
    
      
      matchup
      opponent
    
  
  
    
      1
      LAL @ POR
      POR
    
    
      2
      LAL @ POR
      POR
    
    
      3
      LAL @ POR
      POR
    
    
      4
      LAL @ POR
      POR
    
    
      5
      LAL @ POR
      POR
    
    
      6
      LAL @ POR
      POR
    
    
      8
      LAL @ POR
      POR
    
    
      9
      LAL @ POR
      POR
    
    
      10
      LAL @ POR
      POR
    
    
      11
      LAL vs. UTA
      UTA
    
    
      12
      LAL vs. UTA
      UTA
    
    
      13
      LAL vs. UTA
      UTA
    
    
      14
      LAL vs. UTA
      UTA
    
    
      15
      LAL vs. UTA
      UTA
    
    
      17
      LAL vs. UTA
      UTA
    
    
      18
      LAL vs. UTA
      UTA
    
    
      20
      LAL vs. UTA
      UTA
    
    
      21
      LAL vs. UTA
      UTA
    
    
      22
      LAL vs. UTA
      UTA
    
    
      23
      LAL vs. UTA
      UTA
    
    
      24
      LAL vs. UTA
      UTA
    
    
      25
      LAL vs. UTA
      UTA
    
    
      26
      LAL vs. UTA
      UTA
    
    
      27
      LAL vs. UTA
      UTA
    
    
      28
      LAL vs. UTA
      UTA
    
    
      29
      LAL vs. UTA
      UTA
    
    
      30
      LAL vs. UTA
      UTA
    
    
      31
      LAL vs. UTA
      UTA
    
    
      38
      LAL @ VAN
      VAN
    
    
      39
      LAL @ VAN
      VAN
    
    
      ...
      ...
      ...
    
    
      30661
      LAL @ IND
      IND
    
    
      30662
      LAL @ IND
      IND
    
    
      30663
      LAL @ IND
      IND
    
    
      30665
      LAL @ IND
      IND
    
    
      30666
      LAL @ IND
      IND
    
    
      30667
      LAL @ IND
      IND
    
    
      30669
      LAL @ IND
      IND
    
    
      30670
      LAL vs. IND
      IND
    
    
      30671
      LAL vs. IND
      IND
    
    
      30672
      LAL vs. IND
      IND
    
    
      30673
      LAL vs. IND
      IND
    
    
      30674
      LAL vs. IND
      IND
    
    
      30675
      LAL vs. IND
      IND
    
    
      30676
      LAL vs. IND
      IND
    
    
      30677
      LAL vs. IND
      IND
    
    
      30678
      LAL vs. IND
      IND
    
    
      30679
      LAL vs. IND
      IND
    
    
      30681
      LAL vs. IND
      IND
    
    
      30683
      LAL vs. IND
      IND
    
    
      30684
      LAL vs. IND
      IND
    
    
      30685
      LAL vs. IND
      IND
    
    
      30687
      LAL vs. IND
      IND
    
    
      30688
      LAL vs. IND
      IND
    
    
      30689
      LAL vs. IND
      IND
    
    
      30690
      LAL vs. IND
      IND
    
    
      30691
      LAL vs. IND
      IND
    
    
      30692
      LAL vs. IND
      IND
    
    
      30694
      LAL vs. IND
      IND
    
    
      30695
      LAL vs. IND
      IND
    
    
      30696
      LAL vs. IND
      IND
    
  

25697 rows × 2 columns



In [102]:

    
plt.figure(figsize=(5,5))
plt.scatter(raw.dist,raw.shot_distance,color='R')
plt.title('dist and show_distance')









    Out[102]:





<matplotlib.text.Text at 0x11d0cfd10>



In [132]:

    
gs = kobe.groupby('shot_zone_area')
print kobe['shot_zone_area'].value_counts()
print len(gs)
#print zip(gs)[0]









    



Center(C)                11289
Right Side Center(RC)     3981
Right Side(R)             3859
Left Side Center(LC)      3364
Left Side(L)              3132
Back Court(BC)              72
Name: shot_zone_area, dtype: int64
6



In [133]:

    
import matplotlib.cm as cm 
plt.figure(figsize=(20,10))

def scatter_plot_by_category(feat):
    alpha = 0.1
    gs = kobe.groupby(feat)
    cs = cm.rainbow(np.linspace(0,1,len(gs)))
    #print zip(gs,cs)[0]
    for g,c in zip(gs,cs): 
        plt.scatter(g[1].loc_x,g[1].loc_y,color=c,alpha=alpha)
    
plt.subplot(131)
scatter_plot_by_category('shot_zone_area')
plt.title('shot_zone_area')


plt.subplot(132)
scatter_plot_by_category('shot_zone_basic')
plt.title('shot_zone_basic')


plt.subplot(133)
scatter_plot_by_category('shot_zone_range')
plt.title('shot_zone_range')









    Out[133]:





<matplotlib.text.Text at 0x120d451d0>



In [127]:

    
#print help(np.linspace)
x = [1, 2, 3]
y = [4, 5, 6, 7]
xy = zip(x, y)
print xy
print xy[0]









    



[(1, 4), (2, 5), (3, 6)]
(1, 4)



In [135]:

    
drops = ['shot_id', 'team_id', 'team_name', 'shot_zone_area', 'shot_zone_range', 'shot_zone_basic', \
         'matchup', 'lon', 'lat', 'seconds_remaining', 'minutes_remaining', \
         'shot_distance', 'loc_x', 'loc_y', 'game_event_id', 'game_id', 'game_date']

for drop in drops:
    raw = raw.drop(drop,1)



In [138]:

    
print raw['combined_shot_type'].value_counts()
pd.get_dummies(raw['combined_shot_type'],prefix='combined_shot_type')[0:2]









    



Jump Shot    23485
Layup         5448
Dunk          1286
Tip Shot       184
Hook Shot      153
Bank Shot      141
Name: combined_shot_type, dtype: int64






    Out[138]:






  
    
      
      combined_shot_type_Bank Shot
      combined_shot_type_Dunk
      combined_shot_type_Hook Shot
      combined_shot_type_Jump Shot
      combined_shot_type_Layup
      combined_shot_type_Tip Shot
    
  
  
    
      0
      0
      0
      0
      1
      0
      0
    
    
      1
      0
      0
      0
      1
      0
      0



In [139]:

    
categorical_vars = ['action_type', 'combined_shot_type', \
                    'shot_type', 'opponent', 'period', 'season']
for var in categorical_vars:
    raw = pd.concat([raw,pd.get_dummies(raw[var], prefix=var)],1)
    raw = raw.drop(var,1)



In [148]:

    
train_kobe = raw[pd.notnull(raw['shot_made_flag'])]
#print train_kobe
train_kobe_notnull = train_kobe.drop('shot_made_flag',1)
#print train_kobe_notnull
train_lable = train_kobe['shot_made_flag']
#print train_lable
test_kobe = raw[pd.isnull(raw['shot_made_flag'])]
test_kobe_isnull = test_kobe.drop('shot_made_flag',1)



In [149]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,log_loss
import time



In [151]:

    
import numpy as np 
range_m = np.logspace(0,2,5).astype(int)
range_m









    Out[151]:





array([  1,   3,  10,  31, 100])



In [158]:

    
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold

#Finding best n_estimators for RandomForestClassifier
print 'Finding best n_estimators for RandomForestClassifier...'
min_score = 100000
best_n = 0
scores_n = []
range_n = np.logspace(0,2,5).astype(int)
for n in range_n:
    print 'the number of tree :{0}'.format(n)
    t1 = time.time()
    
    rfc_score = 0.
    rfc = RandomForestClassifier(n_estimators=n)
    for train_k,test_k in KFold(len(train_kobe_notnull),n_folds=10,shuffle=True):
        rfc.fit(train_kobe_notnull.iloc[train_k],train_lable.iloc[train_k])
        pred = rfc.predict(train_kobe_notnull.iloc[test_k])
        rfc_score += log_loss(train_lable.iloc[test_k],pred)/10
    scores_n.append(rfc_score)
    if rfc_score < min_score:
        min_score = rfc_score
        best_n = n 
        
    t2 = time.time()
    print 'Done processing {0} trees ({1:.3f}sec)'.format(n,t2-t1)

print best_n,min_score


#find best max_depth for RandomForestClassifier
print 'Find best max_depth for RandomForestClassifier...'
min_score = 100000
best_m = 0
scores_m = []
range_m = np.logspace(0,2,5).astype(int)
for m in range_m:
    print 'the max depth:{0}'.format(m)
    t1 = time.time()
    
    rfc_score = 0.
    rfc = RandomForestClassifier(max_depth=m,n_estimators=best_n)
    
    for train_k,test_k in KFold(len(train_kobe_notnull),n_folds=10,shuffle=True):
        rfc.fit(train_kobe_notnull.iloc[train_k],train_lable.iloc[train_k])
        pred = rfc.predict(train_kobe_notnull.iloc[test_k])
        rfc_score +=log_loss(train_lable.iloc[test_k],pred) /10
    
    scores_m.append(rfc_score)
    if rfc_score<min_score:
        min_score = rfc_score
        best_m = m
        
    t2 = time.time()
    print 'Done processing {0} trees ({1:.3f}sec)'.format(m,t2-t1)

print best_m,min_score









    



Finding best n_estimators for RandomForestClassifier...
the number of tree :1
Done processing 1 trees (1.581sec)
the number of tree :3
Done processing 3 trees (2.613sec)
the number of tree :10
Done processing 10 trees (5.947sec)
the number of tree :31
Done processing 31 trees (18.201sec)
the number of tree :100
Done processing 100 trees (64.410sec)
100 12.3360201857
Find best max_depth for RandomForestClassifier...
the max depth:1
Done processing 1 trees (9.264sec)
the max depth:3
Done processing 3 trees (11.372sec)
the max depth:10
Done processing 10 trees (20.075sec)
the max depth:31
Done processing 31 trees (44.780sec)
the max depth:100
Done processing 100 trees (54.753sec)
10 11.0107469944



In [159]:

    
plt.figure(figsize=(10,5))
plt.subplot(121)
plt.plot(range_n,scores_n)
plt.ylabel('score')
plt.xlabel('number of trees')

plt.subplot(122)
plt.plot(range_m,scores_m)
plt.ylabel('score')
plt.xlabel('max depth')









    Out[159]:





<matplotlib.text.Text at 0x12fc55450>



In [157]:

    
#model = RandomForestClassifier(n_estimators=best_n,max_depth=best_m)
#model.fit(train_kobe_notnull,train_lable)









    Out[157]:





RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)



In [ ]:

	action_type	combined_shot_type	game_event_id	game_id	lat	loc_x	loc_y	lon	minutes_remaining	period	...	shot_type	shot_zone_area	shot_zone_basic	shot_zone_range	team_id	team_name	game_date	matchup	opponent	shot_id
0	Jump Shot	Jump Shot	10	20000012	33.9723	167	72	-118.1028	10	1	...	2PT Field Goal	Right Side(R)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	1
1	Jump Shot	Jump Shot	12	20000012	34.0443	-157	0	-118.4268	10	1	...	2PT Field Goal	Left Side(L)	Mid-Range	8-16 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	2
2	Jump Shot	Jump Shot	35	20000012	33.9093	-101	135	-118.3708	7	1	...	2PT Field Goal	Left Side Center(LC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	3
3	Jump Shot	Jump Shot	43	20000012	33.8693	138	175	-118.1318	6	1	...	2PT Field Goal	Right Side Center(RC)	Mid-Range	16-24 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	4
4	Driving Dunk Shot	Dunk	155	20000012	34.0443	0	0	-118.2698	6	2	...	2PT Field Goal	Center(C)	Restricted Area	Less Than 8 ft.	1610612747	Los Angeles Lakers	2000-10-31	LAL @ POR	POR	5