notebook.community

Edit and run



In [1]:

    
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

    
#Helper function to calculate  the profile off the player
def profile(dataframe, ratio=False):
    all_players_shots = dataframe.replace('made', 1).replace('missed', 0).sort_values('SHOT_DIST', ascending=False)
    all_made_cumsum = all_players_shots['SHOT_RESULT'].cumsum()
    all_dist = all_players_shots['SHOT_DIST']
    return (all_dist, all_made_cumsum if ratio else all_made_cumsum/all_made_cumsum.count())



In [3]:

    
#Loading the dataset (https://www.kaggle.com/dansbecker/nba-shot-logs)
shot_logs = pd.read_csv('./shot_logs.csv')
shot_logs.tail()









    Out[3]:






  
    
      
      GAME_ID
      MATCHUP
      LOCATION
      W
      FINAL_MARGIN
      SHOT_NUMBER
      PERIOD
      GAME_CLOCK
      SHOT_CLOCK
      DRIBBLES
      ...
      SHOT_DIST
      PTS_TYPE
      SHOT_RESULT
      CLOSEST_DEFENDER
      CLOSEST_DEFENDER_PLAYER_ID
      CLOSE_DEF_DIST
      FGM
      PTS
      player_name
      player_id
    
  
  
    
      128064
      21400006
      OCT 29, 2014 - BKN @ BOS
      A
      L
      -16
      5
      3
      1:52
      18.3
      5
      ...
      8.7
      2
      missed
      Smart, Marcus
      203935
      0.8
      0
      0
      jarrett jack
      101127
    
    
      128065
      21400006
      OCT 29, 2014 - BKN @ BOS
      A
      L
      -16
      6
      4
      11:28
      19.8
      4
      ...
      0.6
      2
      made
      Turner, Evan
      202323
      0.6
      1
      2
      jarrett jack
      101127
    
    
      128066
      21400006
      OCT 29, 2014 - BKN @ BOS
      A
      L
      -16
      7
      4
      11:10
      23.0
      2
      ...
      16.9
      2
      made
      Thornton, Marcus
      201977
      4.2
      1
      2
      jarrett jack
      101127
    
    
      128067
      21400006
      OCT 29, 2014 - BKN @ BOS
      A
      L
      -16
      8
      4
      2:37
      9.1
      4
      ...
      18.3
      2
      missed
      Bradley, Avery
      202340
      3.0
      0
      0
      jarrett jack
      101127
    
    
      128068
      21400006
      OCT 29, 2014 - BKN @ BOS
      A
      L
      -16
      9
      4
      0:12
      NaN
      5
      ...
      5.1
      2
      made
      Bradley, Avery
      202340
      2.3
      1
      2
      jarrett jack
      101127
    
  

5 rows × 21 columns



In [4]:

    
print("Total missing values: ")
shot_logs.isnull().any(axis=1).sum()









    



Total missing values: 






    Out[4]:





5567



In [5]:

    
print("Missing values per column")
shot_logs.isnull().sum()









    



Missing values per column






    Out[5]:





GAME_ID                          0
MATCHUP                          0
LOCATION                         0
W                                0
FINAL_MARGIN                     0
SHOT_NUMBER                      0
PERIOD                           0
GAME_CLOCK                       0
SHOT_CLOCK                    5567
DRIBBLES                         0
TOUCH_TIME                       0
SHOT_DIST                        0
PTS_TYPE                         0
SHOT_RESULT                      0
CLOSEST_DEFENDER                 0
CLOSEST_DEFENDER_PLAYER_ID       0
CLOSE_DEF_DIST                   0
FGM                              0
PTS                              0
player_name                      0
player_id                        0
dtype: int64



In [6]:

    
# Calculates, for each player its overall conversion rate as the made shots over all shots took
players_ratios = {}
for player_name in shot_logs["player_name"].unique():
        player_shots = shot_logs[player_name==shot_logs["player_name"]]
        missed = player_shots[player_shots["SHOT_RESULT"] == 'missed']["SHOT_RESULT"].count()
        made = player_shots[player_shots["SHOT_RESULT"] == 'made']["SHOT_RESULT"].count()
        players_ratios[player_name] = float("%.2f"%(made/(missed+made)))



In [7]:

    
#Generating a data frame with the overall ratio for each player, now we can query by player name easily
overall_shot_ratio = pd.DataFrame(players_ratios, index=['ratio'])
overall_shot_ratio['kobe bryant']









    Out[7]:





ratio    0.37
Name: kobe bryant, dtype: float64



In [8]:

    
#Now lets do the same thing for 3 pts shot
players_ratios_3 = {}
for player_name in shot_logs["player_name"].unique():
        player_shots = shot_logs[player_name==shot_logs["player_name"]]
        three_pointers = player_shots[player_shots["SHOT_RESULT"] == 'made'][player_shots["PTS_TYPE"] == 3]["PTS_TYPE"].count()
        players_ratios_3[player_name] = float("%.2f"%(three_pointers/player_shots.PTS_TYPE.count()))









    



/root/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:5: UserWarning: Boolean Series key will be reindexed to match DataFrame index.



In [9]:

    
three_points_shot_ratios = pd.DataFrame(players_ratios_3, index=['ratio'])
three_points_shot_ratios['kobe bryant']









    Out[9]:





ratio    0.08
Name: kobe bryant, dtype: float64



In [10]:

    
plt.figure(figsize=(10,5))
#Lets plot the histogram for the shot ratio
plt.title("Avg. Shots Made Ratio Histogram")
plt.ylabel("Frequency")
plt.xlabel("Shots Made Ratio")
plt.hist(overall_shot_ratio.T['ratio'], alpha =0.85)
#Lets plot the histogram for the 3 points shot ratio
plt.hist(three_points_shot_ratios.T['ratio'], color='g', alpha=0.85)
plt.grid(axis='y')
plt.legend(['Overall', '3-pointers'])
plt.show()



In [11]:

    
#Now, lets focus on a specific player
lebron_shots = shot_logs[shot_logs['player_name'] == 'lebron james']
lebron_shots.head()









    Out[11]:






  
    
      
      GAME_ID
      MATCHUP
      LOCATION
      W
      FINAL_MARGIN
      SHOT_NUMBER
      PERIOD
      GAME_CLOCK
      SHOT_CLOCK
      DRIBBLES
      ...
      SHOT_DIST
      PTS_TYPE
      SHOT_RESULT
      CLOSEST_DEFENDER
      CLOSEST_DEFENDER_PLAYER_ID
      CLOSE_DEF_DIST
      FGM
      PTS
      player_name
      player_id
    
  
  
    
      45834
      21400900
      MAR 04, 2015 - CLE @ TOR
      A
      W
      8
      1
      1
      9:09
      13.7
      9
      ...
      7.0
      2
      missed
      Johnson, James
      201949
      0.8
      0
      0
      lebron james
      2544
    
    
      45835
      21400900
      MAR 04, 2015 - CLE @ TOR
      A
      W
      8
      2
      1
      6:08
      15.2
      8
      ...
      5.4
      2
      missed
      Valanciunas, Jonas
      202685
      2.5
      0
      0
      lebron james
      2544
    
    
      45836
      21400900
      MAR 04, 2015 - CLE @ TOR
      A
      W
      8
      3
      1
      4:38
      12.3
      0
      ...
      23.2
      3
      made
      Johnson, James
      201949
      3.5
      1
      3
      lebron james
      2544
    
    
      45837
      21400900
      MAR 04, 2015 - CLE @ TOR
      A
      W
      8
      4
      1
      0:02
      NaN
      0
      ...
      27.1
      3
      missed
      Ross, Terrence
      203082
      3.9
      0
      0
      lebron james
      2544
    
    
      45838
      21400900
      MAR 04, 2015 - CLE @ TOR
      A
      W
      8
      5
      2
      10:17
      20.8
      2
      ...
      3.1
      2
      made
      Johnson, James
      201949
      3.5
      1
      2
      lebron james
      2544
    
  

5 rows × 21 columns



In [12]:

    
#Let's see how he does according to the distance of the shots
plt.title("Distance accuracy profile for L. James")
plt.ylabel("Made shots")
plt.xlabel("Distance from basket (ft.)")
plt.grid()
lebron_profile = profile(lebron_shots)
plt.plot(*lebron_profile)
plt.show()



In [13]:

    
# Let's see how this compares to the average player
avg_profile = profile(shot_logs)
plt.title("Distance accuracy profile for L. James vs Avg. Player")
plt.ylabel("Made shots")
plt.xlabel("Distance from basket (ft.)")
plt.grid()
plt.plot(*lebron_profile)
plt.plot(*avg_profile)
plt.legend(['L. James', 'Avg'])
plt.show()



In [15]:

    
# Now let's compare maybe 4 or 5 players
plt.figure(figsize=(10,7))
legends = ['Avg']
plt.title("Distance accuracy profile for some players")
plt.ylabel("Made shots")
plt.xlabel("Distance from basket (ft.)")
plt.grid()
plt.plot(*avg_profile, '--')
#Grab the first 5 players to profile
for player_name in shot_logs['player_name'].unique()[:5]:
    player_profile = profile(shot_logs[shot_logs['player_name']== player_name])
    plt.plot(*player_profile, '-')
    legends.append(player_name)
plt.legend(legends)
plt.show()



In [17]:

    
# All right, Brian Roberts looks above the average profile, lets see how his
# profile behaves under different situtations
brian_roberts = shot_logs[shot_logs['player_name'] == 'brian roberts']
normal_profile = profile(brian_roberts)
close_defender = profile(brian_roberts[brian_roberts['CLOSE_DEF_DIST'] <= 3.3])
medium_range_defender = profile(brian_roberts[brian_roberts['CLOSE_DEF_DIST'] > 3.3][brian_roberts['CLOSE_DEF_DIST'] <= 6.6])
far_defender = profile(brian_roberts[brian_roberts['CLOSE_DEF_DIST'] > 6.6])
plt.figure(figsize=(8,5))
plt.plot(*normal_profile)
plt.plot(*close_defender)
plt.plot(*medium_range_defender)
plt.plot(*far_defender)
plt.plot(*avg_profile, '--')
plt.ylabel("Made shots")
plt.xlabel("Distance from basket (ft.)")
plt.grid()
plt.xticks(range(0,50, 5))
plt.legend(['Brian Avg.', 'Close Defender', 'Medium Range Defender', 'Far Defender', 'Avg'])
plt.show()









    



/root/anaconda3/lib/python3.5/site-packages/ipykernel/__main__.py:6: UserWarning: Boolean Series key will be reindexed to match DataFrame index.



In [ ]:

	GAME_ID	MATCHUP	LOCATION	W	FINAL_MARGIN	SHOT_NUMBER	PERIOD	GAME_CLOCK	SHOT_CLOCK	DRIBBLES	...	SHOT_DIST	PTS_TYPE	SHOT_RESULT	CLOSEST_DEFENDER	CLOSEST_DEFENDER_PLAYER_ID	CLOSE_DEF_DIST	FGM	PTS	player_name	player_id
128064	21400006	OCT 29, 2014 - BKN @ BOS	A	L	-16	5	3	1:52	18.3	5	...	8.7	2	missed	Smart, Marcus	203935	0.8	0	0	jarrett jack	101127
128065	21400006	OCT 29, 2014 - BKN @ BOS	A	L	-16	6	4	11:28	19.8	4	...	0.6	2	made	Turner, Evan	202323	0.6	1	2	jarrett jack	101127
128066	21400006	OCT 29, 2014 - BKN @ BOS	A	L	-16	7	4	11:10	23.0	2	...	16.9	2	made	Thornton, Marcus	201977	4.2	1	2	jarrett jack	101127
128067	21400006	OCT 29, 2014 - BKN @ BOS	A	L	-16	8	4	2:37	9.1	4	...	18.3	2	missed	Bradley, Avery	202340	3.0	0	0	jarrett jack	101127
128068	21400006	OCT 29, 2014 - BKN @ BOS	A	L	-16	9	4	0:12	NaN	5	...	5.1	2	made	Bradley, Avery	202340	2.3	1	2	jarrett jack	101127

	GAME_ID	MATCHUP	LOCATION	W	FINAL_MARGIN	SHOT_NUMBER	PERIOD	GAME_CLOCK	SHOT_CLOCK	DRIBBLES	...	SHOT_DIST	PTS_TYPE	SHOT_RESULT	CLOSEST_DEFENDER	CLOSEST_DEFENDER_PLAYER_ID	CLOSE_DEF_DIST	FGM	PTS	player_name	player_id
45834	21400900	MAR 04, 2015 - CLE @ TOR	A	W	8	1	1	9:09	13.7	9	...	7.0	2	missed	Johnson, James	201949	0.8	0	0	lebron james	2544
45835	21400900	MAR 04, 2015 - CLE @ TOR	A	W	8	2	1	6:08	15.2	8	...	5.4	2	missed	Valanciunas, Jonas	202685	2.5	0	0	lebron james	2544
45836	21400900	MAR 04, 2015 - CLE @ TOR	A	W	8	3	1	4:38	12.3	0	...	23.2	3	made	Johnson, James	201949	3.5	1	3	lebron james	2544
45837	21400900	MAR 04, 2015 - CLE @ TOR	A	W	8	4	1	0:02	NaN	0	...	27.1	3	missed	Ross, Terrence	203082	3.9	0	0	lebron james	2544
45838	21400900	MAR 04, 2015 - CLE @ TOR	A	W	8	5	2	10:17	20.8	2	...	3.1	2	made	Johnson, James	201949	3.5	1	2	lebron james	2544