notebook.community

Edit and run



In [2]:

    
import pandas as pd
import numpy as np

df = pd.read_csv('data/data.csv') # read in the csv file
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46129 entries, 0 to 46128
Data columns (total 64 columns):
Unnamed: 0            46129 non-null int64
Date                  46129 non-null object
GameID                46129 non-null int64
Drive                 46129 non-null int64
qtr                   46129 non-null int64
down                  39006 non-null float64
time                  46102 non-null object
TimeUnder             46102 non-null float64
TimeSecs              46102 non-null float64
PlayTimeDiff          46075 non-null float64
SideofField           46063 non-null object
yrdln                 46021 non-null float64
yrdline100            46021 non-null float64
ydstogo               46129 non-null int64
ydsnet                46129 non-null int64
GoalToGo              46021 non-null float64
FirstDown             42811 non-null float64
posteam               42878 non-null object
DefensiveTeam         42878 non-null object
desc                  46129 non-null object
PlayAttempted         46129 non-null int64
Yards.Gained          46129 non-null int64
sp                    46129 non-null int64
Touchdown             46129 non-null int64
ExPointResult         1131 non-null object
TwoPointConv          89 non-null object
DefTwoPoint           5 non-null object
Safety                46129 non-null int64
PlayType              46129 non-null object
Passer                19398 non-null object
PassAttempt           46129 non-null int64
PassOutcome           19435 non-null object
PassLength            19291 non-null object
PassLocation          19291 non-null object
InterceptionThrown    46129 non-null int64
Interceptor           467 non-null object
Rusher                13080 non-null object
RushAttempt           46129 non-null int64
RunLocation           12969 non-null object
RunGap                9588 non-null object
Receiver              18458 non-null object
Reception             46129 non-null int64
ReturnResult          2340 non-null object
Returner              2490 non-null object
Tackler1              24903 non-null object
Tackler2              3356 non-null object
FieldGoalResult       1001 non-null object
FieldGoalDistance     989 non-null float64
Fumble                46129 non-null int64
RecFumbTeam           481 non-null object
RecFumbPlayer         481 non-null object
Sack                  46129 non-null int64
Challenge.Replay      46129 non-null int64
ChalReplayResult      413 non-null object
Accepted.Penalty      46129 non-null int64
PenalizedTeam         3535 non-null object
PenaltyType           1952 non-null object
PenalizedPlayer       3404 non-null object
Penalty.Yards         46129 non-null int64
PosTeamScore          42878 non-null float64
DefTeamScore          42878 non-null float64
ScoreDiff             42878 non-null float64
AbsScoreDiff          42878 non-null float64
Season                46129 non-null int64
dtypes: float64(13), int64(21), object(30)
memory usage: 22.5+ MB






    



//anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (26) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)



In [3]:

    
from sklearn import preprocessing



In [4]:

    
#List of attributes which aren't going to be used for analysis
columns_to_delete = ['Unnamed: 0', 'Date', 'time', 'TimeUnder', 
                     'RushAttempt', 
                     'PlayAttempted']

for col in columns_to_delete:
    if col in df:
        del df[col]



In [5]:

    
#Defining list of column names of each of the scales of variables being used.
#Interval and Ratio features are grouped together, and binary features are separated from other ordinal features
continuous_features = ['TimeSecs', 'PlayTimeDiff', 'yrdln', 'yrdline100',
                       'ydstogo', 'ydsnet', 'Yards.Gained', 'Penalty.Yards',
                       'ScoreDiff', 'AbsScoreDiff']
ordinal_features = ['Drive', 'qtr', 'down']
binary_features = ['GoalToGo', 'FirstDown','sp', 'Touchdown', 'Safety', 'Fumble']
categorical_features = df.columns.difference(continuous_features).difference(ordinal_features)



In [6]:

    
# Remove rows representing Timeouts and Two-Minute-Warnings
df = df[[play not in ["Timeout", "Two Minute Warning"] for play in df.PlayType]]



In [7]:

    
#Replace NaNs in categorical and ordinal columns with -1
df = df.replace(to_replace=np.nan,value=-1)

#Coercing the data columns to the correct types
df[continuous_features] = df[continuous_features].astype(np.float64)
df[ordinal_features] = df[ordinal_features].astype(np.int64)
df[binary_features] = df[binary_features].astype(np.int8)



In [8]:

    
#Setup seaborn
import seaborn as sns
cmap = sns.diverging_palette(220, 10, as_cmap=True) # one of the many color mappings

#Setup plotly
import plotly
plotly.offline.init_notebook_mode() # run at the start of every notebook

#Setup matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)

#Embed figures in the Jupyter Notebook
%matplotlib inline

#Use GGPlot style for matplotlib
plt.style.use('ggplot')



In [9]:

    
df['PlayType'].unique()









    Out[9]:





array(['Kickoff', 'Run', 'Pass', 'Sack', 'No Play', 'Field Goal', 'Punt',
       'Quarter End', 'Extra Point', 'QB Kneel', 'Onside Kick',
       'End of Game', 'Spike'], dtype=object)



In [10]:

    
team_analysis = df[['posteam', 'PlayType', 'Yards.Gained']]
team_analysis = team_analysis[team_analysis.posteam != -1]
team_analysis = team_analysis[(team_analysis.PlayType == 'Run') | (team_analysis.PlayType == 'Pass')]



In [11]:

    
team_analysis.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 31425 entries, 1 to 46128
Data columns (total 3 columns):
posteam         31425 non-null object
PlayType        31425 non-null object
Yards.Gained    31425 non-null float64
dtypes: float64(1), object(2)
memory usage: 982.0+ KB



In [12]:

    
team_analysis['PlayType'].unique()









    Out[12]:





array(['Run', 'Pass'], dtype=object)



In [13]:

    
team_grouped = team_analysis.groupby(['posteam', 'PlayType'], sort=True)

#get the count of each playtype per team
teams_count = team_grouped.count()
#name the column 'count'
teams_count.columns = ['count']
#ungroups the dataframe
teams_count = teams_count.reset_index()

#gets the total number of plays for each team (Run + Pass)
def teamSum(x):
    return teams_count[teams_count.posteam == x.posteam]['count'].sum()

#add to teams_count as column 'totals'
totals = teams_count.apply(teamSum, axis=1)
teams_count['totals'] = totals

#finds percentage of PlayTypes for each team
def teamPercent(x):
    temp =  x['count'] / x['totals']
    return temp

#adds percentages to teams_count
teams_count['percentages'] = teams_count.apply(teamPercent, axis=1)


teams_count_perc = teams_count[['posteam', 'PlayType', 'percentages']]
teams_count_perc









    Out[13]:






  
    
      
      posteam
      PlayType
      percentages
    
  
  
    
      0
      ARI
      Pass
      0.568367
    
    
      1
      ARI
      Run
      0.431633
    
    
      2
      ATL
      Pass
      0.604854
    
    
      3
      ATL
      Run
      0.395146
    
    
      4
      BAL
      Pass
      0.638863
    
    
      5
      BAL
      Run
      0.361137
    
    
      6
      BUF
      Pass
      0.485417
    
    
      7
      BUF
      Run
      0.514583
    
    
      8
      CAR
      Pass
      0.495540
    
    
      9
      CAR
      Run
      0.504460
    
    
      10
      CHI
      Pass
      0.530303
    
    
      11
      CHI
      Run
      0.469697
    
    
      12
      CIN
      Pass
      0.526646
    
    
      13
      CIN
      Run
      0.473354
    
    
      14
      CLE
      Pass
      0.620690
    
    
      15
      CLE
      Run
      0.379310
    
    
      16
      DAL
      Pass
      0.568501
    
    
      17
      DAL
      Run
      0.431499
    
    
      18
      DEN
      Pass
      0.600198
    
    
      19
      DEN
      Run
      0.399802
    
    
      20
      DET
      Pass
      0.649231
    
    
      21
      DET
      Run
      0.350769
    
    
      22
      GB
      Pass
      0.576424
    
    
      23
      GB
      Run
      0.423576
    
    
      24
      HOU
      Pass
      0.577425
    
    
      25
      HOU
      Run
      0.422575
    
    
      26
      IND
      Pass
      0.620863
    
    
      27
      IND
      Run
      0.379137
    
    
      28
      JAC
      Pass
      0.638454
    
    
      29
      JAC
      Run
      0.361546
    
    
      ...
      ...
      ...
      ...
    
    
      34
      MIN
      Pass
      0.496175
    
    
      35
      MIN
      Run
      0.503825
    
    
      36
      NE
      Pass
      0.632056
    
    
      37
      NE
      Run
      0.367944
    
    
      38
      NO
      Pass
      0.637405
    
    
      39
      NO
      Run
      0.362595
    
    
      40
      NYG
      Pass
      0.610020
    
    
      41
      NYG
      Run
      0.389980
    
    
      42
      NYJ
      Pass
      0.580211
    
    
      43
      NYJ
      Run
      0.419789
    
    
      44
      OAK
      Pass
      0.627715
    
    
      45
      OAK
      Run
      0.372285
    
    
      46
      PHI
      Pass
      0.588458
    
    
      47
      PHI
      Run
      0.411542
    
    
      48
      PIT
      Pass
      0.618802
    
    
      49
      PIT
      Run
      0.381198
    
    
      50
      SD
      Pass
      0.634470
    
    
      51
      SD
      Run
      0.365530
    
    
      52
      SEA
      Pass
      0.504124
    
    
      53
      SEA
      Run
      0.495876
    
    
      54
      SF
      Pass
      0.582139
    
    
      55
      SF
      Run
      0.417861
    
    
      56
      STL
      Pass
      0.530864
    
    
      57
      STL
      Run
      0.469136
    
    
      58
      TB
      Pass
      0.547131
    
    
      59
      TB
      Run
      0.452869
    
    
      60
      TEN
      Pass
      0.600653
    
    
      61
      TEN
      Run
      0.399347
    
    
      62
      WAS
      Pass
      0.572308
    
    
      63
      WAS
      Run
      0.427692
    
  

64 rows × 3 columns



In [14]:

    
#this is to format the data to make plotting easier

trp = teams_count_perc[teams_count_perc.PlayType == 'Run']
tpp = teams_count_perc[teams_count_perc.PlayType == 'Pass']

trp =trp.set_index('posteam')
tpp =tpp.set_index('posteam')

pass_run = pd.concat([trp, tpp], axis=1)
pass_run









    Out[14]:






  
    
      
      PlayType
      percentages
      PlayType
      percentages
    
    
      posteam
      
      
      
      
    
  
  
    
      ARI
      Run
      0.431633
      Pass
      0.568367
    
    
      ATL
      Run
      0.395146
      Pass
      0.604854
    
    
      BAL
      Run
      0.361137
      Pass
      0.638863
    
    
      BUF
      Run
      0.514583
      Pass
      0.485417
    
    
      CAR
      Run
      0.504460
      Pass
      0.495540
    
    
      CHI
      Run
      0.469697
      Pass
      0.530303
    
    
      CIN
      Run
      0.473354
      Pass
      0.526646
    
    
      CLE
      Run
      0.379310
      Pass
      0.620690
    
    
      DAL
      Run
      0.431499
      Pass
      0.568501
    
    
      DEN
      Run
      0.399802
      Pass
      0.600198
    
    
      DET
      Run
      0.350769
      Pass
      0.649231
    
    
      GB
      Run
      0.423576
      Pass
      0.576424
    
    
      HOU
      Run
      0.422575
      Pass
      0.577425
    
    
      IND
      Run
      0.379137
      Pass
      0.620863
    
    
      JAC
      Run
      0.361546
      Pass
      0.638454
    
    
      KC
      Run
      0.470523
      Pass
      0.529477
    
    
      MIA
      Run
      0.361171
      Pass
      0.638829
    
    
      MIN
      Run
      0.503825
      Pass
      0.496175
    
    
      NE
      Run
      0.367944
      Pass
      0.632056
    
    
      NO
      Run
      0.362595
      Pass
      0.637405
    
    
      NYG
      Run
      0.389980
      Pass
      0.610020
    
    
      NYJ
      Run
      0.419789
      Pass
      0.580211
    
    
      OAK
      Run
      0.372285
      Pass
      0.627715
    
    
      PHI
      Run
      0.411542
      Pass
      0.588458
    
    
      PIT
      Run
      0.381198
      Pass
      0.618802
    
    
      SD
      Run
      0.365530
      Pass
      0.634470
    
    
      SEA
      Run
      0.495876
      Pass
      0.504124
    
    
      SF
      Run
      0.417861
      Pass
      0.582139
    
    
      STL
      Run
      0.469136
      Pass
      0.530864
    
    
      TB
      Run
      0.452869
      Pass
      0.547131
    
    
      TEN
      Run
      0.399347
      Pass
      0.600653
    
    
      WAS
      Run
      0.427692
      Pass
      0.572308



In [15]:

    
#plot playtype percentage per team

pass_run.columns = ['PlayType1', 'RunPercentage', 'PlayType2', 'PassPercentage']
pass_run = pass_run.sort_values(by = 'RunPercentage')
pass_run[['RunPercentage', 'PassPercentage']].plot(kind = 'bar', stacked=True)









    Out[15]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a212b38>



In [16]:

    
team_yards = team_grouped.sum().reset_index()
team_yards['totals'] = totals

def teamAvgYards(x):
    temp =  x['Yards.Gained'] / x['totals']
    return temp

team_yards['AvgYards'] = team_yards.apply(teamAvgYards, axis=1)
team_yards = team_yards[['posteam', 'PlayType', 'AvgYards']]

tyr = team_yards[team_yards.PlayType == 'Run']
typ = team_yards[team_yards.PlayType == 'Pass']

tyr =tyr.set_index('posteam').sort_values(by = 'AvgYards')
typ =typ.set_index('posteam').sort_values(by = 'AvgYards')



In [17]:

    
tyr.plot(kind='bar')









    Out[17]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a6ef5c0>



In [18]:

    
typ.plot(kind='bar')









    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x11c596198>



In [19]:

    
team_avg_yards = pd.concat([tyr, typ], axis=1)
team_avg_yards.columns = ['a', 'AvgRunYards', 'b', 'AvgPassYards']
team_avg_yards = team_avg_yards[['AvgRunYards', 'AvgPassYards']]
team_avg_yards.sort_values(by = 'AvgPassYards', inplace=True)
team_avg_yards.plot(kind='bar')









    Out[19]:





<matplotlib.axes._subplots.AxesSubplot at 0x11a6ef400>



In [20]:

    
team_avg_yards.sort_values(by = 'AvgRunYards', inplace=True)
team_avg_yards.plot(kind='bar')









    Out[20]:





<matplotlib.axes._subplots.AxesSubplot at 0x11c9d9c88>



In [ ]:

	posteam	PlayType	percentages
0	ARI	Pass	0.568367
1	ARI	Run	0.431633
2	ATL	Pass	0.604854
3	ATL	Run	0.395146
4	BAL	Pass	0.638863
5	BAL	Run	0.361137
6	BUF	Pass	0.485417
7	BUF	Run	0.514583
8	CAR	Pass	0.495540
9	CAR	Run	0.504460
10	CHI	Pass	0.530303
11	CHI	Run	0.469697
12	CIN	Pass	0.526646
13	CIN	Run	0.473354
14	CLE	Pass	0.620690
15	CLE	Run	0.379310
16	DAL	Pass	0.568501
17	DAL	Run	0.431499
18	DEN	Pass	0.600198
19	DEN	Run	0.399802
20	DET	Pass	0.649231
21	DET	Run	0.350769
22	GB	Pass	0.576424
23	GB	Run	0.423576
24	HOU	Pass	0.577425
25	HOU	Run	0.422575
26	IND	Pass	0.620863
27	IND	Run	0.379137
28	JAC	Pass	0.638454
29	JAC	Run	0.361546
...	...	...	...
34	MIN	Pass	0.496175
35	MIN	Run	0.503825
36	NE	Pass	0.632056
37	NE	Run	0.367944
38	NO	Pass	0.637405
39	NO	Run	0.362595
40	NYG	Pass	0.610020
41	NYG	Run	0.389980
42	NYJ	Pass	0.580211
43	NYJ	Run	0.419789
44	OAK	Pass	0.627715
45	OAK	Run	0.372285
46	PHI	Pass	0.588458
47	PHI	Run	0.411542
48	PIT	Pass	0.618802
49	PIT	Run	0.381198
50	SD	Pass	0.634470
51	SD	Run	0.365530
52	SEA	Pass	0.504124
53	SEA	Run	0.495876
54	SF	Pass	0.582139
55	SF	Run	0.417861
56	STL	Pass	0.530864
57	STL	Run	0.469136
58	TB	Pass	0.547131
59	TB	Run	0.452869
60	TEN	Pass	0.600653
61	TEN	Run	0.399347
62	WAS	Pass	0.572308
63	WAS	Run	0.427692

	PlayType	percentages	PlayType	percentages
posteam
ARI	Run	0.431633	Pass	0.568367
ATL	Run	0.395146	Pass	0.604854
BAL	Run	0.361137	Pass	0.638863
BUF	Run	0.514583	Pass	0.485417
CAR	Run	0.504460	Pass	0.495540
CHI	Run	0.469697	Pass	0.530303
CIN	Run	0.473354	Pass	0.526646
CLE	Run	0.379310	Pass	0.620690
DAL	Run	0.431499	Pass	0.568501
DEN	Run	0.399802	Pass	0.600198
DET	Run	0.350769	Pass	0.649231
GB	Run	0.423576	Pass	0.576424
HOU	Run	0.422575	Pass	0.577425
IND	Run	0.379137	Pass	0.620863
JAC	Run	0.361546	Pass	0.638454
KC	Run	0.470523	Pass	0.529477
MIA	Run	0.361171	Pass	0.638829
MIN	Run	0.503825	Pass	0.496175
NE	Run	0.367944	Pass	0.632056
NO	Run	0.362595	Pass	0.637405
NYG	Run	0.389980	Pass	0.610020
NYJ	Run	0.419789	Pass	0.580211
OAK	Run	0.372285	Pass	0.627715
PHI	Run	0.411542	Pass	0.588458
PIT	Run	0.381198	Pass	0.618802
SD	Run	0.365530	Pass	0.634470
SEA	Run	0.495876	Pass	0.504124
SF	Run	0.417861	Pass	0.582139
STL	Run	0.469136	Pass	0.530864
TB	Run	0.452869	Pass	0.547131
TEN	Run	0.399347	Pass	0.600653
WAS	Run	0.427692	Pass	0.572308