In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/data.csv') # read in the csv file
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46129 entries, 0 to 46128
Data columns (total 64 columns):
Unnamed: 0            46129 non-null int64
Date                  46129 non-null object
GameID                46129 non-null int64
Drive                 46129 non-null int64
qtr                   46129 non-null int64
down                  39006 non-null float64
time                  46102 non-null object
TimeUnder             46102 non-null float64
TimeSecs              46102 non-null float64
PlayTimeDiff          46075 non-null float64
SideofField           46063 non-null object
yrdln                 46021 non-null float64
yrdline100            46021 non-null float64
ydstogo               46129 non-null int64
ydsnet                46129 non-null int64
GoalToGo              46021 non-null float64
FirstDown             42811 non-null float64
posteam               42878 non-null object
DefensiveTeam         42878 non-null object
desc                  46129 non-null object
PlayAttempted         46129 non-null int64
Yards.Gained          46129 non-null int64
sp                    46129 non-null int64
Touchdown             46129 non-null int64
ExPointResult         1131 non-null object
TwoPointConv          89 non-null object
DefTwoPoint           5 non-null object
Safety                46129 non-null int64
PlayType              46129 non-null object
Passer                19398 non-null object
PassAttempt           46129 non-null int64
PassOutcome           19435 non-null object
PassLength            19291 non-null object
PassLocation          19291 non-null object
InterceptionThrown    46129 non-null int64
Interceptor           467 non-null object
Rusher                13080 non-null object
RushAttempt           46129 non-null int64
RunLocation           12969 non-null object
RunGap                9588 non-null object
Receiver              18458 non-null object
Reception             46129 non-null int64
ReturnResult          2340 non-null object
Returner              2490 non-null object
Tackler1              24903 non-null object
Tackler2              3356 non-null object
FieldGoalResult       1001 non-null object
FieldGoalDistance     989 non-null float64
Fumble                46129 non-null int64
RecFumbTeam           481 non-null object
RecFumbPlayer         481 non-null object
Sack                  46129 non-null int64
Challenge.Replay      46129 non-null int64
ChalReplayResult      413 non-null object
Accepted.Penalty      46129 non-null int64
PenalizedTeam         3535 non-null object
PenaltyType           1952 non-null object
PenalizedPlayer       3404 non-null object
Penalty.Yards         46129 non-null int64
PosTeamScore          42878 non-null float64
DefTeamScore          42878 non-null float64
ScoreDiff             42878 non-null float64
AbsScoreDiff          42878 non-null float64
Season                46129 non-null int64
dtypes: float64(13), int64(21), object(30)
memory usage: 22.5+ MB
//anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (26) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [3]:
from sklearn import preprocessing

In [4]:
#List of attributes which aren't going to be used for analysis
columns_to_delete = ['Unnamed: 0', 'Date', 'time', 'TimeUnder', 
                     'RushAttempt', 
                     'PlayAttempted']

for col in columns_to_delete:
    if col in df:
        del df[col]

In [5]:
#Defining list of column names of each of the scales of variables being used.
#Interval and Ratio features are grouped together, and binary features are separated from other ordinal features
continuous_features = ['TimeSecs', 'PlayTimeDiff', 'yrdln', 'yrdline100',
                       'ydstogo', 'ydsnet', 'Yards.Gained', 'Penalty.Yards',
                       'ScoreDiff', 'AbsScoreDiff']
ordinal_features = ['Drive', 'qtr', 'down']
binary_features = ['GoalToGo', 'FirstDown','sp', 'Touchdown', 'Safety', 'Fumble']
categorical_features = df.columns.difference(continuous_features).difference(ordinal_features)

In [6]:
# Remove rows representing Timeouts and Two-Minute-Warnings
df = df[[play not in ["Timeout", "Two Minute Warning"] for play in df.PlayType]]

In [7]:
#Replace NaNs in categorical and ordinal columns with -1
df = df.replace(to_replace=np.nan,value=-1)

#Coercing the data columns to the correct types
df[continuous_features] = df[continuous_features].astype(np.float64)
df[ordinal_features] = df[ordinal_features].astype(np.int64)
df[binary_features] = df[binary_features].astype(np.int8)

In [8]:
#Setup seaborn
import seaborn as sns
cmap = sns.diverging_palette(220, 10, as_cmap=True) # one of the many color mappings

#Setup plotly
import plotly
plotly.offline.init_notebook_mode() # run at the start of every notebook

#Setup matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)

#Embed figures in the Jupyter Notebook
%matplotlib inline

#Use GGPlot style for matplotlib
plt.style.use('ggplot')



In [9]:
df['PlayType'].unique()


Out[9]:
array(['Kickoff', 'Run', 'Pass', 'Sack', 'No Play', 'Field Goal', 'Punt',
       'Quarter End', 'Extra Point', 'QB Kneel', 'Onside Kick',
       'End of Game', 'Spike'], dtype=object)

In [10]:
team_analysis = df[['posteam', 'PlayType', 'Yards.Gained']]
team_analysis = team_analysis[team_analysis.posteam != -1]
team_analysis = team_analysis[(team_analysis.PlayType == 'Run') | (team_analysis.PlayType == 'Pass')]

In [11]:
team_analysis.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 31425 entries, 1 to 46128
Data columns (total 3 columns):
posteam         31425 non-null object
PlayType        31425 non-null object
Yards.Gained    31425 non-null float64
dtypes: float64(1), object(2)
memory usage: 982.0+ KB

In [12]:
team_analysis['PlayType'].unique()


Out[12]:
array(['Run', 'Pass'], dtype=object)

In [13]:
team_grouped = team_analysis.groupby(['posteam', 'PlayType'], sort=True)

#get the count of each playtype per team
teams_count = team_grouped.count()
#name the column 'count'
teams_count.columns = ['count']
#ungroups the dataframe
teams_count = teams_count.reset_index()

#gets the total number of plays for each team (Run + Pass)
def teamSum(x):
    return teams_count[teams_count.posteam == x.posteam]['count'].sum()

#add to teams_count as column 'totals'
totals = teams_count.apply(teamSum, axis=1)
teams_count['totals'] = totals

#finds percentage of PlayTypes for each team
def teamPercent(x):
    temp =  x['count'] / x['totals']
    return temp

#adds percentages to teams_count
teams_count['percentages'] = teams_count.apply(teamPercent, axis=1)


teams_count_perc = teams_count[['posteam', 'PlayType', 'percentages']]
teams_count_perc


Out[13]:
posteam PlayType percentages
0 ARI Pass 0.568367
1 ARI Run 0.431633
2 ATL Pass 0.604854
3 ATL Run 0.395146
4 BAL Pass 0.638863
5 BAL Run 0.361137
6 BUF Pass 0.485417
7 BUF Run 0.514583
8 CAR Pass 0.495540
9 CAR Run 0.504460
10 CHI Pass 0.530303
11 CHI Run 0.469697
12 CIN Pass 0.526646
13 CIN Run 0.473354
14 CLE Pass 0.620690
15 CLE Run 0.379310
16 DAL Pass 0.568501
17 DAL Run 0.431499
18 DEN Pass 0.600198
19 DEN Run 0.399802
20 DET Pass 0.649231
21 DET Run 0.350769
22 GB Pass 0.576424
23 GB Run 0.423576
24 HOU Pass 0.577425
25 HOU Run 0.422575
26 IND Pass 0.620863
27 IND Run 0.379137
28 JAC Pass 0.638454
29 JAC Run 0.361546
... ... ... ...
34 MIN Pass 0.496175
35 MIN Run 0.503825
36 NE Pass 0.632056
37 NE Run 0.367944
38 NO Pass 0.637405
39 NO Run 0.362595
40 NYG Pass 0.610020
41 NYG Run 0.389980
42 NYJ Pass 0.580211
43 NYJ Run 0.419789
44 OAK Pass 0.627715
45 OAK Run 0.372285
46 PHI Pass 0.588458
47 PHI Run 0.411542
48 PIT Pass 0.618802
49 PIT Run 0.381198
50 SD Pass 0.634470
51 SD Run 0.365530
52 SEA Pass 0.504124
53 SEA Run 0.495876
54 SF Pass 0.582139
55 SF Run 0.417861
56 STL Pass 0.530864
57 STL Run 0.469136
58 TB Pass 0.547131
59 TB Run 0.452869
60 TEN Pass 0.600653
61 TEN Run 0.399347
62 WAS Pass 0.572308
63 WAS Run 0.427692

64 rows × 3 columns


In [14]:
#this is to format the data to make plotting easier

trp = teams_count_perc[teams_count_perc.PlayType == 'Run']
tpp = teams_count_perc[teams_count_perc.PlayType == 'Pass']

trp =trp.set_index('posteam')
tpp =tpp.set_index('posteam')

pass_run = pd.concat([trp, tpp], axis=1)
pass_run


Out[14]:
PlayType percentages PlayType percentages
posteam
ARI Run 0.431633 Pass 0.568367
ATL Run 0.395146 Pass 0.604854
BAL Run 0.361137 Pass 0.638863
BUF Run 0.514583 Pass 0.485417
CAR Run 0.504460 Pass 0.495540
CHI Run 0.469697 Pass 0.530303
CIN Run 0.473354 Pass 0.526646
CLE Run 0.379310 Pass 0.620690
DAL Run 0.431499 Pass 0.568501
DEN Run 0.399802 Pass 0.600198
DET Run 0.350769 Pass 0.649231
GB Run 0.423576 Pass 0.576424
HOU Run 0.422575 Pass 0.577425
IND Run 0.379137 Pass 0.620863
JAC Run 0.361546 Pass 0.638454
KC Run 0.470523 Pass 0.529477
MIA Run 0.361171 Pass 0.638829
MIN Run 0.503825 Pass 0.496175
NE Run 0.367944 Pass 0.632056
NO Run 0.362595 Pass 0.637405
NYG Run 0.389980 Pass 0.610020
NYJ Run 0.419789 Pass 0.580211
OAK Run 0.372285 Pass 0.627715
PHI Run 0.411542 Pass 0.588458
PIT Run 0.381198 Pass 0.618802
SD Run 0.365530 Pass 0.634470
SEA Run 0.495876 Pass 0.504124
SF Run 0.417861 Pass 0.582139
STL Run 0.469136 Pass 0.530864
TB Run 0.452869 Pass 0.547131
TEN Run 0.399347 Pass 0.600653
WAS Run 0.427692 Pass 0.572308

In [15]:
#plot playtype percentage per team

pass_run.columns = ['PlayType1', 'RunPercentage', 'PlayType2', 'PassPercentage']
pass_run = pass_run.sort_values(by = 'RunPercentage')
pass_run[['RunPercentage', 'PassPercentage']].plot(kind = 'bar', stacked=True)


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a212b38>

In [16]:
team_yards = team_grouped.sum().reset_index()
team_yards['totals'] = totals

def teamAvgYards(x):
    temp =  x['Yards.Gained'] / x['totals']
    return temp

team_yards['AvgYards'] = team_yards.apply(teamAvgYards, axis=1)
team_yards = team_yards[['posteam', 'PlayType', 'AvgYards']]

tyr = team_yards[team_yards.PlayType == 'Run']
typ = team_yards[team_yards.PlayType == 'Pass']

tyr =tyr.set_index('posteam').sort_values(by = 'AvgYards')
typ =typ.set_index('posteam').sort_values(by = 'AvgYards')

In [17]:
tyr.plot(kind='bar')


Out[17]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a6ef5c0>

In [18]:
typ.plot(kind='bar')


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x11c596198>

In [19]:
team_avg_yards = pd.concat([tyr, typ], axis=1)
team_avg_yards.columns = ['a', 'AvgRunYards', 'b', 'AvgPassYards']
team_avg_yards = team_avg_yards[['AvgRunYards', 'AvgPassYards']]
team_avg_yards.sort_values(by = 'AvgPassYards', inplace=True)
team_avg_yards.plot(kind='bar')


Out[19]:
<matplotlib.axes._subplots.AxesSubplot at 0x11a6ef400>

In [20]:
team_avg_yards.sort_values(by = 'AvgRunYards', inplace=True)
team_avg_yards.plot(kind='bar')


Out[20]:
<matplotlib.axes._subplots.AxesSubplot at 0x11c9d9c88>

In [ ]: