In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

df = pd.read_csv('data/data.csv') # read in the csv file
df.info()

#List of attributes which aren't going to be used for analysis
columns_to_delete = ['Unnamed: 0', 'Date', 'time', 'TimeUnder', 
                     'RushAttempt', 
                     'PlayAttempted']

for col in columns_to_delete:
    if col in df:
        del df[col]
        
#Defining list of column names of each of the scales of variables being used.
#Interval and Ratio features are grouped together, and binary features are separated from other ordinal features
continuous_features = ['TimeSecs', 'PlayTimeDiff', 'yrdln', 'yrdline100',
                       'ydstogo', 'ydsnet', 'Yards.Gained', 'Penalty.Yards',
                       'ScoreDiff', 'AbsScoreDiff']
ordinal_features = ['Drive', 'qtr', 'down']
binary_features = ['GoalToGo', 'FirstDown','sp', 'Touchdown', 'Safety', 'Fumble']
categorical_features = df.columns.difference(continuous_features).difference(ordinal_features)

# Remove rows representing Timeouts and Two-Minute-Warnings
df = df[[play not in ["Timeout", "Two Minute Warning"] for play in df.PlayType]]

# Remove rows representing Timeouts and Two-Minute-Warnings
df = df[[play not in ["Timeout", "Two Minute Warning"] for play in df.PlayType]]

#Setup seaborn
import seaborn as sns
cmap = sns.diverging_palette(220, 10, as_cmap=True) # one of the many color mappings

#Setup plotly
import plotly
plotly.offline.init_notebook_mode() # run at the start of every notebook

#Setup matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)

#Embed figures in the Jupyter Notebook
%matplotlib inline

#Use GGPlot style for matplotlib
plt.style.use('ggplot')


//anaconda/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (26) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46129 entries, 0 to 46128
Data columns (total 64 columns):
Unnamed: 0            46129 non-null int64
Date                  46129 non-null object
GameID                46129 non-null int64
Drive                 46129 non-null int64
qtr                   46129 non-null int64
down                  39006 non-null float64
time                  46102 non-null object
TimeUnder             46102 non-null float64
TimeSecs              46102 non-null float64
PlayTimeDiff          46075 non-null float64
SideofField           46063 non-null object
yrdln                 46021 non-null float64
yrdline100            46021 non-null float64
ydstogo               46129 non-null int64
ydsnet                46129 non-null int64
GoalToGo              46021 non-null float64
FirstDown             42811 non-null float64
posteam               42878 non-null object
DefensiveTeam         42878 non-null object
desc                  46129 non-null object
PlayAttempted         46129 non-null int64
Yards.Gained          46129 non-null int64
sp                    46129 non-null int64
Touchdown             46129 non-null int64
ExPointResult         1131 non-null object
TwoPointConv          89 non-null object
DefTwoPoint           5 non-null object
Safety                46129 non-null int64
PlayType              46129 non-null object
Passer                19398 non-null object
PassAttempt           46129 non-null int64
PassOutcome           19435 non-null object
PassLength            19291 non-null object
PassLocation          19291 non-null object
InterceptionThrown    46129 non-null int64
Interceptor           467 non-null object
Rusher                13080 non-null object
RushAttempt           46129 non-null int64
RunLocation           12969 non-null object
RunGap                9588 non-null object
Receiver              18458 non-null object
Reception             46129 non-null int64
ReturnResult          2340 non-null object
Returner              2490 non-null object
Tackler1              24903 non-null object
Tackler2              3356 non-null object
FieldGoalResult       1001 non-null object
FieldGoalDistance     989 non-null float64
Fumble                46129 non-null int64
RecFumbTeam           481 non-null object
RecFumbPlayer         481 non-null object
Sack                  46129 non-null int64
Challenge.Replay      46129 non-null int64
ChalReplayResult      413 non-null object
Accepted.Penalty      46129 non-null int64
PenalizedTeam         3535 non-null object
PenaltyType           1952 non-null object
PenalizedPlayer       3404 non-null object
Penalty.Yards         46129 non-null int64
PosTeamScore          42878 non-null float64
DefTeamScore          42878 non-null float64
ScoreDiff             42878 non-null float64
AbsScoreDiff          42878 non-null float64
Season                46129 non-null int64
dtypes: float64(13), int64(21), object(30)
memory usage: 22.5+ MB

In [2]:
df_playtype = df[['qtr', 'down', 'TimeSecs', 'posteam', 'DefensiveTeam', 'PlayType', 'ScoreDiff']]
df_playtype = df_playtype.dropna(subset=['down']) 
df_playtype = df_playtype[df_playtype.PlayType != 'No Play']
df_playtype = df_playtype[df_playtype.PlayType != 'QB Kneel']

In [3]:
playtype_corr = df_playtype.corr()
sns.heatmap(playtype_corr)


Out[3]:
<matplotlib.axes._subplots.AxesSubplot at 0x10633f908>

In [4]:
dfp_sample = df_playtype[['down', 'PlayType']].sample(20000)
dfp_sample_reset = dfp_sample.groupby(['down', 'PlayType']).size().reset_index()

In [21]:
a_ = dfp_sample_reset[dfp_sample_reset.PlayType == 'Field Goal']
b_ = dfp_sample_reset[dfp_sample_reset.PlayType == 'Pass']
c_ = dfp_sample_reset[dfp_sample_reset.PlayType == 'Run']
d_ = dfp_sample_reset[dfp_sample_reset.PlayType == 'Sack']
e_ = dfp_sample_reset[dfp_sample_reset.PlayType == 'Spike']
f_ = dfp_sample_reset[dfp_sample_reset.PlayType == 'Punt']

a_ = a_[['down', 0]]
a_.columns = ['down', 'Field Goal']

b_ = b_[['down', 0]]
b_.columns = ['down', 'Pass']

c_ = c_[['down', 0]]
c_.columns = ['down', 'Run']

d_ = d_[['down', 0]]
d_.columns = ['down', 'Sack']

e_ = e_[['down', 0]]
e_.columns = ['down', 'Spike']

f_ = f_[['down', 0]]
f_.columns = ['down', 'Punt']

full = pd.concat([a_.set_index('down'), b_.set_index('down'), c_.set_index('down'), 
          d_.set_index('down'),e_.set_index('down'),f_.set_index('down')], axis = 1)


Out[21]:
down Punt
17 4.0 1343

In [6]:
full.plot(kind='bar', stacked= True)


Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x10636d0b8>

In [39]:
#Isolate releant attributes
df_playtype = df[['qtr', 'down', 'TimeSecs', 'PlayType']]

#Group and then re-flatten
dfp_sample = df_playtype[['down', 'PlayType']].sample(20000)
dfp_sample_reset = dfp_sample.groupby(['down', 'PlayType']).size().reset_index()

#Data transformation to get it to a plot-able form
playtypes = ['Field Goal', 'Pass', 'Run', 'Sack', 'Spike', 'Punt']
dfp_list = [dfp_sample_reset[dfp_sample_reset.PlayType == x] for x in playtypes]
for i, x in enumerate(dfp_list):
    dfp_list[i] = dfp_list[i][['down',0]]
    dfp_list[i].columns = ['down', playtypes[i]]

#Concatenate the list into one data frame and then plot it
pd.concat([x.set_index('down') for x in dfp_list], axis=1).plot(kind='bar', stacked=True)


Out[39]:
<matplotlib.axes._subplots.AxesSubplot at 0x1523d3eb8>

In [ ]: