In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/data.csv') # read in the csv file
df.info()
In [3]:
from sklearn import preprocessing
In [4]:
#List of attributes which aren't going to be used for analysis
columns_to_delete = ['Unnamed: 0', 'Date', 'time', 'TimeUnder',
'RushAttempt',
'PlayAttempted']
for col in columns_to_delete:
if col in df:
del df[col]
In [5]:
#Defining list of column names of each of the scales of variables being used.
#Interval and Ratio features are grouped together, and binary features are separated from other ordinal features
continuous_features = ['TimeSecs', 'PlayTimeDiff', 'yrdln', 'yrdline100',
'ydstogo', 'ydsnet', 'Yards.Gained', 'Penalty.Yards',
'ScoreDiff', 'AbsScoreDiff']
ordinal_features = ['Drive', 'qtr', 'down']
binary_features = ['GoalToGo', 'FirstDown','sp', 'Touchdown', 'Safety', 'Fumble']
categorical_features = df.columns.difference(continuous_features).difference(ordinal_features)
In [6]:
# Remove rows representing Timeouts and Two-Minute-Warnings
df = df[[play not in ["Timeout", "Two Minute Warning"] for play in df.PlayType]]
In [7]:
#Replace NaNs in categorical and ordinal columns with -1
df = df.replace(to_replace=np.nan,value=-1)
#Coercing the data columns to the correct types
df[continuous_features] = df[continuous_features].astype(np.float64)
df[ordinal_features] = df[ordinal_features].astype(np.int64)
df[binary_features] = df[binary_features].astype(np.int8)
In [8]:
#Setup seaborn
import seaborn as sns
cmap = sns.diverging_palette(220, 10, as_cmap=True) # one of the many color mappings
#Setup plotly
import plotly
plotly.offline.init_notebook_mode() # run at the start of every notebook
#Setup matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore', DeprecationWarning)
#Embed figures in the Jupyter Notebook
%matplotlib inline
#Use GGPlot style for matplotlib
plt.style.use('ggplot')
In [9]:
df['PlayType'].unique()
Out[9]:
In [10]:
team_analysis = df[['posteam', 'PlayType', 'Yards.Gained']]
team_analysis = team_analysis[team_analysis.posteam != -1]
team_analysis = team_analysis[(team_analysis.PlayType == 'Run') | (team_analysis.PlayType == 'Pass')]
In [11]:
team_analysis.info()
In [12]:
team_analysis['PlayType'].unique()
Out[12]:
In [13]:
team_grouped = team_analysis.groupby(['posteam', 'PlayType'], sort=True)
#get the count of each playtype per team
teams_count = team_grouped.count()
#name the column 'count'
teams_count.columns = ['count']
#ungroups the dataframe
teams_count = teams_count.reset_index()
#gets the total number of plays for each team (Run + Pass)
def teamSum(x):
return teams_count[teams_count.posteam == x.posteam]['count'].sum()
#add to teams_count as column 'totals'
totals = teams_count.apply(teamSum, axis=1)
teams_count['totals'] = totals
#finds percentage of PlayTypes for each team
def teamPercent(x):
temp = x['count'] / x['totals']
return temp
#adds percentages to teams_count
teams_count['percentages'] = teams_count.apply(teamPercent, axis=1)
teams_count_perc = teams_count[['posteam', 'PlayType', 'percentages']]
teams_count_perc
Out[13]:
In [14]:
#this is to format the data to make plotting easier
trp = teams_count_perc[teams_count_perc.PlayType == 'Run']
tpp = teams_count_perc[teams_count_perc.PlayType == 'Pass']
trp =trp.set_index('posteam')
tpp =tpp.set_index('posteam')
pass_run = pd.concat([trp, tpp], axis=1)
pass_run
Out[14]:
In [15]:
#plot playtype percentage per team
pass_run.columns = ['PlayType1', 'RunPercentage', 'PlayType2', 'PassPercentage']
pass_run = pass_run.sort_values(by = 'RunPercentage')
pass_run[['RunPercentage', 'PassPercentage']].plot(kind = 'bar', stacked=True)
Out[15]:
In [16]:
team_yards = team_grouped.sum().reset_index()
team_yards['totals'] = totals
def teamAvgYards(x):
temp = x['Yards.Gained'] / x['totals']
return temp
team_yards['AvgYards'] = team_yards.apply(teamAvgYards, axis=1)
team_yards = team_yards[['posteam', 'PlayType', 'AvgYards']]
tyr = team_yards[team_yards.PlayType == 'Run']
typ = team_yards[team_yards.PlayType == 'Pass']
tyr =tyr.set_index('posteam').sort_values(by = 'AvgYards')
typ =typ.set_index('posteam').sort_values(by = 'AvgYards')
In [17]:
tyr.plot(kind='bar')
Out[17]:
In [18]:
typ.plot(kind='bar')
Out[18]:
In [19]:
team_avg_yards = pd.concat([tyr, typ], axis=1)
team_avg_yards.columns = ['a', 'AvgRunYards', 'b', 'AvgPassYards']
team_avg_yards = team_avg_yards[['AvgRunYards', 'AvgPassYards']]
team_avg_yards.sort_values(by = 'AvgPassYards', inplace=True)
team_avg_yards.plot(kind='bar')
Out[19]:
In [20]:
team_avg_yards.sort_values(by = 'AvgRunYards', inplace=True)
team_avg_yards.plot(kind='bar')
Out[20]:
In [ ]: