In [2]:
import warnings
warnings.filterwarnings('ignore')
In [3]:
%pylab inline
import pandas as pd
import seaborn as sns
In [4]:
raw_df = pd.read_csv("/home/brianb/Downloads/odsc_football_modeling_data_2.csv")
df_no_week_1 = raw_df[raw_df.week > 1]
In [5]:
ff_cols = raw_df.columns
sort(ff_cols.values)
Out[5]:
In [6]:
raw_df[ff_cols].head()
Out[6]:
In [7]:
df_no_week_1[ff_cols].head()
Out[7]:
Pandas is a Python library you may have heard of that is great for exploring data in real time. Let's look at what we already did with Pandas and what else we can do!
In [9]:
raw_df = pd.read_csv("/home/brianb/Downloads/odsc_football_modeling_data_2.csv")
It reads a CSV file into a DataFrame.
DataFrames are mystical creatures in Data Science.
Popularized by R, they provide a standardized matrix-style format for interacting with your data. Most data can fit into this row and column format: financial transactions, iPhone app user records, medical histories, etc.
In [10]:
raw_df['full_name'].head()
Out[10]:
In [11]:
raw_df[['full_name', 'position', 'team']].head()
Out[11]:
In [13]:
raw_df[raw_df.week > 1].head()
Out[13]:
We can just output the entire dataframe to the console, but that doesn't scale beyond a couple hundred rows.
In [1]: df = DataFrame(np.random.randn(10, 4))
In [2]: df
Out[2]:
0 1 2 3
0 0.469112 -0.282863 -1.509059 -1.135632
1 1.212112 -0.173215 0.119209 -1.044236
2 -0.861849 -2.104569 -0.494929 1.071804
3 0.721555 -0.706771 -1.039575 0.271860
4 -0.424972 0.567020 0.276232 -1.087401
5 -0.673690 0.113648 -1.478427 0.524988
6 0.404705 0.577046 -1.715002 -1.039268
7 -0.370647 -1.157892 -1.344312 0.844885
8 1.075770 -0.109050 1.643563 -1.469388
9 0.357021 -0.674600 -1.776904 -0.968914
In [16]:
df_no_week_1 = raw_df[raw_df.week > 1]
df_no_week_1.isnull().sum()
Out[16]:
We will mainly use seaborn examples in this presentation. It's very intuitive and powerful to use.
In [10]:
pylab.hist(raw_df['fanduel_points'],
normed=True,
bins=np.linspace(-1, 35, 12),
alpha=0.35,
label='fanduel_points')
pylab.legend()
pylab.figure(figsize=(15,15))
Out[10]:
In [11]:
transformed_target = pd.DataFrame.copy(raw_df[raw_df.fanduel_points > 1])
transformed_target['fanduel_points'] = np.log(raw_df[raw_df.fanduel_points > 0]['fanduel_points']-1)
pylab.figure(figsize=(15,15))
pylab.hist(transformed_target['fanduel_points'],
normed=True,
bins=np.linspace(-1, 4, 100),
alpha=0.35,
label='fanduel_points')
pylab.legend()
Out[11]:
In [13]:
no_nans = raw_df[raw_df[raw_df.fanduel_points< 3].notnull()]
no_nans = raw_df[raw_df.position != 'UNK'] #remove Unknowns from this dataframe
no_nans.groupby('position').size()
bad_performances = pd.DataFrame({'count' : no_nans.groupby('position').size()}).reset_index()
bad_performances = bad_performances.sort(['count'], ascending=[0])
print bad_performances
g = sns.factorplot("position", "count",
data=bad_performances, kind="bar",
size=15, palette="pastel", dropna=True, x_order=bad_performances.position.values)
In [14]:
#Let's look at an individual player
raw_df[(raw_df.full_name =='Tom Brady') & (raw_df.week == 2)]
Out[14]:
In [15]:
#All positions are not created equal -- some score many more points than others
raw_df[raw_df[raw_df.fanduel_points> 1].notnull()].groupby('position')['fanduel_points'].sum()
Out[15]:
In [19]:
#Since a few positions seem to score all the points, let's zoom in on those
plot_order = ['TE', 'WR', 'RB',
'K', 'QB']
top_positions_only = raw_df[raw_df.position.isin(plot_order)]
top_positions_only.groupby('position')['fanduel_points'].mean()
Out[19]:
In [20]:
# Violin plots are a nice alternative to boxplots that also show interesting detail about
# the shape of the distribution
nonnull_subset = top_positions_only['fanduel_points'].notnull()
plt.figure(figsize=(12, 6))
sns.violinplot(top_positions_only['fanduel_points'][nonnull_subset],
top_positions_only['position'][nonnull_subset],
inner='box',
order=plot_order,
bw=1,
size=16)
Out[20]:
In [18]:
# QB's and kickers score the most points. Let's look into those using a Histogram
qb_k = ['K', 'QB']
qb_k_data = raw_df[raw_df.position.isin(qb_k)]
groups = qb_k_data.groupby('position').groups
pylab.figure(figsize=(15,5))
for key, row_ids in groups.iteritems():
pylab.hist(qb_k_data['fanduel_points'][row_ids].values,
normed=True,
bins=np.linspace(-10, 50, 50),
alpha=0.35,
label=str(key))
pylab.legend()
Out[18]:
In [19]:
# We can use a Facetgrid to analyze teams
def vertical_mean_line(x, **kwargs):
plt.axvline(np.percentile(x, 95), **kwargs)
teams = ['NE', 'CHI']
team_data = raw_df[raw_df.team.isin(teams)]
team_data = team_data[team_data.week < 4]
g = sns.FacetGrid(team_data, row="team", col="week",
margin_titles=True, dropna=True, size=4)
bins = np.linspace(-3, 30, 30)
g.map(plt.hist, "fanduel_points", color="black", bins=bins,
lw=0, normed=True)
g.map(vertical_mean_line, 'fanduel_points')
Out[19]:
In [20]:
# We can use a Heatmap to analyze teams kicker and qb performance
teams = ['NE', 'CHI', 'NYG', 'DET', 'NYJ']
team_data = qb_k_data[qb_k_data.team.isin(teams)]
team_data = team_data[team_data.week < 10]
ptable = pd.pivot_table(
team_data,
values='fanduel_points',
index=["team"],
columns='week')
reorder_teams = ptable.reindex(teams).fillna(0)
pylab.figure(figsize=(15,5))
sns.heatmap(reorder_teams.astype(int), annot=True, fmt="d", cmap="YlGnBu")
# Zero values are bye weeks
Out[20]:
In [21]:
# Are previous week's points a good predictor of current week's points?
# Let's consider only kicker and QB data for these teams
# We have to exclude week 1 here since there is no previous weeks' mean
team_data_no_week_1 = team_data[team_data.week > 1]
grid = sns.JointGrid(team_data_no_week_1['mean_fanduel_points'],
team_data_no_week_1['fanduel_points'],space=0, size=10, ratio=50)
grid.plot_joint(plt.scatter, color="g")
grid.plot_marginals(sns.rugplot, height=1, color="g")
Out[21]:
In [22]:
# We can use jointplot (uses JointGrid internally) to get a quick regression line for this
sns.jointplot('mean_fanduel_points', 'fanduel_points', data=team_data_no_week_1,
kind="reg", color=sns.color_palette()[1], size=9)
Out[22]:
In [23]:
# QB's are significantly more important than any other position. Let's dig in
qb_df = raw_df[raw_df.position == 'QB']
In [24]:
# Passing attempts from previous weeks --- is there a trend with next week's performance? No
sns.jointplot(qb_df['mean_passing_att'],
qb_df['fanduel_points'], kind="reg", size=9)
Out[24]:
In [25]:
import sklearn
In [21]:
# Let's prep for modeling
exclude_week_1 = top_positions_only[top_positions_only.week > 1]
model_data = pd.DataFrame.copy(exclude_week_1)
model_data = model_data[model_data.fanduel_points > 0]
print np.isnan(model_data['fanduel_points']).sum()
# Let's cut our target out so we don't train on it
target = model_data.pop('fanduel_points')
# We don't need player id's --- let's throw this away
throw_away = model_data.pop('player_id')
import sklearn.cross_validation
(train_data,
test_data,
train_target,
test_target) = sklearn.cross_validation.train_test_split(
model_data, target, test_size=0.2, random_state=1337
)
In [27]:
#Handle categorical vars
import sklearn.preprocessing
import sklearn.feature_extraction
from sklearn.feature_extraction import DictVectorizer
encoder = DictVectorizer(sparse=False)
#Let's do one-hot encoding in sklearn using DictVectorizer
categorical_vars = ['full_name', 'position', 'team', 'week', 'opponent', 'home_team', 'away_team']
vardata = train_data[categorical_vars].fillna('MISSING')
encoder.fit(vardata.to_dict(orient='records'))
train_catdata = encoder.transform(vardata.to_dict(orient='records'))
test_vardata = test_data[categorical_vars].fillna('MISSING')
test_catdata = encoder.transform(
test_vardata[categorical_vars].to_dict(orient='records'))
pd.DataFrame(train_catdata).describe()
Out[27]:
In [28]:
#Handle numeric vars
from sklearn.preprocessing import Imputer
imputer = Imputer(strategy='median')
numeric_vars = list(set(train_data.columns.tolist()) - set(categorical_vars))
numdata = train_data[numeric_vars]
imputer.fit(numdata)
train_numdata = imputer.transform(numdata)
test_numdata = imputer.transform(test_data[numeric_vars])
In [29]:
train_this = np.hstack([train_numdata, train_catdata])
test_this = np.hstack([test_numdata, test_catdata])
In [30]:
import sklearn
from sklearn.linear_model import LinearRegression
print np.any(isnan(train_numdata))
print np.all(np.isfinite(train_numdata))
lr = LinearRegression(fit_intercept=False)
lr.fit(train_numdata, train_target)
lr_predictions = pd.Series(lr.predict(test_numdata),
name='Linear Regression')
In [31]:
p_df = pd.DataFrame({'Prediction': lr_predictions,
'Actual': test_target.values})
pylab.figure(figsize=(10, 10))
sns.jointplot('Actual', 'Prediction', data=p_df,
kind="hex", color=sns.color_palette()[1])
#Let's take a look at our residuals using using just the categorical vars
Out[31]:
In [32]:
from sklearn import metrics
test_metrics = {
'Explained Variance': metrics.explained_variance_score,
'MAE': metrics.mean_absolute_error,
'MSE': metrics.mean_squared_error,
'MedAE': metrics.median_absolute_error,
'R2': metrics.r2_score
}
def metrics_report(*predictions):
records = []
for prediction_set in predictions:
record = {'name': prediction_set.name}
for metric_name in sorted(test_metrics.keys()):
metric_func = test_metrics[metric_name]
record[metric_name] = metric_func(test_target, prediction_set)
records.append(record)
frame = pd.DataFrame.from_records(records).set_index('name')
return frame
metrics_report(lr_predictions)
Out[32]:
In [33]:
# We need to add reference models to track a baseline performance that we can compare our other models to
mean_response = np.mean(train_target)
mean_predictions = pd.Series(np.ones_like(test_target) * mean_response,
name='Mean Response')
median_response = np.median(train_target)
median_predictions = pd.Series(np.ones_like(test_target) * median_response,
name='Median Response')
metrics_report(mean_predictions,
median_predictions,
lr_predictions)
Out[33]:
In [34]:
#Time for ElasticNet
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import ElasticNet
estimator = ElasticNet()
parameters = {
'alpha': np.linspace(0.1, 2, 10, endpoint=True),
'l1_ratio': np.linspace(0, 1, 10, endpoint=True)
}
enet = GridSearchCV(estimator, parameters)
enet.fit(train_numdata, train_target)
Out[34]:
In [35]:
print(enet.best_params_, enet.best_score_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in enet.grid_scores_:
print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))
print()
In [36]:
estimator2 = ElasticNet()
parameters2 = {
'alpha': np.linspace(0.4, 0.6, 10, endpoint=True),
'l1_ratio': np.linspace(0.4, 0.6, 10, endpoint=True)
}
enet2 = GridSearchCV(estimator2, parameters2)
enet2.fit(train_numdata, train_target)
Out[36]:
In [46]:
print(enet2.best_params_, enet2.best_score_)
print()
print("Grid scores on development set:")
print()
for params, mean_score, scores in enet2.grid_scores_:
print("%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params))
print()
In [47]:
enet_predictions = pd.Series(enet.predict(test_numdata),
name='Elastic Net')
p_df = pd.DataFrame({'Enet Prediction': enet_predictions,
'Actual': test_target.values})
pylab.figure(figsize=(10, 10))
sns.jointplot('Actual', 'Enet Prediction', data=p_df, kind="hex",
color=sns.color_palette()[2])
Out[47]:
In [239]:
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
estimator = RandomForestRegressor()
parameters = {'n_estimators': (5, 10, 15, 20, 25, 30, 35),
'max_depth': (3, 5, 7, 9, 11),
}
rfr = GridSearchCV(estimator, parameters, n_jobs=3)
rfr.fit(train_this, train_target)
rfr_predictions = pd.Series(rfr.predict(test_this),
name='Random Forest')
p_df = pd.DataFrame({'RF Prediction': rfr_predictions,
'Actual': test_target.values})
pylab.figure(figsize=(10, 10))
sns.jointplot('Actual', 'RF Prediction', data=p_df, kind="hex",
color=sns.color_palette()[3])
Out[239]:
In [240]:
metrics_report(mean_predictions,
median_predictions,
lr_predictions,
enet_predictions,
rfr_predictions)
Out[240]:
In [241]:
lr_diffs = lr_predictions - test_target
lr_diffs.name = 'LinearRegression Error'
rfr_diffs = rfr_predictions - test_target
rfr_diffs.name = 'RandomForest Error'
sns.jointplot(lr_predictions, rfr_predictions, kind='resid', color=sns.color_palette()[4])
Out[241]:
In [242]:
from sklearn.linear_model import Lasso
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
degree = 2
model = make_pipeline(PolynomialFeatures(degree), Lasso())
model.fit(train_numdata, train_target)
poly_preds = pd.Series(model.predict(test_numdata),
name='Polynomial Lasso',
index=test_target.index)
sns.jointplot(test_target,
poly_preds,
kind='resid',
color=sns.color_palette()[5])
Out[242]:
In [243]:
metrics_report(mean_predictions,
median_predictions,
lr_predictions,
enet_predictions,
rfr_predictions,
poly_preds)
Out[243]:
In [ ]: