In [ ]:
import os
import os.path as osp
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import influence_plot
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from k2datascience.utils import ax_formatter, save_fig, size
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
In [ ]:
data_dir = osp.realpath(osp.join(os.getcwd(), '..', 'data',
'linear_regression'))
In [ ]:
batting = pd.read_csv(osp.join(data_dir, 'batting.csv'))
category_cols = (
'stint',
'league_id',
'triple',
'cs',
'ibb',
'hbp',
'sf',
'g_idp',
)
for col in category_cols:
batting.loc[:, col] = batting.loc[:, col].astype('category')
batting.info()
batting.head()
batting.describe()
In [ ]:
player = pd.read_csv(osp.join(data_dir, 'player.csv'))
category_cols = (
'bats',
'birth_month',
'death_month',
'throws',
)
for col in category_cols:
player.loc[:, col] = player.loc[:, col].astype('category')
player.info()
player.head()
player.describe()
In [ ]:
salary = pd.read_csv(osp.join(data_dir, 'salary.csv'))
category_cols = (
'team_id',
'league_id',
)
for col in category_cols:
salary.loc[:, col] = salary.loc[:, col].astype('category')
salary.info()
salary.head()
salary.describe()
In [ ]:
team = pd.read_csv(osp.join(data_dir, 'team.csv'))
category_cols = (
'league_id',
'div_id',
'div_win',
'lg_win',
'rank',
'team_id',
'wc_win',
'ws_win',
)
for col in category_cols:
team.loc[:, col] = team.loc[:, col].astype('category')
team.info()
team.head()
team.describe()
In [ ]:
mean_salary = (salary
.groupby('year')
.mean()
.reset_index())
mean_salary.corr()
In [ ]:
ax = mean_salary.plot(x='year', y='salary', figsize=(8, 6),
label='Mean Salary')
ax.set_title('Mean Salary vs Year', fontsize=size['title'])
ax.legend(fontsize=size['legend'])
ax.set_xlabel('Year', fontsize=size['label'])
ax.set_ylabel('Mean Salary (x $1000)', fontsize= size['label'])
ax.yaxis.set_major_formatter(ax_formatter['thousands'])
plt.show();
In [ ]:
lr = smf.ols(formula=f'salary ~ year', data=mean_salary).fit()
lr.summary()
In [ ]:
# Data
ax = mean_salary.plot(x='year', y='salary',
figsize=(8, 6), label='Mean Salary')
# Regression Line
ax.plot(mean_salary.year, lr.predict(mean_salary.year),
linestyle='--', label='Linear Regression')
# Confidence Intervals
std, upper, lower = wls_prediction_std(lr)
ax.plot(mean_salary.year, lower, alpha=0.5, color='black',
label='Confidence Interval', linestyle='-.')
ax.plot(mean_salary.year, upper, alpha=0.5, color='black',
linestyle='-.')
ax.set_title('Mean Salary vs Year', fontsize=size['title'])
ax.legend(fontsize=size['legend'])
ax.set_xlabel('Year', fontsize=size['label'])
ax.set_ylabel('Mean Salary (x $1000)', fontsize= size['label'])
ax.yaxis.set_major_formatter(ax_formatter['thousands'])
plt.show();
In [ ]:
fig = plt.figure('Salary Boxp Plot', figsize=(12, 6),
facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
sns.boxplot(x='year', y='salary', data=salary,
fliersize=2, ax=ax0)
ax0.set_title('Salary vs Year', fontsize=size['title'])
ax0.set_xlabel('Year', fontsize=size['label'])
ax0.set_ylabel('Salary (x $1000)', fontsize=size['label'])
ax0.yaxis.set_major_formatter(ax_formatter['thousands'])
fig.autofmt_xdate()
plt.show();
The Gini Coefficient is a means to represent the income or wealth distribution of a population.
In [ ]:
gini = {}
for year in salary.year.unique():
salaries = (salary.query(f'year == {year}')
.salary
.sort_values())
n = salaries.size
gini[year] = ((2 * np.sum(salaries * (np.arange(n) + 1)))
/ (n * salaries.sum())
- ((n + 1) / n))
gini = (pd.Series(gini)
.reset_index()
.rename(columns={'index': 'year', 0: 'gini'}))
In [ ]:
gini.corr()
In [ ]:
ax = gini.plot(x='year', y='gini', figsize=(8, 6),
label='Gini Coefficient')
ax.set_title('Gini Coefficient vs Year', fontsize=size['title'])
ax.legend(fontsize=size['legend'])
ax.set_xlabel('Year', fontsize=size['label'])
ax.set_ylabel('Gini Coefficient', fontsize= size['label'])
plt.show();
In [ ]:
features = ' + '.join([f'np.power(year, {x + 1})' for x in range(2)])
quadratic_model = smf.ols(formula=f'gini ~ {features}', data=gini).fit()
quadratic_model.summary()
log_model = smf.ols(formula='gini ~ np.log(year) * year', data=gini).fit()
log_model.summary()
In [ ]:
fig = plt.figure('Regression Plot', figsize=(10, 5),
facecolor='white', edgecolor='black')
rows, cols = (1, 2)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
ax1 = plt.subplot2grid((rows, cols), (0, 1), sharey=ax0)
# Regression Lines
ax0.plot(gini.year, quadratic_model.predict(gini.year),
color='red', linestyle='-.',
label='Quadratic Regression')
std0, upper, lower = wls_prediction_std(quadratic_model)
ax0.plot(gini.year, lower, alpha=0.5, color='black',
label='Confidence Interval', linestyle='-.')
ax0.plot(gini.year, upper, alpha=0.5, color='black',
linestyle='-.')
ax1.plot(gini.year, log_model.predict(gini.year),
color='red', linestyle='--',
label='Logrithmic Regression')
std, upper, lower = wls_prediction_std(log_model)
ax1.plot(gini.year, lower, alpha=0.5, color='black',
label='Confidence Interval', linestyle='-.')
ax1.plot(gini.year, upper, alpha=0.5, color='black',
linestyle='-.')
# Data
for ax in (ax0, ax1):
gini.plot(x='year', y='gini', label='Gini Coefficient', ax=ax)
ax.legend(fontsize=size['legend'])
ax.set_xlabel('Year', fontsize=size['label'])
ax.set_ylabel('Gini Coefficient', fontsize= size['label'])
fig.autofmt_xdate()
plt.tight_layout()
plt.suptitle('Gini Coefficient vs Year',
fontsize=size['super_title'], y=1.05)
plt.show();
In [ ]:
fig = plt.figure('Residual Plot', figsize=(12, 5),
facecolor='white', edgecolor='black')
rows, cols = (1, 2)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
ax1 = plt.subplot2grid((rows, cols), (0, 1))
# Quadratic Model
ax0.scatter(quadratic_model.fittedvalues, quadratic_model.resid)
ax0.set_title('Quadratic Model', fontsize=size['title'])
# Logrithmic Model
ax1.scatter(log_model.fittedvalues, log_model.resid)
ax1.set_title('Logrithmic Model', fontsize=size['title'])
for ax in (ax0, ax1):
ax.set_xlabel('Fitted Values', fontsize=size['label'])
ax.set_ylabel('Raw Residuals', fontsize=size['label'])
plt.show();
In [ ]:
fig = plt.figure('Influence Plot', figsize=(10, 10),
facecolor='white', edgecolor='black')
rows, cols = (1, 2)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
ax1 = plt.subplot2grid((rows, cols), (0, 1))
# Quadradic Model
influence = influence_plot(quadratic_model, ax=ax0)
ax0.set_title('Quadratic Model',
fontsize=size['title'])
# Logrithmic Model
influence = influence_plot(log_model, ax=ax1)
ax1.set_title('Logrithmic Model',
fontsize=size['title'])
for ax in (ax0, ax1):
ax.set_xlabel('H Leverage',
fontsize=size['label'])
ax.set_ylabel('Studentized Residuals',
fontsize=size['label'])
plt.show();
In [ ]:
fig = plt.figure('Team Heatmap', figsize=(10, 8),
facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
sns.heatmap(team.corr(),
annot=False, cbar_kws={'orientation': 'vertical'},
fmt='.2f', linewidths=1, vmin=-1, vmax=1, ax=ax0)
ax0.set_title('Team Dataset', fontsize=size['title'])
ax0.set_xticklabels(ax0.xaxis.get_majorticklabels(),
fontsize=size['label'], rotation=80)
ax0.set_yticklabels(ax0.yaxis.get_majorticklabels(),
fontsize=size['label'], rotation=0)
plt.show();
In [ ]:
features = [
'g',
'w',
'bb',
'ab',
'fp',
'ipouts',
'ha',
'er',
'double',
]
model = ' + '.join(features)
team_model = smf.ols(formula=f'h ~ {model}', data=team).fit()
team_model.summary()
In [ ]:
features.remove('bb')
In [ ]:
new_model = ' + '.join(features)
team_model = smf.ols(formula=f'h ~ {new_model}', data=team).fit()
team_model.summary()
In [ ]:
fig = plt.figure('Residual Plot', figsize=(8, 6),
facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
# Model
ax0.scatter(team_model.fittedvalues, team_model.resid)
ax0.set_title('Team Model', fontsize=size['title'])
ax0.set_xlabel('Fitted Values', fontsize=size['label'])
ax0.set_ylabel('Raw Residuals', fontsize=size['label'])
plt.show();
In [ ]:
fig = plt.figure('Influence Plot', figsize=(10, 10),
facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
# Team Model
influence = influence_plot(team_model, ax=ax0)
ax0.set_title('Team Model',
fontsize=size['title'])
ax.set_xlabel('H Leverage',
fontsize=size['label'])
ax.set_ylabel('Studentized Residuals',
fontsize=size['label'])
plt.show();
In [ ]:
with sns.axes_style("white"):
fig = plt.figure('At Bats, Wins, Games Played Scatter',
figsize=(10, 6), facecolor='white',
edgecolor='black')
ax = Axes3D(fig)
ax.view_init(15, 45)
sc = ax.scatter(team.ab, team.w, team.g,
c=team.h, cmap='gnuplot',
vmin=team.h.min(), vmax=team.h.max())
plt.colorbar(sc)
ax.set_title(f'Data Colored by Hits',
fontsize=size['title'], y=1.02)
ax.set_xlabel('\nAt Bats', fontsize=size['label'])
ax.set_ylabel('\nWins', fontsize=size['label'])
ax.set_zlabel('\nGames Played', fontsize=size['label'])
plt.suptitle('Team\nAt Bats vs Wins vs Games Played',
fontsize=size['super_title'], x=0.4, y=1.15)
plt.show();
In [ ]:
batting_data = (batting
.set_index('player_id')
.join((player.loc[:, ['player_id', 'bats']]
.set_index('player_id')))
.loc[:, ['year', 'g', 'ab', 'bats', 'h']])
batting_data.info()
In [ ]:
batting_data.head()
In [ ]:
year_avg = (batting_data
.groupby(['player_id', 'year'])[['g', 'ab', 'bats', 'h']]
.sum()
.reset_index())
year_avg.head()
In [ ]:
player_avg = (year_avg
.groupby(['player_id'])[['g', 'ab', 'h']]
.mean()
.reset_index())
player_avg.head()
In [ ]:
player = (player_avg
.set_index('player_id')
.join(player.loc[:, ['player_id', 'bats']]
.set_index('player_id')))
player.head()
In [ ]:
fig = plt.figure('Player Heatmap', figsize=(5, 4),
facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
sns.heatmap(player.corr(),
annot=True, cbar_kws={'orientation': 'vertical'},
fmt='.2f', linewidths=5, vmin=-1, vmax=1, ax=ax0)
ax0.set_title('Player Dataset Correlation',
fontsize=size['title'])
ax0.set_xticklabels(ax0.xaxis.get_majorticklabels(),
fontsize=size['label'], rotation=80)
ax0.set_yticklabels(ax0.yaxis.get_majorticklabels(),
fontsize=size['label'], rotation=0)
plt.show();
In [ ]:
grid = sns.pairplot(player.dropna(),
diag_kws={'alpha': 0.5, 'bins': 10,
'edgecolor': 'black'},
plot_kws={'alpha': 0.7})
grid.fig.suptitle('Player Data Correlation',
fontsize=size['super_title'], y=1.03)
cols = (player
.select_dtypes(exclude=['category'])
.columns)
for n, col in enumerate(cols):
grid.axes[cols.size - 1, n].set_xlabel(cols[n],
fontsize=size['label'])
grid.axes[n, 0].set_ylabel(cols[n], fontsize=size['label'])
plt.show();
In [ ]:
features = [
'g',
'ab',
'bats',
]
model = ' + '.join(features)
batting_model = smf.ols(formula=f'h ~ {model}', data=player).fit()
batting_model.summary()
In [ ]:
fig = plt.figure('Residual Plot', figsize=(8, 6),
facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
# Model
ax0.scatter(batting_model.fittedvalues, batting_model.resid,
alpha=0.5)
ax0.set_title('Batting Model', fontsize=size['title'])
ax0.set_xlabel('Fitted Values', fontsize=size['label'])
ax0.set_ylabel('Raw Residuals', fontsize=size['label'])
plt.show();
In [ ]:
with sns.axes_style("white"):
fig = plt.figure(', Wins, Games Played Scatter',
figsize=(10, 6), facecolor='white',
edgecolor='black')
ax = Axes3D(fig)
sc = ax.scatter(batting_data.g, batting_data.ab, batting_data.h,
c=batting_data.bats.cat.codes, cmap='gnuplot')
cbar = plt.colorbar(sc, ticks=[-1, 0, 1, 2])
cbar.ax.set_yticklabels(['N/A', 'Both', 'Left', 'Right'],
fontsize=size['legend'])
ax.set_title(f'Data Colored by Batting Hand',
fontsize=size['title'], y=1.02)
ax.set_xlabel('\nGames Played', fontsize=size['label'])
ax.set_ylabel('\nAt Bats', fontsize=size['label'])
ax.set_zlabel('\nHits', fontsize=size['label'])
plt.suptitle('Batter\nGames Played vs At Bats vs Hits',
fontsize=size['super_title'], x=0.4, y=1.15)
plt.show();