Linear Models

Timothy Helton


Imports


In [ ]:
import os
import os.path as osp

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import seaborn as sns
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import influence_plot
from statsmodels.sandbox.regression.predstd import wls_prediction_std

from k2datascience.utils import ax_formatter, save_fig, size

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline

Load Data


In [ ]:
data_dir = osp.realpath(osp.join(os.getcwd(), '..', 'data',
                                 'linear_regression'))

Batting


In [ ]:
batting = pd.read_csv(osp.join(data_dir, 'batting.csv'))
category_cols = (
    'stint',
    'league_id',
    'triple',
    'cs',
    'ibb',
    'hbp',
    'sf',
    'g_idp',
)
for col in category_cols:
    batting.loc[:, col] = batting.loc[:, col].astype('category')

batting.info()
batting.head()
batting.describe()

Player


In [ ]:
player = pd.read_csv(osp.join(data_dir, 'player.csv'))
category_cols = (
    'bats',
    'birth_month',
    'death_month',
    'throws',
)
for col in category_cols:
    player.loc[:, col] = player.loc[:, col].astype('category')

player.info()
player.head()
player.describe()

Salary


In [ ]:
salary = pd.read_csv(osp.join(data_dir, 'salary.csv'))
category_cols = (
    'team_id',
    'league_id',
)
for col in category_cols:
    salary.loc[:, col] = salary.loc[:, col].astype('category')

salary.info()
salary.head()
salary.describe()

Team


In [ ]:
team = pd.read_csv(osp.join(data_dir, 'team.csv'))
category_cols = (
    'league_id',
    'div_id',
    'div_win',
    'lg_win',
    'rank',
    'team_id',
    'wc_win',
    'ws_win',
)
for col in category_cols:
    team.loc[:, col] = team.loc[:, col].astype('category')

team.info()
team.head()
team.describe()

Exercise 1:

  1. Compute the correlation between mean salary and year.
  2. Generate a graph of mean salary per year.

In [ ]:
mean_salary = (salary
               .groupby('year')
               .mean()
               .reset_index())
mean_salary.corr()

In [ ]:
ax = mean_salary.plot(x='year', y='salary', figsize=(8, 6),
                      label='Mean Salary')

ax.set_title('Mean Salary vs Year', fontsize=size['title'])
ax.legend(fontsize=size['legend'])
ax.set_xlabel('Year', fontsize=size['label'])
ax.set_ylabel('Mean Salary (x $1000)', fontsize= size['label'])
ax.yaxis.set_major_formatter(ax_formatter['thousands'])

plt.show();

Exercise 2:

  1. Find the best line that approximates mean salary with respect to years.
  2. Plot this line together with the data from Exercise 1.

In [ ]:
lr = smf.ols(formula=f'salary ~ year', data=mean_salary).fit()
lr.summary()

In [ ]:
# Data
ax = mean_salary.plot(x='year', y='salary',
                      figsize=(8, 6), label='Mean Salary')

# Regression Line
ax.plot(mean_salary.year, lr.predict(mean_salary.year),
        linestyle='--', label='Linear Regression')

# Confidence Intervals
std, upper, lower = wls_prediction_std(lr)
ax.plot(mean_salary.year, lower, alpha=0.5, color='black',
        label='Confidence Interval', linestyle='-.')
ax.plot(mean_salary.year, upper, alpha=0.5, color='black',
        linestyle='-.')

ax.set_title('Mean Salary vs Year', fontsize=size['title'])
ax.legend(fontsize=size['legend'])
ax.set_xlabel('Year', fontsize=size['label'])
ax.set_ylabel('Mean Salary (x $1000)', fontsize= size['label'])
ax.yaxis.set_major_formatter(ax_formatter['thousands'])

plt.show();

Exercise 3: Create a box plot for salaries per year.


In [ ]:
fig = plt.figure('Salary Boxp Plot', figsize=(12, 6),
                 facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))

sns.boxplot(x='year', y='salary', data=salary,
            fliersize=2, ax=ax0)

ax0.set_title('Salary vs Year', fontsize=size['title'])
ax0.set_xlabel('Year', fontsize=size['label'])
ax0.set_ylabel('Salary (x $1000)', fontsize=size['label'])
ax0.yaxis.set_major_formatter(ax_formatter['thousands'])

fig.autofmt_xdate()

plt.show();

Exercise 4:

  1. From the previous graph we can see an increasing disparity in salaries as time increases.
    1. How would you measure disparity in salaries?
    2. Compute the correlation of disparity and years.
    3. Find the best line that approximates disparity with respect to years.

The Gini Coefficient is a means to represent the income or wealth distribution of a population.

  • The Gini coefficient measures the inequality among values of a frequency distribution.
  • G = 0 represents perfect equality
  • G = 1 expresses maximal inequality
$$G = \frac{2 \sum_{i=1}^n i y_i}{n \sum_{i=1}^n y_i} - \frac{n + 1}{n}$$

In [ ]:
gini = {}
for year in salary.year.unique():
    salaries = (salary.query(f'year == {year}')
                .salary
                .sort_values())
    n = salaries.size

    gini[year] = ((2 * np.sum(salaries * (np.arange(n) + 1)))
                  / (n * salaries.sum())
                  - ((n + 1) / n))
gini = (pd.Series(gini)
        .reset_index()
        .rename(columns={'index': 'year', 0: 'gini'}))

In [ ]:
gini.corr()

In [ ]:
ax = gini.plot(x='year', y='gini', figsize=(8, 6),
               label='Gini Coefficient')

ax.set_title('Gini Coefficient vs Year', fontsize=size['title'])
ax.legend(fontsize=size['legend'])
ax.set_xlabel('Year', fontsize=size['label'])
ax.set_ylabel('Gini Coefficient', fontsize= size['label'])

plt.show();

In [ ]:
features = ' + '.join([f'np.power(year, {x + 1})' for x in range(2)])
quadratic_model = smf.ols(formula=f'gini ~ {features}', data=gini).fit()
quadratic_model.summary()

log_model = smf.ols(formula='gini ~ np.log(year) * year', data=gini).fit()
log_model.summary()

In [ ]:
fig = plt.figure('Regression Plot', figsize=(10, 5),
                 facecolor='white', edgecolor='black')
rows, cols = (1, 2)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
ax1 = plt.subplot2grid((rows, cols), (0, 1), sharey=ax0)

# Regression Lines
ax0.plot(gini.year, quadratic_model.predict(gini.year),
         color='red', linestyle='-.',
         label='Quadratic Regression')
std0, upper, lower = wls_prediction_std(quadratic_model)
ax0.plot(gini.year, lower, alpha=0.5, color='black',
         label='Confidence Interval', linestyle='-.')
ax0.plot(gini.year, upper, alpha=0.5, color='black',
         linestyle='-.')

ax1.plot(gini.year, log_model.predict(gini.year),
         color='red', linestyle='--',
         label='Logrithmic Regression')
std, upper, lower = wls_prediction_std(log_model)
ax1.plot(gini.year, lower, alpha=0.5, color='black',
         label='Confidence Interval', linestyle='-.')
ax1.plot(gini.year, upper, alpha=0.5, color='black',
         linestyle='-.')

# Data
for ax in (ax0, ax1):
    gini.plot(x='year', y='gini', label='Gini Coefficient', ax=ax)
    ax.legend(fontsize=size['legend'])
    ax.set_xlabel('Year', fontsize=size['label'])
    ax.set_ylabel('Gini Coefficient', fontsize= size['label'])

fig.autofmt_xdate()
plt.tight_layout()
plt.suptitle('Gini Coefficient vs Year',
             fontsize=size['super_title'], y=1.05)

plt.show();

In [ ]:
fig = plt.figure('Residual Plot', figsize=(12, 5),
                 facecolor='white', edgecolor='black')
rows, cols = (1, 2)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
ax1 = plt.subplot2grid((rows, cols), (0, 1))

# Quadratic Model
ax0.scatter(quadratic_model.fittedvalues, quadratic_model.resid)

ax0.set_title('Quadratic Model', fontsize=size['title'])

# Logrithmic Model
ax1.scatter(log_model.fittedvalues, log_model.resid)

ax1.set_title('Logrithmic Model', fontsize=size['title'])

for ax in (ax0, ax1):
    ax.set_xlabel('Fitted Values', fontsize=size['label'])
    ax.set_ylabel('Raw Residuals', fontsize=size['label'])
    
plt.show();

In [ ]:
fig = plt.figure('Influence Plot', figsize=(10, 10),
                 facecolor='white', edgecolor='black')
rows, cols = (1, 2)
ax0 = plt.subplot2grid((rows, cols), (0, 0))
ax1 = plt.subplot2grid((rows, cols), (0, 1))

# Quadradic Model
influence = influence_plot(quadratic_model, ax=ax0)

ax0.set_title('Quadratic Model',
              fontsize=size['title'])

# Logrithmic Model
influence = influence_plot(log_model, ax=ax1)

ax1.set_title('Logrithmic Model',
              fontsize=size['title'])

for ax in (ax0, ax1):
    ax.set_xlabel('H Leverage',
                  fontsize=size['label'])
    ax.set_ylabel('Studentized Residuals',
                  fontsize=size['label'])

plt.show();

Findings

  • Adding a cubic term yeilded similar results as the quadratic regression.
  • Both models appear to be overfitting at the end of the data.
  • The quadratic model does not fit the data as well as the logrithmic model.
  • The quadratic model has a far smaller confidence interval than the logrithmic model.
  • A small handful of data points are having a large impact on both models.

Exercise 5:

  1. Build a predictive model for the amount of hits for a team given Games played, Wins, Walks by batters, At bats, Fielding percentage, Outs Pitched (innings pitched x 3), Hits allowed, Earned runs allowed, Doubles. To solve this problem you will use team.csv.
    1. How does your model measure accuracy?
    2. What was the score for its accuracy?
    3. Choose two features and create a 3d plot of feature1, feature2, h.

In [ ]:
fig = plt.figure('Team Heatmap', figsize=(10, 8),
                 facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))

sns.heatmap(team.corr(),
            annot=False, cbar_kws={'orientation': 'vertical'},
            fmt='.2f', linewidths=1, vmin=-1, vmax=1, ax=ax0)

ax0.set_title('Team Dataset', fontsize=size['title'])
ax0.set_xticklabels(ax0.xaxis.get_majorticklabels(),
                    fontsize=size['label'], rotation=80)
ax0.set_yticklabels(ax0.yaxis.get_majorticklabels(),
                    fontsize=size['label'], rotation=0)

plt.show();

In [ ]:
features = [
    'g',
    'w',
    'bb',
    'ab',
    'fp',
    'ipouts',
    'ha',
    'er',
    'double',
]
model = ' + '.join(features)
team_model = smf.ols(formula=f'h ~ {model}', data=team).fit()
team_model.summary()
Findings
  • The feature bb (walks) is not statistically relevent and will be removed.

In [ ]:
features.remove('bb')

In [ ]:
new_model = ' + '.join(features)
team_model = smf.ols(formula=f'h ~ {new_model}', data=team).fit()
team_model.summary()

In [ ]:
fig = plt.figure('Residual Plot', figsize=(8, 6),
                 facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))

# Model
ax0.scatter(team_model.fittedvalues, team_model.resid)

ax0.set_title('Team Model', fontsize=size['title'])

ax0.set_xlabel('Fitted Values', fontsize=size['label'])
ax0.set_ylabel('Raw Residuals', fontsize=size['label'])
    
plt.show();

In [ ]:
fig = plt.figure('Influence Plot', figsize=(10, 10),
                 facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))

# Team Model
influence = influence_plot(team_model, ax=ax0)

ax0.set_title('Team Model',
              fontsize=size['title'])

ax.set_xlabel('H Leverage',
              fontsize=size['label'])
ax.set_ylabel('Studentized Residuals',
              fontsize=size['label'])

plt.show();

In [ ]:
with sns.axes_style("white"):
    fig = plt.figure('At Bats, Wins, Games Played Scatter',
                     figsize=(10, 6), facecolor='white',
                     edgecolor='black')
    ax = Axes3D(fig)
    ax.view_init(15, 45)


    sc = ax.scatter(team.ab, team.w, team.g,
                    c=team.h, cmap='gnuplot',
                    vmin=team.h.min(), vmax=team.h.max())
    plt.colorbar(sc)

ax.set_title(f'Data Colored by Hits',
             fontsize=size['title'], y=1.02)
ax.set_xlabel('\nAt Bats', fontsize=size['label'])
ax.set_ylabel('\nWins', fontsize=size['label'])
ax.set_zlabel('\nGames Played', fontsize=size['label'])

plt.suptitle('Team\nAt Bats vs Wins vs Games Played',
             fontsize=size['super_title'], x=0.4, y=1.15)

plt.show();
Findings
  • The F-statistic is not equal to zero, so a relationship exists between hits and at least one of the features.
    • This is confirmed by the P-values for all features being zero.
      • All included features exhibit a relationship to hits.
    • The featrues with the largest coefficient confidence are:
      • At Bats
      • Wins
      • Hits Allowed
  • The model has an $R^2$ value of 0.957.
  • Numerous outliers are present in the data.
  • The leverage values to not indicate any specific points having a disproportionate impact on the data.
  • This model is an appropriate representation of the data and may be used to make predictions.

Exercise 6:

  1. Build a similar model to predict average hits per year based on Games played, at bats and whether a player is a left or right handed batter. Consider only those players who are either left or right handed batters and for the moment do not worry about missing data or ambidextrous batters.

In [ ]:
batting_data = (batting
                .set_index('player_id')
                .join((player.loc[:, ['player_id', 'bats']]
                       .set_index('player_id')))
                .loc[:, ['year', 'g', 'ab', 'bats', 'h']])
batting_data.info()

In [ ]:
batting_data.head()

In [ ]:
year_avg = (batting_data
            .groupby(['player_id', 'year'])[['g', 'ab', 'bats', 'h']]
            .sum()
            .reset_index())
year_avg.head()

In [ ]:
player_avg = (year_avg
              .groupby(['player_id'])[['g', 'ab', 'h']]
              .mean()
              .reset_index())
player_avg.head()

In [ ]:
player = (player_avg
          .set_index('player_id')
          .join(player.loc[:, ['player_id', 'bats']]
                .set_index('player_id')))
player.head()

In [ ]:
fig = plt.figure('Player Heatmap', figsize=(5, 4),
                 facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))

sns.heatmap(player.corr(),
            annot=True, cbar_kws={'orientation': 'vertical'},
            fmt='.2f', linewidths=5, vmin=-1, vmax=1, ax=ax0)

ax0.set_title('Player Dataset Correlation',
              fontsize=size['title'])
ax0.set_xticklabels(ax0.xaxis.get_majorticklabels(),
                    fontsize=size['label'], rotation=80)
ax0.set_yticklabels(ax0.yaxis.get_majorticklabels(),
                    fontsize=size['label'], rotation=0)

plt.show();

In [ ]:
grid = sns.pairplot(player.dropna(),
                    diag_kws={'alpha': 0.5, 'bins': 10,
                              'edgecolor': 'black'},
                    plot_kws={'alpha': 0.7})

grid.fig.suptitle('Player Data Correlation',
                  fontsize=size['super_title'], y=1.03)

cols = (player
        .select_dtypes(exclude=['category'])
        .columns)
for n, col in enumerate(cols):
    grid.axes[cols.size - 1, n].set_xlabel(cols[n],
                                           fontsize=size['label'])
    grid.axes[n, 0].set_ylabel(cols[n], fontsize=size['label'])

plt.show();

In [ ]:
features = [
    'g',
    'ab',
    'bats',
]

model = ' + '.join(features)
batting_model = smf.ols(formula=f'h ~ {model}', data=player).fit()
batting_model.summary()
Findings
  • All of the features are statistically relavent.
  • The model accuratly represents the data.

In [ ]:
fig = plt.figure('Residual Plot', figsize=(8, 6),
                 facecolor='white', edgecolor='black')
rows, cols = (1, 1)
ax0 = plt.subplot2grid((rows, cols), (0, 0))

# Model
ax0.scatter(batting_model.fittedvalues, batting_model.resid,
            alpha=0.5)

ax0.set_title('Batting Model', fontsize=size['title'])

ax0.set_xlabel('Fitted Values', fontsize=size['label'])
ax0.set_ylabel('Raw Residuals', fontsize=size['label'])
    
plt.show();

In [ ]:
with sns.axes_style("white"):
    fig = plt.figure(', Wins, Games Played Scatter',
                     figsize=(10, 6), facecolor='white',
                     edgecolor='black')
    ax = Axes3D(fig)

    sc = ax.scatter(batting_data.g, batting_data.ab, batting_data.h,
                    c=batting_data.bats.cat.codes, cmap='gnuplot')
    
    cbar = plt.colorbar(sc, ticks=[-1, 0, 1, 2])
    cbar.ax.set_yticklabels(['N/A', 'Both', 'Left', 'Right'],
                            fontsize=size['legend'])

ax.set_title(f'Data Colored by Batting Hand',
             fontsize=size['title'], y=1.02)
ax.set_xlabel('\nGames Played', fontsize=size['label'])
ax.set_ylabel('\nAt Bats', fontsize=size['label'])
ax.set_zlabel('\nHits', fontsize=size['label'])

plt.suptitle('Batter\nGames Played vs At Bats vs Hits',
             fontsize=size['super_title'], x=0.4, y=1.15)

plt.show();