In [1]:
## imports
%matplotlib inline
%config InlineBackend.figure_format='retina'
# %load_ext autoreload
# the "1" means: always reload modules marked with "%aimport"
# %autoreload 1
from __future__ import absolute_import, division, print_function
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.pyplot import GridSpec
import seaborn as sns
# import mplsvds
# import mpld3
import numpy as np
import pandas as pd
import os, sys
from tqdm import tqdm
import warnings
sns.set_context("poster", font_scale=1.3)
In [2]:
dfd = pd.read_csv("../data/redcard/crowdstorm_disaggregated.csv.gz", compression='gzip')
#renaming Alpha_3 to playerCountry
dfd = dfd.rename(columns={'Alpha_3':'playerCountry'})
The dataset is available as a list with 146,028 dyads of players and referees and includes details from players, details from referees and details regarding the interactions of player-referees. A summary of the variables of interest can be seen below. A detailed description of all variables included can be seen in the README file on the project website. -- https://docs.google.com/document/d/1uCF5wmbcL90qvrk_J27fWAvDcDNrO9o_APkicwRkOKc/edit
Variable Name: | Variable Description: |
---|---|
playerShort | short player ID |
player | player name |
club | player club |
leagueCountry | country of player club (England, Germany, France, and Spain) |
height | player height (in cm) |
weight | player weight (in kg) |
position | player position |
games | number of games in the player-referee dyad |
goals | number of goals in the player-referee dyad |
yellowCards | number of yellow cards player received from the referee |
yellowReds | number of yellow-red cards player received from the referee |
redCards | number of red cards player received from the referee |
photoID | ID of player photo (if available) |
rater1 | skin rating of photo by rater 1 |
rater2 | skin rating of photo by rater 1 |
refNum | unique referee ID number (referee name removed for anonymizing purposes) |
refCountry | unique referee country ID number |
meanIAT | mean implicit bias score (using the race IAT) for referee country |
nIAT | sample size for race IAT in that particular country |
seIAT | standard error for mean estimate of race IAT |
meanExp | mean explicit bias score (using a racial thermometer task) for referee country |
nExp | sample size for explicit bias in that particular country |
seExp | standard error for mean estimate of explicit bias measure |
In [3]:
dfd.head()
Out[3]:
In [4]:
dfd.shape
Out[4]:
In [5]:
dfd.columns
Out[5]:
In [6]:
## how many distinct players in the dataset?
players = dfd['playerShort'].unique()
len(players)
Out[6]:
In [7]:
## how many distinct refs in the dataset?
dfd['refNum'].nunique()
Out[7]:
In [8]:
## what's the distribution of skin tone among players?
player_df = dfd[['playerShort', 'rater1', 'rater2', 'skintone']].drop_duplicates()
In [9]:
# sanity check: there should be 2,053 rows in player_df
assert player_df.shape[0] == 2053, "Error: player_df is not unique by player."
In [35]:
player_df.head()
Out[35]:
In [36]:
## visualize the distribution of skin tone among players
fig, axes = plt.subplots(1, 2, figsize=(16,8), sharey=True)
# rater 1
player_df.groupby('rater1')['playerShort'].count().plot(kind='bar', ax=axes[0]);
axes[0].set_xlabel("Skintone");
axes[0].set_ylabel("Count of Players");
axes[0].set_title("Rater 1's Ratings");
# rater 2
player_df.groupby('rater2')['playerShort'].count().plot(kind='bar', ax=axes[1]);
axes[1].set_xlabel("Skintone");
axes[1].set_title("Rater 2's Ratings");
fig.tight_layout();
In [42]:
## visualize the average skin tone
fig, ax = plt.subplots(figsize=(14,8))
# rater 1
player_df.groupby('skintone')['playerShort'].count().plot(kind='bar', ax=ax);
ax.set_xlabel("Skintone");
ax.set_ylabel("Count of Players");
ax.set_title("Avg. Skintone Rating of Players");
fig.tight_layout()
In [12]:
## how many yellow, yellowRed, and red cards are given (total) to players as a function of skintone?
g = dfd.groupby(['playerShort', 'skintone'], as_index=False)[['yellowCards', 'yellowReds',
'redCards', 'allreds']].sum()
# visualize results
fig, axes = plt.subplots(3, 1, figsize=(14,12), sharex=True)
ax = axes[0]
g[['skintone', 'yellowCards']].boxplot(by='skintone', ax=ax);
ax.set_ylim((-2, 800));
ax.set_title("Yellow Cards (Total)", size=14);
#ax.set_ylabel("Total Yellow Cards Received", size=14);
ax.set_xlabel("");
# visualize results
ax = axes[1]
g[['skintone', 'yellowReds']].boxplot(by='skintone', ax=ax);
ax.set_ylim((-2, 30));
ax.set_title("Yellow-Red Cards (Total)", size=14);
#ax.set_ylabel("Total Yellow-Red Cards Received", size=14);
ax.set_xlabel("");
# visualize results
ax = axes[2]
g[['skintone', 'redCards']].boxplot(by='skintone', ax=ax);
ax.set_ylim((-2, 30));
ax.set_title("Red Cards (Total)", size=14);
#ax.set_ylabel("Total Red Cards Received", size=14);
ax.set_xlabel("Avg. Skintone", size=14);
fig.suptitle("");
fig.tight_layout();
In [15]:
## what about looking at avg cards-per-game for number of games played?
# get total games by player
total_games_by_player = dfd.groupby(['playerShort'], as_index=False)['games'].sum()
# merge dataframes
games_and_cards = pd.merge(g, total_games_by_player, on='playerShort')
# divide cards by games to get cards per game
games_and_cards['yellowNorm'] = games_and_cards['yellowCards']/games_and_cards['games']
games_and_cards['yellowRedNorm'] = games_and_cards['yellowReds']/games_and_cards['games']
games_and_cards['redNorm'] = games_and_cards['redCards']/games_and_cards['games']
# visualize results
fig, axes = plt.subplots(3, 1, figsize=(14,12), sharex=True)
ax = axes[0]
games_and_cards[['skintone', 'yellowNorm']].boxplot(by='skintone', ax=ax);
ax.set_ylim((-0.01, .4));
ax.set_title("Yellow Cards (Per Game)", size=14);
#ax.set_ylabel("Total Yellow Cards Received", size=14);
ax.set_xlabel("");
# visualize results
ax = axes[1]
games_and_cards[['skintone', 'yellowRedNorm']].boxplot(by='skintone', ax=ax);
ax.set_ylim((-0.001, .03));
ax.set_title("Yellow-Red Cards (Per Game)", size=14);
#ax.set_ylabel("Total Yellow-Red Cards Received", size=14);
ax.set_xlabel("");
# visualize results
ax = axes[2]
games_and_cards[['skintone', 'redNorm']].boxplot(by='skintone', ax=ax);
ax.set_ylim((-0.001, .03));
ax.set_title("Red Cards (Per Game)", size=14);
#ax.set_ylabel("Total Red Cards Received", size=14);
ax.set_xlabel("Avg. Skintone", size=14);
fig.suptitle("");
fig.tight_layout();
In [33]:
## if we divide players into light (skintone < 0.5) and dark (skintone > 0.5), what do we see?
def binary_skintone(s):
if s < 0.5:
return 'light'
elif s > 0.5:
return 'dark'
else: # skintone == 0.5
return None
# assign binary skintone labels to players
games_and_cards['skintone_binary'] = games_and_cards['skintone'].apply(binary_skintone)
# make same figures as above, but only using binary labels
bin_only_df = games_and_cards.dropna(subset=['skintone_binary'])
# visualize results
fig, axes = plt.subplots(1, 3, figsize=(14,5), sharex=True)
ax = axes[0]
bin_only_df[['skintone_binary', 'yellowNorm']].boxplot(by='skintone_binary', ax=ax);
ax.set_ylim((-0.01, .4));
ax.set_title("Yellow Cards (Per Game)", size=14);
#ax.set_ylabel("Total Yellow Cards Received", size=14);
ax.set_xlabel("Skintone (Binary)", size=14);
# visualize results
ax = axes[1]
bin_only_df[['skintone_binary', 'yellowRedNorm']].boxplot(by='skintone_binary', ax=ax);
ax.set_ylim((-0.001, .02));
ax.set_title("Yellow-Red Cards (Per Game)", size=14);
#ax.set_ylabel("Total Yellow-Red Cards Received", size=14);
ax.set_xlabel("Skintone (Binary)", size=14);
# visualize results
ax = axes[2]
bin_only_df[['skintone_binary', 'redNorm']].boxplot(by='skintone_binary', ax=ax);
ax.set_ylim((-0.001, .02));
ax.set_title("Red Cards (Per Game)", size=14);
#ax.set_ylabel("Total Red Cards Received", size=14);
ax.set_xlabel("Skintone (Binary)", size=14);
fig.suptitle("");
fig.tight_layout();