In [1]:
## imports

%matplotlib inline
%config InlineBackend.figure_format='retina'
# %load_ext autoreload
# the "1" means: always reload modules marked with "%aimport"
# %autoreload 1

from __future__ import absolute_import, division, print_function
import matplotlib as mpl
from matplotlib import pyplot as plt
from matplotlib.pyplot import GridSpec
import seaborn as sns
# import mplsvds
# import mpld3
import numpy as np
import pandas as pd
import os, sys
from tqdm import tqdm
import warnings

sns.set_context("poster", font_scale=1.3)


/Users/ben/miniconda3/lib/python3.4/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)

In [2]:
dfd = pd.read_csv("../data/redcard/crowdstorm_disaggregated.csv.gz", compression='gzip')
#renaming Alpha_3 to playerCountry
dfd = dfd.rename(columns={'Alpha_3':'playerCountry'})

Data Structure

The dataset is available as a list with 146,028 dyads of players and referees and includes details from players, details from referees and details regarding the interactions of player-referees. A summary of the variables of interest can be seen below. A detailed description of all variables included can be seen in the README file on the project website. -- https://docs.google.com/document/d/1uCF5wmbcL90qvrk_J27fWAvDcDNrO9o_APkicwRkOKc/edit

Variable Name: Variable Description:
playerShort short player ID
player player name
club player club
leagueCountry country of player club (England, Germany, France, and Spain)
height player height (in cm)
weight player weight (in kg)
position player position
games number of games in the player-referee dyad
goals number of goals in the player-referee dyad
yellowCards number of yellow cards player received from the referee
yellowReds number of yellow-red cards player received from the referee
redCards number of red cards player received from the referee
photoID ID of player photo (if available)
rater1 skin rating of photo by rater 1
rater2 skin rating of photo by rater 1
refNum unique referee ID number (referee name removed for anonymizing purposes)
refCountry unique referee country ID number
meanIAT mean implicit bias score (using the race IAT) for referee country
nIAT sample size for race IAT in that particular country
seIAT standard error for mean estimate of race IAT
meanExp mean explicit bias score (using a racial thermometer task) for referee country
nExp sample size for explicit bias in that particular country
seExp standard error for mean estimate of explicit bias measure

In [3]:
dfd.head()


Out[3]:
playerShort player club leagueCountry birthday height weight position games victories ... meanIAT nIAT seIAT meanExp nExp seExp skintone allreds allredsStrict refCount
0 lucas-wilchez Lucas Wilchez Real Zaragoza Spain 31.08.1983 177.0 72.0 Attacking Midfielder 1 0 ... 0.326391 712.0 0.000564 0.396000 750.0 0.002696 0.375 0 0 1
1 john-utaka John Utaka Montpellier HSC France 08.01.1982 179.0 82.0 Right Winger 1 0 ... 0.203375 40.0 0.010875 -0.204082 49.0 0.061504 0.750 0 0 1
2 abdon-prats Abdón Prats RCD Mallorca Spain 17.12.1992 181.0 79.0 NaN 1 0 ... 0.369894 1785.0 0.000229 0.588297 1897.0 0.001002 NaN 0 0 3
3 pablo-mari Pablo Marí RCD Mallorca Spain 31.08.1993 191.0 87.0 Center Back 1 1 ... 0.369894 1785.0 0.000229 0.588297 1897.0 0.001002 NaN 0 0 3
4 ruben-pena Rubén Peña Real Valladolid Spain 18.07.1991 172.0 70.0 Right Midfielder 1 1 ... 0.369894 1785.0 0.000229 0.588297 1897.0 0.001002 NaN 0 0 3

5 rows × 32 columns


In [4]:
dfd.shape


Out[4]:
(426572, 32)

In [5]:
dfd.columns


Out[5]:
Index(['playerShort', 'player', 'club', 'leagueCountry', 'birthday', 'height',
       'weight', 'position', 'games', 'victories', 'ties', 'defeats', 'goals',
       'yellowCards', 'yellowReds', 'redCards', 'photoID', 'rater1', 'rater2',
       'refNum', 'refCountry', 'playerCountry', 'meanIAT', 'nIAT', 'seIAT',
       'meanExp', 'nExp', 'seExp', 'skintone', 'allreds', 'allredsStrict',
       'refCount'],
      dtype='object')

In [6]:
## how many distinct players in the dataset?
players = dfd['playerShort'].unique()
len(players)


Out[6]:
2053

In [7]:
## how many distinct refs in the dataset?
dfd['refNum'].nunique()


Out[7]:
3147

In [8]:
## what's the distribution of skin tone among players?
player_df = dfd[['playerShort', 'rater1', 'rater2', 'skintone']].drop_duplicates()

In [9]:
# sanity check: there should be 2,053 rows in player_df
assert player_df.shape[0] == 2053, "Error: player_df is not unique by player."

In [35]:
player_df.head()


Out[35]:
playerShort rater1 rater2 skintone
0 lucas-wilchez 0.25 0.50 0.375
1 john-utaka 0.75 0.75 0.750
2 abdon-prats NaN NaN NaN
3 pablo-mari NaN NaN NaN
4 ruben-pena NaN NaN NaN

In [36]:
## visualize the distribution of skin tone among players
fig, axes = plt.subplots(1, 2, figsize=(16,8), sharey=True)
# rater 1
player_df.groupby('rater1')['playerShort'].count().plot(kind='bar', ax=axes[0]);
axes[0].set_xlabel("Skintone");
axes[0].set_ylabel("Count of Players");
axes[0].set_title("Rater 1's Ratings");
# rater 2
player_df.groupby('rater2')['playerShort'].count().plot(kind='bar', ax=axes[1]);
axes[1].set_xlabel("Skintone");
axes[1].set_title("Rater 2's Ratings");
fig.tight_layout();



In [42]:
## visualize the average skin tone
fig, ax = plt.subplots(figsize=(14,8))
# rater 1
player_df.groupby('skintone')['playerShort'].count().plot(kind='bar', ax=ax);
ax.set_xlabel("Skintone");
ax.set_ylabel("Count of Players");
ax.set_title("Avg. Skintone Rating of Players");
fig.tight_layout()



In [12]:
## how many yellow, yellowRed, and red cards are given (total) to players as a function of skintone?
g = dfd.groupby(['playerShort', 'skintone'], as_index=False)[['yellowCards', 'yellowReds', 
                                                              'redCards', 'allreds']].sum()

# visualize results
fig, axes = plt.subplots(3, 1, figsize=(14,12), sharex=True)
ax = axes[0]
g[['skintone', 'yellowCards']].boxplot(by='skintone', ax=ax);
ax.set_ylim((-2, 800));
ax.set_title("Yellow Cards (Total)", size=14);
#ax.set_ylabel("Total Yellow Cards Received", size=14);
ax.set_xlabel("");
# visualize results
ax = axes[1]
g[['skintone', 'yellowReds']].boxplot(by='skintone', ax=ax);
ax.set_ylim((-2, 30));
ax.set_title("Yellow-Red Cards (Total)", size=14);
#ax.set_ylabel("Total Yellow-Red Cards Received", size=14);
ax.set_xlabel("");
# visualize results
ax = axes[2]
g[['skintone', 'redCards']].boxplot(by='skintone', ax=ax);
ax.set_ylim((-2, 30));
ax.set_title("Red Cards (Total)", size=14);
#ax.set_ylabel("Total Red Cards Received", size=14);
ax.set_xlabel("Avg. Skintone", size=14);

fig.suptitle("");
fig.tight_layout();



In [15]:
## what about looking at avg cards-per-game for number of games played?

# get total games by player
total_games_by_player = dfd.groupby(['playerShort'], as_index=False)['games'].sum()

# merge dataframes
games_and_cards = pd.merge(g, total_games_by_player, on='playerShort')
# divide cards by games to get cards per game
games_and_cards['yellowNorm'] = games_and_cards['yellowCards']/games_and_cards['games']
games_and_cards['yellowRedNorm'] = games_and_cards['yellowReds']/games_and_cards['games']
games_and_cards['redNorm'] = games_and_cards['redCards']/games_and_cards['games']

# visualize results
fig, axes = plt.subplots(3, 1, figsize=(14,12), sharex=True)
ax = axes[0]
games_and_cards[['skintone', 'yellowNorm']].boxplot(by='skintone', ax=ax);
ax.set_ylim((-0.01, .4));
ax.set_title("Yellow Cards (Per Game)", size=14);
#ax.set_ylabel("Total Yellow Cards Received", size=14);
ax.set_xlabel("");
# visualize results
ax = axes[1]
games_and_cards[['skintone', 'yellowRedNorm']].boxplot(by='skintone', ax=ax);
ax.set_ylim((-0.001, .03));
ax.set_title("Yellow-Red Cards (Per Game)", size=14);
#ax.set_ylabel("Total Yellow-Red Cards Received", size=14);
ax.set_xlabel("");
# visualize results
ax = axes[2]
games_and_cards[['skintone', 'redNorm']].boxplot(by='skintone', ax=ax);
ax.set_ylim((-0.001, .03));
ax.set_title("Red Cards (Per Game)", size=14);
#ax.set_ylabel("Total Red Cards Received", size=14);
ax.set_xlabel("Avg. Skintone", size=14);

fig.suptitle("");
fig.tight_layout();



In [33]:
## if we divide players into light (skintone < 0.5) and dark (skintone > 0.5), what do we see?

def binary_skintone(s):
    if s < 0.5:
        return 'light'
    elif s > 0.5:
        return 'dark'
    else: # skintone == 0.5
        return None
    
# assign binary skintone labels to players
games_and_cards['skintone_binary'] = games_and_cards['skintone'].apply(binary_skintone)
# make same figures as above, but only using binary labels
bin_only_df = games_and_cards.dropna(subset=['skintone_binary'])

# visualize results
fig, axes = plt.subplots(1, 3, figsize=(14,5), sharex=True)
ax = axes[0]
bin_only_df[['skintone_binary', 'yellowNorm']].boxplot(by='skintone_binary', ax=ax);
ax.set_ylim((-0.01, .4));
ax.set_title("Yellow Cards (Per Game)", size=14);
#ax.set_ylabel("Total Yellow Cards Received", size=14);
ax.set_xlabel("Skintone (Binary)", size=14);

# visualize results
ax = axes[1]
bin_only_df[['skintone_binary', 'yellowRedNorm']].boxplot(by='skintone_binary', ax=ax);
ax.set_ylim((-0.001, .02));
ax.set_title("Yellow-Red Cards (Per Game)", size=14);
#ax.set_ylabel("Total Yellow-Red Cards Received", size=14);
ax.set_xlabel("Skintone (Binary)", size=14);

# visualize results
ax = axes[2]
bin_only_df[['skintone_binary', 'redNorm']].boxplot(by='skintone_binary', ax=ax);
ax.set_ylim((-0.001, .02));
ax.set_title("Red Cards (Per Game)", size=14);
#ax.set_ylabel("Total Red Cards Received", size=14);
ax.set_xlabel("Skintone (Binary)", size=14);

fig.suptitle("");
fig.tight_layout();