In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import datetime
import os
import pandas as pd
import numpy as np
import pkg_resources
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import time
import scipy.stats as stats
from sklearn import metrics

%matplotlib inline

# autoreload makes it easier to interactively work on code in imported libraries
%load_ext autoreload
%autoreload 2

# Set pandas display options so we can read more of the comment text.
pd.set_option('max_colwidth', 300)

import model_bias_analysis

In [2]:
# Print python version
import sys
print(sys.version)


2.7.13 (default, Nov 24 2017, 17:33:09) 
[GCC 6.3.0 20170516]

Load and pre-process Madlibs data


In [3]:
# Read madlibs dataset, convert Label to Booleans
madlibs_df = pd.read_csv('eval_datasets/bias_madlibs_77k_scored_prod_models.csv')
madlibs_df.loc[madlibs_df.Label == 'BAD', 'Label'] = True
madlibs_df.loc[madlibs_df.Label == 'NOT_BAD', 'Label'] = False
# Needed for AUC metrics to work properly
madlibs_df['Label'] = madlibs_df['Label'].astype(bool)

In [4]:
# Add boolean identity columns, e.g. "hug american" should have True in the "american" column.
madlibs_identity_terms = model_bias_analysis.read_identity_terms('bias_madlibs_data/adjectives_people.txt')
model_bias_analysis.add_subgroup_columns_from_text(madlibs_df, 'Text', madlibs_identity_terms)
madlibs_df.head()


Out[4]:
Label Template Text id RockV6_2:TOXICITY Rock:TOXICITY lesbian gay bisexual transgender ... older young younger teenage millenial middle aged elderly blind deaf paralyzed
0 True verb_adj murder american 0 0.837852 0.811245 False False False False ... False False False False False False False False False False
1 False verb_adj respect trans 1 0.066053 0.251806 False False False False ... False False False False False False False False False False
2 False verb_adj respect homosexual 2 0.188938 0.831167 False False False False ... False False False False False False False False False False
3 False verb_adj hug mexican 3 0.059981 0.098826 False False False False ... False False False False False False False False False False
4 True verb_adj kill sikh 4 0.743793 0.888041 False False False False ... False False False False False False False False False False

5 rows × 56 columns

Load and pre-process Civil Comments data


In [5]:
def convert_to_bool_column(df, src_column, dst_column):
    df.loc[df[src_column] >= 0.5, dst_column] = True
    df.loc[df[src_column] < 0.5, dst_column] = False
    # convert NaNs to False
    df.loc[df[src_column].isnull(), dst_column] = False
    # Needed for AUC metrics to work properly
    df[dst_column] = df[dst_column].astype(bool)

In [6]:
# Read Civil Comments dataset, set Label column to be a Boolean toxicity value.
civil_comments_df = pd.read_csv('eval_datasets/civil_comments_scored_prod_models.csv')
convert_to_bool_column(civil_comments_df, 'toxicity', 'Label')

In [7]:
# Convert identity terms to booleans
civil_comments_identity_terms = [
    'male', 'female', 'transgender', 'other_gender', 'heterosexual',
    'homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation', 'christian',
    'jewish', 'muslim', 'hindu', 'buddhist', 'atheist', 'other_religion', 'black',
    'white', 'asian', 'latino', 'other_race_or_ethnicity',
    'physical_disability', 'intellectual_or_learning_disability',
    'psychiatric_or_mental_illness', 'other_disability']
for identity in civil_comments_identity_terms:
    convert_to_bool_column(civil_comments_df, identity, identity)
civil_comments_df.head()


Out[7]:
Unnamed: 0 Unnamed: 0.1 Unnamed: 0.1.1 article_id asian atheist bisexual black buddhist christian ... sexual_explicit threat toxicity toxicity_annotator_count transgender white wow Rock:TOXICITY RockV6_2:TOXICITY Label
0 0 0 0 138380 False False False False False False ... 0.000000 0.000000 0.783333 60 False False 0 0.623230 0.865134 True
1 1 1 1 160937 False False False False False False ... 0.000000 0.000000 0.628571 70 False False 0 0.773702 0.831807 True
2 2 2 2 366992 False False False False False False ... 0.028571 0.014286 0.571429 70 False False 0 0.544393 0.607978 True
3 3 3 3 351749 False False False False False False ... 0.000000 0.000000 0.675000 80 False False 0 0.683339 0.869096 True
4 4 4 4 147521 False False False False False False ... 0.000000 0.413333 0.800000 75 False False 0 0.876304 0.836591 True

5 rows × 51 columns


In [8]:
# Create another view of the data that is limited to the comments < 100 characters.
civil_comments_short_df = civil_comments_df.loc[civil_comments_df['Text'].str.len() < 100]

Measure bias


In [9]:
# Calculate bias metrics on each dataset
madlibs_bias_metrics = model_bias_analysis.compute_bias_metrics_for_models(
    madlibs_df,
    madlibs_identity_terms,
    ['Rock:TOXICITY', 'RockV6_2:TOXICITY'],
    'Label')
civil_comments_bias_metrics = model_bias_analysis.compute_bias_metrics_for_models(
    civil_comments_df,
    civil_comments_identity_terms,
    ['Rock:TOXICITY', 'RockV6_2:TOXICITY'],
    'Label')

In [10]:
civil_comments_short_bias_metrics = model_bias_analysis.compute_bias_metrics_for_models(
    civil_comments_short_df,
    civil_comments_identity_terms,
    ['Rock:TOXICITY', 'RockV6_2:TOXICITY'],
    'Label')

In [11]:
# Sort bias metrics by weighted pinned AUC
def sort_by_weighted_pinned_auc(bias_metrics):
    bias_metrics['weighted_pinned_auc'] = (
        bias_metrics['Rock:TOXICITY_bnsp_auc'] +
        bias_metrics['Rock:TOXICITY_bpsn_auc'] +
        bias_metrics['Rock:TOXICITY_subgroup_auc']) / 3
    bias_metrics = bias_metrics.sort_values('weighted_pinned_auc', ascending=True)
    return bias_metrics.drop('weighted_pinned_auc', axis=1)

madlibs_bias_metrics = sort_by_weighted_pinned_auc(madlibs_bias_metrics)
#civil_comments_bias_metrics = sort_by_weighted_pinned_auc(civil_comments_bias_metrics)
#civil_comments_short_bias_metrics = sort_by_weighted_pinned_auc(civil_comments_short_bias_metrics)

In [12]:
civil_comments_short_bias_metrics


Out[12]:
Rock:TOXICITY_bnsp_auc Rock:TOXICITY_bpsn_auc Rock:TOXICITY_negative_aeg Rock:TOXICITY_positive_aeg Rock:TOXICITY_subgroup_auc subgroup subset_size RockV6_2:TOXICITY_bnsp_auc RockV6_2:TOXICITY_bpsn_auc RockV6_2:TOXICITY_negative_aeg RockV6_2:TOXICITY_positive_aeg RockV6_2:TOXICITY_subgroup_auc
0 0.941721 0.912627 0.131592 -0.034134 0.905559 male 4874 0.950170 0.930810 0.115947 -0.047009 0.919983
1 0.936841 0.900076 0.171412 -0.043163 0.883369 female 4001 0.946961 0.918171 0.176532 -0.038343 0.899323
2 0.924908 0.823183 0.336985 -0.174747 0.740532 transgender 183 0.895972 0.889983 0.269538 -0.178842 0.768658
3 NaN NaN NaN NaN NaN other_gender 0 NaN NaN NaN NaN NaN
4 0.936999 0.727653 0.342266 -0.027626 0.721014 heterosexual 58 0.934094 0.874530 0.211589 -0.189727 0.793478
5 0.981018 0.574589 0.432685 0.105010 0.705605 homosexual_gay_or_lesbian 743 0.927494 0.824487 0.297307 -0.078862 0.768287
6 0.936755 0.821174 0.328169 -0.082697 0.825000 bisexual 18 0.911440 0.908253 0.178779 -0.182233 0.837500
7 NaN NaN NaN NaN NaN other_sexual_orientation 0 NaN NaN NaN NaN NaN
8 0.927412 0.931726 0.124032 -0.089159 0.908062 christian 2320 0.936972 0.944863 0.091696 -0.110184 0.920382
9 0.952931 0.793307 0.339744 -0.026733 0.794870 jewish 393 0.949480 0.840675 0.329010 -0.070221 0.807151
10 0.941178 0.812724 0.335553 -0.064606 0.782752 muslim 1624 0.929436 0.882686 0.264886 -0.130706 0.812552
11 0.965379 0.948592 0.124223 -0.017988 0.956522 hindu 26 0.961482 0.935407 0.084628 -0.179294 0.927536
12 0.980238 0.935431 0.074269 -0.021691 1.000000 buddhist 19 0.971272 0.958542 0.162026 -0.119254 1.000000
13 0.964900 0.880696 0.234939 -0.004288 0.913462 atheist 109 0.968360 0.906158 0.250211 -0.050035 0.936699
14 0.944304 0.940336 0.139779 -0.112832 0.932099 other_religion 33 0.956875 0.960683 0.137891 -0.042913 0.975309
15 0.945123 0.785825 0.354367 -0.047339 0.765804 black 1054 0.968965 0.754640 0.399983 -0.021438 0.775819
16 0.941012 0.807950 0.331320 -0.055697 0.783766 white 2230 0.959880 0.830020 0.372516 -0.080897 0.817256
17 0.918172 0.918779 0.141527 -0.109582 0.876223 asian 437 0.933219 0.901915 0.207158 -0.059013 0.857295
18 0.906234 0.896002 0.157164 -0.104974 0.826731 latino 170 0.936828 0.898580 0.164774 -0.101880 0.851154
19 0.937924 0.898786 0.199009 -0.148223 0.884921 other_race_or_ethnicity 37 0.973826 0.873768 0.323046 -0.041782 0.920635
20 NaN 0.925284 0.081078 NaN NaN physical_disability 7 NaN 0.892209 0.004097 NaN NaN
21 0.834910 0.913476 0.289715 -0.238664 0.571429 intellectual_or_learning_disability 11 0.816865 0.831496 0.291579 -0.234149 0.535714
22 0.950433 0.786838 0.358592 -0.053193 0.781171 psychiatric_or_mental_illness 495 0.950900 0.858471 0.286049 -0.065712 0.838466
23 NaN 0.880045 0.364894 NaN NaN other_disability 1 NaN 0.969800 0.237191 NaN NaN

In [13]:
# Define functions for printing bias metric heatmaps

N_AEG_COLORS = 12

aeg_negative_colors = sns.cubehelix_palette(N_AEG_COLORS,  # n_colors
                                   0.6,  # start at magenta
                                   0,  # no rotation
                                   3,  # gamma
                                   1,  # dark
                                   1,  # light
                                   reverse=True,
                                   as_cmap=False)[int(N_AEG_COLORS/2):]

aeg_positive_colors = sns.cubehelix_palette(N_AEG_COLORS,  # n_colors
                                   2,  # greenish
                                   0,  # no rotation
                                   3,  # gamma
                                   1,  # dark
                                   1,  # light
                                   reverse=False,
                                   as_cmap=False)[:int(N_AEG_COLORS/2)]
aeg_colors = aeg_negative_colors + aeg_positive_colors

def limit_subgroups(bias_metrics, subgroups):
    return bias_metrics.loc[bias_metrics.subgroup.isin(subgroups)]

def print_auc_heatmap(bias_metrics, subgroups, model, out=None, color_palette=None):
    bias_metrics_for_subgroups = limit_subgroups(bias_metrics, subgroups)
    if not color_palette:
        color_palette = sns.cubehelix_palette(100,  # n_colors
                                              0.6,  # start at magenta
                                              0,  # no rotation
                                              3,  # gamma
                                              1,  # dark
                                              1.2,  # light
                                              reverse=True,
                                              as_cmap=False)
    model_bias_analysis.plot_auc_heatmap(bias_metrics_for_subgroups, [model], color_palette, out=out)

def print_aeg_heatmap(bias_metrics, subgroups, model, out=None, color_palette=None):
    bias_metrics_for_subgroups = limit_subgroups(bias_metrics, subgroups)
    if not color_palette:
        color_palette = aeg_colors
    model_bias_analysis.plot_aeg_heatmap(bias_metrics_for_subgroups, [model], color_palette, out=out)

def get_identities_over_n(df, n, all_identities):
    results = []
    for identity in all_identities:
        num_records = len(df.query(identity + '==True'))
        if num_records > 100:
            results.append(identity)
    return results

In [14]:
# Print Civil Comments heatmaps.  Filter identities to only use those which have >= 100
# short comments written about them.
CIVIL_COMMENTS_HEATMAP_IDENTITIES = get_identities_over_n(civil_comments_short_df, 100, civil_comments_identity_terms)
print_auc_heatmap(civil_comments_bias_metrics, CIVIL_COMMENTS_HEATMAP_IDENTITIES, 'Rock:TOXICITY', '/tmp/civil_aucs_1.png')
print_auc_heatmap(civil_comments_bias_metrics, CIVIL_COMMENTS_HEATMAP_IDENTITIES, 'RockV6_2:TOXICITY', '/tmp/civil_aucs_6.png')
print_aeg_heatmap(civil_comments_bias_metrics, CIVIL_COMMENTS_HEATMAP_IDENTITIES, 'Rock:TOXICITY', '/tmp/civil_aegs_1.png')
print_aeg_heatmap(civil_comments_bias_metrics, CIVIL_COMMENTS_HEATMAP_IDENTITIES, 'RockV6_2:TOXICITY', '/tmp/civil_aegs_6.png')

# Print Civil Comments Short Comments heatmaps
print_auc_heatmap(civil_comments_short_bias_metrics, CIVIL_COMMENTS_HEATMAP_IDENTITIES, 'Rock:TOXICITY', '/tmp/civil_short_aucs_1.png')
print_auc_heatmap(civil_comments_short_bias_metrics, CIVIL_COMMENTS_HEATMAP_IDENTITIES, 'RockV6_2:TOXICITY', '/tmp/civil_short_aucs_6.png')
print_aeg_heatmap(civil_comments_short_bias_metrics, CIVIL_COMMENTS_HEATMAP_IDENTITIES, 'Rock:TOXICITY', '/tmp/civil_short_aegs_1.png')
print_aeg_heatmap(civil_comments_short_bias_metrics, CIVIL_COMMENTS_HEATMAP_IDENTITIES, 'RockV6_2:TOXICITY', '/tmp/civil_short_aegs_6.png')

# Print Madlibs heatmaps, using same number of identities as civil comments
num_identities = 20
MADLIB_HEATMAP_IDENTITIES = madlibs_identity_terms[:num_identities]
print_auc_heatmap(madlibs_bias_metrics, MADLIB_HEATMAP_IDENTITIES, 'Rock:TOXICITY', '/tmp/madlibs_aucs_1.png')
print_auc_heatmap(madlibs_bias_metrics, MADLIB_HEATMAP_IDENTITIES, 'RockV6_2:TOXICITY', '/tmp/madlibs_aucs_6.png')
print_aeg_heatmap(madlibs_bias_metrics, MADLIB_HEATMAP_IDENTITIES, 'Rock:TOXICITY', '/tmp/madlibs_aegs_1.png')
print_aeg_heatmap(madlibs_bias_metrics, MADLIB_HEATMAP_IDENTITIES, 'RockV6_2:TOXICITY', '/tmp/madlibs_aegs_6.png')

In [15]:
def get_percent_toxic(df):
    toxic_comments = df.query('Label == True')
    return len(toxic_comments) / len(df)
    

def print_count_and_percent_toxic(df, identity):
  # Query all training comments where the identity column equals True.
  identity_comments = df.query(identity + ' == True')

  # Query which of those comments also have "toxicity" equals True
  toxic_identity_comments = identity_comments.query('toxicity == True')
  # Alternatively you could also write a query using & (and), e.g.:
  # toxic_identity_comments = train_df.query(identity  + ' == True & toxicity == True')

  # Print the results.
  num_comments = len(identity_comments)
  percent_toxic = get_percent_toxic(identity_comments)
  print('%d comments refer to the %s identity, %.2f%% are toxic' % (
    num_comments,
    identity,
    # multiply percent_toxic by 100 for easier reading.
    100 * percent_toxic))

for identity in civil_comments_identity_terms:
    print_count_and_percent_toxic(civil_comments_df, identity)
    
# Print the results for all comments
num_comments = len(civil_comments_df)
percent_toxic = get_percent_toxic(civil_comments_df)
print('%d comments for all identities, %.2f%% are toxic' % (
    num_comments,
    # multiply percent_toxic by 100 for easier reading.
    100 * percent_toxic))


44484 comments refer to the male identity, 15.03% are toxic
53429 comments refer to the female identity, 13.68% are toxic
2499 comments refer to the transgender identity, 21.29% are toxic
10 comments refer to the other_gender identity, 30.00% are toxic
1291 comments refer to the heterosexual identity, 22.77% are toxic
10997 comments refer to the homosexual_gay_or_lesbian identity, 28.38% are toxic
287 comments refer to the bisexual identity, 21.60% are toxic
10 comments refer to the other_sexual_orientation identity, 30.00% are toxic
40423 comments refer to the christian identity, 9.09% are toxic
7651 comments refer to the jewish identity, 16.21% are toxic
21006 comments refer to the muslim identity, 22.76% are toxic
580 comments refer to the hindu identity, 10.52% are toxic
588 comments refer to the buddhist identity, 11.73% are toxic
1412 comments refer to the atheist identity, 13.10% are toxic
325 comments refer to the other_religion identity, 14.46% are toxic
14901 comments refer to the black identity, 31.39% are toxic
25082 comments refer to the white identity, 28.06% are toxic
4578 comments refer to the asian identity, 12.36% are toxic
2004 comments refer to the latino identity, 19.36% are toxic
508 comments refer to the other_race_or_ethnicity identity, 18.70% are toxic
82 comments refer to the physical_disability identity, 7.32% are toxic
93 comments refer to the intellectual_or_learning_disability identity, 6.45% are toxic
4889 comments refer to the psychiatric_or_mental_illness identity, 21.07% are toxic
5 comments refer to the other_disability identity, 20.00% are toxic
1804875 comments for all identities, 8.00% are toxic

In [16]:
def plot_background_histogram(data, model, title, label_col='Label', out=None):
    fig = plt.figure()
    toxic_scores = data.query(label_col + ' == True')[model]
    toxic_scores = toxic_scores.rename(title)
    non_toxic_scores = data.query(label_col + ' == False')[model]
    non_toxic_scores = non_toxic_scores.rename(title)
    sns.distplot(non_toxic_scores , color='#2b6f39', bins=30)
    sns.distplot(toxic_scores , color='#d38fc5', bins=30)
    if out:
        fig.savefig(out, format='png', bbox_inches='tight')
    return fig
    
def plot_histogram(data, identity, model, title, label_col='Label', out=None):
    fig = plt.figure()
    toxic_scores = data.query(identity + ' == True & ' + label_col + ' == True')[model]
    toxic_scores = toxic_scores.rename(title)
    non_toxic_scores = data.query(identity + ' == True & '+ label_col + ' == False')[model]
    non_toxic_scores = non_toxic_scores.rename(title)
    sns.distplot(non_toxic_scores , color='#2b6f39', axlabel=title)
    sns.distplot(toxic_scores , color='#d38fc5', axlabel=title)
    if out:
        fig.savefig(out, format='png', bbox_inches='tight')
    return fig

In [17]:
plot_background_histogram(civil_comments_df, 'Rock:TOXICITY', 
                          'Background - All Comments - TOXICITY@1', out='/tmp/hist_background_all_1.png')
plot_background_histogram(civil_comments_df, 'RockV6_2:TOXICITY',
                          'Background - All Comments - TOXICITY@6', out='/tmp/hist_background_all_6.png')
plot_background_histogram(civil_comments_short_df, 'Rock:TOXICITY', 
                          'Background - Short Comments - TOXICITY@1', out='/tmp/hist_background_short_1.png')
plot_background_histogram(civil_comments_short_df, 'RockV6_2:TOXICITY',
                          'Background - Short Comments - TOXICITY@6', out='/tmp/hist_background_short_6.png')


Out[17]:

In [18]:
plot_histogram(civil_comments_df, 'homosexual_gay_or_lesbian', 'Rock:TOXICITY', 
               'homosexual_gay_or_lesbian - All Comments - TOXICITY@1', out='/tmp/hist_hgl_all_1.png')
plot_histogram(civil_comments_df, 'homosexual_gay_or_lesbian', 'RockV6_2:TOXICITY',
               'homosexual_gay_or_lesbian - All Comments - TOXICITY@6', out='/tmp/hist_hgl_all_6.png')
plot_histogram(civil_comments_short_df, 'homosexual_gay_or_lesbian', 'Rock:TOXICITY', 
               'homosexual_gay_or_lesbian - Short Comments - TOXICITY@1', out='/tmp/hist_hgl_short_1.png')
plot_histogram(civil_comments_short_df, 'homosexual_gay_or_lesbian', 'RockV6_2:TOXICITY',
               'homosexual_gay_or_lesbian - Short Comments - TOXICITY@6', out='/tmp/hist_hgl_short_6.png')


Out[18]: