In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import pandas as pd
import numpy as np
import pkg_resources
import matplotlib.pyplot as plt
import seaborn as sns
import time
import scipy.stats as stats
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import Input
from keras.layers import Conv1D
from keras.layers import MaxPooling1D
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Dense
from keras.optimizers import RMSprop
from keras.models import Model
%matplotlib inline
# autoreload makes it easier to interactively work on code in imported libraries
%load_ext autoreload
%autoreload 2
In [2]:
# These files will be provided to tutorial participants via Google Cloud Storage
train_v1_df = pd.read_csv('../input/fat-star-tutorial-data/public_train_v1.csv')
validate_df = pd.read_csv('../input/fat-star-tutorial-data/public_validate.csv')
test_df = pd.read_csv('../input/fat-star-tutorial-data/public_test.csv')
Let's examine some rows in these datasets. Note that columns like toxicity and male are percent scores. We query for "male >= 0" to exclude rows where the male identity is not labeled.
In [3]:
train_v1_df[['toxicity', 'male', 'comment_text']].query('male >= 0').head()
Out[3]:
We will need to convert toxicity and identity columns to booleans, in order to work with our neural net and metrics calculcations. For this tutorial, we will consider any value >= 0.5 as True (i.e. a comment should be considered toxic if 50% or more crowd raters labeled it as toxic). Note that this code also converts missing identity fields to False.
In [4]:
# List all identities
identity_columns = [
'male', 'female', 'transgender', 'other_gender', 'heterosexual', 'homosexual_gay_or_lesbian',
'bisexual', 'other_sexual_orientation', 'christian', 'jewish', 'muslim', 'hindu', 'buddhist',
'atheist', 'other_religion', 'black', 'white', 'asian', 'latino', 'other_race_or_ethnicity',
'physical_disability', 'intellectual_or_learning_disability', 'psychiatric_or_mental_illness', 'other_disability']
def convert_to_bool(df, col_name):
df[col_name] = np.where(df[col_name] >= 0.5, True, False)
for df in [train_v1_df, validate_df, test_df]:
for col in ['toxicity'] + identity_columns:
convert_to_bool(df, col)
train_v1_df[['toxicity', 'male', 'comment_text']].head()
Out[4]:
This code creates and trains a convolutional neural net using the Keras framework. This neural net accepts a text comment, encoding as a sequence of integers, and outputs a probably that the comment is toxic. Don't worry if you do not understand all of this code, as we will be treating this neural net as a black box later in the tutorial.
In [5]:
MAX_SEQUENCE_LENGTH = 250
MAX_NUM_WORDS = 10000
TOXICITY_COLUMN = 'toxicity'
TEXT_COLUMN = 'comment_text'
EMBEDDINGS_PATH = '../data/glove.6B/glove.6B.100d.txt'
EMBEDDINGS_DIMENSION = 100
DROPOUT_RATE = 0.3
LEARNING_RATE = 0.00005
NUM_EPOCHS = 1 # TODO: increase this
BATCH_SIZE = 128
In [6]:
def pad_text(texts, tokenizer):
return pad_sequences(tokenizer.texts_to_sequences(texts), maxlen=MAX_SEQUENCE_LENGTH)
def train_model(train_df, validate_df, tokenizer):
# Prepare data
train_text = pad_text(train_df[TEXT_COLUMN], tokenizer)
train_labels = to_categorical(train_df[TOXICITY_COLUMN])
validate_text = pad_text(validate_df[TEXT_COLUMN], tokenizer)
validate_labels = to_categorical(validate_df[TOXICITY_COLUMN])
# Load embeddings
embeddings_index = {}
with open(EMBEDDINGS_PATH) as f:
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype='float32')
embeddings_index[word] = coefs
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1,
EMBEDDINGS_DIMENSION))
num_words_in_embedding = 0
for word, i in tokenizer.word_index.items():
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
num_words_in_embedding += 1
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
# Create model layers.
def get_convolutional_neural_net_layers():
"""Returns (input_layer, output_layer)"""
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_layer = Embedding(len(tokenizer.word_index) + 1,
EMBEDDINGS_DIMENSION,
weights=[embedding_matrix],
input_length=MAX_SEQUENCE_LENGTH,
trainable=False)
x = embedding_layer(sequence_input)
x = Conv1D(128, 5, activation='relu', padding='same')(x)
x = MaxPooling1D(5, padding='same')(x)
x = Conv1D(128, 5, activation='relu', padding='same')(x)
x = MaxPooling1D(5, padding='same')(x)
x = Conv1D(128, 5, activation='relu', padding='same')(x)
x = MaxPooling1D(40, padding='same')(x)
x = Flatten()(x)
x = Dropout(DROPOUT_RATE)(x)
x = Dense(128, activation='relu')(x)
preds = Dense(2, activation='softmax')(x)
return sequence_input, preds
# Compile model.
input_layer, output_layer = get_convolutional_neural_net_layers()
model = Model(input_layer, output_layer)
model.compile(loss='categorical_crossentropy',
optimizer=RMSprop(lr=LEARNING_RATE),
metrics=['acc'])
# Train model.
model.fit(train_text,
train_labels,
batch_size=BATCH_SIZE,
epochs=NUM_EPOCHS,
validation_data=(validate_text, validate_labels),
verbose=2)
return model
In [7]:
MODEL_NAME_V1 = 'fat_star_tutorial_v1'
tokenizer_v1 = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_v1.fit_on_texts(train_v1_df[TEXT_COLUMN])
model_v1 = train_model(train_v1_df, validate_df, tokenizer_v1)
In [ ]:
test_comments_padded = pad_text(test_df[TEXT_COLUMN], tokenizer_v1)
test_df[MODEL_NAME_V1] = model_v1.predict(test_comments_padded)[:, 1]
In [ ]:
# Print some records to compare our model resulsts with the correct labels
test_df[[TOXICITY_COLUMN, TEXT_COLUMN, MODEL_NAME_V1]].head(10)
In [ ]:
# Get a list of identity columns that have >= 100 True records. This will remove groups such
# as "other_disability" which do not have enough records to calculate meaningful metrics.
identities_with_over_100_records = []
for identity in identity_columns:
num_records = len(test_df.query(identity + '==True'))
if num_records >= 100:
identities_with_over_100_records.append(identity)
In [ ]:
def compute_normalized_pinned_auc(df, subgroup, model_name):
subgroup_non_toxic = df[df[subgroup] & ~df[TOXICITY_COLUMN]]
subgroup_toxic = df[df[subgroup] & df[TOXICITY_COLUMN]]
background_non_toxic = df[~df[subgroup] & ~df[TOXICITY_COLUMN]]
background_toxic = df[~df[subgroup] & df[TOXICITY_COLUMN]]
within_subgroup_mwu = normalized_mwu(subgroup_non_toxic, subgroup_toxic, model_name)
cross_negative_mwu = normalized_mwu(subgroup_non_toxic, background_toxic, model_name)
cross_positive_mwu = normalized_mwu(background_non_toxic, subgroup_toxic, model_name)
return np.mean([1 - within_subgroup_mwu, 1 - cross_negative_mwu, 1 - cross_positive_mwu])
def normalized_mwu(data1, data2, model_name):
"""Returns the number of pairs where the datapoint in data1 has a greater score than that from data2."""
scores_1 = data1[model_name]
scores_2 = data2[model_name]
n1 = len(scores_1)
n2 = len(scores_2)
u, _ = stats.mannwhitneyu(scores_1, scores_2, alternative = 'less')
return u/(n1*n2)
def compute_pinned_auc(df, identity, model_name):
# Create combined_df, containing an equal number of comments that refer to the identity, and
# that belong to the background distribution.
identity_df = df[df[identity]]
nonidentity_df = df[~df[identity]].sample(len(identity_df), random_state=25)
combined_df = pd.concat([identity_df, nonidentity_df])
# Calculate the Pinned AUC
true_labels = combined_df[TOXICITY_COLUMN]
predicted_labels = combined_df[model_name]
return metrics.roc_auc_score(true_labels, predicted_labels)
def get_bias_metrics(df, model_name):
bias_metrics_df = pd.DataFrame({
'subgroup': identities_with_over_100_records,
'pinned_auc': [compute_pinned_auc(df, identity, model_name)
for identity in identities_with_over_100_records],
'normalized_pinned_auc': [compute_normalized_pinned_auc(df, identity, model_name)
for identity in identities_with_over_100_records]
})
# Re-order columns and sort bias metrics
return bias_metrics_df[['subgroup', 'pinned_auc', 'normalized_pinned_auc']].sort_values('pinned_auc')
def calculate_overall_auc(df, model_name):
true_labels = df[TOXICITY_COLUMN]
predicted_labels = df[model_name]
return metrics.roc_auc_score(true_labels, predicted_labels)
In [ ]:
bias_metrics_df = get_bias_metrics(test_df, MODEL_NAME_V1)
bias_metrics_df
In [ ]:
calculate_overall_auc(test_df, MODEL_NAME_V1)
We can graph a histogram of comment scores in each identity. In the following graphs, the X axis represents the toxicity score given by our new model, and the Y axis represents the comment count. Blue values are comment whose true label is non-toxic, while red values are those whose true label is toxic.
We can see that for some identities such as Asian, the model scores most non-toxic comments as less than 0.2 and most toxic comments as greater than 0.2. This indicates that for the Asian identity, our model is able to distinguish between toxic and non-toxic comments. However, for the black identity, there are many non-toxic comments with scores over 0.5, along with many toxic comments with scores of less than 0.5. This shows that for the black identity, our model will be less accurate at separating toxic comments from non-toxic comments.
In [ ]:
# Plot toxicity distributions of different identities to visualize bias.
def plot_histogram(identity):
toxic_scores = test_df.query(identity + ' == True & toxicity == True')[MODEL_NAME_V1]
non_toxic_scores = test_df.query(identity + ' == True & toxicity == False')[MODEL_NAME_V1]
sns.distplot(non_toxic_scores, color="skyblue", axlabel=identity)
sns.distplot(toxic_scores, color="red", axlabel=identity)
plt.figure()
for identity in bias_metrics_df['subgroup']:
plot_histogram(identity)
One possible reason for bias in the model may be that our training data is baised. In our case, our initial training data contained a higher percentage of toxic vs non-toxic comments for the "homosexual_gay_or_lesbian" identity. We have another dataset which contains additional non-toxic comments that refer to the "homosexual_gay_or_lesbian" group. If we train a new model using this data, we should make a small improvement in bias against this category (TODO: verify this).
In [ ]:
# Load new training data and convert fields to booleans.
train_v2_df = pd.read_csv('../input/fat-star-tutorial-data/public_train_v2.csv')
for col in ['toxicity'] + identity_columns:
convert_to_bool(train_v2_df, col)
# Create a new model using the same structure as our model_v1.
MODEL_NAME_V2 = 'fat_star_tutorial_v2'
tokenizer_v2 = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer_v2.fit_on_texts(train_v2_df[TEXT_COLUMN])
model_v2 = train_model(train_v2_df, validate_df, tokenizer_v2)
In [ ]:
test_comments_padded_v2 = pad_text(test_df[TEXT_COLUMN], tokenizer_v2)
test_df[MODEL_NAME_V2] = model_v2.predict(test_comments_padded_v2)[:, 1]
In [ ]:
bias_metrics_v2_df = get_bias_metrics(test_df, MODEL_NAME_V2)
bias_metrics_v2_df