In [0]:
!pip3 install --quiet "tensorflow>=1.11"
!pip3 install --quiet sentencepiece
In [0]:
import os
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn import metrics
import sentencepiece
import zipfile
from google.colab import auth
from google.colab import files
from IPython.display import HTML, display
Use Kaggle's My Account page to down load a kaggle.json file and re-upload it here.
In [0]:
!mkdir -p /root/.kaggle
token_file = "/root/.kaggle/kaggle.json"
uploaded = files.upload()
with open(token_file, "wb") as f:
f.write(uploaded["kaggle.json"])
os.chmod(token_file, 600)
In [0]:
import kaggle
Download the test set and extract the labeled portion
In [0]:
kaggle.api.competition_download_file('jigsaw-toxic-comment-classification-challenge', 'test.csv')
In [0]:
kaggle.api.competition_download_file('jigsaw-toxic-comment-classification-challenge', 'test_labels.csv')
In [0]:
test_labels = pd.read_csv('test_labels.csv.zip', index_col='id')
testset = test_labels.loc[test_labels['toxic'] != -1].join(
pd.read_csv('test.csv.zip', index_col='id'))
Load the pre-trained toxicity model from Google Cloud Storage
In [0]:
auth.authenticate_user()
In [0]:
!mkdir -p tfjs_model
!gsutil -m cp -R gs://conversationai-public/public_models/tfjs/v1/* tfjs_model
In [0]:
predict_fn = tf.contrib.predictor.from_saved_model(
'tfjs_model', signature_def_key='predict')
Load sentence piece model and preprocess test data
In [0]:
sp = sentencepiece.SentencePieceProcessor()
sp.Load('tfjs_model/assets/universal_encoder_8k_spm.model')
Out[0]:
Score the sentences with toxicity model
In [0]:
def progress(value, max=100):
return HTML("""
<progress
value='{value}'
max='{max}',
style='width: 100%'
>
{value}
</progress>
""".format(value=value, max=max))
In [0]:
tox_scores = []
nrows = testset.shape[0]
out = display(progress(0, nrows), display_id=True)
for offset in range(0, nrows):
out.update(progress(offset, nrows))
values = sp.EncodeAsIds(testset['comment_text'][offset])
tox_scores.append(predict_fn({
'values': values,
'indices': [(0, i) for i in range(len(values))],
'dense_shape': [1, len(values)]})['toxicity/probabilities'][0,1])
In [0]:
!gsutil cp gs://conversationai-public/public_models/tfjs/perspectiveapi.csv .
In [0]:
perspective_api=pd.read_csv('perspectiveapi.csv')
In [0]:
top_kernel = kaggle.api.kernels_output(kernel='tunguz/superblend', path='.')
In [0]:
top_kernel_scores = testset = test_labels.loc[test_labels['toxic'] != -1].join(
pd.read_csv('superblend.csv', index_col='id'), rsuffix='_predicted')
In [0]:
plt.figure()
fpr, tpr, _ = metrics.roc_curve(testset['toxic'], tox_scores)
plt.plot(fpr, tpr, label='Tensorflow JS model')
fpr, tpr, _ = metrics.roc_curve(testset['toxic'],
perspective_api['PerspectiveAPI'].values)
plt.plot(fpr, tpr, label='Perspective API')
fpr, tpr, _ = metrics.roc_curve(top_kernel_scores['toxic'],
top_kernel_scores['toxic_predicted'])
plt.plot(fpr, tpr, label='Top scoring Kaggle kernel')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.legend(loc='lower right')
plt.ylim(0.75, 1.0)
plt.xlim(0.0, 0.25)
plt.title('Performance on Kaggle Toxic Comments Challenge Test Set')
plt.show()