Jigsaw Unintended Bias in Toxicity Classification
Detect toxicity across a diverse range of conversations
https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification/data#
In [10]:
!pip install -U -q kaggle
!echo {"username":"albahnsen","key":"1c9e16f2c8e4de73bb6b7db12d17e22d"} > C:\Users\albah\.kaggle\kaggle.json
In [1]:
!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -f test.csv -p ../datasets
In [ ]:
!kaggle competitions download -c jigsaw-unintended-bias-in-toxicity-classification -f train.csv -p ../datasets
In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
import string
from joblib import Parallel, delayed
from tqdm import tqdm_notebook as tqdm
%matplotlib inline
In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
stop_words = set(stopwords.words('english'))
stem = SnowballStemmer('english')
In [3]:
train_df = pd.read_csv("../datasets/train.csv.zip")
train_df = train_df[['id','comment_text', 'target']]
test_df = pd.read_csv("../datasets/test.csv.zip")
In [4]:
train_df.head()
Out[4]:
In [5]:
train_df.target.hist()
Out[5]:
In [6]:
train_df.shape
Out[6]:
In [ ]:
In [7]:
test_df.head()
Out[7]:
In [8]:
test_df.shape
Out[8]:
In [9]:
train_df = train_df.sample(100000, random_state=42)
In [10]:
train_df.shape
Out[10]:
Create tokens
In [11]:
def tokenize(text):
tokens = []
for token in word_tokenize(text):
if token in string.punctuation: continue
if token in stop_words: continue
tokens.append(stem.stem(token))
return " ".join(tokens)
In [12]:
train_tokens = Parallel(n_jobs=-1, verbose=1)(delayed(tokenize)(text) for text in train_df['comment_text'].tolist())
In [13]:
train_tokens[0]
Out[13]:
In [14]:
test_tokens = Parallel(n_jobs=-1, verbose=1)(delayed(tokenize)(text) for text in test_df['comment_text'].tolist())
In [16]:
len(train_tokens + test_tokens)
Out[16]:
In [17]:
vect = TfidfVectorizer()
vect.fit(train_tokens + test_tokens)
Out[17]:
In [18]:
X = vect.transform(train_tokens)
y = train_df['target']
In [19]:
reg = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42, max_depth=10)
reg.fit(X, y)
Out[19]:
In [20]:
test_X = vect.transform(test_tokens)
test_y = reg.predict(test_X)