This is a reimplementation of J.Howard's Improved LSTM baseline: GloVe + dropout Kaggle kernel in FastAI/PyTorch. The original kernel manages a private score of 0.09783 tied with 2747/4551 place.
-- Wayne Nixalo
Imports
In [22]:
import pathlib
import os
import torchtext
# from torchtext.data import Field
from torchtext import data
# import spacy
import pandas as pd
import numpy as np
# from torchtext.data import TabularDataset
Paths
In [2]:
data_path = pathlib.Path('../../data')
comp_path = pathlib.Path(data_path/'competitions/jigsaw-toxic-comment-classification-challenge')
EMBEDDING_FILE = 'glove/glove.6B.50d.txt'
TRAIN_DATA_FILE= 'train.csv'
TEST_DATA_FILE = 'test.csv'
Config parameters
In [3]:
embed_sz = 50 # embedding vector columns (factors)
max_feat = 20000 # embedding vector rows (words)
maxlen = 100 # words in comment to use
Data Loading
In [4]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
# train = pd.read_csv(comp_path/TRAIN_DATA_FILE)
# test = pd.read_csv(comp/TEST_DATA_FILE)
In [19]:
# SEE: Aside 1, Aside 2
# TEXT = Field(sequential=True, tokenize='spacy', lower=True)
TEXT = data.Field(sequential=True, tokenize= lambda x: x.split(), lower=True, )
LABEL = data.Field(sequential=False, use_vocab=False)
In [20]:
# trainval_datafields = [("id",None),("comment_text",TEXT)]
# trainval_datafields.extend((clss, LABEL) for clss in list_classes)
# test_datafields = [("id",None), ("comment_text",TEXT)]
# train_dataset = data.TabularDataset(
# path=comp_path/TRAIN_DATA_FILE, format='csv',
# skip_header=True, fields=trainval_datafields,
# sort_within_batch=True)
# test_dataset = data.TabularDataset(
# path=comp_path/TEST_DATA_FILE, format='csv',
# skip_header=True, fields=test_datafields)
# # TEXT.build_vocab(train_dataset)
experimenting with Fastai
In [24]:
from fastai.nlp import *
In [26]:
train_df = pd.read_csv(comp_path/TRAIN_DATA_FILE)
In [29]:
# SEE: Aside 3
model = LanguageModelData.from_dataframes(
path=comp_path, field=TEXT, col="comment_text",
train_df=train_df, val_df=train_df, test_df=train_df,
bs=64, min_freq=3)
In [33]:
em_sz = 200
nh = 500
nl = 3
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))
In [32]:
learner = model.get_model(opt_fn, em_sz, nh, nl,
dropouti=0.05, dropout=0.05, wdrop=0.1,
dropoute=0.02, dropouth=0.05)
In [37]:
learner.clip = 0.3 # gradient clipping
In [66]:
learner.model.parameters
Out[66]:
Labels are already binary encoded, so no need to numericalize. Therefore use_vocab=False.
In [35]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train = pd.read_csv(comp_path/TRAIN_DATA_FILE)
# test = pd.read_csv(comp/TEST_DATA_FILE)
In [40]:
train[list_classes][55:65]
Out[40]:
The Spacy tokenizer is going to take a lot longer to run through the data than the naive whitespace-separator.
TEXT = Field(sequential=True, tokenize= lambda x: x.split(), lower=True)
vs
TEXT = Field(sequential=True, tokenize='spacy', lower=True)
This is because comment id 206058417140 has a lot of exclamation marks -- fastai forum discussion
example usage of fastai.nlp.LanguageModelData:
TEXT = data.Field(lower=True, tokenize=spacy_tok)
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=64, bptt=70, min_freq=10)
em_sz = 200
nh = 200
nl = 3
opt_fn = partial(optim.Adam, betas=(0.7, 0.99))
learner = md.get_model(opt_fn, em_sz, nh, nl, dropouti=0.55, dropout=0.05,
wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = seq2seq_reg
learner.clip = 0.3
learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)
In [ ]: