PyTorch LSTM: GloVe + dropout --- Incomplete

This is a reimplementation of J.Howard's Improved LSTM baseline: GloVe + dropout Kaggle kernel in FastAI/PyTorch. The original kernel manages a private score of 0.09783 tied with 2747/4551 place.

-- Wayne Nixalo

Imports



In [22]:

    
import pathlib
import os
import torchtext
# from torchtext.data import Field
from torchtext import data
# import spacy
import pandas as pd
import numpy as np
# from torchtext.data import TabularDataset

Paths



In [2]:

    
data_path = pathlib.Path('../../data')
comp_path = pathlib.Path(data_path/'competitions/jigsaw-toxic-comment-classification-challenge')
EMBEDDING_FILE = 'glove/glove.6B.50d.txt'
TRAIN_DATA_FILE= 'train.csv'
TEST_DATA_FILE = 'test.csv'

Config parameters



In [3]:

    
embed_sz = 50    # embedding vector columns (factors)
max_feat = 20000 # embedding vector rows    (words)
maxlen   = 100   # words in comment to use

Data Loading



In [4]:

    
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# train = pd.read_csv(comp_path/TRAIN_DATA_FILE)
# test  = pd.read_csv(comp/TEST_DATA_FILE)



In [19]:

    
# SEE: Aside 1, Aside 2
# TEXT  = Field(sequential=True, tokenize='spacy', lower=True)
TEXT = data.Field(sequential=True, tokenize= lambda x: x.split(), lower=True, )
LABEL = data.Field(sequential=False, use_vocab=False)



In [20]:

    
# trainval_datafields = [("id",None),("comment_text",TEXT)]
# trainval_datafields.extend((clss, LABEL) for clss in list_classes)

# test_datafields = [("id",None), ("comment_text",TEXT)]

# train_dataset = data.TabularDataset(
#                     path=comp_path/TRAIN_DATA_FILE, format='csv',
#                     skip_header=True, fields=trainval_datafields, 
#                     sort_within_batch=True)
# test_dataset  = data.TabularDataset(
#                     path=comp_path/TEST_DATA_FILE, format='csv',
#                     skip_header=True, fields=test_datafields)

# # TEXT.build_vocab(train_dataset)

experimenting with Fastai



In [24]:

    
from fastai.nlp import *



In [26]:

    
train_df = pd.read_csv(comp_path/TRAIN_DATA_FILE)



In [29]:

    
# SEE: Aside 3
model = LanguageModelData.from_dataframes(
            path=comp_path, field=TEXT, col="comment_text", 
            train_df=train_df, val_df=train_df, test_df=train_df, 
            bs=64, min_freq=3)



In [33]:

    
em_sz = 200
nh = 500
nl = 3

opt_fn = partial(optim.Adam, betas=(0.7, 0.99))



In [32]:

    
learner = model.get_model(opt_fn, em_sz, nh, nl, 
                          dropouti=0.05, dropout=0.05, wdrop=0.1, 
                          dropoute=0.02, dropouth=0.05)



In [37]:

    
learner.clip = 0.3 # gradient clipping



In [66]:

    
learner.model.parameters









    Out[66]:





<bound method Module.parameters of SequentialRNN(
  (0): RNN_Encoder(
    (encoder): Embedding(470343, 200, padding_idx=1)
    (encoder_with_dropout): EmbeddingDropout(
      (embed): Embedding(470343, 200, padding_idx=1)
    )
    (rnns): ModuleList(
      (0): WeightDrop(
        (module): LSTM(200, 500, dropout=0.05)
      )
      (1): WeightDrop(
        (module): LSTM(500, 500, dropout=0.05)
      )
      (2): WeightDrop(
        (module): LSTM(500, 200, dropout=0.05)
      )
    )
    (dropouti): LockedDropout(
    )
    (dropouths): ModuleList(
      (0): LockedDropout(
      )
      (1): LockedDropout(
      )
      (2): LockedDropout(
      )
    )
  )
  (1): LinearDecoder(
    (decoder): Linear(in_features=200, out_features=470343, bias=False)
    (dropout): LockedDropout(
    )
  )
)>

Misc / Asides / Notes

Aside 1

Labels are already binary encoded, so no need to numericalize. Therefore use_vocab=False.



In [35]:

    
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

train = pd.read_csv(comp_path/TRAIN_DATA_FILE)
# test  = pd.read_csv(comp/TEST_DATA_FILE)



In [40]:

    
train[list_classes][55:65]









    Out[40]:







  
    
      
      toxic
      severe_toxic
      obscene
      threat
      insult
      identity_hate
    
  
  
    
      55
      1
      1
      1
      0
      1
      0
    
    
      56
      1
      0
      1
      0
      1
      0
    
    
      57
      0
      0
      0
      0
      0
      0
    
    
      58
      1
      0
      1
      0
      0
      0
    
    
      59
      1
      0
      0
      0
      0
      0
    
    
      60
      0
      0
      0
      0
      0
      0
    
    
      61
      0
      0
      0
      0
      0
      0
    
    
      62
      0
      0
      0
      0
      0
      0
    
    
      63
      0
      0
      0
      0
      0
      0
    
    
      64
      0
      0
      0
      0
      0
      0

Aside 2

The Spacy tokenizer is going to take a lot longer to run through the data than the naive whitespace-separator.

TEXT = Field(sequential=True, tokenize= lambda x: x.split(), lower=True)

TEXT = Field(sequential=True, tokenize='spacy', lower=True)

This is because comment id 206058417140 has a lot of exclamation marks -- fastai forum discussion

Aside 3

example usage of fastai.nlp.LanguageModelData:

TEXT  = data.Field(lower=True, tokenize=spacy_tok)
FILES = dict(train=TRN_PATH, validation=VAL_PATH, test=VAL_PATH)
md = LanguageModelData.from_text_files(PATH, TEXT, **FILES, bs=64, bptt=70, min_freq=10)

em_sz = 200
nh = 200
nl = 3

opt_fn = partial(optim.Adam, betas=(0.7, 0.99))
learner = md.get_model(opt_fn, em_sz, nh, nl, dropouti=0.55, dropout=0.05,
                       wdrop=0.1, dropoute=0.02, dropouth=0.05)
learner.reg_fn = seq2seq_reg
learner.clip = 0.3

learner.fit(3e-3, 4, wds=1e-6, cycle_len=1, cycle_mult=2)



In [ ]:

	toxic	severe_toxic	obscene	insult
55	1	1	1	1
56	1	0	1	1
57	0	0	0	0
58	1	0	1	0
59	1	0	0	0
60	0	0	0	0
61	0	0	0	0
62	0	0	0	0
63	0	0	0	0
64	0	0	0	0

	toxic	severe_toxic	obscene	insult
55	1	1	1	1
56	1	0	1	1
57	0	0	0	0
58	1	0	1	0
59	1	0	0	0
60	0	0	0	0
61	0	0	0	0
62	0	0	0	0
63	0	0	0	0
64	0	0	0	0

	toxic	severe_toxic	obscene	insult
55	1	1	1	1
56	1	0	1	1
57	0	0	0	0
58	1	0	1	0
59	1	0	0	0
60	0	0	0	0
61	0	0	0	0
62	0	0	0	0
63	0	0	0	0
64	0	0	0	0