Pretraining on WT103



In [ ]:

    
%load_ext autoreload
%autoreload 2

%matplotlib inline



In [ ]:

    
#export
from exp.nb_12a import *

Data

One time download

Jump_to lesson 12 video



In [ ]:

    
#path = datasets.Config().data_path()
#version = '103' #2



In [ ]:

    
#! wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-{version}-v1.zip -P {path}
#! unzip -q -n {path}/wikitext-{version}-v1.zip  -d {path}
#! mv {path}/wikitext-{version}/wiki.train.tokens {path}/wikitext-{version}/train.txt
#! mv {path}/wikitext-{version}/wiki.valid.tokens {path}/wikitext-{version}/valid.txt
#! mv {path}/wikitext-{version}/wiki.test.tokens {path}/wikitext-{version}/test.txt

Split the articles: WT103 is given as one big text file and we need to chunk it in different articles if we want to be able to shuffle them at the beginning of each epoch.



In [ ]:

    
path = datasets.Config().data_path()/'wikitext-103'



In [ ]:

    
def istitle(line):
    return len(re.findall(r'^ = [^=]* = $', line)) != 0



In [ ]:

    
def read_wiki(filename):
    articles = []
    with open(filename, encoding='utf8') as f:
        lines = f.readlines()
    current_article = ''
    for i,line in enumerate(lines):
        current_article += line
        if i < len(lines)-2 and lines[i+1] == ' \n' and istitle(lines[i+2]):
            current_article = current_article.replace('<unk>', UNK)
            articles.append(current_article)
            current_article = ''
    current_article = current_article.replace('<unk>', UNK)
    articles.append(current_article)
    return articles



In [ ]:

    
train = TextList(read_wiki(path/'train.txt'), path=path) #+read_file(path/'test.txt')
valid = TextList(read_wiki(path/'valid.txt'), path=path)



In [ ]:

    
len(train), len(valid)



In [ ]:

    
sd = SplitData(train, valid)



In [ ]:

    
proc_tok,proc_num = TokenizeProcessor(),NumericalizeProcessor()



In [ ]:

    
ll = label_by_func(sd, lambda x: 0, proc_x = [proc_tok,proc_num])



In [ ]:

    
pickle.dump(ll, open(path/'ld.pkl', 'wb'))



In [ ]:

    
ll = pickle.load( open(path/'ld.pkl', 'rb'))



In [ ]:

    
bs,bptt = 128,70
data = lm_databunchify(ll, bs, bptt)



In [ ]:

    
vocab = ll.train.proc_x[-1].vocab
len(vocab)

Model



In [ ]:

    
dps = np.array([0.1, 0.15, 0.25, 0.02, 0.2]) * 0.2
tok_pad = vocab.index(PAD)



In [ ]:

    
emb_sz, nh, nl = 300, 300, 2
model = get_language_model(len(vocab), emb_sz, nh, nl, tok_pad, *dps)



In [ ]:

    
cbs = [partial(AvgStatsCallback,accuracy_flat),
       CudaCallback, Recorder,
       partial(GradientClipping, clip=0.1),
       partial(RNNTrainer, α=2., β=1.),
       ProgressCallback]



In [ ]:

    
learn = Learner(model, data, cross_entropy_flat, lr=5e-3, cb_funcs=cbs, opt_func=adam_opt())



In [ ]:

    
lr = 5e-3
sched_lr  = combine_scheds([0.3,0.7], cos_1cycle_anneal(lr/10., lr, lr/1e5))
sched_mom = combine_scheds([0.3,0.7], cos_1cycle_anneal(0.8, 0.7, 0.8))
cbsched = [ParamScheduler('lr', sched_lr), ParamScheduler('mom', sched_mom)]



In [ ]:

    
learn.fit(10, cbs=cbsched)



In [ ]:

    
torch.save(learn.model.state_dict(), path/'pretrained.pth')
pickle.dump(vocab, open(path/'vocab.pkl', 'wb'))



In [ ]: