Create training and validation samples for BERT

I'm borrowing some code from my train-test-validation project in order to produce train/test divide for BERT.

Getting the train/test vs. validation split right can be challenging, because we want to avoid repeating authors from the train/test set in validation. (Or in both train and test for that matter.) Authorial diction is constant enough that this could become an unfair advantage for genres with a few prolific authors. We also want to ensure that the positive & negative classes within a given set have a similar distribution across historical time. (Otherwise the model will become a model of language change.) Building sets where all these conditions hold is more involved than a random sample of volumes.


In [42]:
import sys, glob
import os, csv, random
import numpy as np
import pandas as pd
from scipy import stats

Managing the validation split.

The functions defined below are used to create a train/test/validation divide, while also ensuring

  1. No author is present in more than one of those sets, so we don't overfit on a specific style.
  2. Positive and negative classes are equally distributed across time (so we don't end up modeling language change instead of genre!)

But the best way to understand the overall workflow may be to scan down a few cells to the bottom function, train_and_validate().


In [2]:
def evenlymatchdate(meta, tt_positives, v_positives, negatives):
    '''
    Given a metadata file, two lists of positive indexes and a (larger) list
    of negative indexes, this assigns negatives that match the date distribution
    of the two positive lists as closely as possible, working randomly so that
    neither list gets "a first shot" at maximally close matches.
    
    The task is complicated by our goal of ensuring that authors are only
    represented in the train/test OR the validation set. To do this while
    using as much of our sample as we can, we encourage the algorithm to choose
    works from already-selected authors when they fit the date parameters needed.
    This is the function of the selected_neg_unmatched set: works by authors we have
    chosen, not yet matched to a positive work.
    '''
    
    assert len(negatives) > (len(tt_positives) + len(v_positives))
    authors = dict()
    authors['tt'] = set(meta.loc[tt_positives, 'author'])
    authors['v'] = set(meta.loc[v_positives, 'author'])
    
    neg_matched = dict()
    neg_matched['tt'] = []
    neg_matched['v'] = []
    neg_unmatched = dict()
    neg_unmatched['v'] = []
    neg_unmatched['tt'] = []
    
    negative_meta = meta.loc[negatives, : ]
    
    allpositives = [(x, 'tt') for x in tt_positives]
    allpositives.extend([(x, 'v') for x in v_positives])
    random.shuffle(allpositives)
    
    for idx, settype in allpositives:
        if settype == 'v':
            inversetype = 'tt'
        else:
            inversetype = 'v'
            
        date = meta.loc[idx, 'firstpub']
        found = False
        negative_meta = negative_meta.assign(diff = np.abs(negative_meta['firstpub'] - date))
        
        for idx2 in neg_unmatched[settype]:
            matchdate = meta.loc[idx2, 'firstpub']
            if abs(matchdate - date) < 3:
                neg_matched[settype].append(idx2)
                location = neg_unmatched[settype].index(idx2)
                neg_unmatched[settype].pop(location)
                found = True
                break
        
        if not found:
            candidates = []
            for i in range(200):
                aspirants = negative_meta.index[negative_meta['diff'] == i].tolist()
                
                # the following section insures that authors in
                # traintest don't end up also in validation
                for a in aspirants:
                    asp_author = meta.loc[a, 'author']
                    if asp_author not in authors[inversetype]:
                        # don't even consider books by authors already
                        # in the other set
                        candidates.append(a)
                        
                if len(candidates) > 0:
                    break
        
            chosen = random.sample(candidates, 1)[0]
            chosenauth = negative_meta.loc[chosen, 'author']
            allbyauth = negative_meta.index[negative_meta['author'] == chosenauth].tolist()
            authors[settype].add(chosenauth)
            
            if len(allbyauth) < 1:
                print('error')
                
            for idx3 in allbyauth:
                if idx3 == chosen:
                    neg_matched[settype].append(idx3)
                    # the one we actually chose
                else:
                    neg_unmatched[settype].append(idx3)
                    # others by same author, to be considered first in future
            
            negative_meta.drop(allbyauth, inplace = True)
            
            if len(negative_meta) == 0:
                print('Exhausted negatives! This is surprising.')
                break
    
    # other books by same authors can be added to the set in the end
    tt_neg = neg_matched['tt'] + neg_unmatched['tt']
    v_neg = neg_matched['v'] + neg_unmatched['v']
    
    remaining_neg = negative_meta.index.tolist()

    return tt_neg, v_neg, remaining_neg

In [3]:
def tags2tagset(x):
    ''' function that will be applied to transform
    fantasy | science-fiction into {'fantasy', 'science-fiction'} '''
    if type(x) == float:
        return set()
    else:
        return set(x.split(' | '))

def divide_training_from_validation(tags4positive, tags4negative, sizecap, metadatapath):
    ''' This function divides a dataset into two parts: a training-and-test set, and a
    validation set. We ensure that authors are represented in one set *or* the other,
    not both.
    
    A model is optimized by gridsearch and crossvalidation on the training-and-test set. Then this model
    is applied to the validation set, and accuracy is recorded.
    '''
    
    meta = pd.read_csv(metadatapath)
    column_of_sets = meta['genretags'].apply(tags2tagset)
    meta = meta.assign(tagset = column_of_sets)
    
    overlap = []
    negatives = []
    positives = []
    
    for idx, row in meta.iterrows():
        if 'drop' in row['tagset']:
            continue
            # these works were dropped and will not be present in the data folder
            
        posintersect = len(row['tagset'] & tags4positive)
        negintersect = len(row['tagset'] & tags4negative)
        
        if posintersect and negintersect:
            overlap.append(idx)
        elif posintersect:
            positives.append(idx)
        elif negintersect:
            negatives.append(idx)
            
    print()
    print('-------------')
    print('Begin construction of validation split.')
    print("Positives/negatives:", len(positives), len(negatives))
    
    random.shuffle(overlap)
    print('Overlap (assigned to pos class): ' + str(len(overlap)))
    positives.extend(overlap)
    
    # We do selection by author
    positiveauthors = list(set(meta.loc[positives, 'author'].tolist()))
    
    random.shuffle(positiveauthors)
    
    traintest_pos = []
    validation_pos = []
    donewithtraintest = False
    
    for auth in positiveauthors:
        this_auth_indices = meta.index[meta['author'] == auth].tolist()
        confirmed_auth_indices = []
        for idx in this_auth_indices:
            if idx in positives:
                confirmed_auth_indices.append(idx)
        
        if not donewithtraintest:
            traintest_pos.extend(confirmed_auth_indices)
        else:
            validation_pos.extend(confirmed_auth_indices)
        
        if len(traintest_pos) > sizecap:
            # that's deliberately > rather than >= because we want a cushion
            donewithtraintest = True
    
    # Now let's get a set of negatives that match the positives' distribution
    # across the time axis.
    
    traintest_neg, validation_neg, remaining_neg = evenlymatchdate(meta, traintest_pos, validation_pos, negatives)
    
    # For Bert, we want an equal number of positive and negative vols,
    # because there will be no subsequent winnowing. This departs from
    # our practice with versatiletrainer2.
    
    if len(traintest_neg) > len(traintest_pos):
        k = len(traintest_pos)
        traintest_neg = random.sample(traintest_neg, k)
        
    traintest = meta.loc[traintest_pos + traintest_neg, : ]
    realclass = ([1] * len(traintest_pos)) + ([0] * len(traintest_neg))
    traintest = traintest.assign(realclass = realclass)
    print("Traintest pos/neg:", len(traintest_pos), len(traintest_neg))
    
    if len(validation_neg) > len(validation_pos):
        validation_neg = validation_neg[0: len(validation_pos)]
        # we want the balance of pos and neg examples to be even
        
    print("Validation pos/neg:", len(validation_pos), len(validation_neg))
    
    validation = meta.loc[validation_pos + validation_neg, : ]
    realclass = ([1] * len(validation_pos)) + ([0] * len(validation_neg))
    validation = validation.assign(realclass = realclass)
    
    return traintest, validation

Getting snippets.

Once we have lists of volumes for the train and validation sets, we iterate through each list and get BERT-sized snippets from each volume in the list. A parameter n defines the maximum number we can take.

The lines are shuffled and written to file in BERT-appropriate format.

We also save a row-level metadata file for the validation ("dev") set; this can be used later to group snippets by volume and interpret accuracy at the volume level.


In [60]:
def confirm_separation(df1, df2):
    '''
    Just some stats on the train/test vs validation split.
    '''
    
    authors1 = set(df1['author'])
    authors2 = set(df2['author'])
    overlap = authors1.intersection(authors2)
    if len(overlap) > 0:
        print('Overlap: ', overlap)
    
    pos1date = np.mean(df1.loc[df1.realclass == 0, 'firstpub'])
    neg1date = np.mean(df1.loc[df1.realclass == 1, 'firstpub'])
    pos2date = np.mean(df2.loc[df2.realclass == 0, 'firstpub'])
    neg2date = np.mean(df2.loc[df2.realclass == 1, 'firstpub'])
    
    print("Traintest mean date pos:", pos1date, "neg:", neg1date)
    print("Validation mean date pos", pos2date, "neg:", neg2date)
    print()

def get_snippets(anid, n, snipmaxlen):
    '''
    Returns snippets from a file. The number of snippets
    is determined by the parameter n, the length of
    snippets by snipmaxlen.
    '''
    
    inpath = '../newtexts/' + anid + '.txt'
    with open(inpath, encoding = 'utf-8') as f:
        filelines = f.readlines()
        words = []
        for line in filelines:
            newwords = line.strip().split()
            words.extend(newwords)
            
    startsnip = False
    snippets = []
    snip = []
            
    # Note that we skip the first
    for i in range(256, len(words)):
        w = words[i]
        if w.startswith('<p') and w.endswith('>'):
            continue
            # these are pagebreak marks
            # I've inserted
            
        if not startsnip and (w.endswith('.') or w.endswith('?') or w.endswith('"') or w.endswith(',')):
            startsnip = True
        elif startsnip and len(snip) < snipmaxlen:
            snip.append(w.lower())
            # we assume an uncased model
            
        elif len(snip) >= snipmaxlen:
            snippets.append(' '.join(snip))
            snip = []
            startsnip = False
    
    # Note that we deliberately don't take the last (incomplete)
    # snippet. It's likely to be uncharacteristic. We also skip the
    # next to the last snippet for the same reason.
    
    max_n = len(snippets) - 1
    
    if max_n > n: 
        snippets = random.sample(snippets[0 : -1], n)
        
        # we do this random sampling in hopes of getting snippets across
        # the whole length of the book (minus uncharacteristic start and end)
    
    return snippets, max_n

def bertformat(df, n, snipmaxlen):
    poslines = []
    neglines = []
    index = 0
    snip_maxes = []

    for idx, row in df.iterrows():
        the_id = row.docid
        the_class = row.realclass
        
        snippets, max_n = get_snippets(the_id, n, snipmaxlen)
        snip_maxes.append(max_n)
        
        for s in snippets:
            line = dict()
            line['docid'] = the_id
            line['idx'] = str(index)
            line['class'] = str(the_class)
            line['dummy'] = 'd'
            line['text'] = s
            # lineindex-tab-classlabel-tab-dummycolumn-tab-text
            
            if line['class'] == '1': 
                poslines.append(line)
            else:
                neglines.append(line)
                
            index += 1
    
    # we want equal pos and neg representation
    minlen = min(len(poslines), len(neglines))
    
    poslines = poslines[0 : minlen]
    neglines = neglines[0 : minlen]
    
    lines = poslines + neglines
    
    random.shuffle(lines)
    outframe = pd.DataFrame(lines)
    # the random shuffle is extremely important, given the way BERT works!
    
    print('Average possible snippets:', sum(snip_maxes) / len(snip_maxes))
    print('Actual taken per vol: ', str((minlen * 2) / len(df)))
    
    return outframe
    
def create_traindev(modelname, tags4positive, tags4negative, sizecap, metadatapath, n, snipmaxlen):
    
    for i in range(1):
        fullname = modelname
        
        traintest, validation = divide_training_from_validation(tags4positive, tags4negative, sizecap, metadatapath)
        
        confirm_separation(traintest, validation)
    
        traintest.to_csv('bertmeta/train_vols_' + fullname + '.csv', index = False)
        validation.to_csv('bertmeta/dev_vols_' + fullname + '.csv', index = False)
        
        print()
        print('Metadata written. Writing training data ...')
        
        train_df = bertformat(traintest, n, snipmaxlen)
        train4bert = train_df.loc[ : , ['idx', 'class', 'dummy', 'text']]
        train4bert.to_csv('bertdata/train_' + fullname + '.tsv', sep = '\t', header = False, index = False, quoting = csv.QUOTE_NONE)
                
        print('... and validation data.')
        print()
        dev_df = bertformat(validation, n, snipmaxlen)
        dev4bert = dev_df.loc[ : , ['idx', 'class', 'dummy', 'text']]
        dev4bert.to_csv('bertdata/dev_' + fullname + '.tsv', sep = '\t', header = False, index = False, quoting = csv.QUOTE_NONE)
        devmeta = dev_df.loc[ : , ['idx', 'docid', 'class']]
        devmeta.to_csv('bertmeta/dev_rows_' + fullname + '.tsv', sep = '\t', index = False)
        # we can use this to interpret results later, grouping them by docid

In [70]:
create_traindev('Goth512max', {'lochorror', 'pbgothic', 'locghost', 'stangothic', 'chihorror'},
        {'random', 'chirandom'}, 125, '../meta/finalmeta.csv', 300, 512)


-------------
Begin construction of validation split.
Positives/negatives: 166 370
Overlap (assigned to pos class): 0
Traintest pos/neg: 128 128
Validation pos/neg: 38 38
Traintest mean date pos: 1872.75 neg: 1872.140625
Validation mean date pos 1895.578947368421 neg: 1894.3684210526317


Metadata written. Writing training data ...
Average possible snippets: 161.03515625
Actual taken per vol:  134.9296875
... and validation data.

Average possible snippets: 171.56578947368422
Actual taken per vol:  144.39473684210526

In [66]:
create_traindev('SF512max', {'anatscifi', 'locscifi', 'chiscifi', 'femscifi'},
        {'random', 'chirandom'}, 125, '../meta/finalmeta.csv', 300, 512)


-------------
Begin construction of validation split.
Positives/negatives: 175 347
Overlap (assigned to pos class): 23
Traintest pos/neg: 126 126
Validation pos/neg: 72 72
Traintest mean date pos: 1938.8015873015872 neg: 1940.9920634920634
Validation mean date pos 1933.486111111111 neg: 1933.625


Metadata written. Writing training data ...
Average possible snippets: 158.38492063492063
Actual taken per vol:  138.515873015873
... and validation data.

Average possible snippets: 152.54166666666666
Actual taken per vol:  140.72222222222223

In [71]:
create_traindev('Mystery512max', {'locdetective', 'locdetmyst', 'chimyst', 'det100'},
        {'random', 'chirandom'}, 125, '../meta/finalmeta.csv', 300, 512)


-------------
Begin construction of validation split.
Positives/negatives: 206 327
Overlap (assigned to pos class): 43
Traintest pos/neg: 126 126
Validation pos/neg: 123 123
Traintest mean date pos: 1930.0873015873017 neg: 1930.3809523809523
Validation mean date pos 1930.9756097560976 neg: 1931.520325203252


Metadata written. Writing training data ...
Average possible snippets: 160.76190476190476
Actual taken per vol:  144.86507936507937
... and validation data.

Average possible snippets: 144.9959349593496
Actual taken per vol:  127.32520325203252

deprecated

I was checking to make sure that snippets are evenly distributed across the length of a book.


In [21]:
def number_snippets(anid):
    '''
    Returns 128-word snippets from a file. The number of snippets
    is determined by the parameter n.
    '''
    
    inpath = '../newtexts/' + anid + '.txt'
    with open(inpath, encoding = 'utf-8') as f:
        filelines = f.readlines()
        words = []
        for line in filelines:
            newwords = line.strip().split()
            words.extend(newwords)
            
    startsnip = False
    snippets = []
    snip = []
            
    # Note that we skip the first
    for i in range(256, len(words)):
        w = words[i]
        if w.startswith('<p') and w.endswith('>'):
            continue
            # these are pagebreak marks
            # I've inserted
            
        if not startsnip and w.endswith('.') or w.endswith('?') or w.endswith('"'):
            startsnip = True
        elif startsnip and len(snip) < 127:
            snip.append(w.lower())
            # we assume an uncased model
            
        elif len(snip) >= 127:
            snippets.append(' '.join(snip))
            snip = []
            startsnip = False
    
    # Note that we deliberately don't take the last (incomplete)
    # snippet. It's likely to be uncharacteristic. We also skip the
    # next to the last snippet for the same reason.
    
    snipdict = dict()
    for i, s in enumerate(snippets):
        snipdict[s] = i
    maxsnip = i
    
    return snipdict, maxsnip

for d in docs:
    sd, maxsnip = number_snippets(d)
    indexes = []
    for s in data[3]:
        if s in sd:
            indexes.append(sd[s])
    avg =(sum(indexes) / len(indexes))
    print(avg/maxsnip)


0.5045118110236221
0.5
0.5
0.5
0.5035007610350076
0.5
0.5009651898734178
0.504534693877551
0.5
0.4980990099009901
0.5
0.5028068518664254
0.48463201663201666
0.5
0.5
0.5104120603015075
0.5
0.5
0.5
0.5
0.5
0.5
0.5
0.5006194968553459
0.4972224299065421
0.486419595314164
0.4939302325581395
0.4984158415841584
0.5099687546496057
0.5
0.5
0.5
0.5
0.5064872107186358
0.5
0.5004580152671756
0.5046385224274407
0.5
0.5
0.5
0.5
0.5
0.5034262101534829
0.5
0.5
0.5
0.5
0.5
0.5
0.503568345323741
0.5
0.49811644832605534
0.5
0.4956188811188812
0.5
0.5
0.5
0.5
0.5
0.5
0.48377516339869286
0.4952910447761194
0.5
0.49448452012383903
0.5
0.49851863857374396
0.497234608985025
0.4896973590292648
0.5
0.5
0.5
0.4989453125
0.5
0.5053383458646616
0.5
0.5
0.4922493150684931
0.5009964912280701
0.5
0.4973909648633575
0.4998831615120275
0.5
0.4922722852512156
0.5
0.5
0.5
0.5081350844277673
0.4982775263951735
0.4934920127795527
0.5
0.5034285714285714
0.49877389984825493
0.5059312977099236
0.5036221374045802
0.5064183796856107
0.5071705069124424
0.5
0.5
0.5
0.5
0.5
0.49573958333333334
0.4961507352941177
0.5
0.5
0.5
0.5098395604395605
0.5
0.5021161118404295
0.4925776173285198
0.5
0.5
0.5
0.5
0.5028456790123457
0.5
0.5055822102425876
0.49890978886756243
0.49858823529411767
0.5
0.5
0.49356249999999996
0.4970330250990753
0.49625255338904367
0.4991332007952286
0.49971276595744685
0.5048638655462185
0.5
0.5
0.5109560585885486
0.5
0.5
0.5128966376089664
0.5
0.5
0.5
0.48904489795918366
0.5037951635846373
0.5
0.5
0.5
0.5006254071661238
0.5
0.5039267299864315

Create sentiment data


In [54]:
def get_snips(text):
    
    words = text.replace('\t', ' ').replace('<br />', ' ').split()
    snips = []
    
    for floor in range(0, len(words), 128):
        ceiling = floor + 128
        if ceiling > len(words):
            ceiling = len(words)
            
        s = words[floor : ceiling]
        if len(s) > 64:
            snips.append(' '.join(s))
                
    return snips

def get_dataset(rootfolder):
    
    negpaths = glob.glob(rootfolder + '/neg/*.txt')
    pospaths = glob.glob(rootfolder + '/pos/*.txt')
    paths = [(0, x) for x in negpaths] + [(1, x) for x in pospaths]
    
    index = 0
    lines = []
    
    for label, p in paths:
        docid = p.split('/')[-1].replace('.txt', '')
        
        with open(p) as f:
            text = f.read().strip()
            snips = get_snips(text)
            
            for s in snips:
                line = dict()
                line['docid'] = docid
                line['idx'] = index
                line['class'] = str(label)
                line['dummy'] = 'd'
                line['text'] = s
                index += 1
                lines.append(line)
    
    random.shuffle(lines)
    outframe = pd.DataFrame(lines)
    
    return outframe
            
fullname = 'sentiment'

train_df = get_dataset('/Volumes/TARDIS/aclImdb/train')

train4bert = train_df.loc[ : , ['idx', 'class', 'dummy', 'text']]
train4bert.to_csv('bertdata/train_' + fullname + '.tsv', sep = '\t', header = False, index = False, quoting = csv.QUOTE_NONE)

test_df = get_dataset('/Volumes/TARDIS/aclImdb/test')
dev4bert = test_df.loc[ : , ['idx', 'class', 'dummy', 'text']]
dev4bert.to_csv('bertdata/dev_' + fullname + '.tsv', sep = '\t', header = False, index = False, quoting = csv.QUOTE_NONE)
devmeta = test_df.loc[ : , ['idx', 'docid', 'class']]
devmeta.to_csv('bertmeta/dev_rows_' + fullname + '.tsv', sep = '\t', index = False)
# we can use this to interpret results later, grouping them by docid

In [53]:
rootfolder = '/Volumes/TARDIS/aclImdb/train'
negpaths = glob.glob(rootfolder + '/neg/*.txt')
n = negpaths[0]
docid = n.split('/')[-1].replace('.txt', '')
print(n, docid)


/Volumes/TARDIS/aclImdb/train/neg/0_3.txt 0_3

In [38]:
sum(np.array(dev_df['class'], dtype = 'int8') > 3)


Out[38]:
312

In [39]:
len(dev_df)


Out[39]:
583

How long is the average movie review?


In [69]:
import glob
def lengths_dataset(rootfolder):
    
    negpaths = glob.glob(rootfolder + '/neg/*.txt')
    pospaths = glob.glob(rootfolder + '/pos/*.txt')
    paths = [(0, x) for x in negpaths] + [(1, x) for x in pospaths]
    
    lens = []
    
    for label, p in paths:
        
        with open(p) as f:
            words = f.read().split()
            lens.append(len(words))
            
    
    return sum(lens) / len(lens)

lengths_dataset('/Volumes/TARDIS/aclImdb/train')


Out[69]:
233.7872

In [ ]: