I'm borrowing some code from my train-test-validation project in order to produce train/test divide for BERT.
Getting the train/test vs. validation split right can be challenging, because we want to avoid repeating authors from the train/test set in validation. (Or in both train and test for that matter.) Authorial diction is constant enough that this could become an unfair advantage for genres with a few prolific authors. We also want to ensure that the positive & negative classes within a given set have a similar distribution across historical time. (Otherwise the model will become a model of language change.) Building sets where all these conditions hold is more involved than a random sample of volumes.
In [42]:
import sys, glob
import os, csv, random
import numpy as np
import pandas as pd
from scipy import stats
The functions defined below are used to create a train/test/validation divide, while also ensuring
But the best way to understand the overall workflow may be to scan down a few cells to the bottom function, train_and_validate().
In [2]:
def evenlymatchdate(meta, tt_positives, v_positives, negatives):
'''
Given a metadata file, two lists of positive indexes and a (larger) list
of negative indexes, this assigns negatives that match the date distribution
of the two positive lists as closely as possible, working randomly so that
neither list gets "a first shot" at maximally close matches.
The task is complicated by our goal of ensuring that authors are only
represented in the train/test OR the validation set. To do this while
using as much of our sample as we can, we encourage the algorithm to choose
works from already-selected authors when they fit the date parameters needed.
This is the function of the selected_neg_unmatched set: works by authors we have
chosen, not yet matched to a positive work.
'''
assert len(negatives) > (len(tt_positives) + len(v_positives))
authors = dict()
authors['tt'] = set(meta.loc[tt_positives, 'author'])
authors['v'] = set(meta.loc[v_positives, 'author'])
neg_matched = dict()
neg_matched['tt'] = []
neg_matched['v'] = []
neg_unmatched = dict()
neg_unmatched['v'] = []
neg_unmatched['tt'] = []
negative_meta = meta.loc[negatives, : ]
allpositives = [(x, 'tt') for x in tt_positives]
allpositives.extend([(x, 'v') for x in v_positives])
random.shuffle(allpositives)
for idx, settype in allpositives:
if settype == 'v':
inversetype = 'tt'
else:
inversetype = 'v'
date = meta.loc[idx, 'firstpub']
found = False
negative_meta = negative_meta.assign(diff = np.abs(negative_meta['firstpub'] - date))
for idx2 in neg_unmatched[settype]:
matchdate = meta.loc[idx2, 'firstpub']
if abs(matchdate - date) < 3:
neg_matched[settype].append(idx2)
location = neg_unmatched[settype].index(idx2)
neg_unmatched[settype].pop(location)
found = True
break
if not found:
candidates = []
for i in range(200):
aspirants = negative_meta.index[negative_meta['diff'] == i].tolist()
# the following section insures that authors in
# traintest don't end up also in validation
for a in aspirants:
asp_author = meta.loc[a, 'author']
if asp_author not in authors[inversetype]:
# don't even consider books by authors already
# in the other set
candidates.append(a)
if len(candidates) > 0:
break
chosen = random.sample(candidates, 1)[0]
chosenauth = negative_meta.loc[chosen, 'author']
allbyauth = negative_meta.index[negative_meta['author'] == chosenauth].tolist()
authors[settype].add(chosenauth)
if len(allbyauth) < 1:
print('error')
for idx3 in allbyauth:
if idx3 == chosen:
neg_matched[settype].append(idx3)
# the one we actually chose
else:
neg_unmatched[settype].append(idx3)
# others by same author, to be considered first in future
negative_meta.drop(allbyauth, inplace = True)
if len(negative_meta) == 0:
print('Exhausted negatives! This is surprising.')
break
# other books by same authors can be added to the set in the end
tt_neg = neg_matched['tt'] + neg_unmatched['tt']
v_neg = neg_matched['v'] + neg_unmatched['v']
remaining_neg = negative_meta.index.tolist()
return tt_neg, v_neg, remaining_neg
In [3]:
def tags2tagset(x):
''' function that will be applied to transform
fantasy | science-fiction into {'fantasy', 'science-fiction'} '''
if type(x) == float:
return set()
else:
return set(x.split(' | '))
def divide_training_from_validation(tags4positive, tags4negative, sizecap, metadatapath):
''' This function divides a dataset into two parts: a training-and-test set, and a
validation set. We ensure that authors are represented in one set *or* the other,
not both.
A model is optimized by gridsearch and crossvalidation on the training-and-test set. Then this model
is applied to the validation set, and accuracy is recorded.
'''
meta = pd.read_csv(metadatapath)
column_of_sets = meta['genretags'].apply(tags2tagset)
meta = meta.assign(tagset = column_of_sets)
overlap = []
negatives = []
positives = []
for idx, row in meta.iterrows():
if 'drop' in row['tagset']:
continue
# these works were dropped and will not be present in the data folder
posintersect = len(row['tagset'] & tags4positive)
negintersect = len(row['tagset'] & tags4negative)
if posintersect and negintersect:
overlap.append(idx)
elif posintersect:
positives.append(idx)
elif negintersect:
negatives.append(idx)
print()
print('-------------')
print('Begin construction of validation split.')
print("Positives/negatives:", len(positives), len(negatives))
random.shuffle(overlap)
print('Overlap (assigned to pos class): ' + str(len(overlap)))
positives.extend(overlap)
# We do selection by author
positiveauthors = list(set(meta.loc[positives, 'author'].tolist()))
random.shuffle(positiveauthors)
traintest_pos = []
validation_pos = []
donewithtraintest = False
for auth in positiveauthors:
this_auth_indices = meta.index[meta['author'] == auth].tolist()
confirmed_auth_indices = []
for idx in this_auth_indices:
if idx in positives:
confirmed_auth_indices.append(idx)
if not donewithtraintest:
traintest_pos.extend(confirmed_auth_indices)
else:
validation_pos.extend(confirmed_auth_indices)
if len(traintest_pos) > sizecap:
# that's deliberately > rather than >= because we want a cushion
donewithtraintest = True
# Now let's get a set of negatives that match the positives' distribution
# across the time axis.
traintest_neg, validation_neg, remaining_neg = evenlymatchdate(meta, traintest_pos, validation_pos, negatives)
# For Bert, we want an equal number of positive and negative vols,
# because there will be no subsequent winnowing. This departs from
# our practice with versatiletrainer2.
if len(traintest_neg) > len(traintest_pos):
k = len(traintest_pos)
traintest_neg = random.sample(traintest_neg, k)
traintest = meta.loc[traintest_pos + traintest_neg, : ]
realclass = ([1] * len(traintest_pos)) + ([0] * len(traintest_neg))
traintest = traintest.assign(realclass = realclass)
print("Traintest pos/neg:", len(traintest_pos), len(traintest_neg))
if len(validation_neg) > len(validation_pos):
validation_neg = validation_neg[0: len(validation_pos)]
# we want the balance of pos and neg examples to be even
print("Validation pos/neg:", len(validation_pos), len(validation_neg))
validation = meta.loc[validation_pos + validation_neg, : ]
realclass = ([1] * len(validation_pos)) + ([0] * len(validation_neg))
validation = validation.assign(realclass = realclass)
return traintest, validation
Once we have lists of volumes for the train and validation sets, we iterate through each list and get BERT-sized snippets from each volume in the list. A parameter n defines the maximum number we can take.
The lines are shuffled and written to file in BERT-appropriate format.
We also save a row-level metadata file for the validation ("dev") set; this can be used later to group snippets by volume and interpret accuracy at the volume level.
In [60]:
def confirm_separation(df1, df2):
'''
Just some stats on the train/test vs validation split.
'''
authors1 = set(df1['author'])
authors2 = set(df2['author'])
overlap = authors1.intersection(authors2)
if len(overlap) > 0:
print('Overlap: ', overlap)
pos1date = np.mean(df1.loc[df1.realclass == 0, 'firstpub'])
neg1date = np.mean(df1.loc[df1.realclass == 1, 'firstpub'])
pos2date = np.mean(df2.loc[df2.realclass == 0, 'firstpub'])
neg2date = np.mean(df2.loc[df2.realclass == 1, 'firstpub'])
print("Traintest mean date pos:", pos1date, "neg:", neg1date)
print("Validation mean date pos", pos2date, "neg:", neg2date)
print()
def get_snippets(anid, n, snipmaxlen):
'''
Returns snippets from a file. The number of snippets
is determined by the parameter n, the length of
snippets by snipmaxlen.
'''
inpath = '../newtexts/' + anid + '.txt'
with open(inpath, encoding = 'utf-8') as f:
filelines = f.readlines()
words = []
for line in filelines:
newwords = line.strip().split()
words.extend(newwords)
startsnip = False
snippets = []
snip = []
# Note that we skip the first
for i in range(256, len(words)):
w = words[i]
if w.startswith('<p') and w.endswith('>'):
continue
# these are pagebreak marks
# I've inserted
if not startsnip and (w.endswith('.') or w.endswith('?') or w.endswith('"') or w.endswith(',')):
startsnip = True
elif startsnip and len(snip) < snipmaxlen:
snip.append(w.lower())
# we assume an uncased model
elif len(snip) >= snipmaxlen:
snippets.append(' '.join(snip))
snip = []
startsnip = False
# Note that we deliberately don't take the last (incomplete)
# snippet. It's likely to be uncharacteristic. We also skip the
# next to the last snippet for the same reason.
max_n = len(snippets) - 1
if max_n > n:
snippets = random.sample(snippets[0 : -1], n)
# we do this random sampling in hopes of getting snippets across
# the whole length of the book (minus uncharacteristic start and end)
return snippets, max_n
def bertformat(df, n, snipmaxlen):
poslines = []
neglines = []
index = 0
snip_maxes = []
for idx, row in df.iterrows():
the_id = row.docid
the_class = row.realclass
snippets, max_n = get_snippets(the_id, n, snipmaxlen)
snip_maxes.append(max_n)
for s in snippets:
line = dict()
line['docid'] = the_id
line['idx'] = str(index)
line['class'] = str(the_class)
line['dummy'] = 'd'
line['text'] = s
# lineindex-tab-classlabel-tab-dummycolumn-tab-text
if line['class'] == '1':
poslines.append(line)
else:
neglines.append(line)
index += 1
# we want equal pos and neg representation
minlen = min(len(poslines), len(neglines))
poslines = poslines[0 : minlen]
neglines = neglines[0 : minlen]
lines = poslines + neglines
random.shuffle(lines)
outframe = pd.DataFrame(lines)
# the random shuffle is extremely important, given the way BERT works!
print('Average possible snippets:', sum(snip_maxes) / len(snip_maxes))
print('Actual taken per vol: ', str((minlen * 2) / len(df)))
return outframe
def create_traindev(modelname, tags4positive, tags4negative, sizecap, metadatapath, n, snipmaxlen):
for i in range(1):
fullname = modelname
traintest, validation = divide_training_from_validation(tags4positive, tags4negative, sizecap, metadatapath)
confirm_separation(traintest, validation)
traintest.to_csv('bertmeta/train_vols_' + fullname + '.csv', index = False)
validation.to_csv('bertmeta/dev_vols_' + fullname + '.csv', index = False)
print()
print('Metadata written. Writing training data ...')
train_df = bertformat(traintest, n, snipmaxlen)
train4bert = train_df.loc[ : , ['idx', 'class', 'dummy', 'text']]
train4bert.to_csv('bertdata/train_' + fullname + '.tsv', sep = '\t', header = False, index = False, quoting = csv.QUOTE_NONE)
print('... and validation data.')
print()
dev_df = bertformat(validation, n, snipmaxlen)
dev4bert = dev_df.loc[ : , ['idx', 'class', 'dummy', 'text']]
dev4bert.to_csv('bertdata/dev_' + fullname + '.tsv', sep = '\t', header = False, index = False, quoting = csv.QUOTE_NONE)
devmeta = dev_df.loc[ : , ['idx', 'docid', 'class']]
devmeta.to_csv('bertmeta/dev_rows_' + fullname + '.tsv', sep = '\t', index = False)
# we can use this to interpret results later, grouping them by docid
In [70]:
create_traindev('Goth512max', {'lochorror', 'pbgothic', 'locghost', 'stangothic', 'chihorror'},
{'random', 'chirandom'}, 125, '../meta/finalmeta.csv', 300, 512)
In [66]:
create_traindev('SF512max', {'anatscifi', 'locscifi', 'chiscifi', 'femscifi'},
{'random', 'chirandom'}, 125, '../meta/finalmeta.csv', 300, 512)
In [71]:
create_traindev('Mystery512max', {'locdetective', 'locdetmyst', 'chimyst', 'det100'},
{'random', 'chirandom'}, 125, '../meta/finalmeta.csv', 300, 512)
In [21]:
def number_snippets(anid):
'''
Returns 128-word snippets from a file. The number of snippets
is determined by the parameter n.
'''
inpath = '../newtexts/' + anid + '.txt'
with open(inpath, encoding = 'utf-8') as f:
filelines = f.readlines()
words = []
for line in filelines:
newwords = line.strip().split()
words.extend(newwords)
startsnip = False
snippets = []
snip = []
# Note that we skip the first
for i in range(256, len(words)):
w = words[i]
if w.startswith('<p') and w.endswith('>'):
continue
# these are pagebreak marks
# I've inserted
if not startsnip and w.endswith('.') or w.endswith('?') or w.endswith('"'):
startsnip = True
elif startsnip and len(snip) < 127:
snip.append(w.lower())
# we assume an uncased model
elif len(snip) >= 127:
snippets.append(' '.join(snip))
snip = []
startsnip = False
# Note that we deliberately don't take the last (incomplete)
# snippet. It's likely to be uncharacteristic. We also skip the
# next to the last snippet for the same reason.
snipdict = dict()
for i, s in enumerate(snippets):
snipdict[s] = i
maxsnip = i
return snipdict, maxsnip
for d in docs:
sd, maxsnip = number_snippets(d)
indexes = []
for s in data[3]:
if s in sd:
indexes.append(sd[s])
avg =(sum(indexes) / len(indexes))
print(avg/maxsnip)
In [54]:
def get_snips(text):
words = text.replace('\t', ' ').replace('<br />', ' ').split()
snips = []
for floor in range(0, len(words), 128):
ceiling = floor + 128
if ceiling > len(words):
ceiling = len(words)
s = words[floor : ceiling]
if len(s) > 64:
snips.append(' '.join(s))
return snips
def get_dataset(rootfolder):
negpaths = glob.glob(rootfolder + '/neg/*.txt')
pospaths = glob.glob(rootfolder + '/pos/*.txt')
paths = [(0, x) for x in negpaths] + [(1, x) for x in pospaths]
index = 0
lines = []
for label, p in paths:
docid = p.split('/')[-1].replace('.txt', '')
with open(p) as f:
text = f.read().strip()
snips = get_snips(text)
for s in snips:
line = dict()
line['docid'] = docid
line['idx'] = index
line['class'] = str(label)
line['dummy'] = 'd'
line['text'] = s
index += 1
lines.append(line)
random.shuffle(lines)
outframe = pd.DataFrame(lines)
return outframe
fullname = 'sentiment'
train_df = get_dataset('/Volumes/TARDIS/aclImdb/train')
train4bert = train_df.loc[ : , ['idx', 'class', 'dummy', 'text']]
train4bert.to_csv('bertdata/train_' + fullname + '.tsv', sep = '\t', header = False, index = False, quoting = csv.QUOTE_NONE)
test_df = get_dataset('/Volumes/TARDIS/aclImdb/test')
dev4bert = test_df.loc[ : , ['idx', 'class', 'dummy', 'text']]
dev4bert.to_csv('bertdata/dev_' + fullname + '.tsv', sep = '\t', header = False, index = False, quoting = csv.QUOTE_NONE)
devmeta = test_df.loc[ : , ['idx', 'docid', 'class']]
devmeta.to_csv('bertmeta/dev_rows_' + fullname + '.tsv', sep = '\t', index = False)
# we can use this to interpret results later, grouping them by docid
In [53]:
rootfolder = '/Volumes/TARDIS/aclImdb/train'
negpaths = glob.glob(rootfolder + '/neg/*.txt')
n = negpaths[0]
docid = n.split('/')[-1].replace('.txt', '')
print(n, docid)
In [38]:
sum(np.array(dev_df['class'], dtype = 'int8') > 3)
Out[38]:
In [39]:
len(dev_df)
Out[39]:
In [69]:
import glob
def lengths_dataset(rootfolder):
negpaths = glob.glob(rootfolder + '/neg/*.txt')
pospaths = glob.glob(rootfolder + '/pos/*.txt')
paths = [(0, x) for x in negpaths] + [(1, x) for x in pospaths]
lens = []
for label, p in paths:
with open(p) as f:
words = f.read().split()
lens.append(len(words))
return sum(lens) / len(lens)
lengths_dataset('/Volumes/TARDIS/aclImdb/train')
Out[69]:
In [ ]: