Since I'm working with long documents, I'm not really concerned with BERT's raw predictions about individual text chunks. Instead I need to know how good the predictions are when aggregated at the volume level.
This notebook answers that question, pairing BERT's predictions with a metadata file that got spun off when data was originally created. For a given TASK, this file will be named, for instance, bertmeta/dev_rows_{TASK_NAME}.tsv. This metadata file lists the index of each text chunk but also the docid (usually, volume-level ID) associated with a larger document.
We can then group predictions by docid and evaluate accuracy at the volume level. I have tried doing this by averaging logits, as well as binary voting.
My tentative conclusion is that in most cases binary voting is preferable; I'm not sure whether the logits are scaled in a way that produces a reliable mean.
In [213]:
# modules needed
import pandas as pd
from scipy.stats import pearsonr
import numpy as np
In [228]:
pred = pd.read_csv('reports/sf512max/predictions.tsv', sep = '\t', header = None, names = ['real', 'pred'])
pred.head()
Out[228]:
In [229]:
meta = pd.read_csv('bertmeta/dev_rows_SF512max.tsv', sep = '\t')
meta.head()
Out[229]:
In [230]:
pred.shape
Out[230]:
In [231]:
meta.shape
Out[231]:
In [232]:
# Here we're aligning the dataframes by setting the index of "pred"
# to match the idx column of "meta."
pred = pred.assign(idx = meta['idx'])
pred = pred.set_index('idx')
pred.head()
Out[232]:
In [233]:
correct = []
right = 0
for idx, row in pred.iterrows():
if row['pred'] == row['real']:
correct.append(True)
right += 1
else:
correct.append(False)
print(right / len(pred))
In [234]:
byvol = meta.groupby('docid')
rightvols = 0
allvols = 0
bertprobs = dict()
for vol, df in byvol:
total = 0
right = 0
positive = 0
df.set_index('idx', inplace = True)
for idx, row in df.iterrows():
total += 1
true_class = row['class']
predicted_class = pred.loc[idx, 'pred']
assert true_class == pred.loc[idx, 'real']
if true_class == predicted_class:
right += 1
if predicted_class:
positive += 1
bertprobs[vol] = positive/total
if right/ total >= 0.5:
rightvols += 1
allvols += 1
print()
print('Overall accuracy:', rightvols / allvols)
In [235]:
pred = pd.read_csv('reports/sf512max/logits.tsv', sep = '\t', header = None, names = ['real', 'pred'])
pred.head()
Out[235]:
In [236]:
right = 0
for idx, row in pred.iterrows():
if row['pred'] >= 0:
predclass = 1
else:
predclass = 0
if predclass == row['real']:
correct.append(True)
right += 1
else:
correct.append(False)
print(right / len(pred))
In [238]:
# Here we're aligning the dataframes by setting the index of "pred"
# to match the idx column of "meta."
pred = pred.assign(idx = meta['idx'])
pred = pred.set_index('idx')
pred.head()
Out[238]:
In [239]:
byvol = meta.groupby('docid')
rightvols = 0
allvols = 0
bertprobs = dict()
for vol, df in byvol:
total = 0
right = 0
positive = 0
df.set_index('idx', inplace = True)
predictions = []
for idx, row in df.iterrows():
predict = pred.loc[idx, 'pred']
predictions.append(predict)
true_class = row['class']
volmean = sum(predictions) / len(predictions)
if volmean >= 0:
predicted_class = 1
else:
predicted_class = 0
if true_class == predicted_class:
rightvols += 1
allvols += 1
print()
print('Overall accuracy:', rightvols / allvols)
In [38]:
def corrdist(filename, bertprobs):
'''
Checks for correlation.
'''
# If I were coding elegantly, I would not repeat
# the same code twice, but this is just a sanity check, so
# the structure here is that we do exactly the same thing
# for models 0-4 and for models 5-9.
root = '../temp/' + filename
logisticprob = dict()
for i in range(0, 10):
# note the range endpoints
tt_df = pd.read_csv(root + str(i) + '.csv', index_col = 'docid')
for key, value in bertprobs.items():
if key in tt_df.index:
l_prob = tt_df.loc[key, 'probability']
if key not in logisticprob:
logisticprob[key] = []
logisticprob[key].append(l_prob)
a = []
b = []
for key, value in logisticprob.items():
aval = sum(value) / len(value)
bval = bertprobs[key]
a.append(aval)
b.append(bval)
print(pearsonr(a, b))
print(len(a), len(b))
corrdist('BoWSF', bertprobs)
In [44]:
thisprobs = dict()
lastprobs = dict()
root = '../temp/BoWSF'
for i in range(0, 10):
df = pd.read_csv(root + str(i) + '.csv', index_col = 'docid')
a = []
b = []
for idx, row in df.iterrows():
thisprobs[idx] = row.probability
if idx in lastprobs:
a.append(lastprobs[idx])
b.append(thisprobs[idx])
if len(a) > 0:
print(pearsonr(a, b))
lastprobs = thisprobs
thisprobs = dict()
In [103]:
met = pd.read_csv('bertmeta/dev_rows_SF0_500.tsv', sep = '\t')
met.head()
Out[103]:
In [112]:
# regression
byvol = meta.groupby('docid')
volpred = []
volreal = []
for vol, df in byvol:
total = 0
right = 0
positive = 0
df.set_index('idx', inplace = True)
predictions = []
for idx, row in df.iterrows():
predict = pred.loc[idx, 'pred']
predictions.append(predict)
true_class = float(row['class'])
volmean = sum(predictions) / len(predictions)
volpred.append(volmean)
volreal.append(true_class)
print()
print('Overall accuracy:', pearsonr(volpred, volreal))
In [ ]: