This notebook works up some quick and dirty bag-of-words models, to see how much this approach suffers when we cut whole documents into 128- or 256-word chunks.
We're going to use LogisticRegression from scikit-learn, and apply it in three ways:
To whole documents.
To BERT-sized chunks.
Aggregating the votes from BERT-sized chunks to produce a document-level prediction.
In [1]:
# Things that will come in handy
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
from scipy.stats import pearsonr
import random, glob, csv
@InProceedings{maas-EtAl:2011:ACL-HLT2011, author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher}, title = {Learning Word Vectors for Sentiment Analysis}, booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, month = {June}, year = {2011}, address = {Portland, Oregon, USA}, publisher = {Association for Computational Linguistics}, pages = {142--150}, url = {http://www.aclweb.org/anthology/P11-1015}
In [12]:
raw = pd.read_csv('sentimentdata.tsv', sep = '\t')
fullname = 'sentiment'
raw = raw.sample(frac = 1)
# that is in effect a shuffle
cut = round(len(raw) * .75)
train = raw.iloc[0: cut, : ]
test = raw.iloc[cut : , : ]
In [46]:
lex = Counter()
delchars = ''.join(c for c in map(chr, range(256)) if not c.isalpha())
spaces = ' ' * len(delchars)
punct2space = str.maketrans(delchars, spaces)
def getwords(text):
global punct2space
text = text.replace('<br />', ' ')
words = text.translate(punct2space).split()
return words
def get_dataset(rootfolder):
negpaths = glob.glob(rootfolder + '/neg/*.txt')
pospaths = glob.glob(rootfolder + '/pos/*.txt')
paths = [(0, x) for x in negpaths] + [(1, x) for x in pospaths]
index = 0
lines = []
lex = Counter()
labels = []
texts = []
for label, p in paths:
with open(p) as f:
text = f.read().strip().lower()
words = getwords(text)
for w in words:
lex[w] += 1
labels.append(label)
texts.append(text)
vocab = [x[0] for x in lex.most_common()]
print(vocab[0:10])
df = pd.DataFrame.from_dict({'sent': labels, 'text': texts})
df = df.sample(frac = 1)
# shuffle
return vocab, df
In [47]:
def make_matrix(df, vocab, cut):
lexicon = dict()
for i in range(cut):
lexicon[vocab[i]] = i
y = []
x = []
for i, row in df.iterrows():
y.append(int(row['sent']))
x_row = np.zeros(cut)
words = getwords(row.text)
for w in words:
if w in lexicon:
idx = lexicon[w]
x_row[idx] = x_row[idx] + 1
x_row = x_row / np.sum(len(words))
x.append(x_row)
x = np.array(x)
return x, y
In [63]:
triplets = []
vocab, train_df = get_dataset('/Volumes/TARDIS/aclImdb/train')
print('got training')
dummy, test_df = get_dataset('/Volumes/TARDIS/aclImdb/test')
print('got test')
for cut in range(3200, 5200, 200):
for reg_const in [.00001, .0001, .0003, .001, .01, .1]:
trainingset, train_y = make_matrix(train_df, vocab, cut)
testset, test_y = make_matrix(test_df, vocab, cut)
model = LogisticRegression(C = reg_const)
stdscaler = StandardScaler()
stdscaler.fit(trainingset)
scaledtraining = stdscaler.transform(trainingset)
model.fit(scaledtraining, train_y)
scaledtest = stdscaler.transform(testset)
predictions = [x[1] for x in model.predict_proba(scaledtest)]
predictions = np.round(predictions)
accuracy = accuracy_score(predictions, test_y)
f1 = f1_score(predictions, test_y)
print(cut, reg_const, f1, accuracy)
triplets.append((accuracy, cut, reg_const))
random.shuffle(triplets)
triplets.sort(key = lambda x: x[0])
print(triplets[-1])
In [96]:
def get_datachunks(filepath):
data = pd.read_csv(filepath, sep = '\t', header = None, names = ['idx', 'sent', 'dummy', 'text'], quoting = csv.QUOTE_NONE)
lex = Counter()
for i, row in data.iterrows():
text = row['text'].strip().lower()
words = getwords(text)
for w in words:
lex[w] += 1
vocab = [x[0] for x in lex.most_common()]
print(vocab[0:10])
df = data.loc[ : , ['sent', 'text']]
return vocab, df
triplets = []
vocab, train_df = get_datachunks('/Users/tunder/Dropbox/fiction/bert/bertdata/train_sentiment.tsv')
print('got training')
dummy, test_df = get_datachunks('/Users/tunder/Dropbox/fiction/bert/bertdata/dev_sentiment.tsv')
print('got test')
for cut in range(2200, 6200, 400):
for reg_const in [.00001, .00005, .0001, .0003, .001]:
trainingset, train_y = make_matrix(train_df, vocab, cut)
testset, test_y = make_matrix(test_df, vocab, cut)
model = LogisticRegression(C = reg_const)
stdscaler = StandardScaler()
stdscaler.fit(trainingset)
scaledtraining = stdscaler.transform(trainingset)
model.fit(scaledtraining, train_y)
scaledtest = stdscaler.transform(testset)
predictions = [x[1] for x in model.predict_proba(scaledtest)]
predictions = np.round(predictions)
accuracy = accuracy_score(predictions, test_y)
f1 = f1_score(predictions, test_y)
print(cut, reg_const, f1, accuracy)
triplets.append((accuracy, cut, reg_const))
random.shuffle(triplets)
triplets.sort(key = lambda x: x[0])
print(triplets[-1])
In [88]:
trainingset, train_y = make_matrix(train_df, vocab, 5200)
testset, test_y = make_matrix(test_df, vocab, 5200)
model = LogisticRegression(C = .0001)
stdscaler = StandardScaler()
stdscaler.fit(trainingset)
scaledtraining = stdscaler.transform(trainingset)
model.fit(scaledtraining, train_y)
scaledtest = stdscaler.transform(testset)
predictions = [x[1] for x in model.predict_proba(scaledtest)]
In [89]:
# make a dataframe
meta = pd.read_csv('bertmeta/dev_rows_sentiment.tsv', sep = '\t')
pred = pd.DataFrame.from_dict({'idx': meta['idx'], 'pred': predictions, 'real': test_y})
pred = pred.set_index('idx')
pred.head()
Out[89]:
In [93]:
right = 0
for idx, row in pred.iterrows():
if row['pred'] >= 0.5:
predclass = 1
else:
predclass = 0
if predclass == row['real']:
right += 1
print(right / len(pred))
In [94]:
byvol = meta.groupby('docid')
rightvols = 0
allvols = 0
bertprobs = dict()
for vol, df in byvol:
total = 0
right = 0
positive = 0
df.set_index('idx', inplace = True)
predicted = []
for idx, row in df.iterrows():
predict = pred.loc[idx, 'pred']
predicted.append(predict)
true_class = row['class']
volmean = sum(predicted) / len(predicted)
if volmean >= 0.5:
predicted_class = 1
else:
predicted_class = 0
if true_class == predicted_class:
rightvols += 1
allvols += 1
print()
print('Overall accuracy:', rightvols / allvols)
In [102]:
triplets = []
vocab, train_df = get_datachunks('/Users/tunder/Dropbox/fiction/bert/bertdata/train_Mystery256.tsv')
print('got training')
dummy, test_df = get_datachunks('/Users/tunder/Dropbox/fiction/bert/bertdata/dev_Mystery256.tsv')
print('got test')
for cut in range(2000, 6200, 400):
for reg_const in [.00001, .00005, .0001, .0003, .001]:
trainingset, train_y = make_matrix(train_df, vocab, cut)
testset, test_y = make_matrix(test_df, vocab, cut)
model = LogisticRegression(C = reg_const)
stdscaler = StandardScaler()
stdscaler.fit(trainingset)
scaledtraining = stdscaler.transform(trainingset)
model.fit(scaledtraining, train_y)
scaledtest = stdscaler.transform(testset)
predictions = [x[1] for x in model.predict_proba(scaledtest)]
predictions = np.round(predictions)
accuracy = accuracy_score(predictions, test_y)
f1 = f1_score(predictions, test_y)
print(cut, reg_const, f1, accuracy)
triplets.append((accuracy, cut, reg_const))
random.shuffle(triplets)
triplets.sort(key = lambda x: x[0])
print(triplets[-1])
In [104]:
# best model
trainingset, train_y = make_matrix(train_df, vocab, 6000)
testset, test_y = make_matrix(test_df, vocab, 6000)
model = LogisticRegression(C = .00001)
stdscaler = StandardScaler()
stdscaler.fit(trainingset)
scaledtraining = stdscaler.transform(trainingset)
model.fit(scaledtraining, train_y)
scaledtest = stdscaler.transform(testset)
predictions = [x[1] for x in model.predict_proba(scaledtest)]
# make a dataframe
meta = pd.read_csv('bertmeta/dev_rows_Mystery256.tsv', sep = '\t')
pred = pd.DataFrame.from_dict({'idx': meta['idx'], 'pred': predictions, 'real': test_y})
pred = pred.set_index('idx')
pred.head()
Out[104]:
In [105]:
byvol = meta.groupby('docid')
rightvols = 0
allvols = 0
bertprobs = dict()
for vol, df in byvol:
total = 0
right = 0
positive = 0
df.set_index('idx', inplace = True)
predicted = []
for idx, row in df.iterrows():
predict = pred.loc[idx, 'pred']
predicted.append(predict)
true_class = row['class']
volmean = sum(predicted) / len(predicted)
if volmean >= 0.5:
predicted_class = 1
else:
predicted_class = 0
if true_class == predicted_class:
rightvols += 1
allvols += 1
print()
print('Overall accuracy:', rightvols / allvols)
Aside: It's really remarkable how powerful binary voting can be. In this case models of genre at 256-word scale are pretty awful (75.5% accuracy) but aggregate up to 87.7% accuracy. But that's still not quite in the same league with models that can see whole novels; in that case the detective/mystery genre can be modeled with more than 91% accuracy. Something is lost when we can't see the whole elephant at once.
In [ ]: