Logistic models for blog post

This notebook works up some quick and dirty bag-of-words models, to see how much this approach suffers when we cut whole documents into 128- or 256-word chunks.

We're going to use LogisticRegression from scikit-learn, and apply it in three ways:

To whole documents.
To BERT-sized chunks.
Aggregating the votes from BERT-sized chunks to produce a document-level prediction.



In [1]:

    
# Things that will come in handy

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from collections import Counter
from scipy.stats import pearsonr
import random, glob, csv

Modeling whole movie reviews from the IMDb dataset

@InProceedings{maas-EtAl:2011:ACL-HLT2011, author = {Maas, Andrew L. and Daly, Raymond E. and Pham, Peter T. and Huang, Dan and Ng, Andrew Y. and Potts, Christopher}, title = {Learning Word Vectors for Sentiment Analysis}, booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies}, month = {June}, year = {2011}, address = {Portland, Oregon, USA}, publisher = {Association for Computational Linguistics}, pages = {142--150}, url = {http://www.aclweb.org/anthology/P11-1015}



In [12]:

    
raw = pd.read_csv('sentimentdata.tsv', sep = '\t')

fullname = 'sentiment'

raw = raw.sample(frac = 1)
# that is in effect a shuffle

cut = round(len(raw) * .75)

train = raw.iloc[0: cut, : ]
test = raw.iloc[cut : , : ]



In [46]:

    
lex = Counter()

delchars = ''.join(c for c in map(chr, range(256)) if not c.isalpha())
spaces = ' ' * len(delchars)
punct2space = str.maketrans(delchars, spaces)

def getwords(text):
    global punct2space
    text = text.replace('<br />', ' ')
    words = text.translate(punct2space).split()
    return words

def get_dataset(rootfolder):
    
    negpaths = glob.glob(rootfolder + '/neg/*.txt')
    pospaths = glob.glob(rootfolder + '/pos/*.txt')
    paths = [(0, x) for x in negpaths] + [(1, x) for x in pospaths]
    
    index = 0
    lines = []
    lex = Counter()
    labels = []
    texts = []
    
    for label, p in paths:
        
        with open(p) as f:
            text = f.read().strip().lower()
            words = getwords(text)
            for w in words:
                lex[w] += 1
            labels.append(label)
            texts.append(text)

    vocab = [x[0] for x in lex.most_common()]
    print(vocab[0:10])
    
    df = pd.DataFrame.from_dict({'sent': labels, 'text': texts})
    df = df.sample(frac = 1)
    # shuffle
    
    return vocab, df



In [47]:

    
def make_matrix(df, vocab, cut):
    
    lexicon = dict()
    for i in range(cut):
        lexicon[vocab[i]] = i
    
    y = []
    x = []
    
    for i, row in df.iterrows():
        y.append(int(row['sent']))
        x_row = np.zeros(cut)
        words = getwords(row.text)
        for w in words:
            if w in lexicon:
                idx = lexicon[w]
                x_row[idx] = x_row[idx] + 1
        
        x_row = x_row / np.sum(len(words))
        
        x.append(x_row)
    
    x = np.array(x)
    
    return x, y



In [63]:

    
triplets = []

vocab, train_df = get_dataset('/Volumes/TARDIS/aclImdb/train')
print('got training')
dummy, test_df = get_dataset('/Volumes/TARDIS/aclImdb/test')
print('got test')

for cut in range(3200, 5200, 200):

    for reg_const in [.00001, .0001, .0003, .001, .01, .1]:
        
        trainingset, train_y = make_matrix(train_df, vocab, cut)
        testset, test_y = make_matrix(test_df, vocab, cut)
        
        model = LogisticRegression(C = reg_const)
        stdscaler = StandardScaler()
        stdscaler.fit(trainingset)
        scaledtraining = stdscaler.transform(trainingset)
        model.fit(scaledtraining, train_y)

        scaledtest = stdscaler.transform(testset)
        predictions = [x[1] for x in model.predict_proba(scaledtest)]
        predictions = np.round(predictions)
        accuracy = accuracy_score(predictions, test_y)
        f1 = f1_score(predictions, test_y)
        print(cut, reg_const, f1, accuracy)
        triplets.append((accuracy, cut, reg_const))

random.shuffle(triplets)
triplets.sort(key = lambda x: x[0])
print(triplets[-1])









    



['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']
got training
['the', 'and', 'a', 'of', 'to', 'is', 'it', 'in', 'i', 'this']
got test
3200 1e-05 0.8492538504508818 0.84888
3200 0.0001 0.8725717483411942 0.87248
3200 0.0003 0.8759947214779862 0.87596
3200 0.001 0.8708091908091908 0.87068
3200 0.01 0.8593687574910108 0.8592
3200 0.1 0.8512644321041908 0.85108
3400 1e-05 0.8499640488935049 0.84976
3400 0.0001 0.8742203742203742 0.87416
3400 0.0003 0.8748 0.8748
3400 0.001 0.8709380870538391 0.87084
3400 0.01 0.8584463921239045 0.85852
3400 0.1 0.8480916336256958 0.84828
3600 1e-05 0.8521301254895692 0.852
3600 0.0001 0.8736008954269269 0.87352
3600 0.0003 0.8757154865308411 0.8758
3600 0.001 0.8704771053474224 0.87056
3600 0.01 0.8545301407772832 0.85492
3600 0.1 0.8419151583256411 0.84244
3800 1e-05 0.8524065027628734 0.85256
3800 0.0001 0.8751649934002641 0.87516
3800 0.0003 0.8758065162505511 0.87604
3800 0.001 0.8714886796233221 0.87172
3800 0.01 0.8534115566985454 0.85408
3800 0.1 0.8392627173213135 0.84024
4000 1e-05 0.8529435322386865 0.85312
4000 0.0001 0.8755904251060764 0.87568
4000 0.0003 0.8764180061730871 0.87668
4000 0.001 0.8711863047748868 0.87148
4000 0.01 0.8523853468683545 0.85284
4000 0.1 0.8385646624387895 0.83912
4200 1e-05 0.853651696646769 0.85388
4200 0.0001 0.8747645573678516 0.875
4200 0.0003 0.875270888514327 0.87568
4200 0.001 0.8710971988121036 0.87152
4200 0.01 0.8497605344709622 0.85068
4200 0.1 0.8349569277835923 0.836
4400 1e-05 0.8540999359385011 0.85424
4400 0.0001 0.8760264370118166 0.8762
4400 0.0003 0.8754516983859311 0.87592
4400 0.001 0.8709379393452501 0.87148
4400 0.01 0.8495090938355062 0.8504
4400 0.1 0.8326770071526158 0.83344
4600 1e-05 0.854051454676605 0.85432
4600 0.0001 0.8750802310654686 0.87544
4600 0.0003 0.8748443087307645 0.8754
4600 0.001 0.8700233250221184 0.87072
4600 0.01 0.8499718445820931 0.8508
4600 0.1 0.8285047481088041 0.82952
4800 1e-05 0.8537866335244356 0.85412
4800 0.0001 0.8751154108626712 0.87556
4800 0.0003 0.8741705875256364 0.87484
4800 0.001 0.8679169349645848 0.86872
4800 0.01 0.847539596179422 0.84868
4800 0.1 0.824866040852504 0.82612
5000 1e-05 0.8540813869491933 0.85456
5000 0.0001 0.8754522794886227 0.87608
5000 0.0003 0.8742953776775648 0.87512
5000 0.001 0.8682489706950837 0.86944
5000 0.01 0.8438511326860841 0.8456
5000 0.1 0.8213622040882281 0.82312
(0.87668, 4000, 0.0003)

Cut down the reviews to 128-word chunks; how does it perform?

Here I'm using the same data files that were given to BERT.



In [96]:

    
def get_datachunks(filepath):
    
    data = pd.read_csv(filepath, sep = '\t', header = None, names = ['idx', 'sent', 'dummy', 'text'], quoting = csv.QUOTE_NONE)
    
    lex = Counter()
    
    for i, row in data.iterrows():
        text = row['text'].strip().lower()
        words = getwords(text)
        for w in words:
            lex[w] += 1

    vocab = [x[0] for x in lex.most_common()]
    print(vocab[0:10])
    
    df = data.loc[ : , ['sent', 'text']]
    
    return vocab, df

triplets = []

vocab, train_df = get_datachunks('/Users/tunder/Dropbox/fiction/bert/bertdata/train_sentiment.tsv')
print('got training')
dummy, test_df = get_datachunks('/Users/tunder/Dropbox/fiction/bert/bertdata/dev_sentiment.tsv')
print('got test')

for cut in range(2200, 6200, 400):

    for reg_const in [.00001, .00005, .0001, .0003, .001]:
        
        trainingset, train_y = make_matrix(train_df, vocab, cut)
        testset, test_y = make_matrix(test_df, vocab, cut)
        
        model = LogisticRegression(C = reg_const)
        stdscaler = StandardScaler()
        stdscaler.fit(trainingset)
        scaledtraining = stdscaler.transform(trainingset)
        model.fit(scaledtraining, train_y)

        scaledtest = stdscaler.transform(testset)
        predictions = [x[1] for x in model.predict_proba(scaledtest)]
        predictions = np.round(predictions)
        accuracy = accuracy_score(predictions, test_y)
        f1 = f1_score(predictions, test_y)
        print(cut, reg_const, f1, accuracy)
        triplets.append((accuracy, cut, reg_const))

random.shuffle(triplets)
triplets.sort(key = lambda x: x[0])
print(triplets[-1])









    



['the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i', 'this']
got training
['the', 'and', 'a', 'of', 'to', 'is', 'in', 'it', 'i', 'this']
got test
2200 1e-05 0.7959050511868602 0.792454125291722
2200 5e-05 0.8080544755826236 0.8045806067816775
2200 0.0001 0.8107440429402385 0.8071889443097058
2200 0.0003 0.8112095722189735 0.8079439893836087
2200 0.001 0.8087246808318672 0.8059762961607102
2600 1e-05 0.7992070107459056 0.796069189584954
2600 5e-05 0.8111916445324809 0.8080812703061364
2600 0.0001 0.8132145107570438 0.8100947238365441
2600 0.0003 0.8128761750185974 0.8100718436827895
2600 0.001 0.8105947703563201 0.8080812703061364
3000 1e-05 0.8009198719393966 0.7979682423465886
3000 5e-05 0.8134257800189095 0.8103692856815998
3000 0.0001 0.8152501914155745 0.8122912185969889
3000 0.0003 0.8147196682891653 0.8118793758294056
3000 0.001 0.8115392429064807 0.8089735963025672
3400 1e-05 0.8019611830362187 0.7994554523406397
3400 5e-05 0.813970803907854 0.8113531322930491
3400 0.0001 0.8166403607666292 0.8139385896673226
3400 0.0003 0.8157609676836974 0.8132064247471743
3400 0.001 0.8125777064448314 0.8103006452203358
3800 1e-05 0.8046912073757712 0.802246831098705
3800 5e-05 0.8157966040462428 0.8133437056697022
3800 0.0001 0.8172901305553676 0.814602114126207
3800 0.0003 0.81690077232284 0.8144877133574338
3800 0.001 0.8115555153379784 0.8094083192239052
4200 1e-05 0.8045966610867303 0.8023612318674781
4200 5e-05 0.8160202513335142 0.8137555484372855
4200 0.0001 0.8173111673292375 0.8149453164325264
4200 0.0003 0.815909604519774 0.8136182675147577
4200 0.001 0.8102952171647372 0.808424472612456
4600 1e-05 0.8044088357776571 0.8022697112524596
4600 5e-05 0.8165902365193034 0.8142360316661328
4600 0.0001 0.8185410128468538 0.8161122042740128
4600 0.0003 0.8158043443864288 0.8135496270534938
4600 0.001 0.8097373188405798 0.8077609481535716
5000 1e-05 0.8057524629147322 0.8037569212465108
5000 5e-05 0.8170797901212231 0.8149453164325264
5000 0.0001 0.8180934951083396 0.8157918821214478
5000 0.0003 0.815861584799692 0.8139614698210772
5000 0.001 0.8096383355266142 0.8080355099986272
5400 1e-05 0.8065817506006073 0.80474076785796
5400 5e-05 0.8178139482310387 0.8156088408914108
5400 0.0001 0.8183954802259887 0.8161350844277674
5400 0.0003 0.8145349231848316 0.8124513796732714
5400 0.001 0.8082092262985833 0.8067084610808585
5800 1e-05 0.807816112798658 0.8060220564682194
5800 5e-05 0.8179370573994886 0.8158834027364664
5800 0.0001 0.8182928484190528 0.8161808447352766
5800 0.0003 0.8143604137899813 0.8123598590582529
5800 0.001 0.8063528077141235 0.8047178877042054
(0.8161808447352766, 5800, 0.0001)

How much can we improve our chunk-level results by aggregating them?



In [88]:

    
trainingset, train_y = make_matrix(train_df, vocab, 5200)
testset, test_y = make_matrix(test_df, vocab, 5200)
model = LogisticRegression(C = .0001)
stdscaler = StandardScaler()
stdscaler.fit(trainingset)
scaledtraining = stdscaler.transform(trainingset)
model.fit(scaledtraining, train_y)

scaledtest = stdscaler.transform(testset)
predictions = [x[1] for x in model.predict_proba(scaledtest)]



In [89]:

    
# make a dataframe
meta = pd.read_csv('bertmeta/dev_rows_sentiment.tsv', sep = '\t')
pred = pd.DataFrame.from_dict({'idx': meta['idx'], 'pred': predictions, 'real': test_y})
pred = pred.set_index('idx')
pred.head()



In [93]:

    
right = 0

for idx, row in pred.iterrows():
    if row['pred'] >= 0.5:
        predclass = 1
    else:
        predclass = 0
        
    if predclass == row['real']:
        right += 1

print(right / len(pred))









    



0.8162266050427859



In [94]:

    
byvol = meta.groupby('docid')
rightvols = 0
allvols = 0
bertprobs = dict()

for vol, df in byvol:
    total = 0
    right = 0
    positive = 0
    df.set_index('idx', inplace = True)
    predicted = []
    for idx, row in df.iterrows():
        predict = pred.loc[idx, 'pred']
        predicted.append(predict)
        true_class = row['class']
    
    volmean = sum(predicted) / len(predicted)
    if volmean >= 0.5:
        predicted_class = 1
    else:
        predicted_class = 0
    
    if true_class == predicted_class:
        rightvols += 1
    allvols += 1

print()
print('Overall accuracy:', rightvols / allvols)









    



Overall accuracy: 0.86454402849027

What about the parallel problem for genre?

We use the same data that was passed to BERT.



In [102]:

    
triplets = []

vocab, train_df = get_datachunks('/Users/tunder/Dropbox/fiction/bert/bertdata/train_Mystery256.tsv')
print('got training')
dummy, test_df = get_datachunks('/Users/tunder/Dropbox/fiction/bert/bertdata/dev_Mystery256.tsv')
print('got test')

for cut in range(2000, 6200, 400):

    for reg_const in [.00001, .00005, .0001, .0003, .001]:
        
        trainingset, train_y = make_matrix(train_df, vocab, cut)
        testset, test_y = make_matrix(test_df, vocab, cut)
        
        model = LogisticRegression(C = reg_const)
        stdscaler = StandardScaler()
        stdscaler.fit(trainingset)
        scaledtraining = stdscaler.transform(trainingset)
        model.fit(scaledtraining, train_y)

        scaledtest = stdscaler.transform(testset)
        predictions = [x[1] for x in model.predict_proba(scaledtest)]
        predictions = np.round(predictions)
        accuracy = accuracy_score(predictions, test_y)
        f1 = f1_score(predictions, test_y)
        print(cut, reg_const, f1, accuracy)
        triplets.append((accuracy, cut, reg_const))

random.shuffle(triplets)
triplets.sort(key = lambda x: x[0])
print(triplets[-1])









    



['the', 'and', 'to', 'a', 'of', 'i', 'he', 'in', 'was', 'it']
got training
['the', 'and', 'to', 'of', 'a', 'i', 'he', 'in', 'was', '”']
got test
2000 1e-05 0.7380035630136199 0.744955805019953
2000 5e-05 0.7374129774222086 0.7453847014507888
2000 0.0001 0.7321996032586715 0.7407041360534069
2000 0.0003 0.7232408229436059 0.7325923992093387
2000 0.001 0.7135843513220967 0.7236788125163167
2400 1e-05 0.7414472798600026 0.7492820646701227
2400 5e-05 0.7369196471842591 0.7469324581359789
2400 0.0001 0.73058597016667 0.7416738149405139
2400 0.0003 0.7198266606804739 0.732349979487562
2400 0.001 0.7091865673101371 0.7227277813001156
2800 1e-05 0.7440613026819923 0.7508671166971246
2800 5e-05 0.7382283753036907 0.7468392197814493
2800 0.0001 0.7325671089170009 0.7413940998769254
2800 0.0003 0.7211278485901893 0.7307276321187484
2800 0.001 0.7093805583481555 0.7196882109424533
3200 1e-05 0.7449090804795502 0.7512214224443367
3200 5e-05 0.7397687260693052 0.7477902509976504
3200 0.0001 0.7333962009969016 0.7416738149405139
3200 0.0003 0.7207193330381039 0.7295155335098646
3200 0.001 0.7086511054618287 0.7178980345354865
3600 1e-05 0.7465601589970954 0.7526945884459031
3600 5e-05 0.743108178103478 0.7502890388990415
3600 0.0001 0.7347056449287896 0.7426061984858091
3600 0.0003 0.7219459750033598 0.7299257822697945
3600 0.001 0.7090664617748752 0.7175623764591802
4000 1e-05 0.7449929154061196 0.7516503188751725
4000 5e-05 0.7397850289384121 0.7476410696304031
4000 0.0001 0.7330264450393883 0.7415246335732667
4000 0.0003 0.7172339442476513 0.7261030097340843
4000 0.001 0.7014902383611229 0.7108865102748667
4400 1e-05 0.745502665797246 0.7525454070786559
4400 5e-05 0.7386530014641287 0.7470256964905083
4400 0.0001 0.7299945983486381 0.7390071980009697
4400 0.0003 0.7142279163857047 0.7233990974527281
4400 0.001 0.6978890298765716 0.7069704993846269
4800 1e-05 0.7435098650051922 0.7512773654570544
4800 5e-05 0.7361022364217251 0.7458508932234363
4800 0.0001 0.7272339021217945 0.737981576101145
4800 0.0003 0.7111723495507385 0.722466713907433
4800 0.001 0.6922032056506383 0.7042106440905531
5200 1e-05 0.7429693364901542 0.7509976503934659
5200 5e-05 0.7343204899604621 0.74437772722187
5200 0.0001 0.7265977297465401 0.7377018610375564
5200 0.0003 0.7106312615504328 0.7226158952746802
5200 0.001 0.6916382750900417 0.7046395405213889
5600 1e-05 0.7468342268595914 0.7543169358147167
5600 5e-05 0.7391068802724248 0.7485548055047925
5600 0.0001 0.7304648094979437 0.7409092604333719
5600 0.0003 0.7129800322123687 0.7241822996307761
5600 0.001 0.6942886529915194 0.7062432402192966
6000 1e-05 0.747579157700584 0.7550068996382352
6000 5e-05 0.7392405063291139 0.7483869764666393
6000 0.0001 0.7299044080653276 0.7397158094953941
6000 0.0003 0.7117383980949062 0.7223548278819976
6000 0.001 0.6930394431554525 0.7039495766978704
(0.7550068996382352, 6000, 1e-05)

and now aggregating the genre chunks



In [104]:

    
# best model

trainingset, train_y = make_matrix(train_df, vocab, 6000)
testset, test_y = make_matrix(test_df, vocab, 6000)
model = LogisticRegression(C = .00001)
stdscaler = StandardScaler()
stdscaler.fit(trainingset)
scaledtraining = stdscaler.transform(trainingset)
model.fit(scaledtraining, train_y)

scaledtest = stdscaler.transform(testset)
predictions = [x[1] for x in model.predict_proba(scaledtest)]

# make a dataframe
meta = pd.read_csv('bertmeta/dev_rows_Mystery256.tsv', sep = '\t')
pred = pd.DataFrame.from_dict({'idx': meta['idx'], 'pred': predictions, 'real': test_y})
pred = pred.set_index('idx')
pred.head()



In [105]:

    
byvol = meta.groupby('docid')
rightvols = 0
allvols = 0
bertprobs = dict()

for vol, df in byvol:
    total = 0
    right = 0
    positive = 0
    df.set_index('idx', inplace = True)
    predicted = []
    for idx, row in df.iterrows():
        predict = pred.loc[idx, 'pred']
        predicted.append(predict)
        true_class = row['class']
    
    volmean = sum(predicted) / len(predicted)
    if volmean >= 0.5:
        predicted_class = 1
    else:
        predicted_class = 0
    
    if true_class == predicted_class:
        rightvols += 1
    allvols += 1

print()
print('Overall accuracy:', rightvols / allvols)









    



Overall accuracy: 0.8770491803278688

Aside: It's really remarkable how powerful binary voting can be. In this case models of genre at 256-word scale are pretty awful (75.5% accuracy) but aggregate up to 87.7% accuracy. But that's still not quite in the same league with models that can see whole novels; in that case the detective/mystery genre can be modeled with more than 91% accuracy. Something is lost when we can't see the whole elephant at once.



In [ ]:

	pred	real
idx
996	0.161673	0
38639	0.284180	1
5648	0.055561	0
43581	0.590964	1
36158	0.660262	1

	pred	real
idx
42274	0.372493	0
47664	0.248213	0
834	0.545889	1
17125	0.713855	1
33412	0.247855	0