Implementing authorless topic models

This is my attempt to implement Thompson & Mimno 2018.



In [13]:

    
import sys, csv, math, random
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from scipy.stats import pearsonr
from scipy.stats import gamma
from collections import Counter



In [5]:

    
rv = gamma(3., loc = 0., scale = 2.)



In [8]:

    
fig, ax = plt.subplots()
x = np.linspace(0, 20)
ax.plot(x, rv.pdf(x), 'r-', lw=5, alpha=0.6, label='gamma pdf')
plt.show()



In [9]:

    
meta = pd.read_csv('../../metadata/filtered_fiction_plus_18c.tsv', sep = '\t', index_col = 'docid')
meta = meta[~meta.index.duplicated(keep = 'first')]



In [124]:

    
authorvocab = dict()

with open('/Users/tunder/data/character_table_18c19c.tsv', encoding = 'utf-8') as f:
    for line in f:
        fields = line.strip().split('\t')
        docid = fields[0]
        if docid not in meta.index:
            continue
        author = meta.loc[docid, 'author']
        if author not in authorvocab:
            authorvocab[author] = Counter()
        words = fields[5].split()
        for w in words:
            authorvocab[author][w] += 1

with open('/Users/tunder/data/character_table_post1900.tsv', encoding = 'utf-8') as f:
    for line in f:
        fields = line.strip().split('\t')
        docid = fields[0]
        if docid not in meta.index:
            continue
        author = meta.loc[docid, 'author']
        if author not in authorvocab:
            authorvocab[author] = Counter()
        words = fields[5].split()
        for w in words:
            authorvocab[author][w] += 1



In [128]:

    
allwords = Counter()
ctr = 0
for author, vocab in authorvocab.items():
    allwords = allwords + vocab
    ctr += 1
    if ctr % 10 == 1:
        print(ctr)
lexicon = [x[0] for x in allwords.most_common()]
print(len(lexicon))









    



1
11
21
31






    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-128-ff67214ebaf3> in <module>()
      2 ctr = 0
      3 for author, vocab in authorvocab.items():
----> 4     allwords = allwords + vocab
      5     ctr += 1
      6     if ctr % 10 == 1:

/Users/tunder/miniconda3/lib/python3.5/collections/__init__.py in __add__(self, other)
    697             newcount = count + other[elem]
    698             if newcount > 0:
--> 699                 result[elem] = newcount
    700         for elem, count in other.items():
    701             if elem not in self and count > 0:

KeyboardInterrupt:



In [130]:

    
allwords = Counter()
ctr = 0
for author, vocab in authorvocab.items():
    ctr += 1
    if ctr % 100 == 1:
        print(ctr)
    for w in vocab.keys():
        allwords[w] += 1
print(len(allwords))



In [144]:

    
lexicon = allwords.most_common()
lexicon = [x for x in lexicon if x[1] > 20]
print(len(lexicon))



In [145]:

    
with open('../dataprep/ficlexicon.tsv', mode = 'w', encoding = 'utf-8') as f:
    f.write('word\tnumauthors\n')
    for word, authfreq in lexicon:
        f.write(word + '\t' + str(authfreq) + '\n')



In [146]:

    
authsums = dict()
for author, vocab in authorvocab.items():
    authsum = sum(vocab.values())
    authsums[author] = authsum



In [148]:

    
lexicon[0:10]









    Out[148]:





[('said', 39089),
 ('had', 38899),
 ('was', 38623),
 ('went', 37809),
 ('took', 37772),
 ('came', 37759),
 ('eyes', 37644),
 ('head', 37536),
 ('made', 37459),
 ('face', 37257)]



In [149]:

    
authorstops = dict()
ctr = 0

for word, authfreq in lexicon:
    ctr += 1
    if ctr % 100 == 1:
        print(ctr)
        
    vector = []
    authors = []
    for author, vocab in authorvocab.items():
        authsum = authsums[author]
        if authsum > 0:
            authors.append(author)
            vector.append(vocab[word] / authsum)
    vector = np.array(vector)

    variance = np.std(vector) ** 2
    mean = np.mean(vector)

    k = (mean ** 2) / variance
    theta = variance / mean
    g = gamma(k, loc = 0., scale = theta)
    threshold = g.ppf(0.95)
    for auth, freq in zip(authors, vector):
        if freq > threshold:
            stopprob = 1 - (threshold / freq)
            if stopprob > 0:
                if auth not in authorstops:
                    authorstops[auth] = []
                authorstops[auth].append((stopprob, word))



In [153]:

    
with open('../dataprep/authorless.tsv', mode = 'w', encoding = 'utf-8') as f:
    f.write('author\tword\tstopprob\n')
    for author, tuplelist in authorstops.items():
        for stopprob, word in tuplelist:
            f.write(str(author) + '\t' + word + '\t' + str(stopprob) + '\n')



In [152]:

    
author









    Out[152]:





nan



In [143]:

    
ct = 0
for k, v in authorstops.items():
    ct += 1
    if ct > 10:
        break
    print(k, v)









    



Desai, Boman. [(0.25812059078187333, 'head'), (0.36124520869418231, 'spoke'), (0.50755219098273274, 'smiled'), (0.48517158923900439, 'laughed')]
Telscombe, Anne. [(0.17834276121710135, 'had'), (0.1086797513108867, 'found'), (0.20616687730415972, 'heard'), (0.042133055897579208, 'began'), (0.038352625613586877, 'tried'), (0.34091142637881378, 'seemed')]
Dunne, Pete, [(0.2416786566184429, 'seemed')]
White, Rhoda Elizabeth Waterman. [(0.18660491535557633, 'heart')]
(Ira Louis), Reeves, Ira L. [(0.16919331565533602, 'turned'), (0.025528334218409277, 'arms'), (0.11961160162823903, 'seen'), (0.1583436599149296, 'name'), (0.53305207028913459, 'started'), (0.15929981725992315, 'wife'), (0.11369323706187862, 'returned'), (0.10866951786498835, 'man'), (0.20785453793872999, 'get'), (0.29537418719153619, 'reached')]
Lintz, Gertrude Davies. [(0.13405276694877699, 'had'), (0.067256592505715362, 'was'), (0.21479634621471644, 'life'), (0.19344237466209036, 'kept'), (0.1502592073336626, 'take'), (0.20000333095951028, 'get')]
Fuller, Thomas, [(0.26492627830903515, 'father'), (0.57668969671890502, 'wife'), (0.53882058346744843, 'heart'), (0.42581273007326126, 'make')]
Sarton, May, [(0.023895190186413395, 'thought'), (0.26870174044803619, 'felt')]
Nguỹên, Huy Tửơng, [(0.026825034065757425, 'turned'), (0.3694747324570915, 'arm'), (0.25142904144505951, 'said-take'), (0.040723154771953651, 'body')]
Harry, Myriam. [(0.47778253022100692, 'father'), (0.21879830374293674, 'mother'), (0.083722135104521089, 'arms')]



In [119]:

    
rv = gamma(k, loc = 0., scale = theta)
fig, ax = plt.subplots()
x = np.linspace(0, rv.ppf(.999))
ax.plot(x, rv.cdf(x), 'r-', lw=5, alpha=0.6, label='gamma pdf')
plt.show()



In [122]:

    
threshold = gamma.ppf(0.94, a = k, loc = 0., scale = theta)
threshold









    Out[122]:





0.0044703025067474146



In [123]:

    
ctr = 0
for author, vocab in authorvocab.items():
    authsum = sum(vocab.values())
    if authsum > 0:
        hathfreq = vocab[word] / authsum
        if hathfreq > threshold:
            # print(author, hathfreq)
            ctr += 1
print(ctr, ctr / len(authors))









    



233 0.040346320346320345



In [71]:

    
len(authors)









    Out[71]:





5775



In [107]:

    
rv.ppf(.99)









    Out[107]:





0.0058971533739358376



In [ ]: