Implementing authorless topic models

This is my attempt to implement Thompson & Mimno 2018.


In [13]:
import sys, csv, math, random
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from scipy.stats import pearsonr
from scipy.stats import gamma
from collections import Counter

In [5]:
rv = gamma(3., loc = 0., scale = 2.)

In [8]:
fig, ax = plt.subplots()
x = np.linspace(0, 20)
ax.plot(x, rv.pdf(x), 'r-', lw=5, alpha=0.6, label='gamma pdf')
plt.show()



In [9]:
meta = pd.read_csv('../../metadata/filtered_fiction_plus_18c.tsv', sep = '\t', index_col = 'docid')
meta = meta[~meta.index.duplicated(keep = 'first')]

In [124]:
authorvocab = dict()

with open('/Users/tunder/data/character_table_18c19c.tsv', encoding = 'utf-8') as f:
    for line in f:
        fields = line.strip().split('\t')
        docid = fields[0]
        if docid not in meta.index:
            continue
        author = meta.loc[docid, 'author']
        if author not in authorvocab:
            authorvocab[author] = Counter()
        words = fields[5].split()
        for w in words:
            authorvocab[author][w] += 1

with open('/Users/tunder/data/character_table_post1900.tsv', encoding = 'utf-8') as f:
    for line in f:
        fields = line.strip().split('\t')
        docid = fields[0]
        if docid not in meta.index:
            continue
        author = meta.loc[docid, 'author']
        if author not in authorvocab:
            authorvocab[author] = Counter()
        words = fields[5].split()
        for w in words:
            authorvocab[author][w] += 1

In [128]:
allwords = Counter()
ctr = 0
for author, vocab in authorvocab.items():
    allwords = allwords + vocab
    ctr += 1
    if ctr % 10 == 1:
        print(ctr)
lexicon = [x[0] for x in allwords.most_common()]
print(len(lexicon))


1
11
21
31
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-128-ff67214ebaf3> in <module>()
      2 ctr = 0
      3 for author, vocab in authorvocab.items():
----> 4     allwords = allwords + vocab
      5     ctr += 1
      6     if ctr % 10 == 1:

/Users/tunder/miniconda3/lib/python3.5/collections/__init__.py in __add__(self, other)
    697             newcount = count + other[elem]
    698             if newcount > 0:
--> 699                 result[elem] = newcount
    700         for elem, count in other.items():
    701             if elem not in self and count > 0:

KeyboardInterrupt: 

In [130]:
allwords = Counter()
ctr = 0
for author, vocab in authorvocab.items():
    ctr += 1
    if ctr % 100 == 1:
        print(ctr)
    for w in vocab.keys():
        allwords[w] += 1
print(len(allwords))


1
101
201
301
401
501
601
701
801
901
1001
1101
1201
1301
1401
1501
1601
1701
1801
1901
2001
2101
2201
2301
2401
2501
2601
2701
2801
2901
3001
3101
3201
3301
3401
3501
3601
3701
3801
3901
4001
4101
4201
4301
4401
4501
4601
4701
4801
4901
5001
5101
5201
5301
5401
5501
5601
5701
5801
5901
6001
6101
6201
6301
6401
6501
6601
6701
6801
6901
7001
7101
7201
7301
7401
7501
7601
7701
7801
7901
8001
8101
8201
8301
8401
8501
8601
8701
8801
8901
9001
9101
9201
9301
9401
9501
9601
9701
9801
9901
10001
10101
10201
10301
10401
10501
10601
10701
10801
10901
11001
11101
11201
11301
11401
11501
11601
11701
11801
11901
12001
12101
12201
12301
12401
12501
12601
12701
12801
12901
13001
13101
13201
13301
13401
13501
13601
13701
13801
13901
14001
14101
14201
14301
14401
14501
14601
14701
14801
14901
15001
15101
15201
15301
15401
15501
15601
15701
15801
15901
16001
16101
16201
16301
16401
16501
16601
16701
16801
16901
17001
17101
17201
17301
17401
17501
17601
17701
17801
17901
18001
18101
18201
18301
18401
18501
18601
18701
18801
18901
19001
19101
19201
19301
19401
19501
19601
19701
19801
19901
20001
20101
20201
20301
20401
20501
20601
20701
20801
20901
21001
21101
21201
21301
21401
21501
21601
21701
21801
21901
22001
22101
22201
22301
22401
22501
22601
22701
22801
22901
23001
23101
23201
23301
23401
23501
23601
23701
23801
23901
24001
24101
24201
24301
24401
24501
24601
24701
24801
24901
25001
25101
25201
25301
25401
25501
25601
25701
25801
25901
26001
26101
26201
26301
26401
26501
26601
26701
26801
26901
27001
27101
27201
27301
27401
27501
27601
27701
27801
27901
28001
28101
28201
28301
28401
28501
28601
28701
28801
28901
29001
29101
29201
29301
29401
29501
29601
29701
29801
29901
30001
30101
30201
30301
30401
30501
30601
30701
30801
30901
31001
31101
31201
31301
31401
31501
31601
31701
31801
31901
32001
32101
32201
32301
32401
32501
32601
32701
32801
32901
33001
33101
33201
33301
33401
33501
33601
33701
33801
33901
34001
34101
34201
34301
34401
34501
34601
34701
34801
34901
35001
35101
35201
35301
35401
35501
35601
35701
35801
35901
36001
36101
36201
36301
36401
36501
36601
36701
36801
36901
37001
37101
37201
37301
37401
37501
37601
37701
37801
37901
38001
38101
38201
38301
38401
38501
38601
38701
38801
38901
39001
39101
39201
39301
39401
39501
39601
39701
39801
39901
2163783

In [144]:
lexicon = allwords.most_common()
lexicon = [x for x in lexicon if x[1] > 20]
print(len(lexicon))


144689

In [145]:
with open('../dataprep/ficlexicon.tsv', mode = 'w', encoding = 'utf-8') as f:
    f.write('word\tnumauthors\n')
    for word, authfreq in lexicon:
        f.write(word + '\t' + str(authfreq) + '\n')

In [146]:
authsums = dict()
for author, vocab in authorvocab.items():
    authsum = sum(vocab.values())
    authsums[author] = authsum

In [148]:
lexicon[0:10]


Out[148]:
[('said', 39089),
 ('had', 38899),
 ('was', 38623),
 ('went', 37809),
 ('took', 37772),
 ('came', 37759),
 ('eyes', 37644),
 ('head', 37536),
 ('made', 37459),
 ('face', 37257)]

In [149]:
authorstops = dict()
ctr = 0

for word, authfreq in lexicon:
    ctr += 1
    if ctr % 100 == 1:
        print(ctr)
        
    vector = []
    authors = []
    for author, vocab in authorvocab.items():
        authsum = authsums[author]
        if authsum > 0:
            authors.append(author)
            vector.append(vocab[word] / authsum)
    vector = np.array(vector)

    variance = np.std(vector) ** 2
    mean = np.mean(vector)

    k = (mean ** 2) / variance
    theta = variance / mean
    g = gamma(k, loc = 0., scale = theta)
    threshold = g.ppf(0.95)
    for auth, freq in zip(authors, vector):
        if freq > threshold:
            stopprob = 1 - (threshold / freq)
            if stopprob > 0:
                if auth not in authorstops:
                    authorstops[auth] = []
                authorstops[auth].append((stopprob, word))


1
101
201
301
401
501
601
701
801
901
1001
1101
1201
1301
1401
1501
1601
1701
1801
1901
2001
2101
2201
2301
2401
2501
2601
2701
2801
2901
3001
3101
3201
3301
3401
3501
3601
3701
3801
3901
4001
4101
4201
4301
4401
4501
4601
4701
4801
4901
5001
5101
5201
5301
5401
5501
5601
5701
5801
5901
6001
6101
6201
6301
6401
6501
6601
6701
6801
6901
7001
7101
7201
7301
7401
7501
7601
7701
7801
7901
8001
8101
8201
8301
8401
8501
8601
8701
8801
8901
9001
9101
9201
9301
9401
9501
9601
9701
9801
9901
10001
10101
10201
10301
10401
10501
10601
10701
10801
10901
11001
11101
11201
11301
11401
11501
11601
11701
11801
11901
12001
12101
12201
12301
12401
12501
12601
12701
12801
12901
13001
13101
13201
13301
13401
13501
13601
13701
13801
13901
14001
14101
14201
14301
14401
14501
14601
14701
14801
14901
15001
15101
15201
15301
15401
15501
15601
15701
15801
15901
16001
16101
16201
16301
16401
16501
16601
16701
16801
16901
17001
17101
17201
17301
17401
17501
17601
17701
17801
17901
18001
18101
18201
18301
18401
18501
18601
18701
18801
18901
19001
19101
19201
19301
19401
19501
19601
19701
19801
19901
20001
20101
20201
20301
20401
20501
20601
20701
20801
20901
21001
21101
21201
21301
21401
21501
21601
21701
21801
21901
22001
22101
22201
22301
22401
22501
22601
22701
22801
22901
23001
23101
23201
23301
23401
23501
23601
23701
23801
23901
24001
24101
24201
24301
24401
24501
24601
24701
24801
24901
25001
25101
25201
25301
25401
25501
25601
25701
25801
25901
26001
26101
26201
26301
26401
26501
26601
26701
26801
26901
27001
27101
27201
27301
27401
27501
27601
27701
27801
27901
28001
28101
28201
28301
28401
28501
28601
28701
28801
28901
29001
29101
29201
29301
29401
29501
29601
29701
29801
29901
30001
30101
30201
30301
30401
30501
30601
30701
30801
30901
31001
31101
31201
31301
31401
31501
31601
31701
31801
31901
32001
32101
32201
32301
32401
32501
32601
32701
32801
32901
33001
33101
33201
33301
33401
33501
33601
33701
33801
33901
34001
34101
34201
34301
34401
34501
34601
34701
34801
34901
35001
35101
35201
35301
35401
35501
35601
35701
35801
35901
36001
36101
36201
36301
36401
36501
36601
36701
36801
36901
37001
37101
37201
37301
37401
37501
37601
37701
37801
37901
38001
38101
38201
38301
38401
38501
38601
38701
38801
38901
39001
39101
39201
39301
39401
39501
39601
39701
39801
39901
40001
40101
40201
40301
40401
40501
40601
40701
40801
40901
41001
41101
41201
41301
41401
41501
41601
41701
41801
41901
42001
42101
42201
42301
42401
42501
42601
42701
42801
42901
43001
43101
43201
43301
43401
43501
43601
43701
43801
43901
44001
44101
44201
44301
44401
44501
44601
44701
44801
44901
45001
45101
45201
45301
45401
45501
45601
45701
45801
45901
46001
46101
46201
46301
46401
46501
46601
46701
46801
46901
47001
47101
47201
47301
47401
47501
47601
47701
47801
47901
48001
48101
48201
48301
48401
48501
48601
48701
48801
48901
49001
49101
49201
49301
49401
49501
49601
49701
49801
49901
50001
50101
50201
50301
50401
50501
50601
50701
50801
50901
51001
51101
51201
51301
51401
51501
51601
51701
51801
51901
52001
52101
52201
52301
52401
52501
52601
52701
52801
52901
53001
53101
53201
53301
53401
53501
53601
53701
53801
53901
54001
54101
54201
54301
54401
54501
54601
54701
54801
54901
55001
55101
55201
55301
55401
55501
55601
55701
55801
55901
56001
56101
56201
56301
56401
56501
56601
56701
56801
56901
57001
57101
57201
57301
57401
57501
57601
57701
57801
57901
58001
58101
58201
58301
58401
58501
58601
58701
58801
58901
59001
59101
59201
59301
59401
59501
59601
59701
59801
59901
60001
60101
60201
60301
60401
60501
60601
60701
60801
60901
61001
61101
61201
61301
61401
61501
61601
61701
61801
61901
62001
62101
62201
62301
62401
62501
62601
62701
62801
62901
63001
63101
63201
63301
63401
63501
63601
63701
63801
63901
64001
64101
64201
64301
64401
64501
64601
64701
64801
64901
65001
65101
65201
65301
65401
65501
65601
65701
65801
65901
66001
66101
66201
66301
66401
66501
66601
66701
66801
66901
67001
67101
67201
67301
67401
67501
67601
67701
67801
67901
68001
68101
68201
68301
68401
68501
68601
68701
68801
68901
69001
69101
69201
69301
69401
69501
69601
69701
69801
69901
70001
70101
70201
70301
70401
70501
70601
70701
70801
70901
71001
71101
71201
71301
71401
71501
71601
71701
71801
71901
72001
72101
72201
72301
72401
72501
72601
72701
72801
72901
73001
73101
73201
73301
73401
73501
73601
73701
73801
73901
74001
74101
74201
74301
74401
74501
74601
74701
74801
74901
75001
75101
75201
75301
75401
75501
75601
75701
75801
75901
76001
76101
76201
76301
76401
76501
76601
76701
76801
76901
77001
77101
77201
77301
77401
77501
77601
77701
77801
77901
78001
78101
78201
78301
78401
78501
78601
78701
78801
78901
79001
79101
79201
79301
79401
79501
79601
79701
79801
79901
80001
80101
80201
80301
80401
80501
80601
80701
80801
80901
81001
81101
81201
81301
81401
81501
81601
81701
81801
81901
82001
82101
82201
82301
82401
82501
82601
82701
82801
82901
83001
83101
83201
83301
83401
83501
83601
83701
83801
83901
84001
84101
84201
84301
84401
84501
84601
84701
84801
84901
85001
85101
85201
85301
85401
85501
85601
85701
85801
85901
86001
86101
86201
86301
86401
86501
86601
86701
86801
86901
87001
87101
87201
87301
87401
87501
87601
87701
87801
87901
88001
88101
88201
88301
88401
88501
88601
88701
88801
88901
89001
89101
89201
89301
89401
89501
89601
89701
89801
89901
90001
90101
90201
90301
90401
90501
90601
90701
90801
90901
91001
91101
91201
91301
91401
91501
91601
91701
91801
91901
92001
92101
92201
92301
92401
92501
92601
92701
92801
92901
93001
93101
93201
93301
93401
93501
93601
93701
93801
93901
94001
94101
94201
94301
94401
94501
94601
94701
94801
94901
95001
95101
95201
95301
95401
95501
95601
95701
95801
95901
96001
96101
96201
96301
96401
96501
96601
96701
96801
96901
97001
97101
97201
97301
97401
97501
97601
97701
97801
97901
98001
98101
98201
98301
98401
98501
98601
98701
98801
98901
99001
99101
99201
99301
99401
99501
99601
99701
99801
99901
100001
100101
100201
100301
100401
100501
100601
100701
100801
100901
101001
101101
101201
101301
101401
101501
101601
101701
101801
101901
102001
102101
102201
102301
102401
102501
102601
102701
102801
102901
103001
103101
103201
103301
103401
103501
103601
103701
103801
103901
104001
104101
104201
104301
104401
104501
104601
104701
104801
104901
105001
105101
105201
105301
105401
105501
105601
105701
105801
105901
106001
106101
106201
106301
106401
106501
106601
106701
106801
106901
107001
107101
107201
107301
107401
107501
107601
107701
107801
107901
108001
108101
108201
108301
108401
108501
108601
108701
108801
108901
109001
109101
109201
109301
109401
109501
109601
109701
109801
109901
110001
110101
110201
110301
110401
110501
110601
110701
110801
110901
111001
111101
111201
111301
111401
111501
111601
111701
111801
111901
112001
112101
112201
112301
112401
112501
112601
112701
112801
112901
113001
113101
113201
113301
113401
113501
113601
113701
113801
113901
114001
114101
114201
114301
114401
114501
114601
114701
114801
114901
115001
115101
115201
115301
115401
115501
115601
115701
115801
115901
116001
116101
116201
116301
116401
116501
116601
116701
116801
116901
117001
117101
117201
117301
117401
117501
117601
117701
117801
117901
118001
118101
118201
118301
118401
118501
118601
118701
118801
118901
119001
119101
119201
119301
119401
119501
119601
119701
119801
119901
120001
120101
120201
120301
120401
120501
120601
120701
120801
120901
121001
121101
121201
121301
121401
121501
121601
121701
121801
121901
122001
122101
122201
122301
122401
122501
122601
122701
122801
122901
123001
123101
123201
123301
123401
123501
123601
123701
123801
123901
124001
124101
124201
124301
124401
124501
124601
124701
124801
124901
125001
125101
125201
125301
125401
125501
125601
125701
125801
125901
126001
126101
126201
126301
126401
126501
126601
126701
126801
126901
127001
127101
127201
127301
127401
127501
127601
127701
127801
127901
128001
128101
128201
128301
128401
128501
128601
128701
128801
128901
129001
129101
129201
129301
129401
129501
129601
129701
129801
129901
130001
130101
130201
130301
130401
130501
130601
130701
130801
130901
131001
131101
131201
131301
131401
131501
131601
131701
131801
131901
132001
132101
132201
132301
132401
132501
132601
132701
132801
132901
133001
133101
133201
133301
133401
133501
133601
133701
133801
133901
134001
134101
134201
134301
134401
134501
134601
134701
134801
134901
135001
135101
135201
135301
135401
135501
135601
135701
135801
135901
136001
136101
136201
136301
136401
136501
136601
136701
136801
136901
137001
137101
137201
137301
137401
137501
137601
137701
137801
137901
138001
138101
138201
138301
138401
138501
138601
138701
138801
138901
139001
139101
139201
139301
139401
139501
139601
139701
139801
139901
140001
140101
140201
140301
140401
140501
140601
140701
140801
140901
141001
141101
141201
141301
141401
141501
141601
141701
141801
141901
142001
142101
142201
142301
142401
142501
142601
142701
142801
142901
143001
143101
143201
143301
143401
143501
143601
143701
143801
143901
144001
144101
144201
144301
144401
144501
144601

In [153]:
with open('../dataprep/authorless.tsv', mode = 'w', encoding = 'utf-8') as f:
    f.write('author\tword\tstopprob\n')
    for author, tuplelist in authorstops.items():
        for stopprob, word in tuplelist:
            f.write(str(author) + '\t' + word + '\t' + str(stopprob) + '\n')

In [152]:
author


Out[152]:
nan

In [143]:
ct = 0
for k, v in authorstops.items():
    ct += 1
    if ct > 10:
        break
    print(k, v)


Desai, Boman. [(0.25812059078187333, 'head'), (0.36124520869418231, 'spoke'), (0.50755219098273274, 'smiled'), (0.48517158923900439, 'laughed')]
Telscombe, Anne. [(0.17834276121710135, 'had'), (0.1086797513108867, 'found'), (0.20616687730415972, 'heard'), (0.042133055897579208, 'began'), (0.038352625613586877, 'tried'), (0.34091142637881378, 'seemed')]
Dunne, Pete, [(0.2416786566184429, 'seemed')]
White, Rhoda Elizabeth Waterman. [(0.18660491535557633, 'heart')]
(Ira Louis), Reeves, Ira L. [(0.16919331565533602, 'turned'), (0.025528334218409277, 'arms'), (0.11961160162823903, 'seen'), (0.1583436599149296, 'name'), (0.53305207028913459, 'started'), (0.15929981725992315, 'wife'), (0.11369323706187862, 'returned'), (0.10866951786498835, 'man'), (0.20785453793872999, 'get'), (0.29537418719153619, 'reached')]
Lintz, Gertrude Davies. [(0.13405276694877699, 'had'), (0.067256592505715362, 'was'), (0.21479634621471644, 'life'), (0.19344237466209036, 'kept'), (0.1502592073336626, 'take'), (0.20000333095951028, 'get')]
Fuller, Thomas, [(0.26492627830903515, 'father'), (0.57668969671890502, 'wife'), (0.53882058346744843, 'heart'), (0.42581273007326126, 'make')]
Sarton, May, [(0.023895190186413395, 'thought'), (0.26870174044803619, 'felt')]
Nguỹên, Huy Tửơng, [(0.026825034065757425, 'turned'), (0.3694747324570915, 'arm'), (0.25142904144505951, 'said-take'), (0.040723154771953651, 'body')]
Harry, Myriam. [(0.47778253022100692, 'father'), (0.21879830374293674, 'mother'), (0.083722135104521089, 'arms')]

In [119]:
rv = gamma(k, loc = 0., scale = theta)
fig, ax = plt.subplots()
x = np.linspace(0, rv.ppf(.999))
ax.plot(x, rv.cdf(x), 'r-', lw=5, alpha=0.6, label='gamma pdf')
plt.show()



In [122]:
threshold = gamma.ppf(0.94, a = k, loc = 0., scale = theta)
threshold


Out[122]:
0.0044703025067474146

In [123]:
ctr = 0
for author, vocab in authorvocab.items():
    authsum = sum(vocab.values())
    if authsum > 0:
        hathfreq = vocab[word] / authsum
        if hathfreq > threshold:
            # print(author, hathfreq)
            ctr += 1
print(ctr, ctr / len(authors))


233 0.040346320346320345

In [71]:
len(authors)


Out[71]:
5775

In [107]:
rv.ppf(.99)


Out[107]:
0.0058971533739358376

In [ ]: