This is my attempt to implement Thompson & Mimno 2018.
In [13]:
import sys, csv, math, random
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from scipy.stats import pearsonr
from scipy.stats import gamma
from collections import Counter
In [5]:
rv = gamma(3., loc = 0., scale = 2.)
In [8]:
fig, ax = plt.subplots()
x = np.linspace(0, 20)
ax.plot(x, rv.pdf(x), 'r-', lw=5, alpha=0.6, label='gamma pdf')
plt.show()
In [9]:
meta = pd.read_csv('../../metadata/filtered_fiction_plus_18c.tsv', sep = '\t', index_col = 'docid')
meta = meta[~meta.index.duplicated(keep = 'first')]
In [124]:
authorvocab = dict()
with open('/Users/tunder/data/character_table_18c19c.tsv', encoding = 'utf-8') as f:
for line in f:
fields = line.strip().split('\t')
docid = fields[0]
if docid not in meta.index:
continue
author = meta.loc[docid, 'author']
if author not in authorvocab:
authorvocab[author] = Counter()
words = fields[5].split()
for w in words:
authorvocab[author][w] += 1
with open('/Users/tunder/data/character_table_post1900.tsv', encoding = 'utf-8') as f:
for line in f:
fields = line.strip().split('\t')
docid = fields[0]
if docid not in meta.index:
continue
author = meta.loc[docid, 'author']
if author not in authorvocab:
authorvocab[author] = Counter()
words = fields[5].split()
for w in words:
authorvocab[author][w] += 1
In [128]:
allwords = Counter()
ctr = 0
for author, vocab in authorvocab.items():
allwords = allwords + vocab
ctr += 1
if ctr % 10 == 1:
print(ctr)
lexicon = [x[0] for x in allwords.most_common()]
print(len(lexicon))
In [130]:
allwords = Counter()
ctr = 0
for author, vocab in authorvocab.items():
ctr += 1
if ctr % 100 == 1:
print(ctr)
for w in vocab.keys():
allwords[w] += 1
print(len(allwords))
In [144]:
lexicon = allwords.most_common()
lexicon = [x for x in lexicon if x[1] > 20]
print(len(lexicon))
In [145]:
with open('../dataprep/ficlexicon.tsv', mode = 'w', encoding = 'utf-8') as f:
f.write('word\tnumauthors\n')
for word, authfreq in lexicon:
f.write(word + '\t' + str(authfreq) + '\n')
In [146]:
authsums = dict()
for author, vocab in authorvocab.items():
authsum = sum(vocab.values())
authsums[author] = authsum
In [148]:
lexicon[0:10]
Out[148]:
In [149]:
authorstops = dict()
ctr = 0
for word, authfreq in lexicon:
ctr += 1
if ctr % 100 == 1:
print(ctr)
vector = []
authors = []
for author, vocab in authorvocab.items():
authsum = authsums[author]
if authsum > 0:
authors.append(author)
vector.append(vocab[word] / authsum)
vector = np.array(vector)
variance = np.std(vector) ** 2
mean = np.mean(vector)
k = (mean ** 2) / variance
theta = variance / mean
g = gamma(k, loc = 0., scale = theta)
threshold = g.ppf(0.95)
for auth, freq in zip(authors, vector):
if freq > threshold:
stopprob = 1 - (threshold / freq)
if stopprob > 0:
if auth not in authorstops:
authorstops[auth] = []
authorstops[auth].append((stopprob, word))
In [153]:
with open('../dataprep/authorless.tsv', mode = 'w', encoding = 'utf-8') as f:
f.write('author\tword\tstopprob\n')
for author, tuplelist in authorstops.items():
for stopprob, word in tuplelist:
f.write(str(author) + '\t' + word + '\t' + str(stopprob) + '\n')
In [152]:
author
Out[152]:
In [143]:
ct = 0
for k, v in authorstops.items():
ct += 1
if ct > 10:
break
print(k, v)
In [119]:
rv = gamma(k, loc = 0., scale = theta)
fig, ax = plt.subplots()
x = np.linspace(0, rv.ppf(.999))
ax.plot(x, rv.cdf(x), 'r-', lw=5, alpha=0.6, label='gamma pdf')
plt.show()
In [122]:
threshold = gamma.ppf(0.94, a = k, loc = 0., scale = theta)
threshold
Out[122]:
In [123]:
ctr = 0
for author, vocab in authorvocab.items():
authsum = sum(vocab.values())
if authsum > 0:
hathfreq = vocab[word] / authsum
if hathfreq > threshold:
# print(author, hathfreq)
ctr += 1
print(ctr, ctr / len(authors))
In [71]:
len(authors)
Out[71]:
In [107]:
rv.ppf(.99)
Out[107]:
In [ ]: