In [1]:
import numpy
import scipy
import pandas
import spacy
import textacy
In [2]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid", color_codes=True)
In [3]:
corpus = textacy.Corpus.load(path='/home/immersinn/gits/ncga/data/processed/CORPUS_bills_filed_pipe01/',
name='CORPUS_bills_filed_pipe01',
compression='gzip')
In [4]:
terms_lists = (doc.to_terms_list(ngrams=1, named_entities=True, as_strings=True) for doc in corpus)
doc_term_matrix, id2term = textacy.vsm.doc_term_matrix(\
terms_lists,
weighting='tfidf', normalize=True, smooth_idf=True,
min_df=3, max_df=0.95, max_n_terms=1000)
doc_term_matrix
Out[4]:
In [7]:
n_topics = 4
model = textacy.tm.TopicModel('lda', n_topics=n_topics)
model.fit(doc_term_matrix)
In [8]:
doc_topic_matrix = model.transform(doc_term_matrix)
In [9]:
doc_topic_matrix.shape
Out[9]:
In [10]:
# Rows sum to unity...
doc_topic_matrix[:10,:]
Out[10]:
In [11]:
pandas.Series(doc_topic_matrix.reshape((doc_topic_matrix.shape[0]*doc_topic_matrix.shape[1],))).describe(percentiles=[0.75, 0.80, 0.90, 0.95, 0.975, 0.99])
Out[11]:
In [12]:
sum(doc_topic_matrix > 0.1)
Out[12]:
In [13]:
top_term_table = []
index = []
for topic_idx, top_terms in model.top_topic_terms(id2term, top_n = 15, topics=range(n_topics)):
index.append('topic ' + str(topic_idx))
top_term_table.append({i : tt for i,tt in enumerate(top_terms)})
top_term_table = pandas.DataFrame(data=top_term_table, index=index)
In [14]:
top_term_table.transpose()
Out[14]:
In [16]:
model.model.components_.shape
Out[16]:
In [17]:
model.model.components_[:,:4]
Out[17]:
In [118]:
# Calculate correlations between Topics in a given Model via the Topic - Term Weighting Vectors
def calc_p_topic_given_word(model, doc_topic_matrix, n_topics):
# 01: Convert model components to a distribution --> p(w|t)
p_word_given_topic = model.model.components_.copy()
p_word_given_topic = p_word_given_topic / p_word_given_topic.sum(axis=1).reshape((n_topics,1))
# 02: Calculate estimate of p(t) from corpus
p_topics = (doc_topic_matrix.sum(axis=0) / len(corpus)).reshape((n_topics, 1))
p_topics = p_topics + (1- p_topics.sum()) / n_topics # Comp round error correct
# 03: p(t|w) = p(w|t) * p(t) / p(w)
p_words = doc_term_matrix.sum(axis=0) / doc_term_matrix.sum()
p_words = p_words + (1 - p_words.sum()) / doc_term_matrix.shape[1] # Comp round error correct
# 04: Calculate p(t|w)
p_topic_given_word = p_word_given_topic * p_topics / p_words
p_topic_given_word = p_topic_given_word + (1 - p_topic_given_word.sum(axis=0)) / n_topics #Comp round error correct
return(p_topic_given_word)
def topic_cosdists(model, doc_topic_matrix, n_topics):
ptw = calc_p_topic_given_word(model, doc_topic_matrix, n_topics)
cos_dists = scipy.spatial.distance.squareform(\
scipy.spatial.distance.pdist(ptw, 'cosine')
)
return(cos_dists)
def average_topic_distances(topic_distances, n_topics):
if topic_distances.shape[0] == n_topics:
if topic_distances.shape[1] == n_topics:
topic_distances = topic_distances[scipy.tril_indices_from(corre_topics, -1)]
else:
raise ValueError
return(topic_distances.sum() / (n_topics * (n_topics-1) / 2))
In [109]:
ptw = calc_p_topic_given_word(model, doc_topic_matrix, n_topics)
In [113]:
corre_topics = topic_cosdists(model, doc_topic_matrix, n_topics)
corre_topics
Out[113]:
In [115]:
ave_dis = average_topic_distances(corre_topics, n_topics)
ave_dis
Out[115]:
In [141]:
models = {}
models_dtms = {}
models_ave_dis = []
for n_topics in range(3,31,3):
print('Fitting model for {} topics...'.format(n_topics))
# Fit model
model = textacy.tm.TopicModel('lda', n_topics=n_topics)
model.fit(doc_term_matrix)
doc_topic_matrix = model.transform(doc_term_matrix)
# Calculate avg cosine similarity data
corre_topics = topic_cosdists(model, doc_topic_matrix, n_topics)
ave_dis = average_topic_distances(corre_topics, n_topics)
# Store data
models[n_topics] = model
models_dtms[n_topics] = doc_topic_matrix
models_ave_dis.append({'n_topics' : n_topics, 'ave_dis' : ave_dis})
In [151]:
models_ave_dis = pandas.DataFrame(models_ave_dis)
models_ave_dis.sort_values(by='n_topics', inplace=True)
models_ave_dis.columns = ['avg_dist', 'n_topics']
models_ave_dis
Out[151]:
In [155]:
plt.plot(models_ave_dis.n_topics, models_ave_dis.avg_dist)
Out[155]:
In [169]:
from IPython.display import Image
Do we need to build models with holdout data / cross fold / etc in order to get this metric accuratly (see paper and below)?
In [170]:
Image(filename='images/perplexity_def.png')
Out[170]:
In [162]:
perplexity = [{'n_topics' : key,
'perp' : models[key].model.perplexity(doc_term_matrix)} \
for key in models.keys()]
In [163]:
perplexity = pandas.DataFrame(perplexity)
perplexity.sort_values(by='n_topics', inplace=True)
In [168]:
plt.plot(perplexity.n_topics, perplexity.perp)
Out[168]:
In [165]:
perplexity
Out[165]:
In [ ]: