In [13]:
from gensim import corpora, models
import gensim
In [8]:
import numpy as np
import random
import pickle
In [9]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
Train LDA models with different number of topics
In [ ]:
texts = pickle.load(open('pub_articles_cleaned_super.pkl','rb'))
random.seed(42)
train_set = random.sample(list(range(0,len(texts))),len(texts)-1000)
test_set = [x for x in list(range(0,len(texts))) if x not in train_set]
train_texts = [texts[i] for i in train_set]
test_texts = [texts[i] for i in test_set]
pickle.dump([train_set,test_set,train_texts,test_texts],open('pub_articles_train_test_sets.pkl','wb'))
In [ ]:
topicnums = [1,5,10,15,20,30,40,50,60,70,80,90,100]
dictionary = corpora.Dictionary(train_texts)
corpus = [dictionary.doc2bow(text) for text in train_texts]
ldamodels_bow = {}
for i in topicnums:
random.seed(42)
%time ldamodels_bow[i] = models.ldamodel.LdaModel(corpus,num_topics=i,id2word=dictionary)
ldamodels_bow[i].save('ldamodels_bow_'+str(i)+'.lda')
Evaluate on 1,000 documents not used in LDA training
In [ ]:
# http://radimrehurek.com/topic_modeling_tutorial/2%20-%20Topic%20Modeling.html
def intra_inter(lda_model, dictionary, test_docs, num_pairs=10000):
# Split each test document into two halves and compute topics for each half
part1 = [lda_model[dictionary.doc2bow(tokens[:int(len(tokens)/2)])] for tokens in test_docs]
part2 = [lda_model[dictionary.doc2bow(tokens[int(len(tokens)/2):])] for tokens in test_docs]
# Compute topic distribution similarities using cosine similarity
#print("Average cosine similarity between corresponding parts (higher is better):")
corresp_parts = np.mean([gensim.matutils.cossim(p1, p2) for p1, p2 in zip(part1, part2)])
#print("Average cosine similarity between 10,000 random parts (lower is better):")
random.seed(42)
random_pairs = np.random.randint(0, len(test_docs), size=(num_pairs, 2))
random_parts = np.mean([gensim.matutils.cossim(part1[i[0]], part2[i[1]]) for i in random_pairs])
return corresp_parts, random_parts
In [ ]:
ldamodels_eval = {}
for i in topicnums:
lda_model = models.ldamodel.LdaModel.load('ldamodels_bow_'+str(i)+'.lda')
ldamodels_eval[i] = intra_inter(lda_model, dictionary, test_texts)
pickle.dump(ldamodels_eval,open('pub_ldamodels_eval.pkl','wb'))
In [10]:
topicnums = [1,5,10,15,20,30,40,50,60,70,80,90,100]
ldamodels_eval = pickle.load(open('pub_ldamodels_eval.pkl','rb'))
corresp_parts = [ldamodels_eval[i][0] for i in topicnums]
random_parts = [ldamodels_eval[i][1] for i in topicnums]
with sns.axes_style("whitegrid"):
x = topicnums
y1 = corresp_parts
y2 = random_parts
plt.plot(x,y1,label='Corresponding parts')
plt.plot(x,y2,label='Random parts')
plt.ylim([0.0,1.0])
plt.xlabel('Number of topics')
plt.ylabel('Average cosine similarity')
plt.legend()
plt.show()
Measure overlap between topic vectors from different numbers of topics
In [ ]:
topicnums = [1,5,10,15,20,30,40,50,60,70,80,90,100]
lda_topics = {}
for i in topicnums:
lda_model = models.ldamodel.LdaModel.load('ldamodels_bow_'+str(i)+'.lda')
lda_topics_string = lda_model.show_topics(i)
lda_topics[i] = ["".join([c if c.isalpha() else " " for c in topic[1]]).split() for topic in lda_topics_string]
pickle.dump(lda_topics,open('pub_lda_topics.pkl','wb'))
In [ ]:
# http://billchambers.me/tutorials/2014/12/21/tf-idf-explained-in-python.html
def jaccard_similarity(query, document):
intersection = set(query).intersection(set(document))
union = set(query).union(set(document))
return len(intersection)/len(union)
In [ ]:
lda_stability = {}
for i in range(0,len(topicnums)-1):
jacc_sims = []
for t1,topic1 in enumerate(lda_topics[topicnums[i]]):
sims = []
for t2,topic2 in enumerate(lda_topics[topicnums[i+1]]):
sims.append(jaccard_similarity(topic1,topic2))
jacc_sims.append(sims)
lda_stability[topicnums[i]] = jacc_sims
pickle.dump(lda_stability,open('pub_lda_stability.pkl','wb'))
In [11]:
topicnums = [1,5,10,20,30,40,50,60,70,80,90,100]
lda_stability = pickle.load(open('pub_lda_stability.pkl','rb'))
mean_stability = [np.array(lda_stability[i]).mean() for i in topicnums[:-1]]
with sns.axes_style("whitegrid"):
x = topicnums[:-1]
y = mean_stability
plt.plot(x,y,label='Mean overlap')
plt.ylim([0.0,1.0])
plt.xlabel('Number of topics')
plt.ylabel('Average Jaccard similarity')
#plt.legend()
plt.show()
In [ ]:
num_topics = 20
lda_model = models.ldamodel.LdaModel.load('ldamodels_bow_'+str(num_topics)+'.lda')
In [14]:
lda_topics = lda_model.show_topics(num_topics)
lda_topics_words = ["".join([c if c.isalpha() else " " for c in topic[1]]).split() for topic in lda_topics]
lda_topics_disp = [("topic "+str(i)+": ")+" ".join(topic) for i,topic in enumerate(lda_topics_words)]
lda_topics_disp
Out[14]:
Get topic distributions / probabilities for each article
In [ ]:
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
In [ ]:
with open ("bubble_popper_postgres.txt","r") as myfile:
lines = [line.replace("\n","") for line in myfile.readlines()]
db, us, pw = 'bubble_popper', lines[0], lines[1]
engine = create_engine('postgresql://%s:%s@localhost:5432/%s'%(us,pw,db))
connstr = "dbname='%s' user='%s' host='localhost' password='%s'"%(db,us,pw)
conn = None; conn = psycopg2.connect(connstr)
In [ ]:
articles = pickle.load(open('pub_articles_trimmed.pkl','rb')) # Article full content and other data
documents = pickle.load(open('pub_articles_cleaned_super.pkl','rb')) # Article preprocessed bag of words
doc_probs = []
for doc in documents:
doc_dict = gensim.corpora.Dictionary([doc])
doc_corp = doc_dict.doc2bow(doc)
doc_probs.append(ldamodel[doc_corp])
articles = articles.drop(['content','title','url'],axis=1)
for i in range(0,num_topics):
articles['topic'+str(i)] = 0.0
indices = articles.index.values.tolist()
for i, doc in list(zip(indices, doc_probs)):
for probs in doc:
articles.set_value(i,'topic'+str(probs[0]),probs[1])
pickle.dump(articles,open('pub_probabs_topic'+str(num_topics)+'.pkl','wb'))
articles.to_sql('article_data',engine,if_exists='replace')