In [1]:
import os
import re
import operator
import matplotlib.pyplot as plt
import warnings
import gensim
import numpy as np
warnings.filterwarnings('ignore') # Let's not pay heed to them right now
from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.corpora import Dictionary
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
%matplotlib inline
In [2]:
from gensim.utils import lemmatize
from nltk.corpus import stopwords
In [8]:
import pandas as pd
import sqlalchemy
from sqlalchemy.dialects import postgresql as psql
from sqlalchemy import Column, Integer, String, DATE
from sqlalchemy.ext.declarative import declarative_base
Base = declarative_base()
class Document(Base):
__tablename__ = 'document'
id = Column(Integer, primary_key=True)
title = Column(psql.TEXT)
date = Column(DATE)
doctype = Column(psql.TEXT)
docnum = Column(psql.TEXT)
subject = Column(psql.TEXT)
body = Column(psql.TEXT)
sign = Column(psql.TEXT)
signtitle = Column(psql.TEXT)
images = Column(psql.JSONB)
raw_json = Column(psql.JSONB)
def __repr__(self):
return self.title
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
engine = create_engine('postgresql://dev:dev@localhost/dev')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
query = 'SELECT body FROM Document'
df = pd.read_sql_query(query, engine)
df.body = df.body.str.replace('\n', ' ').str.replace('\t', ' ')
df.to_csv('consolidated.csv', index=False, encoding='utf-8')
In [9]:
def build_texts(fname):
"""
Function to build tokenized texts from file
Parameters:
----------
fname: File to be read
Returns:
-------
yields preprocessed line
"""
with open(fname) as f:
for line in f:
yield gensim.utils.simple_preprocess(line, deacc=True, min_len=3)
In [10]:
stops = set(stopwords.words('english')) # nltk stopwords list
In [11]:
def process_texts(texts):
"""
Function to process texts. Following are the steps we take:
1. Stopword Removal.
# 2. Collocation detection.
3. Lemmatization (not stem since stemming can reduce the interpretability).
Parameters:
----------
texts: Tokenized texts.
Returns:
-------
texts: Pre-processed tokenized texts.
"""
texts = [[word for word in line if word not in stops] for line in texts]
# texts = [bigram[line] for line in texts]
texts = [[word.split('/')[0] for word in lemmatize(' '.join(line),
allowed_tags=re.compile('(NN)'), min_length=3)] for line in texts]
return texts
In [13]:
train_texts = list(build_texts('consolidated.csv'))
In [14]:
train_texts = process_texts(train_texts)
In [15]:
len(train_texts)
Out[15]:
In [16]:
train_texts_sklearn = [" ".join(ls) for ls in train_texts]
In [17]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
stop_words = stops,
lowercase = True,
max_df = 0.5,
min_df = 0.05,
ngram_range = (1, 4))
dtm_tf = tf_vectorizer.fit_transform(train_texts_sklearn)
print(dtm_tf.shape)
In [18]:
vocabulary_gensim = {}
vocab = Dictionary()
for key, val in tf_vectorizer.vocabulary_.items():
vocabulary_gensim[val] = key
_ = vocab.merge_with(vocabulary_gensim)
corpus_vect_gensim = gensim.matutils.Sparse2Corpus(dtm_tf, documents_columns=False)
In [12]:
def evaluate_graph(dictionary, corpus, texts, limit):
"""
Function to display num_topics - LDA graph using c_v coherence
Parameters:
----------
dictionary : Gensim dictionary
corpus : Gensim corpus
limit : topic limit
Returns:
-------
lm_list : List of LDA topic models
c_v : Coherence values corresponding to the LDA model with respective number of topics
"""
c_v = []
lm_list = []
for num_topics in range(1, limit):
lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
lm_list.append(lm)
cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
c_v.append(cm.get_coherence())
# Show graph
x = range(1, limit)
plt.plot(x, c_v)
plt.xlabel("num_topics")
plt.ylabel("Coherence score")
plt.legend(("c_v"), loc='best')
plt.show()
return lm_list, c_v
In [13]:
lmlist, c_v = evaluate_graph(dictionary=vocab, corpus=corpus_vect_gensim, texts=train_texts, limit=10)
In [44]:
ldamodel = LdaModel(corpus=corpus_vect_gensim, num_topics=50, id2word=vocab)
In [20]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
In [45]:
pyLDAvis.gensim.prepare(ldamodel, corpus_vect_gensim, vocab)
Out[45]:
In [46]:
ldamodel.save('consolidated_lda.bin')
vocab.save('consolidated_corpus.dict')
In [3]:
import pandas as pd
# themes = ['mnchn', 'spec_pop', 'geriatric', 'adolescent']
# keywords = {
# theme: pd.read_excel('data/Thesaurus Initial List.xlsx', sheetname=theme, usecols=['AO', 'Keywords'], dtype=
# {'Keywords': str}).fillna(method='ffill') for theme in themes
# }
# keywords_final = {}
# for theme in themes:
# keywords[theme].Keywords += ','
# keywords_final[theme] = keywords[theme].groupby('AO').sum()['Keywords'].str.replace(
# '"', '').str.replace('“', '').str.replace('&', ' ').str.replace(',', ' ').str.replace('/', '').reset_index()
fn = 'data/cycle_2/adolescent_new_AOs.csv'
df = pd.read_csv(fn)
In [4]:
df.columns
Out[4]:
In [5]:
lda_model = gensim.models.LdaModel.load('data/cycle_2/consolidated_lda.bin')
In [6]:
id2word = gensim.corpora.Dictionary.load('data/cycle_2/consolidated_corpus.dict')
In [7]:
# for key in keywords_final.keys():
# keywords_final[key]['LDA'] = ''
df['LDA'] = ''
for i, r in df.iterrows():
query = r['Final Keywords'].split()
results = sorted(lda_model[id2word.doc2bow(query)], key=lambda tup: tup[1], reverse=True)
r['LDA'] = ' '.join([i[0] for i in lda_model.show_topic(results[0][0], topn=20)])
In [8]:
df.to_csv(fn, index=False)
In [ ]: