In [1]:
import os
import re
import operator
import matplotlib.pyplot as plt
import warnings
import gensim
import numpy as np
warnings.filterwarnings('ignore')  # Let's not pay heed to them right now

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.corpora import Dictionary
from pprint import pprint

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

%matplotlib inline


Using TensorFlow backend.

In [2]:
from gensim.utils import lemmatize
from nltk.corpus import stopwords

In [8]:
import pandas as pd
import sqlalchemy
from sqlalchemy.dialects import postgresql as psql
from sqlalchemy import Column, Integer, String, DATE
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()

class Document(Base):                  
    __tablename__ = 'document'         

    id = Column(Integer, primary_key=True)

    title = Column(psql.TEXT)          
    date = Column(DATE)                
    doctype = Column(psql.TEXT)        
    docnum = Column(psql.TEXT)         
    subject = Column(psql.TEXT)        
    body = Column(psql.TEXT)           
    sign = Column(psql.TEXT)           
    signtitle = Column(psql.TEXT)      
    images = Column(psql.JSONB)        
    raw_json = Column(psql.JSONB)      

    def __repr__(self):                
        return self.title              

from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

engine = create_engine('postgresql://dev:dev@localhost/dev')
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()

query = 'SELECT body FROM Document'

df = pd.read_sql_query(query, engine)

df.body = df.body.str.replace('\n', ' ').str.replace('\t', ' ')

df.to_csv('consolidated.csv', index=False, encoding='utf-8')

LDA Training


In [9]:
def build_texts(fname):
    """
    Function to build tokenized texts from file
    
    Parameters:
    ----------
    fname: File to be read
    
    Returns:
    -------
    yields preprocessed line
    """
    with open(fname) as f:
        for line in f:
            yield gensim.utils.simple_preprocess(line, deacc=True, min_len=3)

In [10]:
stops = set(stopwords.words('english'))  # nltk stopwords list

In [11]:
def process_texts(texts):
    """
    Function to process texts. Following are the steps we take:
    
    1. Stopword Removal.
    # 2. Collocation detection.
    3. Lemmatization (not stem since stemming can reduce the interpretability).
    
    Parameters:
    ----------
    texts: Tokenized texts.
    
    Returns:
    -------
    texts: Pre-processed tokenized texts.
    """
    texts = [[word for word in line if word not in stops] for line in texts]
    # texts = [bigram[line] for line in texts]
    texts = [[word.split('/')[0] for word in lemmatize(' '.join(line), 
                    allowed_tags=re.compile('(NN)'), min_length=3)] for line in texts]
    return texts

In [13]:
train_texts = list(build_texts('consolidated.csv'))

In [14]:
train_texts = process_texts(train_texts)

In [15]:
len(train_texts)


Out[15]:
2037

In [16]:
train_texts_sklearn = [" ".join(ls) for ls in train_texts]

Sklearn


In [17]:
tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = stops,
                                lowercase = True,
                                max_df = 0.5, 
                                min_df = 0.05,
                                ngram_range = (1, 4))
dtm_tf = tf_vectorizer.fit_transform(train_texts_sklearn)
print(dtm_tf.shape)


(2037, 732)

In [18]:
vocabulary_gensim = {}
vocab = Dictionary()
for key, val in tf_vectorizer.vocabulary_.items():
    vocabulary_gensim[val] = key
_ = vocab.merge_with(vocabulary_gensim)   
corpus_vect_gensim = gensim.matutils.Sparse2Corpus(dtm_tf, documents_columns=False)

In [12]:
def evaluate_graph(dictionary, corpus, texts, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    for num_topics in range(1, limit):
        lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lm_list.append(lm)
        cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v.append(cm.get_coherence())
        
    # Show graph
    x = range(1, limit)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    
    return lm_list, c_v

In [13]:
lmlist, c_v = evaluate_graph(dictionary=vocab, corpus=corpus_vect_gensim, texts=train_texts, limit=10)



In [44]:
ldamodel = LdaModel(corpus=corpus_vect_gensim, num_topics=50, id2word=vocab)

In [20]:
import pyLDAvis.gensim
pyLDAvis.enable_notebook()

In [45]:
pyLDAvis.gensim.prepare(ldamodel, corpus_vect_gensim, vocab)


C:\Users\PC\Anaconda3\envs\py27\lib\site-packages\pyLDAvis\_prepare.py:387: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  topic_term_dists = topic_term_dists.ix[topic_order]
Out[45]:

In [46]:
ldamodel.save('consolidated_lda.bin')
vocab.save('consolidated_corpus.dict')

Productionization


In [3]:
import pandas as pd
# themes = ['mnchn', 'spec_pop', 'geriatric', 'adolescent']
# keywords = {
#     theme: pd.read_excel('data/Thesaurus Initial List.xlsx', sheetname=theme, usecols=['AO', 'Keywords'], dtype=
#                          {'Keywords': str}).fillna(method='ffill') for theme in themes
# }
# keywords_final = {}
# for theme in themes:
#     keywords[theme].Keywords += ','
#     keywords_final[theme] = keywords[theme].groupby('AO').sum()['Keywords'].str.replace(
#         '"', '').str.replace('“', '').str.replace('&', ' ').str.replace(',', ' ').str.replace('/', '').reset_index()
fn = 'data/cycle_2/adolescent_new_AOs.csv'
df = pd.read_csv(fn)

In [4]:
df.columns


Out[4]:
Index(['AO', 'Final Keywords'], dtype='object')

In [5]:
lda_model = gensim.models.LdaModel.load('data/cycle_2/consolidated_lda.bin')

In [6]:
id2word = gensim.corpora.Dictionary.load('data/cycle_2/consolidated_corpus.dict')

In [7]:
# for key in keywords_final.keys():
#     keywords_final[key]['LDA'] = ''
df['LDA'] = ''
for i, r in df.iterrows():
    query = r['Final Keywords'].split()
    results = sorted(lda_model[id2word.doc2bow(query)], key=lambda tup: tup[1], reverse=True)
    r['LDA'] = ' '.join([i[0] for i in lda_model.show_topic(results[0][0], topn=20)])

In [8]:
df.to_csv(fn, index=False)

In [ ]: