In [3]:
# -*- coding: utf-8 -*-
from __future__ import division
import jieba, os, re, string
from gensim import corpora, models
import pyLDAvis.gensim
import warnings
# from utils.utils import *
import pandas as pd
import pickle
warnings.filterwarnings("ignore")
testmode = False

In [ ]:
# Built-in dictionary for cutting words
path_dict_for_tokenization = '/Users/easonchan/git/jieba/extra_dict/dict.txt.big'
path_dict_for_stopwords = '/Users/easonchan/Desktop/stop words freq list'
# Path of corpora
#path_corpora = '/Users/easonchan/temp/thenewslense'
#path_corpora = '/Users/easonchan/Desktop/xmlOutput'
# path_corpora = '/Users/easonchan/temp/UDNnews'  ## only 3k articles under UDN news
path_corpora = '/Users/easonchan/temp/iterativeUDNnews' ##20k articles under UDN news

## Hyperparameters for training model
# Minimun length of single document
min_length = 300 if not 'UDNnews' in path_corpora else 150
# Num_topics in LDA
#num_topics = 40 if not 'UDNnews' in path_corpora else 41 #=> little bit better than 55
num_topics = 40 if not 'UDNnews' in path_corpora else 100 #=> little bit better than 55
# Filter out tokens that appear in less than `no_below` documents (absolute number)
no_below_this_number = 20 if not 'UDNnews' in path_corpora else 15
# Filter out tokens that appear in more than `no_above` documents (fraction of total corpus size, *not* absolute number).
no_above_fraction_of_doc = 0.15
# Granularity of model, may increase the model complexity, default is false
full_mode = False 
# Use HMM model for tokenization, default is true
HMM_mode_on = True
# Remove topic which weights less than this number
remove_topic_so_less = 0.08
# Number of iterations in training LDA model, the less the documents in total, the more the iterations for LDA model to converge
num_of_iterations = 30 if testmode else 200
# Number of passes in the model
passes = 5 if testmode else 3

#Print all hyperparameters
parameters = {}
parameters['testmode'] = testmode
parameters['min_length'] = min_length
parameters['num_topics'] = num_topics
parameters['no_below_this_number'] = no_below_this_number
parameters['full_mode'] = full_mode
parameters['HMM_mode_on'] = HMM_mode_on
parameters['no_above_fraction_of_doc'] = no_above_fraction_of_doc
parameters['remove_topic_so_less'] = remove_topic_so_less
parameters['num_of_iterations'] = num_of_iterations
parameters['passes'] = passes
for k in parameters:
    print "Parameter for {0} is {1}".format(k,parameters[k])

# get stop words dict
stopwords = get_stop_words_list(path_dict_for_stopwords)
jieba.set_dictionary(path_dict_for_tokenization)
walk = os.walk(path_corpora)
doc_count=0
train_set = []
doc_mapping={}

## Start of preparing list of documents and tokens [[words_in_1st_doc],[words_in_2nd_doc]....]
for root, dirs, files in walk:
    for name in files:        
        f = open(os.path.join(root, name), 'r')
        raw = f.read()
        f.close()
        # call preprocessing function
        preprocessed_text = preprocessing(raw)
        # Skip document length < min_length
        if len(preprocessed_text) < min_length or name=='.DS_Store':
            continue
        tokens = tokenize(preprocessed_text,stopwords,full_mode,HMM_mode_on)
        train_set.append(list(tokens))
        
        # Build doc-mapping
        doc_mapping[doc_count] = name
        doc_count = doc_count+1

print 'There are %i documents in the pool' % (doc_count)
## End of preparation 

## Start of preparing corpus of docs, which comprise Bag-Of-Words (BOW)
dic = corpora.Dictionary(train_set)
print "In the corpus there are", len(dic), "raw tokens"
denominator = len(dic)
dic.filter_extremes(no_below=no_below_this_number, no_above=no_above_fraction_of_doc)
nominator = len(dic)
print "After filtering, in the corpus there are", len(dic), "unique tokens, reduced ", (1-(nominator/denominator)),"%"
corpus = [dic.doc2bow(text) for text in train_set]
## End of preparation 

## Implementing TF-IDF as vector for each document, and train LDA model on top of that
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lda = models.LdaModel(corpus_tfidf, id2word = dic, num_topics = num_topics,iterations=num_of_iterations,passes = passes)
corpus_lda = lda[corpus_tfidf]


## Exhibit all the topics and related words
for i in range(num_topics):
    print 'Topic %s : ' % (str(i)) + lda.print_topic(i)


## Exhibit visualization of LDA model in interactive graphic
#vis = pyLDAvis.gensim.prepare(lda, corpus, dic)
#pyLDAvis.display(vis)

## Exhibit perplexity of current model under specific topic hyperparameter : k
print '==============================='
print 'Model perplexity : ',lda.bound(corpus_lda)
print '==============================='

## Save model output
save_path = '/Users/easonchan/Desktop/LDAmodel/final_ldamodel'
print 'Model saved at {0}'.format(save_path)
lda.save(save_path)

## Save corpus output
save_path = '/Users/easonchan/Desktop/LDAmodel/corpus.pickle'
mappingFile = open(save_path,'w')
pickle.dump(corpus,mappingFile)
mappingFile.close()
print 'Corpus saved at {0}'.format(save_path)

# Save document mapping
path_mappingfile= '/Users/easonchan/Desktop/LDAmodel/documentmapping.pickle'
mappingFile = open(path_mappingfile,'w')
pickle.dump(doc_mapping,mappingFile)
mappingFile.close()
print 'Document mapping saved at {0}'.format(path_mappingfile)

# Save doc to topic matrix
doc_topic_matrix = {}
count = 0
for doc in corpus:
    dense_vector = {}
    vector = convertListToDict(lda[doc])
    # remove topic that is so irrelevant
    for topic in vector:
        if vector[topic] > remove_topic_so_less:
            dense_vector[topic] = vector[topic]
    doc_topic_matrix[count]=dense_vector
    count = count+1
savePickleFile('doc_topic_matrix',doc_topic_matrix)
print 'doc to topic mapping saved at {0}'.format('doc_topic_matrix')