In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as snb
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import re
import gensim

In [2]:
# train_bio = pd.read_csv("input_light/biology.csv",encoding='utf-8')
# train_cooking = pd.read_csv("input_light/cooking.csv",encoding='utf-8')
# train_crypto = pd.read_csv("input_light/crypto.csv",encoding='utf-8')
# train_dyi = pd.read_csv("input_light/diy.csv",encoding='utf-8')
# train_robotic = pd.read_csv("input_light/robotics.csv",encoding='utf-8')
# train_travel = pd.read_csv("input_light/travel.csv",encoding='utf-8')
# # test_df = pd.read_csv("input_light/test.csv",encoding='utf-8')

# # df_list = []
# df_list = pd.concat([train_bio,train_cooking,train_crypto,train_dyi,train_robotic,train_travel])

In [3]:
df_list = pd.read_csv("input_light/total_dat_reformat.csv")

In [14]:
df_list['tags'] = df_list['tags'].map(lambda d: d.replace("-","_"))

In [95]:
reg_alphabet = "[a-zA-Z]{3,}"
def tokenizer_words(demo,arr=True):
    a = []
    for f in demo.split(' '):
        if bool(re.match(reg_alphabet,f)):
            a.append(f)
    if arr==True:
        return a
    else:
        return ' '.join(a)

In [60]:
sentence_stream = [tokenizer_words(senten) for senten in df_list['doc']]

In [38]:
bigram = gensim.models.Phrases(sentence_stream)

In [39]:
trigram = gensim.models.Phrases(bigram[sentence_stream])


/usr/local/lib/python2.7/dist-packages/gensim/models/phrases.py:248: UserWarning: For a faster implementation, use the gensim.models.phrases.Phraser class
  warnings.warn("For a faster implementation, use the gensim.models.phrases.Phraser class")

In [61]:
dictionary = gensim.corpora.Dictionary(sentence_stream)

In [65]:
corpus = [dictionary.doc2bow(text) for text in sentence_stream]

In [78]:
model = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=6,alpha='auto', eval_every=5)

Sklearn LDA model


In [96]:
vocab = dictionary.token2id.keys()

In [97]:
sentence_stream_str = [tokenizer_words(senten,arr=False) for senten in df_list['doc']]

In [93]:
tfIdfVect = CountVectorizer(vocabulary=vocab,analyzer='word')

In [98]:
corpus = tfIdfVect.fit_transform(sentence_stream_str)

In [101]:
log_topics = []
for topic in range(6,40):
    lda = LatentDirichletAllocation(n_jobs=-1,n_topics=topic,max_iter=15,learning_method='online')
    lda.fit(corpus)
    log_topics.append(lda.perplexity(corpus))
# n_topics = np.argmax(lda.transform(corpus[0]))


Out[101]:
LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_jobs=1, n_topics=6, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [109]:
print log_topics
print np.argmax(log_topics)+6


101345.77923

In [ ]:


In [ ]: