In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as snb
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
import re
import gensim
In [2]:
# train_bio = pd.read_csv("input_light/biology.csv",encoding='utf-8')
# train_cooking = pd.read_csv("input_light/cooking.csv",encoding='utf-8')
# train_crypto = pd.read_csv("input_light/crypto.csv",encoding='utf-8')
# train_dyi = pd.read_csv("input_light/diy.csv",encoding='utf-8')
# train_robotic = pd.read_csv("input_light/robotics.csv",encoding='utf-8')
# train_travel = pd.read_csv("input_light/travel.csv",encoding='utf-8')
# # test_df = pd.read_csv("input_light/test.csv",encoding='utf-8')
# # df_list = []
# df_list = pd.concat([train_bio,train_cooking,train_crypto,train_dyi,train_robotic,train_travel])
In [3]:
df_list = pd.read_csv("input_light/total_dat_reformat.csv")
In [14]:
df_list['tags'] = df_list['tags'].map(lambda d: d.replace("-","_"))
In [95]:
reg_alphabet = "[a-zA-Z]{3,}"
def tokenizer_words(demo,arr=True):
a = []
for f in demo.split(' '):
if bool(re.match(reg_alphabet,f)):
a.append(f)
if arr==True:
return a
else:
return ' '.join(a)
In [60]:
sentence_stream = [tokenizer_words(senten) for senten in df_list['doc']]
In [38]:
bigram = gensim.models.Phrases(sentence_stream)
In [39]:
trigram = gensim.models.Phrases(bigram[sentence_stream])
In [61]:
dictionary = gensim.corpora.Dictionary(sentence_stream)
In [65]:
corpus = [dictionary.doc2bow(text) for text in sentence_stream]
In [78]:
model = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=6,alpha='auto', eval_every=5)
In [96]:
vocab = dictionary.token2id.keys()
In [97]:
sentence_stream_str = [tokenizer_words(senten,arr=False) for senten in df_list['doc']]
In [93]:
tfIdfVect = CountVectorizer(vocabulary=vocab,analyzer='word')
In [98]:
corpus = tfIdfVect.fit_transform(sentence_stream_str)
In [101]:
log_topics = []
for topic in range(6,40):
lda = LatentDirichletAllocation(n_jobs=-1,n_topics=topic,max_iter=15,learning_method='online')
lda.fit(corpus)
log_topics.append(lda.perplexity(corpus))
# n_topics = np.argmax(lda.transform(corpus[0]))
Out[101]:
In [109]:
print log_topics
print np.argmax(log_topics)+6
In [ ]:
In [ ]: