In [11]:
import re
import codecs
import jieba
from zhihu import Question
为什么大多数高中宁愿学习衡中,而不愿学习理念更加先进的人大附中等?https://www.zhihu.com/question/47145647
In [12]:
q = Question('59436426')
In [13]:
stop_words = []
with codecs.open('stop_words.txt','r',encoding='utf-8') as f:
stop_words = f.readlines()
stop_words = [sw.strip() for sw in stop_words]
In [14]:
def tokenize_with_rsw(string):
return ' '.join([word for word in jieba.cut(string) if not word in stop_words and len(word) > 1])
In [15]:
if not q.is_json_exist():
q.get_data_from_api()
q.save_to_json()
q.get_data_from_json()
title_answers = []
title_answers.append(tokenize_with_rsw(re.sub('<[^>]+>','',q.questions['title'])))
for a in q.answers['items']:
title_answers.append(tokenize_with_rsw(a['content']))
In [16]:
from __future__ import print_function
In [17]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
In [18]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
In [19]:
docs_raw = title_answers
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words,
token_pattern = u'[\u4E00-\u9FFF]+',
max_df = 0.5,
min_df = 10)
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print(dtm_tfidf.shape)
In [20]:
lda_tfidf = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)
Out[20]:
In [21]:
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)
Out[21]:
In [22]:
tf_vectorizer = CountVectorizer(stop_words=stop_words,
token_pattern = u'[\u4E00-\u9FFF]+',
max_df = 0.5,
min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
print(dtm_tf.shape)
In [23]:
lda_tf = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_tf.fit(dtm_tf)
Out[23]:
In [24]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
Out[24]:
In [25]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')
Out[25]:
In [26]:
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')
Out[26]: