notebook.community

Edit and run



In [1]:

    
import re
import codecs
import jieba
from zhihu import Question

为什么大多数高中宁愿学习衡中，而不愿学习理念更加先进的人大附中等？https://www.zhihu.com/question/47145647



In [2]:

    
q = Question('47145647')



In [3]:

    
stop_words = []
with codecs.open('stop_words.txt','r',encoding='utf-8') as f:
    stop_words = f.readlines()
stop_words = [sw.strip() for sw in stop_words]



In [4]:

    
def tokenize_with_rsw(string):
    return ' '.join([word for word in jieba.cut(string) if not word in stop_words and len(word) > 1])



In [5]:

    
#q.get_data_from_api()
#q.save_to_json()
q.get_data_from_json()
title_answers = []
title_answers.append(tokenize_with_rsw(re.sub('<[^>]+>','',q.questions['title'])))
for a in q.answers['items']:
    title_answers.append(tokenize_with_rsw(a['content']))









    



Building prefix dict from the default dictionary ...
Loading model from cache c:\users\lxp\appdata\local\temp\jieba.cache
Loading model cost 0.451 seconds.
Prefix dict has been built succesfully.



In [6]:

    
from __future__ import print_function



In [7]:

    
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()



In [8]:

    
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

TF-IDF



In [9]:

    
docs_raw = title_answers
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words,
                                token_pattern = u'[\u4E00-\u9FFF]+',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tfidf = tfidf_vectorizer.fit_transform(docs_raw)
print(dtm_tfidf.shape)









    



(1365, 728)



In [10]:

    
lda_tfidf = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)









    Out[10]:





LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_topics=20, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)



In [11]:

    
pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, tfidf_vectorizer)









    Out[11]:

TF



In [12]:

    
tf_vectorizer = CountVectorizer(stop_words=stop_words,
                                token_pattern = u'[\u4E00-\u9FFF]+',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(docs_raw)
print(dtm_tf.shape)









    



(1365, 728)



In [13]:

    
lda_tf = LatentDirichletAllocation(n_topics=20, random_state=0)
lda_tf.fit(dtm_tf)









    Out[13]:





LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_topics=20, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)



In [14]:

    
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)









    Out[14]:

MDS



In [15]:

    
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='mmds')









    Out[15]:



In [16]:

    
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer, mds='tsne')









    Out[16]: