In [1]:
import re
import jieba
import jieba.analyse
from zhihu import *
#from textrank_ch import *
为什么大多数高中宁愿学习衡中,而不愿学习理念更加先进的人大附中等?https://www.zhihu.com/question/47145647
In [2]:
q = Question('47145647')
if not q.is_json_exist():
q.get_data_from_api()
q.save_to_json()
#q.save_to_pickle()
else:
q.get_data_from_json()
In [3]:
question_title_answers_all_content = q.questions['title']+'\n'
In [4]:
for a in q.answers['items']:
question_title_answers_all_content = question_title_answers_all_content + a['content']+'\n'
In [5]:
question_title_answers_all_content = re.sub('<[^>]+>','',question_title_answers_all_content)
In [6]:
print question_title_answers_all_content[:200]
In [7]:
# tf-idf
jieba.analyse.set_stop_words("stop_words.txt")
kw = jieba.analyse.extract_tags(question_title_answers_all_content, topK=20, withWeight=True)
In [8]:
for k,w in kw:
print k,w
In [9]:
# textrank
for x, w in jieba.analyse.textrank(question_title_answers_all_content, withWeight=True):
print('%s %s' % (x, w))