In [4]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
In [5]:
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'}
In [3]:
zhihu_url = 'https://www.zhihu.com'
In [4]:
class Answer(object):
def __init__(self,href):
self.url = zhihu_url+href
self.res = requests.get(self.url,headers=headers)
if self.res.status_code == 200:
self.soup = BeautifulSoup(self.res.text,'lxml')
qs = self.soup.find_all('div',attrs={'class':'QuestionAnswer-content'})[0]
ca = qs.find_all('div',attrs={'class':'ContentItem AnswerItem'})[0]
ac = ca.find_all('div',attrs={'class':'AuthorInfo-content'})[0]
try:
ul = ac.find_all('a',attrs={'class':'UserLink-link'})[0]
self.userlink = ul['href']
except:
self.userlink = ''
ri = ca.find_all('div',attrs={'class':'RichContent-inner'})[0]
self.text = ri.text
In [5]:
a = Answer('/question/21789715/answer/19345625')
In [6]:
print a.text
In [26]:
zhihu_api_url = 'https://www.zhihu.com/api/v4/topics/{}'
In [27]:
headers2 = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
#'accept':'application/json, text/plain, */*',
'authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
#'x-udid':'AECCYdU9KwuPTh4DahhnpQ6AhG5JM4h5rdY='
}
In [34]:
r = requests.get(zhihu_api_topics.format('19552439'),headers=headers2)
In [35]:
r.content
Out[35]:
In [36]:
j = r.json()
In [37]:
j
Out[37]:
In [33]:
# util.py
import requests
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'authorization':'oauth c3cef7c66a1843f8b3a9e6a1e3160e20',
}
zhihu_page_question_url = 'https://www.zhihu.com/question/{}'
zhihu_api = 'https://www.zhihu.com/api/v4/'
zhihu_api_topics = 'https://www.zhihu.com/api/v4/topics/{}?include=introduction%2Cquestions_count%2Cbest_answers_count%2Cfollowers_count%2Cis_following'
zhihu_api_questions = zhihu_api+'questions/{}'
zhihu_api_answers = zhihu_api+'answers/{}'
zhihu_api_answers_comments = zhihu_api+'''answers/{}/comments?include=data%5B*%5D.author%2Ccollapsed%2C
reply_to_author%2Cdisliked%2Ccontent%2Cvoting%2Cvote_count%2Cis_parent_author%2C
is_author&order=normal&limit=20&offset=0&status=open'''
questions_answers_include = '''include=data%5B*%5D.is_normal%2Cis_sticky%2Ccollapsed_by%2Csuggest_edit%2Ccomment_count%2C\
can_comment%2Ccontent%2Ceditable_content%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2Cmark_infos%2C\
created_time%2Cupdated_time%2Crelationship.is_authorized%2Cis_author%2Cvoting%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3B\
data%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics'''
zhihu_api_questions_answers = zhihu_api+'questions/{}/answers?limit=20&offset=0'+'&'+questions_answers_include
zhihu_api_new_question_up = zhihu_api+'banners/new_question_up?question_token={}'
zhihu_api_new_question_down = zhihu_api+'banners/new_question_down?question_token={}'
zhihu_api_similar_questions = zhihu_api+'questions/{}/similar-questions?include=data%5B%2A%5D.answer_count%2Cauthor&limit=5&offset=0'
zhihu_api_people = zhihu_api+'people/{}'
members_include = '''include=locations%2Cemployments%2Cgender%2Ceducations%2Cbusiness%2Cvoteup_count%2Cthanked_Count%2C\
follower_count%2Cfollowing_count%2Ccover_url%2Cfollowing_topic_count%2Cfollowing_question_count%2Cfollowing_favlists_count%2C\
following_columns_count%2Cavatar_hue%2Canswer_count%2Carticles_count%2Cpins_count%2Cquestion_count%2C\
commercial_question_count%2Cfavorite_count%2Cfavorited_count%2Clogs_count%2Cmarked_answers_count%2Cmarked_answers_text%2C\
message_thread_token%2Caccount_status%2Cis_active%2Cis_force_renamed%2Cis_bind_sina%2Csina_weibo_url%2C\
sina_weibo_name%2Cshow_sina_weibo%2Cis_blocking%2Cis_blocked%2Cis_following%2Cis_followed%2Cmutual_followees_count%2Cvote_to_count%2C\
vote_from_count%2Cthank_to_count%2Cthank_from_count%2Cthanked_count%2Cdescription%2Chosted_live_count%2C\
participated_live_count%2Callow_message%2Cindustry_category%2Corg_name%2Corg_homepage%2C\
badge%5B%3F(type%3Dbest_answerer)%5D.topics'''
zhihu_api_members = zhihu_api+'members/{}'+'?'+members_include
zhihu_api_members_followees = zhihu_api+'members/{}/followees?offset=0&limit=20'+'&'+members_include
zhihu_api_members_followers = zhihu_api+'members/{}/followers?offset=0&limit=20'+'&'+members_include
zhihu_api_members_activities = zhihu_api+'members/{}/activities?offset=0&limit=20' #&after_id=1418481629&desktop=True
members_answers_include = '''include=data%5B*%5D.is_normal%2Csuggest_edit%2Ccomment_count%2C\
can_comment%2Ccontent%2Cvoteup_count%2Creshipment_settings%2Ccomment_permission%2C\
mark_infos%2Ccreated_time%2Cupdated_time%2Crelationship.is_authorized%2Cvoting%2C\
is_author%2Cis_thanked%2Cis_nothelp%2Cupvoted_followees%3Bdata%5B*%5D.author.badge%5B%3F\
(type%3Dbest_answerer)%5D.topics'''
zhihu_api_members_answers = zhihu_api+'members/{}/answer?offset=0&limit=20&sort_by=created'+'&'+members_answers_include
zhihu_api_members_questions = zhihu_api+'members/{}/questions?include=data%5B*%5D.created%2Canswer_count%2Cfollower_count%2Cauthor&offset=0&limit=20'
zhihu_api_members_pins = zhihu_api+'members/{}/pins?offset=0&limit=20&includes=data%5B*%5D.upvoted_followees'
zhihu_api_members_articles = zhihu_api+'members/{}/articles?include=data%5B*%5D.comment_count%2Ccan_comment%2Ccomment_permission%2Ccontent%2Cvoteup_count%2Ccreated%2Cupdated%2Cupvoted_followees%2Cvoting%3Bdata%5B*%5D.author.badge%5B%3F(type%3Dbest_answerer)%5D.topics&offset=0&limit=20&sort_by=created'
zhihu_api_members_column_contributions = zhihu_api+'members/{}/column-contributions?include=data%5B*%5D.column.title%2Cintro%2Cdescription%2Cfollowers%2Carticles_count&offset=0&limit=20'
zhihu_api_members_favlists = zhihu_api+'members/{}/favlists?include=data%5B*%5D.updated_time%2Canswer_count%2Cfollower_count%2Ccreator%2Cis_public&offset=0&limit=20'
zhihu_api_members_following_columns = zhihu_api+'members/{}/following-columns?include=data%5B*%5D.intro%2Cfollowers%2Carticles_count%2Cimage_url%2Cimage_width%2Cimage_height%2Cis_following%2Clast_article.created&offset=0&limit=20'
zhihu_api_members_following_topic_contributions = zhihu_api+'members/{}/following-topic-contributions?include=data%5B*%5D.topic.introduction&offset=0&limit=20'
zhihu_api_members_following_questions = zhihu_api+'members/{}/following-questions?include=data%5B*%5D.created%2Canswer_count%2Cfollower_count%2Cauthor&offset=0&limit=20'
zhihu_api_members_following_favlists = zhihu_api+'members/{}/following-favlists?include=data%5B*%5D.updated_time%2Canswer_count%2Cfollower_count%2Ccreator&offset=0&limit=20'
def get_data_from_zhihu_api(api,token):
data = {'totals':0,'items':[]}
res = requests.get(api.format(token),headers=headers)
j = res.json()
if not 'offset' in api:
return j
else:
try:
data['totals'] = j['paging']['totals']
except:
data['totals'] = None
data['items'].extend(j['data'])
while not j['paging']['is_end']:
res = requests.get(j['paging']['next'],headers=headers)
j = res.json()
data['items'].extend(j['data'])
return data
def get_questions(question_token):
questions = get_data_from_zhihu_api(zhihu_api_questions,question_token)
answers = get_data_from_zhihu_api(zhihu_api_questions_answers,question_token)
similar_questions = get_data_from_zhihu_api(zhihu_api_similar_questions,question_token)
return questions,answers,similar_questions
def get_members(members_token):
members = get_data_from_zhihu_api(zhihu_api_members,members_token)
followees = get_data_from_zhihu_api(zhihu_api_members_followees,members_token)
followers = get_data_from_zhihu_api(zhihu_api_members_followers,members_token)
return members,followees,followers
In [8]:
import os
import codecs
import json
import pickle
program_dir=r'C:\Users\lxp\jupyter_notebooks\kaggle\quora\quora-mining'
class Question(object):
def __init__(self,question_token):
self.question_token = question_token
self.questions,self.answers,self.similar_questions = get_questions(self.question_token)
def save_to_json(self):
print program_dir
question_dir = os.path.join(program_dir,'db\\json\\zhihu\\question')
if not os.path.exists(question_dir):
os.mkdir(question_dir)
self.json_path = os.path.join(question_dir,str(self.question_token)+'.json')
if not os.path.exists(self.json_path):
with codecs.open(self.json_path,'wb',encoding='utf-8') as f:
json.dump({'questions': self.questions,'answers':self.answers,'similar_questions':self.similar_questions},f)
f.close()
def save_to_pickle(self):
question_dir = os.path.join(program_dir,'db\\pickle\\zhihu\\question')
if not os.path.exists(question_dir):
os.mkdir(question_dir)
self.pickle_path = os.path.join(question_dir,str(self.question_token)+'.pickle')
if not os.path.exists(self.pickle_path):
with codecs.open(self.pickle_path,'wb',encoding='utf-8') as f:
pickle.dump(self,f)
f.close()
In [9]:
q = Question(20277085)
In [10]:
q.save_to_json()
In [11]:
q.json_path
Out[11]:
In [12]:
q.save_to_pickle()
In [13]:
q.pickle_path
Out[13]:
In [14]:
q.questions
Out[14]:
In [16]:
q.answers['items'][0]
Out[16]:
In [19]:
ans = get_data_from_zhihu_api(zhihu_api_questions_answers,20277085)
In [27]:
print ans['items'][0]['content']
In [ ]: