In [1]:
import requests
import re
import html
import json
from bs4 import BeautifulSoup
In [2]:
def getPostInfosFromContent(content):
jsonDataArray = re.findall(r'\{\"result\"\:\{\"data.*?,\"variables', content)
#ex. {"result": [get this result], "variables": [ignored]}
posts = {}
for jsonData in jsonDataArray:
jsonData = jsonData[10: -11]
try:
data = json.loads(jsonData)['data']['feedback']
commentsInfo = data['display_comments']['edges']
comments = []
for comment in commentsInfo:
comment = comment['node']
if comment['body'] is not None:
comments.append({
'id': comment['id'],
'time': comment['created_time'],
# 'author': comment['author'], # deep information of user
'message': comment['body']['text'],
'like': comment['feedback']['reactors']['count']
})
actorId = data['owning_profile']['id']
postId = data['share_fbid']
posts['{}_{}'.format(actorId, postId)] = {
'like': data['reaction_count']['count'],
'share': data['share_count']['count'],
'comment': data['display_comments_count']['count'],
'commentsHighlight': comments
}
except:
continue
return posts
In [3]:
def getPostsFromContent(content, postInfos):
soup = BeautifulSoup(content, 'html.parser')
divs = soup.find_all('div', {'class': '_5pcr userContentWrapper'})
posts = []
for div in divs:
try:
posts_id = div.find_all('div', {'class': '_5pcp _5lel _2jyu _232_'})
posts_time = div.find_all('span', {'class': 'fsm fwn fcg'})
posts_link = div.find_all('a', {'class': '_5pcq'})
posts_text = div.find_all('div', {'class': '_5pbx userContent _3576'})
postIds = [re.search(r'_[0-9]+;[0-9]+;', div['id']).group()[1:-1].replace(';', '_') for div in posts_id]
postTimes = [int(span.abbr['data-utime']) for span in posts_time]
postLinks = ['https://www.facebook.com' + a['href'] for a in posts_link]
postMessages = []
for div in posts_text:
for span in div.find_all('span', {'class': 'text_exposed_hide'}):
span.decompose()
for span in div.find_all('span', {'class': 'see_more_link_inner'}):
span.decompose()
postMessages.append(div.get_text())
posts.append({
'id': postIds[0],
'time': postTimes[0],
'link': postLinks[0],
'message': postMessages[0],
'like': postInfos[postIds[0]]['like'],
'share': postInfos[postIds[0]]['share'],
'comment': postInfos[postIds[0]]['comment'],
'commentsHighlight': postInfos[postIds[0]]['commentsHighlight']
})
except:
# print('Exception')
continue
return posts
In [4]:
pageNameOrId = 'CH3Thailand' # <====== แก้ page name ตรงนี้ 638480949626833
source_url = 'https://www.facebook.com/pg/%s/posts/' % pageNameOrId
content = html.unescape(requests.get(source_url).text)
postInfos = getPostInfosFromContent(content)
posts = getPostsFromContent(content, postInfos)
urlResult = re.search(r'\/pages_reaction_units.*?unit_count=8', content).group()
print(len(posts), posts[-1])
In [5]:
for i in range(2):
try:
next_url = 'https://www.facebook.com%s&__a=1' % urlResult
tmp_content = requests.get(next_url).text[9:]
next_content = json.loads(tmp_content)
content = html.unescape(next_content['domops'][0][3]['__html'])
postInfos = getPostInfosFromContent(tmp_content)
next_posts = getPostsFromContent(content, postInfos)
posts += next_posts
try:
urlResult = re.search(r'\/pages_reaction_units.*?unit_count=8', content).group()
except:
print('end')
break
except:
print(next_url)
continue
print(len(posts), posts[-1])
In [ ]: