notebook.community

Edit and run



In [1]:

    
import requests
import re
import html
import json
from bs4 import BeautifulSoup



In [2]:

    
def getPostInfosFromContent(content):
    jsonDataArray = re.findall(r'\{\"result\"\:\{\"data.*?,\"variables', content)
    #ex. {"result": [get this result], "variables": [ignored]}
    posts = {}
    for jsonData in jsonDataArray:
        jsonData = jsonData[10: -11]
        try:
            data = json.loads(jsonData)['data']['feedback']
            commentsInfo = data['display_comments']['edges']
            comments = []
            for comment in commentsInfo:
                comment = comment['node']
                if comment['body'] is not None:
                    comments.append({
                        'id': comment['id'],
                        'time': comment['created_time'],
#                         'author': comment['author'], # deep information of user
                        'message': comment['body']['text'],
                        'like': comment['feedback']['reactors']['count']
                    })
            actorId = data['owning_profile']['id']
            postId = data['share_fbid']
            posts['{}_{}'.format(actorId, postId)] = {
                'like': data['reaction_count']['count'],
                'share': data['share_count']['count'],
                'comment': data['display_comments_count']['count'],
                'commentsHighlight': comments
            }
        except:
            continue
    return posts



In [3]:

    
def getPostsFromContent(content, postInfos):
    soup = BeautifulSoup(content, 'html.parser')
    divs = soup.find_all('div', {'class': '_5pcr userContentWrapper'})
    posts = []
    for div in divs:
        try:
            posts_id = div.find_all('div', {'class': '_5pcp _5lel _2jyu _232_'})
            posts_time = div.find_all('span', {'class': 'fsm fwn fcg'})
            posts_link = div.find_all('a', {'class': '_5pcq'})
            posts_text = div.find_all('div', {'class': '_5pbx userContent _3576'})
            postIds = [re.search(r'_[0-9]+;[0-9]+;', div['id']).group()[1:-1].replace(';', '_') for div in posts_id]
            postTimes = [int(span.abbr['data-utime']) for span in posts_time]
            postLinks = ['https://www.facebook.com' + a['href'] for a in posts_link]
            postMessages = []
            for div in posts_text:
                for span in div.find_all('span', {'class': 'text_exposed_hide'}):
                    span.decompose()
                for span in div.find_all('span', {'class': 'see_more_link_inner'}):
                    span.decompose()
                postMessages.append(div.get_text())
            posts.append({
                'id': postIds[0],
                'time': postTimes[0],
                'link': postLinks[0],
                'message': postMessages[0],
                'like': postInfos[postIds[0]]['like'],
                'share': postInfos[postIds[0]]['share'],
                'comment': postInfos[postIds[0]]['comment'],
                'commentsHighlight': postInfos[postIds[0]]['commentsHighlight']
            })
        except:
#             print('Exception')
            continue
    return posts



In [4]:

    
pageNameOrId = 'CH3Thailand' # <====== แก้ page name ตรงนี้ 638480949626833
source_url = 'https://www.facebook.com/pg/%s/posts/' % pageNameOrId
content = html.unescape(requests.get(source_url).text)
postInfos = getPostInfosFromContent(content)
posts = getPostsFromContent(content, postInfos)
urlResult = re.search(r'\/pages_reaction_units.*?unit_count=8', content).group()
print(len(posts), posts[-1])









    



16 {'id': '1282973728458676_2078043825618325', 'time': 1551838739, 'link': 'https://www.facebook.com/CH3Thailand/photos/a.1331740613581987/2078043785618329/?type=3', 'message': 'เรณูลื่นล้ม เลือดไหลเป็นทาง! ชม #กรงกรรม ย้อนหลังก่อนใคร คลิก👉 http://bit.ly/2TkgPdC #MelloThailand #Ch3Thailand', 'like': 1186, 'share': 13, 'comment': 27, 'commentsHighlight': [{'id': 'Y29tbWVudDoyMDc4MDQzODI1NjE4MzI1XzIwNzgzODU3NjIyNTA3OTg=', 'time': 1551863540, 'message': 'สรุปแล้ว เรื่องนี้ แม่ผัวกับลูกสะไภ้พอกัน แบบไก่เห็นตีนงู งูเห็นนมไก่ แต่เราก็ชอบเรณูมากกว่า อย่างน้อยเธอก็ไม่ได้ทำร้ายใครก่อน แบบดีมาดีตอบ หากร้ายมา ก็มีหนามแหลมไว้ป้องกันตัวบ้าง..แต่ไม่ชอบแม่ผัวเลย เอะอะโวยวาย เสียงดัง ด่ากราด อยู่ด้วยแล้วเครียดตาย มีดีอย่างเดียว ลูกชายหล่อและน่ารักทั้งนั้น เออะ เออะ เออะ', 'like': 0}, {'id': 'Y29tbWVudDoyMDc4MDQzODI1NjE4MzI1XzIwNzgyOTQ5MTU1OTMyMTY=', 'time': 1551856273, 'message': 'สะตอเบอแหล เมนส์ไหลมาแอ๊บว่าแท้ง 😑', 'like': 1}]}



In [5]:

    
for i in range(2):
    try:
        next_url = 'https://www.facebook.com%s&__a=1' % urlResult
        tmp_content = requests.get(next_url).text[9:]
        next_content = json.loads(tmp_content)
        content = html.unescape(next_content['domops'][0][3]['__html'])
        postInfos = getPostInfosFromContent(tmp_content)
        next_posts = getPostsFromContent(content, postInfos)
        posts += next_posts
        try:
            urlResult = re.search(r'\/pages_reaction_units.*?unit_count=8', content).group()
        except:
            print('end')
            break
    except:
        print(next_url)
        continue
print(len(posts), posts[-1])









    



32 {'id': '1282973728458676_2077081355714572', 'time': 1551781800, 'link': 'https://www.facebook.com/CH3Thailand/videos/564579950726172/', 'message': '"ฉันก้าวพ้นมาจากนรกแล้ว ฉันจะไม่กลับไปอยู่ที่ตกต่ำแบบนั้นอีก" ตอนเรณูลาแม่คือสะเทือนใจมาก 😭😭 น่าสงสารชีวิตเรณูจริงๆ ค่ะ #ทีมเรณู รับชมละครย้อนหลัง กรงกรรม EP.2 ได้ที่ลิงก์นี้ค่ะ http://www.ch3thailand.com/news/scoop/16244 #กรงกรรม #Ch3Thailand', 'like': 2933, 'share': 48, 'comment': 16, 'commentsHighlight': []}



In [ ]: