In [1]:
import requests
import re
import urllib
import html
import json
from bs4 import BeautifulSoup
from pymongo import MongoClient
from datetime import datetime
from datetime import date
from datetime import timedelta
import pytz

In [5]:
def updateInfoToCollection(sinceTime, untilTime, pageCollection):
    pageCollection.update_one({
        'type': 'time'
    }, {
        '$set': {
            'startTimeFetched': sinceTime,
            'endTimeFetched': untilTime
        }
    })
#     print('updated time info')

In [6]:
def updateUrlToCollection(result):
    oldUrl = pageCollection.find_one({
        'type': 'url'
    })
    if not oldUrl:
        pageCollection.insert_one({
            'type': 'url',
            'result': result
        })
    else:
        pageCollection.update_one({
            'type': 'url'
        }, {
            '$set': {
                'result': result
            }
        })

In [7]:
def insertIntoDatabase(pid, ptime, pmessage, pageCollection):
#     print(pid, ptime)
#     print(pmessage)
    tzThai = pytz.timezone('Asia/Bangkok')
    oldPost = pageCollection.find_one({
        'type': 'post',
        'id': pid
    })
    if not oldPost:
        pageCollection.insert_one({
            'type': 'post',
            'id': pid,
            'created_time': ptime,
            'date': datetime.fromtimestamp(ptime, tzThai).date().isoformat(),
            'time': datetime.fromtimestamp(ptime, tzThai).time().isoformat(),
            'message': pmessage
        })
        return 1
    return 0

In [8]:
def savePostToDatabase(content, pageCollection):
    pageInfo = pageCollection.find_one({'type': 'time'})
    sinceTime = pageInfo['startTimeFetched']
    untilTime = pageInfo['endTimeFetched']
    
    soup = BeautifulSoup(content, 'html.parser')
    divs = soup.find_all('div', {'class': '_5pcr userContentWrapper'})
    
    insertCount = 0
    for div in divs:
        try:
            posts_id = div.find_all('div', {'class': '_5pcp _5lel _2jyu _232_'})
            posts_time = div.find_all('span', {'class': 'fsm fwn fcg'})
            posts_text = div.find_all('div', {'class': '_5pbx userContent _3576'})
            postIds = [re.search(r'_[0-9]+;[0-9]+;', div['id']).group()[1:-1].replace(';', '_') for div in posts_id]
            postTimes = [int(span.abbr['data-utime']) for span in posts_time]
            postMessages = []
            for div in posts_text:
                for span in div.find_all('span', {'class': 'text_exposed_hide'}):
                    span.decompose()
                for span in div.find_all('span', {'class': 'see_more_link_inner'}):
                    span.decompose()
                postMessages.append(div.get_text())
        except:
            continue
            
        if len(postIds) == 1 and len(postTimes) == 1 and len(postMessages) == 1:
            insertCount += insertIntoDatabase(postIds[0], postTimes[0], postMessages[0], pageCollection)
            if not sinceTime or sinceTime > postTimes[0]:
                sinceTime = postTimes[0]
            if not untilTime or untilTime < postTimes[0]:
                untilTime = postTimes[0]
    
    updateInfoToCollection(sinceTime, untilTime, pageCollection)
    return insertCount

In [2]:
client = MongoClient('localhost', 27017)
pagesDB = client.pages

In [4]:
# mode = 'R'
# tzThai = pytz.timezone('Asia/Bangkok')
# maxDate = int(datetime.now(tzThai).timestamp())
# minDate = int(datetime(2018, 1, 1, 0, 0, 0, 0, tzThai).timestamp())
# print(maxDate, minDate)

In [37]:
pageNameOrId = 'longtunman'
pageCollection = pagesDB[pageNameOrId]

if not pageCollection.find_one({'type': 'time'}):
    initData = {
        'type': 'time',
        'startTimeFetched': None,
        'endTimeFetched': None
    }
    pageCollection.insert_one(initData)
pageInfo = pageCollection.find_one({'type': 'time'})

if not pageCollection.find_one({'type': 'url'}):
    initData = {
        'type': 'url',
        'result': None
    }
    pageCollection.insert_one(initData)
urlInfo = pageCollection.find_one({'type': 'url'})

print(pageNameOrId)
print(pageInfo)
print(urlInfo)


longtunman
{'_id': ObjectId('5b3e02e56078bd34ace6ff13'), 'type': 'time', 'startTimeFetched': None, 'endTimeFetched': None}
{'_id': ObjectId('5b3e02e56078bd34ace6ff14'), 'type': 'url', 'result': None}

In [38]:
insertCount = 0
mode = 'R'
if mode == 'R': #Restart
    source_url = 'https://www.facebook.com/pg/%s/posts/' % pageNameOrId
    content = html.unescape(requests.get(source_url).text)
    insertCount += savePostToDatabase(content, pageCollection)
    result = re.search(r'\/pages_reaction_units.*?ดูเพิ่มเติม', content).group()
    result = re.search(r'\/pages_reaction_units.*?unit_count=8', result).group()
    updateUrlToCollection(result)
    
elif mode == 'C': #Continue
    result = urlInfo['result']
    
print('insert posts =', insertCount)


insert posts = 18

In [39]:
while True:
    try:
        next_url = 'https://www.facebook.com%s&__a=1' % result
        next_content = json.loads(requests.get(next_url).text[9:])
        content = html.unescape(next_content['domops'][0][3]['__html'])
        insertCount += savePostToDatabase(content, pageCollection)
        try:
            next_result = re.search(r'"\/pages_reaction_units.*?ดูเพิ่มเติม', content).group()
        except:
            print('end')
            break
        result = re.search(r'\/pages_reaction_units.*?unit_count=8', next_result).group()
        updateUrlToCollection(result)
        print('insert posts =', insertCount)
    except:
        updateUrlToCollection(result)
        print(result)


insert posts = 25
insert posts = 33
insert posts = 40
insert posts = 48
insert posts = 56
insert posts = 63
insert posts = 69
insert posts = 77
insert posts = 85
insert posts = 92
insert posts = 98
insert posts = 106
insert posts = 114
insert posts = 122
insert posts = 129
insert posts = 137
insert posts = 145
insert posts = 152
insert posts = 160
insert posts = 167
insert posts = 175
insert posts = 183
insert posts = 190
insert posts = 198
insert posts = 205
insert posts = 213
insert posts = 221
insert posts = 229
insert posts = 236
insert posts = 244
insert posts = 251
insert posts = 259
insert posts = 267
insert posts = 275
insert posts = 280
insert posts = 288
insert posts = 296
insert posts = 304
insert posts = 312
insert posts = 320
insert posts = 328
insert posts = 336
insert posts = 344
insert posts = 352
insert posts = 360
insert posts = 368
insert posts = 376
insert posts = 384
insert posts = 392
insert posts = 399
insert posts = 407
insert posts = 415
insert posts = 423
insert posts = 431
insert posts = 439
insert posts = 446
insert posts = 454
insert posts = 462
insert posts = 470
insert posts = 478
insert posts = 486
insert posts = 494
insert posts = 502
insert posts = 510
insert posts = 518
insert posts = 526
insert posts = 534
insert posts = 542
insert posts = 550
insert posts = 558
insert posts = 565
insert posts = 573
insert posts = 581
insert posts = 589
insert posts = 595
insert posts = 601
insert posts = 608
insert posts = 615
insert posts = 623
insert posts = 630
insert posts = 638
insert posts = 646
insert posts = 654
insert posts = 662
insert posts = 670
insert posts = 677
insert posts = 684
insert posts = 692
insert posts = 699
insert posts = 707
insert posts = 715
insert posts = 723
insert posts = 731
insert posts = 739
insert posts = 747
insert posts = 755
insert posts = 763
insert posts = 771
insert posts = 779
insert posts = 787
insert posts = 795
insert posts = 803
/pages_reaction_units/more/?page_id=113397052526245&cursor=%7B%22timeline_cursor%22%3A%22timeline_unit%3A1%3A00000000001507510800%3A04611686018427387904%3A09223372036854775597%3A04611686018427387904%22%2C%22timeline_section_cursor%22%3A%7B%22profile_id%22%3A113397052526245%2C%22start%22%3A1514793600%2C%22end%22%3A1546329599%2C%22query_type%22%3A8%2C%22filter%22%3A1%2C%22filter_after_timestamp%22%3A1525261740%7D%2C%22has_next_page%22%3Atrue%7D&surface=www_pages_posts&unit_count=8
insert posts = 811
insert posts = 819
insert posts = 827
insert posts = 835
insert posts = 843
insert posts = 851
insert posts = 858
insert posts = 866
insert posts = 873
insert posts = 881
insert posts = 889
insert posts = 897
insert posts = 904
insert posts = 912
insert posts = 920
insert posts = 928
insert posts = 936
insert posts = 942
insert posts = 948
insert posts = 956
insert posts = 964
insert posts = 971
insert posts = 979
insert posts = 986
insert posts = 994
insert posts = 1001
insert posts = 1008
insert posts = 1016
insert posts = 1024
insert posts = 1032
insert posts = 1040
insert posts = 1048
insert posts = 1056
insert posts = 1064
insert posts = 1072
insert posts = 1080
insert posts = 1086
insert posts = 1094
insert posts = 1101
insert posts = 1109
insert posts = 1113
end

In [ ]: