In [1]:
# import pymongo
from pymongo import MongoClient
import requests
import time
from datetime import datetime
from datetime import date
from datetime import timedelta
from nltk import ngrams
from itertools import groupby
from functools import reduce

In [2]:
def insertPostToCollection(posts, pageCollection):
    for post in posts:
        if 'message' in post:
            oldPost = pageCollection.find_one({
                'type': 'post',
                'id': post['id']
            })
            if not oldPost:
                pageCollection.insert_one({
                    'type': 'post',
                    'id': post['id'],
                    'date': post['created_time'].split('T')[0],
                    'created_time': post['created_time'],
                    'message': post['message']
                })

In [3]:
def updateInfoToCollection(mode, sinceTime, untilTime, pageCollection):
    if mode == 'R' or mode == 'Restart':
        pageCollection.update_one({
            'type': 'time'
        }, {
            '$set': {
                'startTimeFetched': sinceTime.isoformat(),
                'endTimeFetched': untilTime.isoformat()
            }
        })
    elif mode == 'F' or mode == 'Forward':
        pageCollection.update_one({
            'type': 'time'
        }, {
            '$set': {
                'endTimeFetched': untilTime.isoformat()
            }
        })
    elif mode == 'B' or mode == 'Backward':
        pageCollection.update_one({
            'type': 'time'
        }, {
            '$set': {
                'startTimeFetched': sinceTime.isoformat()
            }
        })

In [4]:
client = MongoClient('localhost', 27017)
pagesDB = client.pages

In [5]:
pageNameOrId = 'thairath'
pageCollection = pagesDB[pageNameOrId]
if not pageCollection.find_one({'type': 'time'}):
    initData = {
        'type': 'time',
        'startTimeFetched': None,
        'endTimeFetched': None,
    }
    pageCollection.insert_one(initData)
pageInfo = pageCollection.find_one({'type': 'time'})
print(pageInfo)


{'_id': ObjectId('5b335f796078bd084c78d551'), 'type': 'time', 'startTimeFetched': '2018-01-01', 'endTimeFetched': '2018-06-27'}

In [17]:
mode = 'F'
maxDate = date.today()
minDate = date(2018, 1, 1)
ACCESS_TOKEN = 'EAAbMtZBGZCKw4BAC8ZBKUvOCE7PiiJQ0ccw4bmb7TBQl0hznY5ySbiy5uTmZAEUVfkxx29P8Cq4LIS2hD5jJyQBlwNzrqEA9eFaJZB51JBbWp6EoCKNpCVWM8hVDmoPB4fogRfWmb6h47iAiabioKo31QmPlMzCh30yLwiZBB4sCM2cJdK7ZBftZBfmrjzzeBTf3ABMJucn2s3ut70bTWgQn'
base_url = 'https://graph.facebook.com/v3.0/'

In [18]:
start_time = time.time()
for dayLimit in range(100):
    pageInfo = pageCollection.find_one({'type': 'time'})
    startTimeFetched = pageInfo['startTimeFetched']
    endTimeFetched = pageInfo['endTimeFetched']

    if mode == 'R' or mode == 'Reset' or not startTimeFetched or not endTimeFetched:
        mode = 'B'
        untilTime = maxDate
        sinceTime = untilTime + timedelta(days=-1)

    elif mode == 'F' or mode == 'Forward':
        sinceTime = datetime.strptime(endTimeFetched, '%Y-%m-%d').date()
        untilTime = sinceTime + timedelta(days=1)
        if untilTime > maxDate:
            untilTime = maxDate

    elif mode == 'B' or mode == 'Backward':
        untilTime = datetime.strptime(startTimeFetched, '%Y-%m-%d').date()
        sinceTime = untilTime + timedelta(days=-1)
        if sinceTime < minDate:
            sinceTime = minDate
            
    fields = 'id,name,posts.since(%s).until(%s){id,message,created_time}' % (sinceTime, untilTime)
    url = '%s?fields=%s&access_token=%s' % (base_url + pageNameOrId, fields, ACCESS_TOKEN)

    try:
        content = requests.get(url).json()
        if 'error' in content:
            print(url)
            print(content['error'])
            raise
        if 'posts' in content and 'data' in content['posts']:
            insertPostToCollection(content['posts']['data'], pageCollection)
            if 'paging' in content['posts'] and 'next' in content['posts']['paging']:
                content = content['posts']
                while True:
                    for attempt in range(11):
                        try:
                            content = requests.get(content['paging']['next']).json()
                        except Exception as error:
                            if attempt == 10:
                                raise
                            print(attempt, type(error), error)
                            continue
                        else:
                            break
                    insertPostToCollection(content['data'], pageCollection)
                    if 'paging' not in content or 'next' not in content['paging']:
                        break
                    
        updateInfoToCollection(mode, sinceTime, untilTime, pageCollection)
        print('Update Time Completed', sinceTime, untilTime)

    except Exception as error:
        print('Have some error, save partial data')
        print(type(error), error)
        break

print("--- %s seconds ---" % (time.time() - start_time))


https://graph.facebook.com/v3.0/thairath?fields=id,name,posts.since(2018-06-27).until(2018-06-28){id,message,created_time}&access_token=EAAbMtZBGZCKw4BAC8ZBKUvOCE7PiiJQ0ccw4bmb7TBQl0hznY5ySbiy5uTmZAEUVfkxx29P8Cq4LIS2hD5jJyQBlwNzrqEA9eFaJZB51JBbWp6EoCKNpCVWM8hVDmoPB4fogRfWmb6h47iAiabioKo31QmPlMzCh30yLwiZBB4sCM2cJdK7ZBftZBfmrjzzeBTf3ABMJucn2s3ut70bTWgQn
{'message': "(#10) To use 'Page Public Content Access', your use of this endpoint must be reviewed and approved by Facebook. To submit this 'Page Public Content Access' feature for review please read our documentation on reviewable features: https://developers.facebook.com/docs/apps/review.", 'type': 'OAuthException', 'code': 10, 'fbtrace_id': 'AoBfsZaZEvZ'}
Have some error, save partial data
<class 'RuntimeError'> No active exception to reraise
--- 0.3998837471008301 seconds ---

In [ ]: