In [1]:
# import pymongo
from pymongo import MongoClient
import requests
import time
from datetime import datetime
from datetime import date
from datetime import timedelta
from nltk import ngrams
from itertools import groupby
from functools import reduce
In [2]:
def insertPostToCollection(posts, pageCollection):
for post in posts:
if 'message' in post:
oldPost = pageCollection.find_one({
'type': 'post',
'id': post['id']
})
if not oldPost:
pageCollection.insert_one({
'type': 'post',
'id': post['id'],
'date': post['created_time'].split('T')[0],
'created_time': post['created_time'],
'message': post['message']
})
In [3]:
def updateInfoToCollection(mode, sinceTime, untilTime, pageCollection):
if mode == 'R' or mode == 'Restart':
pageCollection.update_one({
'type': 'time'
}, {
'$set': {
'startTimeFetched': sinceTime.isoformat(),
'endTimeFetched': untilTime.isoformat()
}
})
elif mode == 'F' or mode == 'Forward':
pageCollection.update_one({
'type': 'time'
}, {
'$set': {
'endTimeFetched': untilTime.isoformat()
}
})
elif mode == 'B' or mode == 'Backward':
pageCollection.update_one({
'type': 'time'
}, {
'$set': {
'startTimeFetched': sinceTime.isoformat()
}
})
In [4]:
client = MongoClient('localhost', 27017)
pagesDB = client.pages
In [5]:
pageNameOrId = 'thairath'
pageCollection = pagesDB[pageNameOrId]
if not pageCollection.find_one({'type': 'time'}):
initData = {
'type': 'time',
'startTimeFetched': None,
'endTimeFetched': None,
}
pageCollection.insert_one(initData)
pageInfo = pageCollection.find_one({'type': 'time'})
print(pageInfo)
In [17]:
mode = 'F'
maxDate = date.today()
minDate = date(2018, 1, 1)
ACCESS_TOKEN = 'EAAbMtZBGZCKw4BAC8ZBKUvOCE7PiiJQ0ccw4bmb7TBQl0hznY5ySbiy5uTmZAEUVfkxx29P8Cq4LIS2hD5jJyQBlwNzrqEA9eFaJZB51JBbWp6EoCKNpCVWM8hVDmoPB4fogRfWmb6h47iAiabioKo31QmPlMzCh30yLwiZBB4sCM2cJdK7ZBftZBfmrjzzeBTf3ABMJucn2s3ut70bTWgQn'
base_url = 'https://graph.facebook.com/v3.0/'
In [18]:
start_time = time.time()
for dayLimit in range(100):
pageInfo = pageCollection.find_one({'type': 'time'})
startTimeFetched = pageInfo['startTimeFetched']
endTimeFetched = pageInfo['endTimeFetched']
if mode == 'R' or mode == 'Reset' or not startTimeFetched or not endTimeFetched:
mode = 'B'
untilTime = maxDate
sinceTime = untilTime + timedelta(days=-1)
elif mode == 'F' or mode == 'Forward':
sinceTime = datetime.strptime(endTimeFetched, '%Y-%m-%d').date()
untilTime = sinceTime + timedelta(days=1)
if untilTime > maxDate:
untilTime = maxDate
elif mode == 'B' or mode == 'Backward':
untilTime = datetime.strptime(startTimeFetched, '%Y-%m-%d').date()
sinceTime = untilTime + timedelta(days=-1)
if sinceTime < minDate:
sinceTime = minDate
fields = 'id,name,posts.since(%s).until(%s){id,message,created_time}' % (sinceTime, untilTime)
url = '%s?fields=%s&access_token=%s' % (base_url + pageNameOrId, fields, ACCESS_TOKEN)
try:
content = requests.get(url).json()
if 'error' in content:
print(url)
print(content['error'])
raise
if 'posts' in content and 'data' in content['posts']:
insertPostToCollection(content['posts']['data'], pageCollection)
if 'paging' in content['posts'] and 'next' in content['posts']['paging']:
content = content['posts']
while True:
for attempt in range(11):
try:
content = requests.get(content['paging']['next']).json()
except Exception as error:
if attempt == 10:
raise
print(attempt, type(error), error)
continue
else:
break
insertPostToCollection(content['data'], pageCollection)
if 'paging' not in content or 'next' not in content['paging']:
break
updateInfoToCollection(mode, sinceTime, untilTime, pageCollection)
print('Update Time Completed', sinceTime, untilTime)
except Exception as error:
print('Have some error, save partial data')
print(type(error), error)
break
print("--- %s seconds ---" % (time.time() - start_time))
In [ ]: