In [1]:
import requests
import re
import urllib
import html
import json
from bs4 import BeautifulSoup
from pymongo import MongoClient
from datetime import datetime
from datetime import date
from datetime import timedelta
import pytz
In [5]:
def updateInfoToCollection(sinceTime, untilTime, pageCollection):
pageCollection.update_one({
'type': 'time'
}, {
'$set': {
'startTimeFetched': sinceTime,
'endTimeFetched': untilTime
}
})
# print('updated time info')
In [6]:
def updateUrlToCollection(result):
oldUrl = pageCollection.find_one({
'type': 'url'
})
if not oldUrl:
pageCollection.insert_one({
'type': 'url',
'result': result
})
else:
pageCollection.update_one({
'type': 'url'
}, {
'$set': {
'result': result
}
})
In [7]:
def insertIntoDatabase(pid, ptime, pmessage, pageCollection):
# print(pid, ptime)
# print(pmessage)
tzThai = pytz.timezone('Asia/Bangkok')
oldPost = pageCollection.find_one({
'type': 'post',
'id': pid
})
if not oldPost:
pageCollection.insert_one({
'type': 'post',
'id': pid,
'created_time': ptime,
'date': datetime.fromtimestamp(ptime, tzThai).date().isoformat(),
'time': datetime.fromtimestamp(ptime, tzThai).time().isoformat(),
'message': pmessage
})
return 1
return 0
In [8]:
def savePostToDatabase(content, pageCollection):
pageInfo = pageCollection.find_one({'type': 'time'})
sinceTime = pageInfo['startTimeFetched']
untilTime = pageInfo['endTimeFetched']
soup = BeautifulSoup(content, 'html.parser')
divs = soup.find_all('div', {'class': '_5pcr userContentWrapper'})
insertCount = 0
for div in divs:
try:
posts_id = div.find_all('div', {'class': '_5pcp _5lel _2jyu _232_'})
posts_time = div.find_all('span', {'class': 'fsm fwn fcg'})
posts_text = div.find_all('div', {'class': '_5pbx userContent _3576'})
postIds = [re.search(r'_[0-9]+;[0-9]+;', div['id']).group()[1:-1].replace(';', '_') for div in posts_id]
postTimes = [int(span.abbr['data-utime']) for span in posts_time]
postMessages = []
for div in posts_text:
for span in div.find_all('span', {'class': 'text_exposed_hide'}):
span.decompose()
for span in div.find_all('span', {'class': 'see_more_link_inner'}):
span.decompose()
postMessages.append(div.get_text())
except:
continue
if len(postIds) == 1 and len(postTimes) == 1 and len(postMessages) == 1:
insertCount += insertIntoDatabase(postIds[0], postTimes[0], postMessages[0], pageCollection)
if not sinceTime or sinceTime > postTimes[0]:
sinceTime = postTimes[0]
if not untilTime or untilTime < postTimes[0]:
untilTime = postTimes[0]
updateInfoToCollection(sinceTime, untilTime, pageCollection)
return insertCount
In [2]:
client = MongoClient('localhost', 27017)
pagesDB = client.pages
In [4]:
# mode = 'R'
# tzThai = pytz.timezone('Asia/Bangkok')
# maxDate = int(datetime.now(tzThai).timestamp())
# minDate = int(datetime(2018, 1, 1, 0, 0, 0, 0, tzThai).timestamp())
# print(maxDate, minDate)
In [37]:
pageNameOrId = 'longtunman'
pageCollection = pagesDB[pageNameOrId]
if not pageCollection.find_one({'type': 'time'}):
initData = {
'type': 'time',
'startTimeFetched': None,
'endTimeFetched': None
}
pageCollection.insert_one(initData)
pageInfo = pageCollection.find_one({'type': 'time'})
if not pageCollection.find_one({'type': 'url'}):
initData = {
'type': 'url',
'result': None
}
pageCollection.insert_one(initData)
urlInfo = pageCollection.find_one({'type': 'url'})
print(pageNameOrId)
print(pageInfo)
print(urlInfo)
In [38]:
insertCount = 0
mode = 'R'
if mode == 'R': #Restart
source_url = 'https://www.facebook.com/pg/%s/posts/' % pageNameOrId
content = html.unescape(requests.get(source_url).text)
insertCount += savePostToDatabase(content, pageCollection)
result = re.search(r'\/pages_reaction_units.*?ดูเพิ่มเติม', content).group()
result = re.search(r'\/pages_reaction_units.*?unit_count=8', result).group()
updateUrlToCollection(result)
elif mode == 'C': #Continue
result = urlInfo['result']
print('insert posts =', insertCount)
In [39]:
while True:
try:
next_url = 'https://www.facebook.com%s&__a=1' % result
next_content = json.loads(requests.get(next_url).text[9:])
content = html.unescape(next_content['domops'][0][3]['__html'])
insertCount += savePostToDatabase(content, pageCollection)
try:
next_result = re.search(r'"\/pages_reaction_units.*?ดูเพิ่มเติม', content).group()
except:
print('end')
break
result = re.search(r'\/pages_reaction_units.*?unit_count=8', next_result).group()
updateUrlToCollection(result)
print('insert posts =', insertCount)
except:
updateUrlToCollection(result)
print(result)
In [ ]: