In [5]:
import facebook # for connecting to Facebook Graph API
import pprint
import datetime
import pandas as pd
import logging
logger = logging.Logger('catch_all')
# send request to Facebook Graph API, fetching last 50 posts of each page :
def collector(page, token, lim) :
graph = facebook.GraphAPI(access_token = token, version = '2.7')
# fetch and transform JSON to dict :
posts = graph.get_connections(id = page, connection_name = 'posts',
fields = 'id,message,link,shares,from,type,created_time,updated_time,'
'comments.limit(0).summary(true),likes.limit(0).summary(true),reactions.limit(0).summary(true)',
limit = lim)
data = posts['data'] # dict
loop = 0
mylist = []
# parse data from dict :
for el in data:
mydict = {}
for key in el:
if key == 'comments':
commentcount = el[key]['summary']['total_count']
mydict['comment_count'] = commentcount
elif key == 'from':
creator = el[key]['name']
mydict['from'] = creator
elif key == 'likes':
likecount = el[key]['summary']['total_count']
mydict['like_count'] = likecount
elif key == 'reactions':
reactcount = el[key]['summary']['total_count']
mydict['reaction_count'] = reactcount
elif key == 'shares':
share = el[key]['count']
mydict['share_count'] = share
else:
mydict[key] = el[key]
time = datetime.datetime.now() # add timestamp
mydict['time_checked'] = str(time)
mylist.append(mydict)
loop += 1
df = pd.DataFrame(mylist) # pandas dataframe
return df
# create a new csv file of each page :
def initiator(page, token, lim) :
df = collector(page, token, lim)
df.to_csv(page + '.csv', index = False, encoding = 'utf_8_sig')
# receive post's ids and collect new data of these extra posts :
def collector_post(post_ids, token) :
graph = facebook.GraphAPI(access_token = token, version = '2.7')
for idvalue in post_ids:
if (idvalue == 'post deleted' or idvalue == 'ERROR in exception' or
idvalue == 'Something wrong' or idvalue == 'Unexpected condition in appending()'):
post_ids.remove(idvalue)
try:
posts = graph.get_objects(ids = post_ids,
fields = 'id,message,link,shares,from,type,created_time,updated_time,'
'comments.limit(0).summary(true),likes.limit(0).summary(true),reactions.limit(0).summary(true)')
except Exception as e:
logger.error(e, exc_info=True)
time = str(datetime.datetime.now())
return pd.DataFrame([{'id' : 'ERROR in exception', 'time_checked' : time}])
#--------------------------------------------
if len(posts) > 0:
loop = 0
mylist = []
for id_key in posts:
mydict = {}
for key in posts[id_key]:
if key == 'comments':
commentcount = posts[id_key][key]['summary']['total_count']
mydict['comment_count'] = commentcount
elif key == 'from':
creator = posts[id_key][key]['name']
mydict['from'] = creator
elif key == 'likes':
likecount = posts[id_key][key]['summary']['total_count']
mydict['like_count'] = likecount
elif key == 'reactions':
reactcount = posts[id_key][key]['summary']['total_count']
mydict['reaction_count'] = reactcount
elif key == 'shares':
share = posts[id_key][key]['count']
mydict['share_count'] = share
else:
mydict[key] = posts[id_key][key]
time = datetime.datetime.now()
mydict['time_checked'] = str(time)
mylist.append(mydict)
loop += 1
df = pd.DataFrame(mylist)
return df
elif len(posts) == 0:
return pd.DataFrame([{'id' : 'post deleted', 'time_checked' : str(datetime.datetime.now())}])
else:
return pd.DataFrame([{'id' : 'Something wrong', 'time_checked' : str(datetime.datetime.now())}])
# fetch new data and update the existing csv files when this function is called :
def appending(page, token, lim) :
df = collector(page, token, lim) # fetch last 50 posts
df_old = pd.read_csv(page + '.csv', encoding='utf_8_sig')
df_new = pd.concat([df_old, df], axis = 0)
# find extra post's ids that are not included in the last 50 posts but existed in the csv file :
id_old = df_old['id'].unique()
id_extra = set(id_old) - set(df['id'])
id_extra = list(id_extra)
# slice all extra ids into chunks and send each chunk to collector_post()
if len(id_extra) > 50:
chunks = [id_extra[x:x+50] for x in range(0, len(id_extra), 50)]
for ids in chunks:
extrarow = collector_post(ids, token)
df_new = pd.concat([df_new, extrarow], axis = 0)
elif len(id_extra) > 0:
extrarow = collector_post(id_extra, token)
df_new = pd.concat([df_new, extrarow], axis = 0)
elif len(id_extra) == 0:
pass
else:
time = str(datetime.datetime.now())
extrarow = pd.DataFrame([{'id' : 'Unexpected condition in appending()', 'time_checked' : time}])
df_new = pd.concat([df_new, extrarow], axis = 0)
# update csv
df_new.to_csv(page + '.csv', index = False, encoding = 'utf_8_sig')
print('to csv page : {} done !'.format(page))
In [6]:
# all page ids that I want to collect data :
pages = ['DramaAdd', 'ejeab', 'cartooneggcat', 'BBCThai', 'in.one.zaroop', 'HighlightsHD.tv', 'khobsanam', '1447102878929950',
'powerofhusbands', 'basementkaraoke', 'cartoon5natee', 'AjahnBuddhadasa', 'Toodsdiary', 'ceclip', 'beargirlfriend',
'jaytherabbitofficial', 'Darlingboredom', 'v.vajiramedhi', '334236760084743', 'kingdomoftigers', 'underbedstar', 'pantipded',
'Pantip.KratooDed', 'nut.ped', '9gaginthai']
# a Facebook secret token :
token = 'Your Secret Token'
# amount of posts you want to collect at the first time
lim = 50
In [7]:
print(str(datetime.datetime.now()))
for page in pages:
initiator(page, token, lim)
print(str(datetime.datetime.now()))
In [9]:
df = pd.read_csv('BBCThai.csv', encoding = 'utf_8_sig')
df.head(5)
Out[9]:
In [10]:
print(str(datetime.datetime.now()))
for page in pages :
appending(page, token, lim)
print(str(datetime.datetime.now()))
In [11]:
df = pd.read_csv('BBCThai.csv', encoding = 'utf_8_sig')
df.head(5)
Out[11]:
In [ ]: