Hourly collecting Facebook pages' data

Import libraries and create functions :


In [5]:
import facebook # for connecting to Facebook Graph API
import pprint
import datetime
import pandas as pd
import logging

logger = logging.Logger('catch_all')

# send request to Facebook Graph API, fetching last 50 posts of each page :
def collector(page, token, lim) :
    graph = facebook.GraphAPI(access_token = token, version = '2.7')
    # fetch and transform JSON to dict :
    posts = graph.get_connections(id = page, connection_name = 'posts',
                                  fields = 'id,message,link,shares,from,type,created_time,updated_time,'
                                  'comments.limit(0).summary(true),likes.limit(0).summary(true),reactions.limit(0).summary(true)',
                                  limit = lim)
    data = posts['data'] # dict
    loop = 0
    mylist = []
    
    # parse data from dict :
    for el in data:
        mydict = {}
        for key in el:
            if key == 'comments':
                commentcount = el[key]['summary']['total_count']
                mydict['comment_count'] = commentcount
            elif key == 'from':
                creator = el[key]['name']
                mydict['from'] = creator
            elif key == 'likes':
                likecount = el[key]['summary']['total_count']
                mydict['like_count'] = likecount
            elif key == 'reactions':
                reactcount = el[key]['summary']['total_count']
                mydict['reaction_count'] = reactcount
            elif key == 'shares':
                share = el[key]['count']
                mydict['share_count'] = share
            else:
                mydict[key] = el[key]
        time = datetime.datetime.now() # add timestamp
        mydict['time_checked'] = str(time)
        mylist.append(mydict)
        loop += 1
    df = pd.DataFrame(mylist) # pandas dataframe
    return df

# create a new csv file of each page :
def initiator(page, token, lim) :
    df = collector(page, token, lim)
    df.to_csv(page + '.csv', index = False, encoding = 'utf_8_sig')

# receive post's ids and collect new data of these extra posts :
def collector_post(post_ids, token) :
    graph = facebook.GraphAPI(access_token = token, version = '2.7')
    for idvalue in post_ids:
        if (idvalue == 'post deleted' or idvalue == 'ERROR in exception' or 
            idvalue == 'Something wrong' or idvalue == 'Unexpected condition in appending()'):
            post_ids.remove(idvalue)
    try:
        posts = graph.get_objects(ids = post_ids,
                            fields = 'id,message,link,shares,from,type,created_time,updated_time,'
                            'comments.limit(0).summary(true),likes.limit(0).summary(true),reactions.limit(0).summary(true)')
    except Exception as e:
        logger.error(e, exc_info=True)
        time = str(datetime.datetime.now())
        return pd.DataFrame([{'id' : 'ERROR in exception', 'time_checked' : time}])
    #--------------------------------------------
    if len(posts) > 0:
        loop = 0
        mylist = []
        for id_key in posts:
            mydict = {}
            for key in posts[id_key]:
                if key == 'comments':
                    commentcount = posts[id_key][key]['summary']['total_count']
                    mydict['comment_count'] = commentcount
                elif key == 'from':
                    creator = posts[id_key][key]['name']
                    mydict['from'] = creator
                elif key == 'likes':
                    likecount = posts[id_key][key]['summary']['total_count']
                    mydict['like_count'] = likecount
                elif key == 'reactions':
                    reactcount = posts[id_key][key]['summary']['total_count']
                    mydict['reaction_count'] = reactcount
                elif key == 'shares':
                    share = posts[id_key][key]['count']
                    mydict['share_count'] = share
                else:
                    mydict[key] = posts[id_key][key]
            time = datetime.datetime.now()
            mydict['time_checked'] = str(time)
            mylist.append(mydict)
            loop += 1
        df = pd.DataFrame(mylist)
        return df
    elif len(posts) == 0:
        return pd.DataFrame([{'id' : 'post deleted', 'time_checked' : str(datetime.datetime.now())}])
    else:
        return pd.DataFrame([{'id' : 'Something wrong', 'time_checked' : str(datetime.datetime.now())}])

# fetch new data and update the existing csv files when this function is called :
def appending(page, token, lim) :
    df = collector(page, token, lim) # fetch last 50 posts
    df_old = pd.read_csv(page + '.csv', encoding='utf_8_sig')
    df_new = pd.concat([df_old, df], axis = 0)
    # find extra post's ids that are not included in the last 50 posts but existed in the csv file : 
    id_old = df_old['id'].unique()
    id_extra = set(id_old) - set(df['id'])
    id_extra = list(id_extra)
    # slice all extra ids into chunks and send each chunk to collector_post()
    if len(id_extra) > 50:
        chunks = [id_extra[x:x+50] for x in range(0, len(id_extra), 50)]
        for ids in chunks:
            extrarow = collector_post(ids, token)
            df_new = pd.concat([df_new, extrarow], axis = 0)
    elif len(id_extra) > 0:
        extrarow = collector_post(id_extra, token)
        df_new = pd.concat([df_new, extrarow], axis = 0)
    elif len(id_extra) == 0:
        pass
    else:
        time = str(datetime.datetime.now())
        extrarow = pd.DataFrame([{'id' : 'Unexpected condition in appending()', 'time_checked' : time}])
        df_new = pd.concat([df_new, extrarow], axis = 0)
    # update csv
    df_new.to_csv(page + '.csv', index = False, encoding = 'utf_8_sig')
    print('to csv page : {} done !'.format(page))

Set up variables:

Note: Put your Facebook secret token into 'token' variable before running


In [6]:
# all page ids that I want to collect data :
pages = ['DramaAdd', 'ejeab', 'cartooneggcat', 'BBCThai', 'in.one.zaroop', 'HighlightsHD.tv', 'khobsanam', '1447102878929950',
         'powerofhusbands', 'basementkaraoke', 'cartoon5natee', 'AjahnBuddhadasa', 'Toodsdiary', 'ceclip', 'beargirlfriend',
         'jaytherabbitofficial', 'Darlingboredom', 'v.vajiramedhi', '334236760084743', 'kingdomoftigers', 'underbedstar', 'pantipded',
         'Pantip.KratooDed', 'nut.ped', '9gaginthai']
# a Facebook secret token :
token = 'Your Secret Token'

# amount of posts you want to collect at the first time
lim = 50

First, run this code to create new csv files of all pages :

Note: All files will be created in the same directory as this script.


In [7]:
print(str(datetime.datetime.now()))
for page in pages:
    initiator(page, token, lim)
print(str(datetime.datetime.now()))


2017-01-08 18:51:47.276207
2017-01-08 18:52:08.342411

A csv file example :


In [9]:
df = pd.read_csv('BBCThai.csv', encoding = 'utf_8_sig')
df.head(5)


Out[9]:
comment_count created_time from id like_count link message reaction_count share_count time_checked type updated_time
0 3 2017-01-08T11:36:18+0000 บีบีซีไทย - BBC Thai 1526071940947174_1872082379679460 205 https://www.facebook.com/BBCThai/videos/187208... มิเชล โอบามา กล่าวสุนทรพจน์อย่างเป็นทางการครั้... 213 30 2017-01-08 18:51:50.754405 video 2017-01-08T11:45:06+0000
1 8 2017-01-08T11:13:47+0000 บีบีซีไทย - BBC Thai 1526071940947174_1872076769680021 370 http://bbc.in/2i2CjZQ พระสงฆ์รูปหนึ่งจุดไฟเผาตัวเองประท้วงรัฐบาลเกาห... 414 25 2017-01-08 18:51:50.754405 link 2017-01-08T11:46:32+0000
2 2 2017-01-08T10:06:32+0000 บีบีซีไทย - BBC Thai 1526071940947174_1872059309681767 182 https://www.facebook.com/BBCThai/videos/187205... แก้ไข : คำผิดในวิดีโอช่วงนาทีที่ 0.56 \r\nจากค... 186 13 2017-01-08 18:51:50.754405 video 2017-01-08T10:28:13+0000
3 30 2017-01-08T09:44:38+0000 บีบีซีไทย - BBC Thai 1526071940947174_1872055029682195 723 http://bbc.in/2j50LGO หนังสือ “การต่อสู้ของข้าพเจ้า” (Mein Kampf) ขอ... 756 78 2017-01-08 18:51:50.754405 link 2017-01-08T11:39:12+0000
4 8 2017-01-08T09:24:53+0000 บีบีซีไทย - BBC Thai 1526071940947174_1872050469682651 434 https://www.facebook.com/BBCThai/videos/187205... อนาคตพนักงานพิมพ์ดีดริบหรี่ ในยุคไอทีรุกเมียนม... 456 59 2017-01-08 18:51:50.754405 video 2017-01-08T10:25:27+0000

Then, run this code hourly to update all of the csv files :


In [10]:
print(str(datetime.datetime.now()))
for page in pages :
    appending(page, token, lim)
print(str(datetime.datetime.now()))


2017-01-08 18:54:48.491571
to csv page : DramaAdd done !
to csv page : ejeab done !
to csv page : cartooneggcat done !
to csv page : BBCThai done !
to csv page : in.one.zaroop done !
to csv page : HighlightsHD.tv done !
to csv page : khobsanam done !
to csv page : 1447102878929950 done !
to csv page : powerofhusbands done !
to csv page : basementkaraoke done !
to csv page : cartoon5natee done !
to csv page : AjahnBuddhadasa done !
to csv page : Toodsdiary done !
to csv page : ceclip done !
to csv page : beargirlfriend done !
to csv page : jaytherabbitofficial done !
to csv page : Darlingboredom done !
to csv page : v.vajiramedhi done !
to csv page : 334236760084743 done !
to csv page : kingdomoftigers done !
to csv page : underbedstar done !
to csv page : pantipded done !
to csv page : Pantip.KratooDed done !
to csv page : nut.ped done !
to csv page : 9gaginthai done !
2017-01-08 18:55:11.133865

An updated csv example :


In [11]:
df = pd.read_csv('BBCThai.csv', encoding = 'utf_8_sig')
df.head(5)


Out[11]:
comment_count created_time from id like_count link message reaction_count share_count time_checked type updated_time
0 3 2017-01-08T11:36:18+0000 บีบีซีไทย - BBC Thai 1526071940947174_1872082379679460 205 https://www.facebook.com/BBCThai/videos/187208... มิเชล โอบามา กล่าวสุนทรพจน์อย่างเป็นทางการครั้... 213 30 2017-01-08 18:51:50.754405 video 2017-01-08T11:45:06+0000
1 8 2017-01-08T11:13:47+0000 บีบีซีไทย - BBC Thai 1526071940947174_1872076769680021 370 http://bbc.in/2i2CjZQ พระสงฆ์รูปหนึ่งจุดไฟเผาตัวเองประท้วงรัฐบาลเกาห... 414 25 2017-01-08 18:51:50.754405 link 2017-01-08T11:46:32+0000
2 2 2017-01-08T10:06:32+0000 บีบีซีไทย - BBC Thai 1526071940947174_1872059309681767 182 https://www.facebook.com/BBCThai/videos/187205... แก้ไข : คำผิดในวิดีโอช่วงนาทีที่ 0.56 \r\r\nจา... 186 13 2017-01-08 18:51:50.754405 video 2017-01-08T10:28:13+0000
3 30 2017-01-08T09:44:38+0000 บีบีซีไทย - BBC Thai 1526071940947174_1872055029682195 723 http://bbc.in/2j50LGO หนังสือ “การต่อสู้ของข้าพเจ้า” (Mein Kampf) ขอ... 756 78 2017-01-08 18:51:50.754405 link 2017-01-08T11:39:12+0000
4 8 2017-01-08T09:24:53+0000 บีบีซีไทย - BBC Thai 1526071940947174_1872050469682651 434 https://www.facebook.com/BBCThai/videos/187205... อนาคตพนักงานพิมพ์ดีดริบหรี่ ในยุคไอทีรุกเมียนม... 456 59 2017-01-08 18:51:50.754405 video 2017-01-08T10:25:27+0000

In [ ]: