練習


In [1]:
import os
import requests
import pandas as pd

from datetime import datetime

In [2]:
# 透過 Graph API 觀察文章 ID 與 token
article_id = '232633627068_10156769966527069'
token = ''

In [3]:
comments = []
pages = 0

"""
nested query + 游標型分頁
%7B => {
%7D => }
%2C => ,
reference: https://www.w3schools.com/tags/ref_urlencode.asp
"""

base_url = 'https://graph.facebook.com/v2.11/{}'.format(article_id)
query = '?fields=comments.limit({})%7Battachment%2Capplication%2Cmessage.limit({})%7D&access_token={}'.format(
    10, 100, token
)
url = '{}/{}'.format(base_url, query)

while True:
    pages += 1
    resp = requests.get(url)
    data = resp.json()
    if 'comments' not in data:
        break

    comments += data['comments']['data']
    
    if 'after' not in data['comments']['paging']['cursors']:
        print('EOF')
        break
    else:
        cursors_after = data['comments']['paging']['cursors']['after']
        query = '?fields=comments.limit({}).after({})%7Battachment%2Capplication%2Cmessage.limit({})%7D&access_token={}'.format(
            10, cursors_after, 100, token
        )
        url = '{}/{}'.format(base_url, query)
        print('pages {}'.format(pages))

print('comments length = {}'.format(len(comments)))


pages 1
pages 2
pages 3
pages 4
pages 5
pages 6
pages 7
comments length = 63

In [4]:
for comment in comments:
    application, attachment, message = '', '', ''
    if 'application' in comment:
        app = {'application_{}'.format(k):v for k, v in comment['application'].items()}
        comment.update(app)
        del comment['application']
    if 'attachment' in comment:
        att = {
            'attachment_type': comment['attachment']['type'],
            'attachment_url': comment['attachment']['url']
        }
        comment.update(att)
        del comment['attachment']

df = pd.DataFrame.from_records(comments)
df.head()


Out[4]:
application_category application_id application_link application_name application_namespace attachment_type attachment_url id message
0 Utilities 350685531728 /android Facebook for Android fbandroid NaN NaN 10156769966527069_10156771068602069 又要連PO好幾天\n一天好幾篇\nPO到有人反感\n留言開始有人吵架鬥嘴\n最後一面倒開始噴這遊戲
1 Utilities 6628568379 /iphone Facebook for iPhone fbiphone photo https://www.facebook.com/photo.php?fbid=164774... 10156769966527069_10156771204372069 水溝是怎樣
2 Utilities 350685531728 /android Facebook for Android fbandroid photo https://www.facebook.com/photo.php?fbid=201326... 10156769966527069_10156771212477069 我的🐸兒子好久才回家本來很生氣(找不到罵兒子的選項XD\n\n結果看到他帶回來的名產\n以及...
3 Utilities 350685531728 /android Facebook for Android fbandroid NaN NaN 10156769966527069_10156771109777069 重複報導是不會膩喔
4 Utilities 350685531728 /android Facebook for Android fbandroid photo https://www.facebook.com/photo.php?fbid=537644... 10156769966527069_10156771833147069 我家的青蛙在我肚子裡跟我一起去旅行了

In [5]:
results = os.path.abspath('../results')
if not os.path.exists(results):
    os.makedirs(results)

filename = os.path.join(results, '{}.csv'.format(article_id))
df.to_csv(filename, index=False)
print('Save file - {}'.format(filename))


Save file - /home/dirl/github/Python-Crawling-Tutorial/results/232633627068_10156769966527069.csv