Download status updates and comments from Facebook pages and Facebook groups.
Uses the Facebook GraphAPI.
To use the Facebook GraphAPI, you need an access token. It's basically a key that unlocks the service.
How to get access token:
E3OGYAACENWbS6CRz7qiFudEose0cBAMdqw...
XXX
below with your access token.
In [ ]:
accesstoken = "XXX"
Create some fuctions that will scrape Facebook pages.
It is a lot of things here, but you'll only interact with these functions:
getme()
will get information about yourself.getpage(id)
will get information about a page by its page ID, slug name or URL, and return page information.getstatuses(pageid)
will get status updates from a page by its page ID, and return a list of statuses.
In [ ]:
# Install facepy package to connect o Facebook API. If it doesn't work, test with pip3 instead.
!pip install facepy
In [ ]:
# Import the "facepy" library that talks to Facebooks API.
from facepy.exceptions import OAuthError
from facepy import GraphAPI
import datetime
# Function that connects to Faceboko GraphAPI and returns information about you.
def getme():
print("Fetching yourself...")
graph = GraphAPI(accesstoken, version="2.11")
melist = graph.get("me?fields=id,name,email,birthday", page=True, retry=2, limit=1)
for me in melist:
print("Done.")
return(me)
print("Couldn't find you...")
# Function that gets information about a Facebook page.
def getpage(id):
graph = GraphAPI(accesstoken, version="2.11")
page = graph.get(str(id) + "?fields=id,name,link,fan_count", page=False, retry=2, limit=1)
return(page)
# Function to get statuses from a Facebook page.
def getstatuses(id, limit=0):
fields = "permalink_url,message,link,created_time,type,from,name,id,likes.limit(1).summary(true),comments.limit(1).summary(true),shares"
if limit == 0:
print("Fetching statuses (this might take some while, consider changing the limit to speed things up)...")
else:
print("Fetching statuses (limited to latest {0})...".format(limit))
graph = GraphAPI(accesstoken, version="2.11")
pages = graph.get(str(id) + "/feed?fields=" + fields, page=True, retry=2, limit=1)
l = process_pager(pages, limit)
print("Done.")
print("Got {0} statuses.".format(len(l)))
return(l)
# Function that process pager from facepy and cycle through each status message.
def process_pager(pages, limit):
l = []
i = 0
for page in pages:
for status in page["data"]:
l.append(process_status(status))
i = i + 1
if i >= limit and limit != 0:
break
if i >= limit and limit != 0:
break
return(l)
# Function that processes a status message into a more easy-to-use dictionary.
def process_status(status):
status_dict = {
"fromname": status["from"]["name"],
"fromid": status["from"]["id"],
"id": status["id"],
"type": status["type"],
"created": process_date(status["created_time"]),
"message": "" if "message" not in status.keys() else str(status["message"].encode("utf-8")),
"link": "" if "link" not in status.keys() else status["link"],
"linkname": "" if "name" not in status.keys() else status["name"].encode("utf-8"),
"likes": 0 if "likes" not in status.keys() else status["likes"]["summary"]["total_count"],
"comments": 0 if "comments" not in status.keys() else status["comments"]["summary"]["total_count"],
"shares": 0 if "shares" not in status.keys() else status["shares"]["count"],
"permalink": status["permalink_url"]
}
return(status_dict)
# Function that convert dates from Facebook to yyy-mm-dd hh:mm:ss.
def process_date(strdate):
dt = datetime.datetime.strptime(strdate, "%Y-%m-%dT%H:%M:%S+0000")
#dt = dt + datetime.timedelta(hours = -6) # About -6 hours in Swedish time.
dt = dt.strftime("%Y-%m-%d %H:%M:%S")
return(dt)
In [ ]:
me = getme()
In [ ]:
me
In [ ]:
# Your name.
print(me["name"])
# Your ID.
print(me["id"])
# Your birthday.
print(me["birthday"])
In [ ]:
guardian = getpage("http://facebook.com/theguardian")
In [ ]:
guardian
In [ ]:
guardian["fan_count"]
In [ ]:
print("{0} ({1}) has {2} fans and ID {3}.".format(guardian["name"], guardian["link"], guardian["fan_count"], guardian["id"]))
The status messages are stored as a dictionary object.
You can see the contents by printing them like print(status["id"])
. Here are all the available names:
Status | Description |
---|---|
status["fromname"] |
name of sender |
status["fromid"] |
ID of sender |
status["id"] |
ID of status message |
status["type"] |
type of status message (e.g., link, event, picture) |
status["created"] |
date when message was published |
status["message"] |
status message |
status["link"] |
URL link in the status message |
status["linkname"] |
name of link that status message may contain |
status["likes"] |
number of likes status message got |
status["comments"] |
number of comments status message got |
status["shares"] |
number of shares status message got |
status["permalink"] |
URL link to Facebook post |
In [ ]:
# Get Facebook status updates from The Guardian (PageID: 10513336322).
guardian_statuses = getstatuses(10513336322, limit=20)
In [ ]:
# Show info about each status message.
for status in guardian_statuses:
print("Created: " + status["created"])
print("Permalink: " + status["permalink"])
print("Message: " + status["message"][:60])
print("Info: {0} likes, {1} shares, {2} comments".format(status["likes"], status["shares"], status["comments"]))
print()
In [ ]:
# Lets count the number of links among all status messages.
# Counter to store the number of links.
i = 0
# Get the number of total status messages. len() means length.
total_statuses = len(guardian_statuses)
# How many statuses are links? Do a for-loop and increment the counter with 1 if it is a link.
for status in guardian_statuses:
if status["type"] == "link":
i = i + 1
print("There are {0} status messages and {1} of them are links.".format(total_statuses, i))
In [ ]:
# Descriptive statistics: how many likes did they get in total?
total_likes = 0
for status in guardian_statuses:
total_likes = total_likes + status["likes"]
print("Total {0} likes.".format(total_likes))