A basic parser for Facebook's weird message format

Just make sure we have all of our dependencies ready.


In [ ]:
%matplotlib inline
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import re

Open the file and parse it as HTML using BeautifulSoup


In [ ]:
with open("./messages.htm") as html_file:
    soup = BeautifulSoup(html_file, "lxml")

Enumerate each thread


In [ ]:
threads = soup.find_all("div", "thread")

Parse the relevant info into a dataframe


In [ ]:
thread_list = []
for thread in threads:
    participants = thread.contents[0].split(",")
    participants = list(map(str.strip, participants))
    messages = thread.select(".message")
    for message in messages:
            user = message.select(".user")[0].string
            date = message.select(".meta")[0].string
            text = message.next_sibling.string
            thread_list.append({
                "thread":participants,
                "from": user,
                "date": date,
                "message": text
            })
df = pd.DataFrame(thread_list, columns=["thread", "from", "date", "message"])

Attempt to Parse the Dates


In [ ]:
df["date"] = pd.to_datetime(df.date)

In [ ]:
df.index = df.date

In [ ]:
del(df["date"])

Show the Data


In [ ]:
df

Some Pretty Visualizations


In [ ]:
ts = df.groupby(df.index.date).count()

In [ ]:
del(ts["thread"])
del(ts["from"])

In [ ]:
ts.plot()

In [ ]:
df["from"].value_counts()[1:9]