In [1]:
%matplotlib inline
import matplotlib
matplotlib.style.use('ggplot')
# Filename for Facebook chat history, downloaded from Facebook.com via its personal data archival functionality
FILENAME = 'messages-2016-02.htm'
In [2]:
# Utility functions
import lxml
import itertools
import dateparser
import csv
import pandas
import tqdm
import numpy
import datetime
from lxml import etree
def parse_html():
f = open(FILENAME, 'rb')
parser = etree.HTMLParser()
tree = etree.parse(f, parser)
return tree
def get_threads(node):
return node.xpath("//div[@class='thread']")
def get_messages(thread):
title = thread.xpath("./text()")[0]
messages = thread.xpath(".//*[@class='message']")
ps = thread.xpath(".//p")
assert len(messages) == len(ps)
n = len(messages)
parsed_messages = []
for message, p in itertools.izip(messages, ps):
parsed_messages.append(get_single_message(message, p))
return {
'title': title,
'messages': parsed_messages,
}
def get_single_message(message, p):
user = message.xpath(".//*[@class='user']/text()")[0]
meta = message.xpath(".//*[@class='meta']/text()")[0]
return {
'user': user,
'date': parse_date(meta),
# 'meta_raw': meta,
'text': p.text
}
def parse_date(raw_text):
# SUPER DUPER SLOW - DO NOT USE
# return dateparser.parse(raw_text, settings={'TIMEZONE': 'US/Pacific'})
# TODO - timezone support
return datetime.datetime.strptime(raw_text[:-4], "%A, %B %d, %Y at %I:%M%p")
def convert_to_dataframe(parsed_thread):
df = pandas.DataFrame.from_dict(parsed_thread['messages'])
df['thread'] = parsed_thread['title']
return df
def pretty_print(node):
print etree.tostring(node, pretty_print=True, method='html')[:10000]
In [3]:
# Read & Parse the HTML
tree = parse_html()
threads = get_threads(tree)
In [4]:
parsed_messages = []
for thread in tqdm.tqdm(threads):
parsed_messages.append(get_messages(thread))
In [5]:
df = None
for parsed_message in tqdm.tqdm(parsed_messages):
if df is None:
df = convert_to_dataframe(parsed_message)
else:
df = df.append(convert_to_dataframe(parsed_message))
In [6]:
# Add additional column for text length
df['textlen'] = df['text'].apply(lambda x: len(x) if x is not None else 0)
In [7]:
# normalize data to "year + month"
TOP = 20
df['yearmonth'] = df['date'].apply(lambda dt: datetime.datetime(dt.year, dt.month, 1, 0,0))
top_counts = df[['thread', 'textlen']].groupby('thread')['textlen'].sum().sort_values(ascending=False)
top_counts = pandas.DataFrame(top_counts)
top_counts.rename(columns = {'textlen':'count'}, inplace=True)
top_counts = top_counts[:TOP]
In [8]:
# everythong other than "top_count" is categorized as "etc".
highest = pandas.DataFrame({'thread':top_counts.index, 'count': top_counts['count'], 'thread_grouped':top_counts.index})')
only_highest = pandas.merge(df, highest, on='thread', how='outer')
only_highest['thread_grouped'].fillna(value='etc', inplace=True)
In [24]:
# get the histogram of chat messages, grouped by top 20 + etc.
counts = only_highest[['date', 'thread_grouped', 'textlen']]\
.groupby(by=(only_highest.yearmonth, only_highest.thread_grouped))[['textlen']].sum()
counts.rename(columns={'textlen':'count'}, inplace=True)
counts = counts.unstack().resample('MS').fillna(0)
counts = counts.stack()
In [25]:
# plot
pivot = counts.reset_index().pivot(index='yearmonth', columns='thread_grouped', values='count')
pivot = pivot.fillna(value=0)
pivot = pivot[highest.index.tolist() + ['etc']]
plot = pivot.plot(figsize=(30, 5), kind='area', colormap='Paired', legend=False, title='Facebook Chat Volume')
plot.set_ylabel("char / month")
plot.set_xlabel("time")
Out[25]:
In [26]:
sent_vs_received = pandas.DataFrame({'is_me': (df['user'] == 'Jeeyoung Kim'), 'date': df['date']})
sent_vs_received['yearmonth'] = sent_vs_received['date'].apply(lambda dt: datetime.datetime(dt.year, dt.month, 1, 0,0))
sent_vs_received_aggregated = sent_vs_received.groupby(('yearmonth', 'is_me')).count()
sent_vs_received_aggregated = sent_vs_received_aggregated.rename(columns={'date':'count'})
sent_vs_received_aggregated = sent_vs_received_aggregated.unstack().resample('MS').fillna(0).stack()
In [27]:
pivot = sent_vs_received_aggregated.reset_index().pivot(index='yearmonth', columns='is_me', values='count')
pivot = pivot.fillna(value=0)
plot = pivot2.plot(figsize=(30, 5), kind='line', colormap='Paired', legend=True, title='Sent vs Received')
plot.set_ylabel("message / month")
plot.set_xlabel("time")
Out[27]: