In [ ]:
%matplotlib inline
import pandas as pd
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import re
In [ ]:
with open("./messages.htm") as html_file:
soup = BeautifulSoup(html_file, "lxml")
In [ ]:
threads = soup.find_all("div", "thread")
In [ ]:
thread_list = []
for thread in threads:
participants = thread.contents[0].split(",")
participants = list(map(str.strip, participants))
messages = thread.select(".message")
for message in messages:
user = message.select(".user")[0].string
date = message.select(".meta")[0].string
text = message.next_sibling.string
thread_list.append({
"thread":participants,
"from": user,
"date": date,
"message": text
})
df = pd.DataFrame(thread_list, columns=["thread", "from", "date", "message"])
In [ ]:
df["date"] = pd.to_datetime(df.date)
In [ ]:
df.index = df.date
In [ ]:
del(df["date"])
In [ ]:
df
In [ ]:
ts = df.groupby(df.index.date).count()
In [ ]:
del(ts["thread"])
del(ts["from"])
In [ ]:
ts.plot()
In [ ]:
df["from"].value_counts()[1:9]