In [1]:
from bigbang.archive import Archive
from bigbang.archive import load as load_archive
import pandas as pd
import datetime
import pytz
from bigbang.thread import Thread
from bigbang.thread import Node
from collections import defaultdict
In [2]:
#load archives data
archives_names = ["hrpc"]
archives_paths = list()
for archive_name in archives_names:
archives_paths.append('../../archives/'+archive_name+'.csv')
archives_list = [load_archive(archive_path).data for archive_path in archives_paths]
archives = Archive(pd.concat(archives_list))
archives_data = archives.resolve_entities() #this should also merge very similar email addresses; if you wanna drop this function, replace .resolve_entities() with .data
In [3]:
#here you can set the time frame
date_from = pd.datetime(1997,11,1,tzinfo=pytz.utc)
date_to = pd.datetime(2018,3,3,tzinfo=pytz.utc)
def filter_by_date(df,d_from,d_to):
return df[(df['Date'] > d_from) & (df['Date'] < d_to)]
#if you use archives_data_filtered, it will be filtered by date
archives_data_filtered = filter_by_date(archives_data, date_from, date_to)
In [4]:
#Q1: Top senders over time period
#set how many top senders to display
n_top_senders = 5
activity = Archive(archives_data_filtered).get_activity()
tot_activity = activity.sum(0).sort_values()
print(tot_activity[-n_top_senders:])
In [5]:
#Q3: Number of emails in a time frame
print(len(archives_data_filtered)) #set the time frame in the cell above
In [6]:
#Q4: I would be interested for instance in the average number of emails per user, across multiple years.
def filter_by_date(df,d_from,d_to):
return df[(df['Date'] > d_from) & (df['Date'] < d_to)]
n_top_senders = 5
for year in range(archives_data["Date"].min().year, archives_data["Date"].max().year):
archives_data_filtered = filter_by_date(archives_data, pd.datetime(year,1,1,tzinfo=pytz.utc),pd.datetime(year,12,31,tzinfo=pytz.utc))
if len(archives_data_filtered) > 0:
print(str(year)+':')
activity = Archive(archives_data_filtered).get_activity()
tot_activity = activity.sum(0).sort_values()
print(tot_activity[-n_top_senders:])
print('______________________')
In [7]:
#Q4b I would also be interested to know, when I know what the peak number of emails
#is across a year, how many emails were sent to the list per month, across multiple numbers of years.
#here you can take a look at the number of emails per year
def filter_by_date(df,d_from,d_to):
return df[(df['Date'] > d_from) & (df['Date'] < d_to)]
activity = Archive(archives_data).get_activity()
for year in range(archives_data["Date"].min().year, archives_data["Date"].max().year):
archives_data_filtered = filter_by_date(archives_data, pd.datetime(year,1,1,tzinfo=pytz.utc),pd.datetime(year,12,31,tzinfo=pytz.utc))
print(str(year)+':'+str(len(archives_data_filtered)))
In [8]:
#then you can specify some years and have the break down per month
def filter_by_date(df,d_from,d_to):
return df[(df['Date'] > d_from) & (df['Date'] < d_to)]
years = [2003, 2009]
for year in years:
archives_data_filtered = filter_by_date(archives_data, pd.datetime(year,1,1,tzinfo=pytz.utc),pd.datetime(year,12,31,tzinfo=pytz.utc))
print(str(year)+': tot '+str(len(archives_data_filtered)))
for month in range(1,13):
if month in (1,3,5,7,8,10,12): archives_data_filtered = filter_by_date(archives_data, pd.datetime(year,month,1,tzinfo=pytz.utc),pd.datetime(year,month,31,tzinfo=pytz.utc))
elif month in (4,6,9,11): archives_data_filtered = filter_by_date(archives_data, pd.datetime(year,month,1,tzinfo=pytz.utc),pd.datetime(year,month,30,tzinfo=pytz.utc))
else: archives_data_filtered = filter_by_date(archives_data, pd.datetime(year,month,1,tzinfo=pytz.utc),pd.datetime(year,month,28,tzinfo=pytz.utc))
print(' '+str(month)+': '+str(len(archives_data_filtered)))
print('____________________')
In [9]:
#Q5 get threads with most replies
#set the number of top threads (-1 for all)
n_top_threads = 5
threads_length = defaultdict(int)
for t in archives.get_threads():
try: threads_length[t.get_root().get_data()["Subject"]] = t.get_num_messages()
except: pass
for t,n in sorted(iter(threads_length.items()), reverse = True, key= lambda k_v:(k_v[1],k_v[0]))[:n_top_threads]:
print(t+' '+str(n))
In [ ]:
In [ ]:
In [ ]:
In [ ]: