notebook.community

Edit and run



In [1]:

    
from bigbang.archive import Archive
from bigbang.archive import load as load_archive
import pandas as pd
import datetime
import pytz
from bigbang.thread import Thread
from bigbang.thread import Node
from collections import defaultdict



In [2]:

    
#load  archives data

archives_names = ["hrpc"]

archives_paths = list()
for archive_name in archives_names:
    archives_paths.append('../../archives/'+archive_name+'.csv')

archives_list = [load_archive(archive_path).data for archive_path in archives_paths]
    
archives = Archive(pd.concat(archives_list))

archives_data = archives.resolve_entities() #this should also merge very similar email addresses; if you wanna drop this function, replace .resolve_entities() with .data



In [3]:

    
#here you can set the time frame 

date_from = pd.datetime(1997,11,1,tzinfo=pytz.utc)
date_to = pd.datetime(2018,3,3,tzinfo=pytz.utc)
def filter_by_date(df,d_from,d_to):
    return df[(df['Date'] > d_from) & (df['Date'] < d_to)]

#if you use archives_data_filtered, it will be filtered by date
archives_data_filtered = filter_by_date(archives_data, date_from, date_to)



In [4]:

    
#Q1: Top senders over time period

#set how many top senders to display
n_top_senders = 5 


activity = Archive(archives_data_filtered).get_activity()
tot_activity = activity.sum(0).sort_values()

print(tot_activity[-n_top_senders:])









    



From
Joseph Lorenzo Hall <joe@cdt.org>               54.0
avri doria <avri@acm.org>                       77.0
Stephen Farrell <stephen.farrell@cs.tcd.ie>     83.0
Stephane Bortzmeyer <bortzmeyer@nic.fr>        134.0
Niels ten Oever <niels@article19.org>          347.0
dtype: float64



In [5]:

    
#Q3: Number of emails in a time frame

print(len(archives_data_filtered)) #set the time frame in the cell above



In [6]:

    
#Q4: I would be interested for instance in the average number of emails per user, across multiple years.

def filter_by_date(df,d_from,d_to):
    return df[(df['Date'] > d_from) & (df['Date'] < d_to)]

n_top_senders = 5 

for year in range(archives_data["Date"].min().year, archives_data["Date"].max().year):
    archives_data_filtered = filter_by_date(archives_data, pd.datetime(year,1,1,tzinfo=pytz.utc),pd.datetime(year,12,31,tzinfo=pytz.utc))
    if len(archives_data_filtered) > 0:
        print(str(year)+':')
        activity = Archive(archives_data_filtered).get_activity()
        tot_activity = activity.sum(0).sort_values()
        print(tot_activity[-n_top_senders:])
        print('______________________')









    



2014:
From
Ludo <ludo@greenhost.nl>                    1.0
Stephane Bortzmeyer <bortzmeyer@nic.fr>     1.0
avri doria <avri@acm.org>                   1.0
Niels ten Oever <mail@nielstenoever.net>    2.0
Niels ten Oever <niels@article19.org>       2.0
dtype: float64
______________________
2015:
From
Joseph Lorenzo Hall <joe@cdt.org>          16.0
avri doria <avri@acm.org>                  20.0
Corinne Cath <cattekwaad@gmail.com>        22.0
Stephane Bortzmeyer <bortzmeyer@nic.fr>    29.0
Niels ten Oever <niels@article19.org>      99.0
dtype: float64
______________________
2016:
From
Stephane Bortzmeyer <bortzmeyer@nic.fr>         31.0
Joseph Lorenzo Hall <joe@cdt.org>               33.0
John Curran <jcurran@istaff.org>                37.0
Stephen Farrell <stephen.farrell@cs.tcd.ie>     43.0
Niels ten Oever <niels@article19.org>          124.0
dtype: float64
______________________
2017:
From
Eliot Lear <lear@cisco.com>                     16.0
Andrew Sullivan <ajs@anvilwalrusden.com>        17.0
Stephen Farrell <stephen.farrell@cs.tcd.ie>     28.0
Stephane Bortzmeyer <bortzmeyer@nic.fr>         45.0
Niels ten Oever <niels@article19.org>          104.0
dtype: float64
______________________



In [7]:

    
#Q4b I would also be interested to know, when I know what the peak number of emails
#is across a year, how many emails were sent to the list per month, across multiple numbers of years. 

#here you can take a look at the number of emails per year

def filter_by_date(df,d_from,d_to):
    return df[(df['Date'] > d_from) & (df['Date'] < d_to)]

activity = Archive(archives_data).get_activity()

for year in range(archives_data["Date"].min().year, archives_data["Date"].max().year):
    archives_data_filtered = filter_by_date(archives_data, pd.datetime(year,1,1,tzinfo=pytz.utc),pd.datetime(year,12,31,tzinfo=pytz.utc))
    print(str(year)+':'+str(len(archives_data_filtered)))



In [8]:

    
#then you can specify some years and have the break down per month

def filter_by_date(df,d_from,d_to):
    return df[(df['Date'] > d_from) & (df['Date'] < d_to)]

years = [2003, 2009]

for year in years:
    archives_data_filtered = filter_by_date(archives_data, pd.datetime(year,1,1,tzinfo=pytz.utc),pd.datetime(year,12,31,tzinfo=pytz.utc))
    print(str(year)+':  tot '+str(len(archives_data_filtered)))
    for month in range(1,13):
        if month in (1,3,5,7,8,10,12): archives_data_filtered = filter_by_date(archives_data, pd.datetime(year,month,1,tzinfo=pytz.utc),pd.datetime(year,month,31,tzinfo=pytz.utc))
        elif month in (4,6,9,11): archives_data_filtered = filter_by_date(archives_data, pd.datetime(year,month,1,tzinfo=pytz.utc),pd.datetime(year,month,30,tzinfo=pytz.utc))
        else: archives_data_filtered = filter_by_date(archives_data, pd.datetime(year,month,1,tzinfo=pytz.utc),pd.datetime(year,month,28,tzinfo=pytz.utc))     
        print('    '+str(month)+':   '+str(len(archives_data_filtered)))
    print('____________________')









    



2003:  tot 0
    1:   0
    2:   0
    3:   0
    4:   0
    5:   0
    6:   0
    7:   0
    8:   0
    9:   0
    10:   0
    11:   0
    12:   0
____________________
2009:  tot 0
    1:   0
    2:   0
    3:   0
    4:   0
    5:   0
    6:   0
    7:   0
    8:   0
    9:   0
    10:   0
    11:   0
    12:   0
____________________



In [9]:

    
#Q5 get threads with most replies


#set the number of top threads (-1 for all)
n_top_threads = 5

threads_length = defaultdict(int)
for t in archives.get_threads():
    try: threads_length[t.get_root().get_data()["Subject"]] = t.get_num_messages()
    except: pass

for t,n in sorted(iter(threads_length.items()), reverse = True, key= lambda k_v:(k_v[1],k_v[0]))[:n_top_threads]:
    print(t+'   '+str(n))









    



[hrpc] Examining existing Venue Selection criteria   71
[hrpc] Case three: DDoS   55
[hrpc] Human Rights Research Group Call on draft-irtf-hrpc-research-07   53
Re: [hrpc] draft-tenoever-hrpc-research-02   32
[hrpc] Comments about draft-irtf-hrpc-research-07   26



In [ ]:



In [ ]:



In [ ]:



In [ ]: