In [1]:
%matplotlib inline

In [51]:
import bigbang.mailman as mailman
import bigbang.graph as graph
import bigbang.process as process
from bigbang.parse import get_date
from bigbang.archive import Archive
import bigbang.twopeople as twoppl
import imp
import pandas as pd
import datetime
import matplotlib.pyplot as plt
import numpy as np
import math
import pytz
import pickle
import os
pd.options.display.mpl_style = 'default' # pandas has a set of preferred graph formatting options

In [109]:

#insert one or more urls of the mailing lists you want to include in the analysis
#if more mailing lists are included, the data are aggregated and treated as a single object of analysis
#e.g. urls  = []
urls = ["", 

archives = [mailman.open_list_archives(url,"../archives") for url in urls]
archives = pd.concat(archives)

Opening 19 archive files

In [145]:
#comput and plot top senders (people sending out emails)

#set the number of top senders to be displayed
n_top_senders = 5

activity = Archive.get_activity(Archive(archives))

tot_activity = activity.sum(0)


tot_activity[-n_top_senders:].plot(kind = 'barh', width = 1)

C:\Anaconda2\lib\site-packages\ipykernel\ FutureWarning: sort is deprecated, use sort_values(inplace=True) for INPLACE sorting
rafik.dammak at (Rafik Dammak)                    33.0
stephanie.perrin at (Stephanie Perrin)     35.0
mariliamaciel at (Marilia Maciel)                 47.0
maryam.bakoshi at (Maryam Bakoshi)                62.0
niels at (Niels ten Oever)                   189.0
dtype: float64
<matplotlib.axes._subplots.AxesSubplot at 0x15a93f30>

In [111]:
#compute replies list (sender+replier)
arc_data = Archive(archives).data
from_users = arc_data[['From']]
to_users = arc_data[arc_data['In-Reply-To'] > 0][['From','Date','In-Reply-To']]
replies = pd.merge(from_users, to_users, how='inner', 

In [117]:
#compute and plot top repliers (people responding to mails)

#set the number of top repliers to be displayed
n_top_repliers = 10

from collections import defaultdict
repliers_count = defaultdict(int)
for reply in replies['From_response']:
    repliers_count[reply] += 1
repliers_count = sorted(iter(repliers_count.items()), key = lambda k_v:(k_v[1],k_v[0]))
for replier_count in repliers_count[-n_top_repliers:]:
    print(replier_count[0]+'  '+str(replier_count[1])) 
repliers_count = pd.DataFrame.from_records(repliers_count, index = 0)
repliers_count[-n_top_repliers:].plot(kind = 'barh', width = 1)

wjdrake at (William Drake)  11
tatiana.tropina at (Tatiana Tropina)  12
avri at (Avri Doria)  14
Lee.HIBBARD at (HIBBARD Lee)  15
mshears at (Matthew Shears)  17
michele at (Michele Neylon - Blacknight)  21
rafik.dammak at (Rafik Dammak)  30
mariliamaciel at (Marilia Maciel)  32
stephanie.perrin at (Stephanie Perrin)  35
niels at (Niels ten Oever)  91
<matplotlib.axes._subplots.AxesSubplot at 0x149660d0>

In [144]:
#compute and plot top dyads (pairs of replier-receiver)

#select the number of top dyads to be desplayed
n_top_dyads = 10

dyads = twoppl.panda_allpairs(replies, twoppl.unique_pairs(replies))
dyads = dyads.sort("num_replies", ascending = False)
print(dyads[:n_top_dyads]["A"]+' '+dyads[:n_top_dyads]["B"]+' '+str(dyads[:n_top_dyads]["num_replies"]))
dyads['dyad'] = dyads['A']+dyads['B']
dyads[:n_top_dyads].plot(kind = 'barh', width = 1, x = 'dyad', y = 'num_replies')

C:\Anaconda2\lib\site-packages\ipykernel\ FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
141    niels at (Niels ten Oever) niels...
126    niels at (Niels ten Oever) steph...
57     niels at (Niels ten Oever) maril...
132    niels at (Niels ten Oever) tatia...
74     niels at (Niels ten Oever) rafik...
147    mshears at (Matthew Shears) niels at a...
112    niels at (Niels ten Oever) miche...
176    michele at (Michele Neylon - Bl...
38     niels at (Niels ten Oever) paf a...
71     niels at (Niels ten Oever) wjdra...
dtype: object
<matplotlib.axes._subplots.AxesSubplot at 0x14d1a2f0>

In [ ]: