This notebook compute and plot the senders that have talked more about something

What it does:

-given a word / sentence, it scans emails in one ore more mailing lists and compute the list of user including that word or sentence in their emails

-it plots the top folks using that word / sentence


Parameters to set options:

-insert one or more urls of mailing lists (mls are aggregated)

-insert one word or a sentence that you want to check for

In [ ]:
%matplotlib inline

In [6]:
import bigbang.mailman as mailman
from bigbang.archive import load as load_archive
from bigbang.parse import get_date
from bigbang.archive import Archive
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from collections import defaultdict
import numpy as np
import math
import pytz
import pickle
import os
pd.options.display.mpl_style = 'default'

In [9]:
#insert one or more urls of the mailing lists you want to include in the analysis
#(if more mailing lists are included, the data are aggregated and treated as a single object of analysis)

urls = ["http://mm.icann.org/pipermail/cc-humanrights/", 
        "http://mm.icann.org/pipermail/wp4/", 
        "http://mm.icann.org/pipermail/ge/"]


try:
    arch_paths =[]
    for url in urls:
        arch_paths.append('../archives/'+url[:-1].replace('://','_/')+'.csv')
    archives = [load_archive(arch_path).data for arch_path in arch_paths]
except:
    arch_paths =[]
    for url in urls:
        arch_paths.append('../archives/'+url[:-1].replace('//','/')+'.csv')
    archives = [load_archive(arch_path).data for arch_path in arch_paths]
mails = pd.concat(archives)

In [32]:
#insert a word or a sentence that you want to look up for

sub_text = 'dio bono'

In [33]:
#counting how many people wrote that

people_count = defaultdict(int)
for mail in mails.iterrows():
    text = mail[1]['Body']
    if sub_text in text:
        people_count[mail[1]['From']] += 1

In [34]:
#insert how many top-people using that sentence you want to visualize

top_people = 5

In [38]:
print str(len(people_count.keys()))+' people are talking about "'+sub_text+'"'
i = 0
for people, count in sorted(people_count.iteritems(), reverse = True, key = lambda (k,v): (v,k)):
    print people+'   '+str(count)
    i+=1
    if i == top_people: break


0 people are talking about "dio bono"

In [ ]: