This notebook compute and plot the senders that have talked more about something
What it does:
-given a word / sentence, it scans emails in one ore more mailing lists and compute the list of user including that word or sentence in their emails
-it plots the top folks using that word / sentence
Parameters to set options:
-insert one or more urls of mailing lists (mls are aggregated)
-insert one word or a sentence that you want to check for
In [1]:
%matplotlib inline
In [2]:
import bigbang.mailman as mailman
from bigbang.archive import load as load_archive
from bigbang.parse import get_date
from bigbang.archive import Archive
import pandas as pd
import datetime
import matplotlib.pyplot as plt
from collections import defaultdict
import numpy as np
import math
import pytz
import pickle
import os
pd.options.display.mpl_style = 'default'
In [3]:
#specify if you want to have words stemmed (no prefixes, plurals, etc.) or literal
stem = False
import os
cwd = os.getcwd()
archives_names = ["6lo"]
arch_paths = list()
for ml_name in archives_names:
arch_paths.append('../../archives/'+ml_name+'.csv')
archives_list = [load_archive(arch_path).data for arch_path in arch_paths]
archives = Archive(pd.concat(archives_list))
archives_data = archives.data
In [13]:
#instert a word or sentence that you wanna look for
sub_text = 'thanks'
In [36]:
#counting how many people wrote that
people_count = defaultdict(int)
for idx, mail in archives_data.iterrows():
if mail['Body'] is not None:
if sub_text in mail['Body']:
people_count[mail['From']] += 1
In [37]:
#insert how many top-people using that sentence you want to visualize
top_people = 5
In [38]:
print(str(len(list(people_count.keys())))+' people are talking about "'+sub_text+'"')
i = 0
for people, count in sorted(iter(people_count.items()), reverse = True, key = lambda k_v: (k_v[1],k_v[0])):
print(people+' '+str(count))
i+=1
if i == top_people: break
In [ ]: