This notebook takes a list of users and outputs the most recurring words per each user
In [18]:
from bigbang.archive import load as load_archive
from bigbang.archive import Archive
import pandas as pd
from nltk.corpus import stopwords
from nltk import tokenize
from collections import defaultdict
import csv
from pandas import DataFrame as df
from nltk.stem.lancaster import LancasterStemmer
import re
st = LancasterStemmer()
In [19]:
#insert ONE url of mailing list
list_name = 'ietf'
#specify if you want to have words stemmed (no prefixes, pluralrs, etc.) or literal
stem = False
import os
cwd = os.getcwd()
ml_names = ["6lo"]
arch_paths = list()
for ml_name in ml_names:
arch_paths.append('../../archives/'+ml_name+'.csv')
archives_list = [load_archive(arch_path).data for arch_path in arch_paths]
archives = Archive(pd.concat(archives_list))
archives_data = archives.data
In [15]:
#preparing a function to count top words per user
def count_words(texts):
wordcount={}
for text in texts:
w = text.replace("'", "")
k = re.sub(r'[^\w]', ' ', w)
t = tokenize.word_tokenize(k)
for g in t:
try:
if stem: word = st.stem(g)
else: word = g
except:
print(g)
pass
if word in stopwords.words('english'):
continue
if word not in wordcount:
wordcount[word] = [1]
else:
wordcount[word][0] += 1
return wordcount
In [16]:
#plotting the top-users for your convenience
#set the number of top-users that you want to see
max_users = 25
if not users_from_file:
activity = Archive.get_activity(archives)
tot_activity = activity.sum(0).sort_values(ascending=False)
try: print(tot_activity[:max_users])
except: print(tot_activity)
In [20]:
#insert in 'users' the list of users that you want to track
users = ['Rene Struik <rstruik.ext@gmail.com>', '"Ralph Droms (rdroms)" <rdroms@cisco.com>']
user_wordcount = defaultdict(dict)
for user in users:
messages = archives_data[archives_data["From"] == user]["Body"]
user_wordcount[user]= count_words(messages)
In [22]:
#insert the number of top words you want to export
n_top_words = 10
for user, wordcount in user_wordcount.items():
print(user)
for word, count in sorted(iter(wordcount.items()), reverse = True, key = lambda k_v:(k_v[1],k_v[0]))[:n_top_words]:
print(word+' '+str(count[0]))
In [ ]:
In [ ]: