This notebook takes a list of users and outputs the most recurring words per each user


In [18]:
from bigbang.archive import load as load_archive
from bigbang.archive import Archive
import pandas as pd
from nltk.corpus import stopwords
from nltk import tokenize
from collections import defaultdict
import csv
from pandas import DataFrame as df
from nltk.stem.lancaster import LancasterStemmer
import re
st = LancasterStemmer()

In [19]:
#insert ONE url of mailing list
list_name = 'ietf'


#specify if you want to have words stemmed (no prefixes, pluralrs, etc.) or literal 
stem = False


import os 
cwd = os.getcwd()    

ml_names = ["6lo"]


arch_paths = list()
for ml_name in ml_names:
    arch_paths.append('../../archives/'+ml_name+'.csv')
    

archives_list = [load_archive(arch_path).data for arch_path in arch_paths]
    
archives = Archive(pd.concat(archives_list))
archives_data = archives.data

In [15]:
#preparing a function to count top words per user

def count_words(texts):
    wordcount={}
    for text in texts:
            w = text.replace("'", "")
            k = re.sub(r'[^\w]', ' ', w)
            t = tokenize.word_tokenize(k)
            for g in t:
                try:
                    if stem: word = st.stem(g)
                    else: word = g
                except:
                    print(g)
                    pass
                if word in stopwords.words('english'):
                    continue
                if word not in wordcount:
                    wordcount[word] = [1]
                else:
                    wordcount[word][0] += 1
    return wordcount

In [16]:
#plotting the top-users for your convenience

#set the number of top-users that you want to see
max_users = 25

if not users_from_file:
    activity = Archive.get_activity(archives)
    tot_activity = activity.sum(0).sort_values(ascending=False)
    try: print(tot_activity[:max_users])
    except: print(tot_activity)


From
"Pascal Thubert (pthubert)" <pthubert@cisco.com>                        279.0
Samita Chakrabarti <samita.chakrabarti@ericsson.com>                    212.0
Carsten Bormann <cabo@tzi.org>                                          187.0
internet-drafts@ietf.org                                                136.0
Michael Richardson <mcr+ietf@sandelman.ca>                               93.0
Ulrich Herberg <ulrich@herberg.name>                                     91.0
Robert Cragie <robert.cragie@gridmerge.com>                              77.0
Alexandru Petrescu <alexandru.petrescu@gmail.com>                        72.0
Brian Haberman <brian@innovationslab.net>                                61.0
Gabriel Montenegro <Gabriel.Montenegro@microsoft.com>                    60.0
Ralph Droms <rdroms.ietf@gmail.com>                                      56.0
"6lo issue tracker" <trac+6lo@tools.ietf.org>                            55.0
Kerry Lynn <kerlyn@ieee.org>                                             49.0
Samita Chakrabarti <samitac.ietf@gmail.com>                              44.0
Behcet Sarikaya <sarikaya2012@gmail.com>                                 35.0
Juergen Schoenwaelder <j.schoenwaelder@jacobs-university.de>             32.0
peter van der Stok <stokcons@xs4all.nl>                                  31.0
James Woodyatt <jhw@nestlabs.com>                                        29.0
"Carles Gomez Montenegro" <carlesgo@entel.upc.edu>                       29.0
<teemu.savolainen@nokia.com>                                             28.0
The IESG <iesg-secretary@ietf.org>                                       23.0
"Savolainen Teemu (Nokia-TECH/Tampere)" <teemu.savolainen@nokia.com>     22.0
"Turner, Randy" <Randy.Turner@landisgyr.com>                             20.0
Thomas Watteyne <thomas.watteyne@inria.fr>                               18.0
Brian E Carpenter <brian.e.carpenter@gmail.com>                          17.0
dtype: float64

In [20]:
#insert in 'users' the list of users that you want to track

  
users = ['Rene Struik <rstruik.ext@gmail.com>', '"Ralph Droms (rdroms)" <rdroms@cisco.com>']
    
user_wordcount = defaultdict(dict)

for user in users:
    messages = archives_data[archives_data["From"] == user]["Body"]
    user_wordcount[user]= count_words(messages)

In [22]:
#insert the number of top words you want to export
n_top_words = 10


for user, wordcount in user_wordcount.items():
    print(user)
    for word, count in sorted(iter(wordcount.items()), reverse = True, key = lambda k_v:(k_v[1],k_v[0]))[:n_top_words]:
        print(word+'   '+str(count[0]))


"Ralph Droms (rdroms)" <rdroms@cisco.com>
20   125
ietf   96
6lo   71
org   66
draft   53
I   48
document   40
6man   39
Ralph   37
IPv6   35
Rene Struik <rstruik.ext@gmail.com>
ietf   95
org   83
I   79
security   69
draft   67
6lo   60
1   38
The   35
one   33
bootstrapping   30

In [ ]:


In [ ]: