This notebook takes the whole list of users and save on a file the most recurring words per each user


In [2]:
from bigbang.archive import load as load_archive
from bigbang.archive import Archive
import pandas as pd
from nltk.corpus import stopwords
from nltk import tokenize
from collections import defaultdict
import csv
from pandas import DataFrame as df
from nltk.stem.lancaster import LancasterStemmer
import re
st = LancasterStemmer()

In [3]:
#specify if you want to have words stemmed (no prefixes, plurals, etc.) or literal 
stem = False


import os 
cwd = os.getcwd()    

archives_names = ["6lo"]


archive_paths = list()
for archive_name in archives_names:
    archive_paths.append('../../archives/'+archive_name+'.csv')
    

archives_list = [load_archive(arch_path).data for arch_path in archive_paths]
    
archives = Archive(pd.concat(archives_list))
archives_data = archives.data

In [4]:
#preparing a function to count top words per user

def count_words(texts):
    wordcount={}
    for text in texts:
            w = text.replace("'", "")
            k = re.sub(r'[^\w]', ' ', w)
            t = tokenize.word_tokenize(k)
            for g in t:
                try:
                    if stem: word = st.stem(g)
                    else: word = g
                except:
                    print(g)
                    pass
                if word in stopwords.words('english'):
                    continue
                if word not in wordcount:
                    wordcount[word] = [1]
                else:
                    wordcount[word][0] += 1
    return wordcount

In [9]:
#extract the list of users and compute the word count per each user (might take some time!)

user_wordcount = defaultdict(int)

users = list(archives_data["From"])

for user in set(users):
    try:
        messages = archives_data[archives_data["From"] == user]["Body"]
        user_wordcount[user]= count_words(messages)
    except: pass

In [9]:
#insert the number of top words you want to export
n_top_words = 10

#edit the file name in case...
users_topwords_f = open('users_topwords.csv', "wb")
users_w = csv.writer(users_topwords_f)

for user, wordcount in user_wordcount.items():    
    for word, count in sorted(iter(wordcount.items()), reverse = True, key = lambda k_v:(k_v[1],k_v[0]))[:n_top_words]:
        users_w.writerow([user]+[word]+[count[0]])
users_topwords_f.close()
print('File exported!')


File exported!

In [ ]: