In [ ]:
<b> This notebook takes a list of users and outputs the most recurring words per each user</b>
In [12]:
from bigbang.archive import load as load_archive
from nltk.corpus import stopwords
from nltk import tokenize
from collections import defaultdict
import csv
from pandas import DataFrame as df
from nltk.stem.lancaster import LancasterStemmer
import re
st = LancasterStemmer()
In [13]:
#insert ONE url of mailing list
url = '6lo/'
#specify if you want to have words stemmed (no prefixes, pluralrs, etc.) or literal
stem = False
try:
arch_path = '../archives/'+url[:-1].replace('://','_/')+'.csv'
archive = load_archive(arch_path).data
except:
arch_path = '../archives/'+url[:-1].replace('//','/')+'.csv'
archive = load_archive(arch_path).data
In [14]:
def count_words(texts):
wordcount={}
for text in texts:
w = text.replace("'", "")
k = re.sub(r'[^\w]', ' ', w)
t = tokenize.word_tokenize(k)
for g in t:
try:
if stem: word = st.stem(g)
else: word = g
except:
print(g)
pass
if word in stopwords.words('english'):
continue
if word not in wordcount:
wordcount[word] = [1]
else:
wordcount[word][0] += 1
return wordcount
In [15]:
#insert the name of the file with users (should be in the bigbang directory)
file_name = 'users_count.csv'
users_f = open('../'+file_name, "rb")
users_r = csv.reader(users_f, delimiter = ',')
user_wordcount = defaultdict(dict)
for user in users_r:
user_wordcount[user[0]]= count_words(archive[archive["From"] == user[0]]["Body"])
In [ ]:
#insert the number of top words you want to export
n_top_words = 10
#edit the file name in case...
users_topwords_f = open('../users_topwords.csv', "wb")
users_w = csv.writer(users_topwords_f)
for user, wordcount in user_wordcount.items():
for word, count in sorted(iter(wordcount.items()), reverse = True, key = lambda k_v:(k_v[1],k_v[0]))[:n_top_words]:
users_w.writerow([user]+[word]+[count[0]])
users_topwords_f.close()
print('File exported!')
In [ ]: