In [ ]:
<b> This notebook takes a list of users and outputs the most recurring words per each user</b>

In [12]:
from bigbang.archive import load as load_archive
from nltk.corpus import stopwords
from nltk import tokenize
from collections import defaultdict
import csv
from pandas import DataFrame as df
from nltk.stem.lancaster import LancasterStemmer
import re
st = LancasterStemmer()

In [13]:
#insert ONE url of mailing list
url = '6lo/'

#specify if you want to have words stemmed (no prefixes, pluralrs, etc.) or literal 
stem = False


try:
    arch_path = '../archives/'+url[:-1].replace('://','_/')+'.csv'
    archive = load_archive(arch_path).data
except:
    arch_path = '../archives/'+url[:-1].replace('//','/')+'.csv'
    archive = load_archive(arch_path).data

In [14]:
def count_words(texts):
    wordcount={}
    for text in texts:
            w = text.replace("'", "")
            k = re.sub(r'[^\w]', ' ', w)
            t = tokenize.word_tokenize(k)
            for g in t:
                try:
                    if stem: word = st.stem(g)
                    else: word = g
                except:
                    print(g)
                    pass
                if word in stopwords.words('english'):
                    continue
                if word not in wordcount:
                    wordcount[word] = [1]
                else:
                    wordcount[word][0] += 1
    return wordcount

In [15]:
#insert the name of the file with users (should be in the bigbang directory)
file_name = 'users_count.csv'
users_f = open('../'+file_name, "rb")
users_r = csv.reader(users_f, delimiter = ',')
user_wordcount = defaultdict(dict)
for user in users_r:
    user_wordcount[user[0]]= count_words(archive[archive["From"] == user[0]]["Body"])


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-15-22651a4b6679> in <module>()
      5 user_wordcount = defaultdict(dict)
      6 for user in users_r:
----> 7     user_wordcount[user[0]]= count_words(archive[archive["From"] == user[0]]["Body"])
      8 
      9 

<ipython-input-14-c5b5de9c35cc> in count_words(texts)
      2     wordcount={}
      3     for text in texts:
----> 4             w = text.replace("'", "")
      5             k = re.sub(r'[^\w]', ' ', w)
      6             t = tokenize.word_tokenize(k)

AttributeError: 'NoneType' object has no attribute 'replace'

In [ ]:
#insert the number of top words you want to export
n_top_words = 10

#edit the file name in case...
users_topwords_f = open('../users_topwords.csv', "wb")
users_w = csv.writer(users_topwords_f)


for user, wordcount in user_wordcount.items():    
    for word, count in sorted(iter(wordcount.items()), reverse = True, key = lambda k_v:(k_v[1],k_v[0]))[:n_top_words]:
        users_w.writerow([user]+[word]+[count[0]])
users_topwords_f.close()
print('File exported!')

In [ ]: