In [ ]:
## This note book gives the trend of multiple words in multiple mailing lists

What it does:
-it computes and plot word counts over time, on aggregated mailing lists' data.
-it exports emails that contains selected words 

Parameters to set options:
-it can track one or more words, according to the number of words set in the variable 'checkwords' 
-it can look in one or more mailing lists, according to how many urls are set; word counts are aggregated across mls
-it can look at literal words or at stemmed words, according to the 'stem' parameter

Useful extensions:
-export dictionary with wordcount trends on individual mailing lists
-look at compund words (e.g. 'human rights')
-give option to SUM word counts instead of treating words separately
-give possibility to normalize word counts

In [25]:
%matplotlib inline

In [26]:
from bigbang.archive import load as load_archive
from bigbang.archive import Archive
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
import nltk
from itertools import repeat
from import LancasterStemmer
st = LancasterStemmer()
from nltk.corpus import stopwords
import re

In [27]:
# pd.options.display.mpl_style = 'default' # pandas has a set of preferred graph formatting options

NameError                                 Traceback (most recent call last)
<ipython-input-27-fd6e10a78034> in <module>()
      1 # pd.options.display.mpl_style = 'default' # pandas has a set of preferred graph formatting options
----> 2'ggplot')

NameError: name 'matplotlib' is not defined

In [9]:
#insert a list of the urls of downloaded mailing lists that you want to include in the analysis. 
#data will be merged: multiple mailing lists are treated as a unique corpus
#e.g. urls  = ["", 
#              ""]

urls = ["6lo/",
#       ""]

    arch_paths =[]
    for url in urls:
    archives = [load_archive(arch_path) for arch_path in arch_paths]
    arch_paths =[]
    for url in urls:
archives = [load_archive(arch_path) for arch_path in arch_paths]

df = pd.concat([ for arx in archives])

In [10]:
#insert a list of *single* words to be tracked e.g. checkwords = ['rights', 'economy', 'human']
checkwords = ["internet","right","human","human rights"]

In [11]:
#to stem or not to stem? 
#if stem is set to True, then checkwords should be stemmed words (no plurals, no suffixes, etc.)
#if stem is set to False, then checkwords are searched for their literal spelling
stem = False

In [12]:
#extension: filter by date?

In [13]:
def count_word(text,word):
    if not text:
        return 0
    if len(word.split(" ")) <= 1:
        ## normalize the text - remove apostrophe and punctuation, lower case
        normalized_text = re.sub(r'[^\w]', ' ',text.replace("'","")).lower()
        tokenized_text = nltk.tokenize.word_tokenize(normalized_text)

        if stem:
            tokenized_text = [st.stem(t) for t in tokenized_text]
        return tokenized_text.count(word)
        return text.lower().count(word)

In [14]:
for word in checkwords:
    df[word] = df['Body'].apply(lambda x: count_word(x,word))

In [15]:
#save each email in a file based on which checkword it contains. good for doing some qualitative analysis

#set the path where the data are to be saved
path = '.'

import os

for word in checkwords:
    print("Saving data for checkword "+word+"...")
    df[df[word] > 0].to_csv(os.path.join(path,word+'.csv'))

Saving data for checkword internet...
Saving data for checkword right...
Saving data for checkword human...
Saving data for checkword human rights...

In [16]:
df = df.dropna(subset=['Date'])
df['Date-ordinal'] = df['Date'].apply(lambda x: x.toordinal())

df_sums = df.groupby('Date-ordinal').sum()

AxisError                                 Traceback (most recent call last)
<ipython-input-16-76766e0b9540> in <module>()
----> 1 df = df.dropna(subset=['Date'])
      2 df['Date-ordinal'] = df['Date'].apply(lambda x: x.toordinal())
      4 df_sums = df.groupby('Date-ordinal').sum()

/home/lem/.local/lib/python2.7/site-packages/pandas/core/frame.pyc in dropna(self, axis, how, thresh, subset, inplace)
   3490                 agg_obj = self.take(indices, axis=agg_axis)
-> 3492             count = agg_obj.count(axis=agg_axis)
   3494             if thresh is not None:

/home/lem/.local/lib/python2.7/site-packages/pandas/core/frame.pyc in count(self, axis, level, numeric_only)
   5645                 result = notna(frame).sum(axis=axis)
   5646             else:
-> 5647                 counts = notna(frame.values).sum(axis=axis)
   5648                 result = Series(counts, index=frame._get_agg_axis(axis))

/home/lem/.local/lib/python2.7/site-packages/numpy/core/_methods.pyc in _sum(a, axis, dtype, out, keepdims)
     31 def _sum(a, axis=None, dtype=None, out=None, keepdims=False):
---> 32     return umr_sum(a, axis, dtype, out, keepdims)
     34 def _prod(a, axis=None, dtype=None, out=None, keepdims=False):

AxisError: axis 1 is out of bounds for array of dimension 1

In [ ]:
from datetime import date

for_export = df_sums.copy()

dates_again = pd.Series(for_export.index,
                  index=for_export.index).apply(lambda x: 

for_export['Date'] = dates_again


In [ ]:
plt.figure(figsize=(12.5, 7.5))

colors = 'rgbkm'

window = 5

for i in range(len(checkwords)):
    smooth_sums = pd.rolling_mean(df_sums,window)

plt.legend(bbox_to_anchor=(.2, 1))

In [ ]: