What it does: -it computes and plot word counts over time, on aggregated mailing lists' data. -it exports emails that contains selected words
Parameters to set options: -it can track one or more words, according to the number of words set in the variable 'checkwords' -it can look in one or more mailing lists, according to how many urls are set; word counts are aggregated across mls -it can look at literal words or at stemmed words, according to the 'stem' parameter
Useful extensions: -export dictionary with wordcount trends on individual mailing lists -look at compund words (e.g. 'human rights') -give option to SUM word counts instead of treating words separately -give possibility to normalize word counts
In [1]:
%matplotlib inline
In [2]:
from bigbang.archive import load as load_archive
from bigbang.archive import Archive
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
import nltk
from itertools import repeat
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from nltk.corpus import stopwords
import re
In [3]:
pd.options.display.mpl_style = 'default' # pandas has a set of preferred graph formatting options
In [4]:
#insert a list of the urls of downloaded mailing lists that you want to include in the analysis.
#data will be merged: multiple mailing lists are treated as a unique corpus
#e.g. urls = ["https://mm.icann.org/pipermail/cc-humanrights/",
# "https://mm.icann.org/pipermail/euro-board/"]
urls = ["http://mm.icann.org/pipermail/cc-humanrights/",
"http://mm.icann.org/pipermail/euro-board/"]
#"http://mm.icann.org/pipermail/wp4/"]
# "http://mm.icann.org/pipermail/wp1/"]
try:
arch_paths =[]
for url in urls:
arch_paths.append('../archives/'+url[:-1].replace('://','_/')+'.csv')
archives = [load_archive(arch_path) for arch_path in arch_paths]
except:
arch_paths =[]
for url in urls:
arch_paths.append('../archives/'+url[:-1].replace('//','/')+'.csv')
archives = [load_archive(arch_path) for arch_path in arch_paths]
df = pd.concat([arx.data for arx in archives])
In [5]:
#insert a list of *single* words to be tracked e.g. checkwords = ['rights', 'economy', 'human']
checkwords = ["internet","right","human","human rights"]
In [6]:
#to stem or not to stem?
#if stem is set to True, then checkwords should be stemmed words (no plurals, no suffixes, etc.)
#if stem is set to False, then checkwords are searched for their literal spelling
stem = False
In [7]:
#extension: filter by date?
In [ ]:
def count_word(text,word):
if not text:
return 0
if len(word.split(" ")) <= 1:
## normalize the text - remove apostrophe and punctuation, lower case
normalized_text = re.sub(r'[^\w]', ' ',text.replace("'","")).lower()
tokenized_text = nltk.tokenize.word_tokenize(normalized_text)
if stem:
tokenized_text = [st.stem(t) for t in tokenized_text]
return tokenized_text.count(word)
else:
return text.lower().count(word)
In [ ]:
for word in checkwords:
df[word] = df['Body'].apply(lambda x: count_word(x,word))
In [ ]:
#save each email in a file based on which checkword it contains. good for doing some qualitative analysis
#set the path where the data are to be saved
path = '.'
import os
for word in checkwords:
print "Saving data for checkword "+word+"..."
df[df[word] > 0].to_csv(os.path.join(path,word+'.csv'))
In [ ]:
df = df.dropna(subset=['Date'])
df['Date-ordinal'] = df['Date'].apply(lambda x: x.toordinal())
df_sums = df.groupby('Date-ordinal').sum()
In [ ]:
from datetime import date
for_export = df_sums.copy()
dates_again = pd.Series(for_export.index,
index=for_export.index).apply(lambda x:
date.fromordinal(x))
for_export['Date'] = dates_again
for_export.to_csv("word_counts_by_date.csv")
In [ ]:
plt.figure(figsize=(12.5, 7.5))
colors = 'rgbkm'
window = 5
for i in range(len(checkwords)):
smooth_sums = pd.rolling_mean(df_sums,window)
plt.plot_date(smooth_sums.index,
smooth_sums[checkwords[i]],
colors[i],
label=checkwords[i])
plt.legend(bbox_to_anchor=(.2, 1))
In [ ]: