In [12]:
from bigbang.archive import Archive
from bigbang.archive import load as load_archive
import os
import pandas as pd
import numpy as np
In [13]:
ietf_path = "../archives/"
ncuc_path = "../archives/http:/lists.ncuc.org/pipermail"
paths = [os.path.join(ietf_path,"6lo.csv"),
os.path.join(ietf_path,"5gangip.csv")]
archives = [load_archive(path) for path in paths]
In [14]:
from sklearn.feature_extraction.text import CountVectorizer
#tp = u'(?u)\x08[^\\W\\d_][^\\W\\d_]+\x08'
tp = '(?u)\\b[^\\W\\d\_]\\w+\\b'
In [15]:
def ordered_words(data,authors=None):
if authors is not None:
## Filter to only those emails that include given authors
## a series of email IDs, valued True iff
## one of the author names is in the From field
selected = data['From'].apply(lambda x:
any([(author in x)
for author
in authors]))
# a series of Booleans can be used to select
# only certain rows from a DataFrame
data = data[selected]
cv = CountVectorizer(max_df=.16,min_df=5,token_pattern=tp)
c_dtm = cv.fit_transform(data['Body'].dropna())
feature_names = cv.get_feature_names()
feature_counts = np.array(c_dtm.sum(axis=0))[0]
feature_order = np.argsort(feature_counts)[::-1]
sorted_features = [feature_names[i] for i in feature_order]
rankings = pd.Series({pair[1] : pair[0]
for pair
in enumerate(sorted_features)})
counts = pd.Series({feature_names[i] : feature_counts[i]
for i
in feature_order})
## Returns a pair (a tuple of length 2)
return rankings,counts
The line below creates a list of three pairs, each pair containing two pandas.Series objects.
A Series is like a dictionary, only its items are ordered and its values must share a data type. The order keys of the series are its index. It is easy to compose Series objects into a DataFrame.
In [16]:
series = [ordered_words(archive.data) for archive in archives]
This creates a DataFrame from each of the series. The columns alternate between representing word rankings and representing word counts.
In [17]:
rankings = pd.concat([series[0][0],
series[0][1],
series[1][0],
series[1][1],
series[2][0],
series[2][1]],axis=1)
In [11]:
# display the first 5 rows of the DataFrame
rankings[:5]
We should rename the columns to be more descriptive of the data.
In [8]:
rankings.rename(columns={0: 'ipc-gnso rankings',
1: 'ipc-gnso counts',
2: 'wp4 rankings',
3: 'wp4 counts',
4: 'ncuc-discuss rankings',
5: 'ncuc-discuss counts'},inplace=True)
In [9]:
rankings[:5]
Out[9]:
Use the to_csv() function on the DataFrame object to export the data to CSV format, which you can open easily in Excel.
In [10]:
rankings.to_csv("rankings_all.csv",encoding="utf-8")
To filter the data by certain authors before computing the word rankings, provide a list of author names as an argument.
Only emails whose From
header includes on of the author names within it will be included in the calculation.
Note that for detecting the author name, the program for now uses simple string inclusion. You may need to try multiple variations of the authors' names in order to catch all emails written by persons of interest.
In [11]:
authors = ["Greg Shatan",
"Niels ten Oever"]
ordered_words(archives[0].data, authors=authors)
Out[11]:
In [ ]: