In [12]:
from bigbang.archive import Archive
from bigbang.archive import load as load_archive
import os
import pandas as pd
import numpy as np

In [13]:
ietf_path = "../archives/"
ncuc_path = "../archives/http:/lists.ncuc.org/pipermail"

paths = [os.path.join(ietf_path,"6lo.csv"),
        os.path.join(ietf_path,"5gangip.csv")]

archives = [load_archive(path) for path in paths]

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

#tp = u'(?u)\x08[^\\W\\d_][^\\W\\d_]+\x08'

tp = '(?u)\\b[^\\W\\d\_]\\w+\\b'

In [15]:
def ordered_words(data,authors=None):
    
    if authors is not None:
        ## Filter to only those emails that include given authors
        
        ## a series of email IDs, valued True iff
        ## one of the author names is in the From field
        selected = data['From'].apply(lambda x: 
                                        any([(author in x)
                                        for author
                                        in authors]))
    
        # a series of Booleans can be used to select
        # only certain rows from a DataFrame
        data = data[selected]
    
    cv = CountVectorizer(max_df=.16,min_df=5,token_pattern=tp)
    
    c_dtm = cv.fit_transform(data['Body'].dropna())
    
    feature_names = cv.get_feature_names()
    feature_counts = np.array(c_dtm.sum(axis=0))[0]
    
    feature_order = np.argsort(feature_counts)[::-1]
    
    sorted_features = [feature_names[i] for i in feature_order]
    
    rankings = pd.Series({pair[1] : pair[0] 
                          for pair 
                          in enumerate(sorted_features)})

    counts = pd.Series({feature_names[i] : feature_counts[i] 
                        for i 
                        in feature_order})
    
    ## Returns a pair (a tuple of length 2)
    return rankings,counts

The line below creates a list of three pairs, each pair containing two pandas.Series objects.

A Series is like a dictionary, only its items are ordered and its values must share a data type. The order keys of the series are its index. It is easy to compose Series objects into a DataFrame.


In [16]:
series = [ordered_words(archive.data) for archive in archives]

This creates a DataFrame from each of the series. The columns alternate between representing word rankings and representing word counts.


In [17]:
rankings = pd.concat([series[0][0],
                      series[0][1],
                      series[1][0],
                      series[1][1],
                      series[2][0],
                      series[2][1]],axis=1)


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-17-4f44f4426481> in <module>()
      3                       series[1][0],
      4                       series[1][1],
----> 5                       series[2][0],
      6                       series[2][1]],axis=1)

IndexError: list index out of range

In [11]:
# display the first 5 rows of the DataFrame
rankings[:5]


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-ae24c0150b69> in <module>()
      1 # display the first 5 rows of the DataFrame
----> 2 rankings[:5]

NameError: name 'rankings' is not defined

We should rename the columns to be more descriptive of the data.


In [8]:
rankings.rename(columns={0: 'ipc-gnso rankings',
                         1: 'ipc-gnso counts',
                         2: 'wp4 rankings',
                         3: 'wp4 counts',
                         4: 'ncuc-discuss rankings',
                         5: 'ncuc-discuss counts'},inplace=True)

In [9]:
rankings[:5]


Out[9]:
ipc-gnso rankings ipc-gnso counts wp4 rankings wp4 counts ncuc-discuss rankings ncuc-discuss counts
a0 NaN NaN NaN NaN 6824.0 64.0
a06 NaN NaN NaN NaN 10433.0 28.0
a0f2 NaN NaN 210.0 39.0 8964.0 38.0
a0ff16b3bef68c8657 NaN NaN NaN NaN 21921.0 6.0
a17976 NaN NaN NaN NaN 21918.0 6.0

Use the to_csv() function on the DataFrame object to export the data to CSV format, which you can open easily in Excel.


In [10]:
rankings.to_csv("rankings_all.csv",encoding="utf-8")

To filter the data by certain authors before computing the word rankings, provide a list of author names as an argument.

Only emails whose From header includes on of the author names within it will be included in the calculation.

Note that for detecting the author name, the program for now uses simple string inclusion. You may need to try multiple variations of the authors' names in order to catch all emails written by persons of interest.


In [11]:
authors = ["Greg Shatan",
           "Niels ten Oever"]

ordered_words(archives[0].data, authors=authors)


Out[11]:
(aaikman            43
 abandon          1077
 ability          1078
 able              502
 above             512
 absolve          1080
 abuhamad          334
 ac                724
 acceptable        766
 access            280
 accordance       1236
 account           784
 accountable       417
 accreditation     770
 acct              151
 acctcrosscomm    1090
 accuracy          450
 achieve           835
 across            310
 acs              1409
 act               353
 action            237
 actions           670
 active            492
 actively         1428
 activities        140
 activity          919
 actual            916
 add              1114
 addition          402
                  ... 
 whole            1465
 why               781
 widely           1360
 wiki              556
 willing          1093
 winston            11
 wish             1116
 without            14
 won               938
 wondering        1392
 words             480
 worked           1282
 world             317
 worldclock        514
 worried           438
 worst            1079
 writing          1397
 written           382
 wsis              126
 wsis10            748
 year              101
 years             194
 yes               332
 yesterday         404
 yet               832
 york              467
 zone              292
 zones            1447
 zuck             1469
 zzxya            1122
 dtype: int64, aaikman           83
 abandon            8
 ability            8
 able              19
 above             19
 absolve            8
 abuhamad          25
 ac                14
 acceptable        13
 access            29
 accordance         7
 account           12
 accountable       22
 accreditation     13
 acct              40
 acctcrosscomm      8
 accuracy          21
 achieve           12
 across            26
 acs                6
 act               25
 action            31
 actions           15
 active            19
 actively           5
 activities        43
 activity          10
 actual            10
 add                8
 addition          22
                 ... 
 whole              5
 why               13
 widely             6
 wiki              17
 willing            8
 winston          141
 wish               8
 without          132
 won               10
 wondering          6
 words             20
 worked             7
 world             26
 worldclock        19
 worried           21
 worst              8
 writing            6
 written           23
 wsis              47
 wsis10            13
 year              53
 years             36
 yes               26
 yesterday         22
 yet               12
 york              20
 zone              28
 zones              5
 zuck               5
 zzxya              8
 dtype: int64)

In [ ]: