In [12]:

    
from bigbang.archive import Archive
from bigbang.archive import load as load_archive
import os
import pandas as pd
import numpy as np



In [13]:

    
ietf_path = "../archives/"
ncuc_path = "../archives/http:/lists.ncuc.org/pipermail"

paths = [os.path.join(ietf_path,"6lo.csv"),
        os.path.join(ietf_path,"5gangip.csv")]

archives = [load_archive(path) for path in paths]



In [14]:

    
from sklearn.feature_extraction.text import CountVectorizer

#tp = u'(?u)\x08[^\\W\\d_][^\\W\\d_]+\x08'

tp = '(?u)\\b[^\\W\\d\_]\\w+\\b'



In [15]:

    
def ordered_words(data,authors=None):
    
    if authors is not None:
        ## Filter to only those emails that include given authors
        
        ## a series of email IDs, valued True iff
        ## one of the author names is in the From field
        selected = data['From'].apply(lambda x: 
                                        any([(author in x)
                                        for author
                                        in authors]))
    
        # a series of Booleans can be used to select
        # only certain rows from a DataFrame
        data = data[selected]
    
    cv = CountVectorizer(max_df=.16,min_df=5,token_pattern=tp)
    
    c_dtm = cv.fit_transform(data['Body'].dropna())
    
    feature_names = cv.get_feature_names()
    feature_counts = np.array(c_dtm.sum(axis=0))[0]
    
    feature_order = np.argsort(feature_counts)[::-1]
    
    sorted_features = [feature_names[i] for i in feature_order]
    
    rankings = pd.Series({pair[1] : pair[0] 
                          for pair 
                          in enumerate(sorted_features)})

    counts = pd.Series({feature_names[i] : feature_counts[i] 
                        for i 
                        in feature_order})
    
    ## Returns a pair (a tuple of length 2)
    return rankings,counts

The line below creates a list of three pairs, each pair containing two pandas.Series objects.

A Series is like a dictionary, only its items are ordered and its values must share a data type. The order keys of the series are its index. It is easy to compose Series objects into a DataFrame.



In [16]:

    
series = [ordered_words(archive.data) for archive in archives]

This creates a DataFrame from each of the series. The columns alternate between representing word rankings and representing word counts.



In [17]:

    
rankings = pd.concat([series[0][0],
                      series[0][1],
                      series[1][0],
                      series[1][1],
                      series[2][0],
                      series[2][1]],axis=1)









    



---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-17-4f44f4426481> in <module>()
      3                       series[1][0],
      4                       series[1][1],
----> 5                       series[2][0],
      6                       series[2][1]],axis=1)

IndexError: list index out of range



In [11]:

    
# display the first 5 rows of the DataFrame
rankings[:5]









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-ae24c0150b69> in <module>()
      1 # display the first 5 rows of the DataFrame
----> 2 rankings[:5]

NameError: name 'rankings' is not defined

We should rename the columns to be more descriptive of the data.



In [8]:

    
rankings.rename(columns={0: 'ipc-gnso rankings',
                         1: 'ipc-gnso counts',
                         2: 'wp4 rankings',
                         3: 'wp4 counts',
                         4: 'ncuc-discuss rankings',
                         5: 'ncuc-discuss counts'},inplace=True)



In [9]:

    
rankings[:5]









    Out[9]:






  
    
      
      ipc-gnso rankings
      ipc-gnso counts
      wp4 rankings
      wp4 counts
      ncuc-discuss rankings
      ncuc-discuss counts
    
  
  
    
      a0
      NaN
      NaN
      NaN
      NaN
      6824.0
      64.0
    
    
      a06
      NaN
      NaN
      NaN
      NaN
      10433.0
      28.0
    
    
      a0f2
      NaN
      NaN
      210.0
      39.0
      8964.0
      38.0
    
    
      a0ff16b3bef68c8657
      NaN
      NaN
      NaN
      NaN
      21921.0
      6.0
    
    
      a17976
      NaN
      NaN
      NaN
      NaN
      21918.0
      6.0

Use the to_csv() function on the DataFrame object to export the data to CSV format, which you can open easily in Excel.



In [10]:

    
rankings.to_csv("rankings_all.csv",encoding="utf-8")

To filter the data by certain authors before computing the word rankings, provide a list of author names as an argument.

Only emails whose From header includes on of the author names within it will be included in the calculation.

Note that for detecting the author name, the program for now uses simple string inclusion. You may need to try multiple variations of the authors' names in order to catch all emails written by persons of interest.



In [11]:

    
authors = ["Greg Shatan",
           "Niels ten Oever"]

ordered_words(archives[0].data, authors=authors)









    Out[11]:





(aaikman            43
 abandon          1077
 ability          1078
 able              502
 above             512
 absolve          1080
 abuhamad          334
 ac                724
 acceptable        766
 access            280
 accordance       1236
 account           784
 accountable       417
 accreditation     770
 acct              151
 acctcrosscomm    1090
 accuracy          450
 achieve           835
 across            310
 acs              1409
 act               353
 action            237
 actions           670
 active            492
 actively         1428
 activities        140
 activity          919
 actual            916
 add              1114
 addition          402
                  ... 
 whole            1465
 why               781
 widely           1360
 wiki              556
 willing          1093
 winston            11
 wish             1116
 without            14
 won               938
 wondering        1392
 words             480
 worked           1282
 world             317
 worldclock        514
 worried           438
 worst            1079
 writing          1397
 written           382
 wsis              126
 wsis10            748
 year              101
 years             194
 yes               332
 yesterday         404
 yet               832
 york              467
 zone              292
 zones            1447
 zuck             1469
 zzxya            1122
 dtype: int64, aaikman           83
 abandon            8
 ability            8
 able              19
 above             19
 absolve            8
 abuhamad          25
 ac                14
 acceptable        13
 access            29
 accordance         7
 account           12
 accountable       22
 accreditation     13
 acct              40
 acctcrosscomm      8
 accuracy          21
 achieve           12
 across            26
 acs                6
 act               25
 action            31
 actions           15
 active            19
 actively           5
 activities        43
 activity          10
 actual            10
 add                8
 addition          22
                 ... 
 whole              5
 why               13
 widely             6
 wiki              17
 willing            8
 winston          141
 wish               8
 without          132
 won               10
 wondering          6
 words             20
 worked             7
 world             26
 worldclock        19
 worried           21
 worst              8
 writing            6
 written           23
 wsis              47
 wsis10            13
 year              53
 years             36
 yes               26
 yesterday         22
 yet               12
 york              20
 zone              28
 zones              5
 zuck               5
 zzxya              8
 dtype: int64)



In [ ]:

	ipc-gnso rankings	ipc-gnso counts	wp4 rankings	wp4 counts	ncuc-discuss rankings	ncuc-discuss counts
a0	NaN	NaN	NaN	NaN	6824.0	64.0
a06	NaN	NaN	NaN	NaN	10433.0	28.0
a0f2	NaN	NaN	210.0	39.0	8964.0	38.0
a0ff16b3bef68c8657	NaN	NaN	NaN	NaN	21921.0	6.0
a17976	NaN	NaN	NaN	NaN	21918.0	6.0