This notebook tries to detect "special words" in a corpus of mailing lists. (for now it works with two mailing lists only)

-it computes and exports in .csv files the word counts (words and their occurrences) -it computes and exports in .csv files the list of common words that are introduced by different people in different lists it computes and print the 'influential words' (see definition in the box)

Further extensions: -from two lists to n lists !


In [3]:
from bigbang.archive import Archive
from bigbang.archive import load as load_archive
import bigbang.parse as parse
import bigbang.graph as graph
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
import nltk
from itertools import repeat
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from nltk.corpus import stopwords
import re
import os

In [51]:
#insert TWO names of mailing lists (no more, no less)


cwd = os.getcwd()    

archives_names = ["ietf-privacy", "architecture-discuss"]


archives_paths = list()
for archive_name in archives_names:
    archives_paths.append('../../archives/'+archive_name+'.csv')
    

archives_list = [load_archive(archive_path).data for archive_path in archives_paths]
    
archives = Archive(pd.concat(archives_list))

archives_data = archives.data

In [52]:
#to stem or not to stem? 
#if stem is set to True, then words are converted into their stem(no plurals, no suffixes, etc.)
#if stem is set to False, then words are processed for their literal spelling

stem = False

First, we shall compute the word counts on the lists. Data will be also exported to .csv files


In [53]:
#Compute word count on the first list
wordcount1={}
for row in archives_list[0].iterrows():
    if row[1]["Body"] is not None:
        w = row[1]["Body"].replace("'", "")
        k = re.sub(r'[^\w]', ' ', w)
        t = nltk.tokenize.word_tokenize(k)
        for g in t:
            try:
                if stem: word = st.stem(g)
                else: word = g
            except:
                print(g)
                pass
            if word in stopwords.words('english'):
                continue
            if word not in wordcount1:
                wordcount1[word] = [1]
                wordcount1[word].append(row[0])
                wordcount1[word].append(row[1]["Date"])
                wordcount1[word].append(row[1]["From"])
                wordcount1[word].append(row[1]["In-Reply-To"])
            else:
                wordcount1[word][0] += 1
wd = wordcount #In case

In [54]:
#Compute word count on the second list
wordcount2={}
for row in archives_list[1].iterrows():
    if row[1]["Body"] is not None:
        w = row[1]["Body"].replace("'", "")
        k = re.sub(r'[^\w]', ' ', w)
        t = nltk.tokenize.word_tokenize(k)
        for g in t:
            try:
                if stem: word = st.stem(g)
                else: word = g
            except:
                pass
            if word in stopwords.words('english'):
                continue
            if word not in wordcount2:
                wordcount2[word] = [1]
                wordcount2[word].append(row[0])
                wordcount2[word].append(row[1]["Date"])
                wordcount2[word].append(row[1]["From"])
                wordcount2[word].append(row[1]["In-Reply-To"])
            else:
                wordcount2[word][0] += 1

In [55]:
#Create and export a wordcount information dataframe per mailing list

#set the variable 'path' as a valid directory path where to store the files



asd = pd.DataFrame(wordcount)
new_dataframe = asd.transpose()
new_dataframe.columns = ["Wordcount", "Message-ID", "Date","From","In-Reply-To"]
new_dataframe.to_csv(cwd+'/wordcount_info_'+archives_names[0]+'.csv')

asd1 = pd.DataFrame(wordcount1)
new_dataframe1 = asd1.transpose()
new_dataframe1.columns = ["Wordcount", "Message-ID", "Date","From","In-Reply-To"]
new_dataframe1.to_csv(cwd+'/wordcount_info_'+archives_names[1]+'.csv')

print('File exported!')
print('Check '+cwd+'/wordcount_info_'+archives_names[0]+'.csv and '+cwd+'wordcount_info_'+archives_names[1]+'.csv')


File exported!
Check /home/berra/bigbang/examples/word_analysis/wordcount_info_ietf-privacy.csv and /home/berra/bigbang/examples/word_analysiswordcount_info_architecture-discuss.csv

Let's print some useful descriptive data:


In [56]:
print('Number of unique words in mailinglist '+archives_names[0])
print(len(wordcount1))


Number of unique words in mailinglist ietf-privacy
11982

In [57]:
print('Number of unique words in mailinglist '+archives_names[1])
print(len(wordcount2))


Number of unique words in mailinglist architecture-discuss
14581

In [58]:
samewordcount=0
for word in wordcount1:
    if word in wordcount2:
        samewordcount += 1
print('Number of same unique words in two mailing lists')
print(samewordcount)


Number of same unique words in two mailing lists
5906

In [59]:
samewords = {}
for word in wordcount1:
    if word in wordcount2:
        if wordcount1[word][3] == wordcount2[word][3]:
            samewords[word] = [wordcount1[word][0],wordcount1[word][3],wordcount1[word][2],
                                         wordcount2[word][0],wordcount2[word][3],wordcount2[word][2]]
print('Total number of same words that are introduced by same people')
print(len(list(samewords.keys())))


Total number of same words that are introduced by same people
75

In [60]:
#build dataframe of information of those words introduced by same people
#and export to file
df1 = pd.DataFrame(samewords)
samewords_sameauthor_dataframe = df1.transpose()
samewords_sameauthor_dataframe.columns = ["Wordcount1", "From1", "Date1","Wordcount2", "From2", "Date2"]
samewords_sameauthor_dataframe.to_csv(cwd+'/samewords_sameauthor.csv')
print('File exported!')
print('Check '+cwd+'/samewords_sameauthor.csv')


File exported!
Check /home/berra/bigbang/examples/word_analysis/samewords_sameauthor.csv

In [61]:
samewordcount = 0
for word in wordcount1:
    if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
        if word in wordcount2:
            if wordcount2[word][0] >= 100 and wordcount2[word][0] <= 500:
                samewordcount += 1
print('Among 100-500 appearance words, the number of common words between two mailing-list')
print(samewordcount)


Among 100-500 appearance words, the number of common words between two mailing-list
119

In [62]:
same_person_count = 0
for word in wordcount1:
    if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
        if word in wordcount2:
            if wordcount2[word][0] >= 100 and wordcount2[word][0] <= 500:
                if wordcount1[word][3] == wordcount2[word][3]:
                    #print word
                    same_person_count += 1
print('Among 100-500 appearance words, the number of common words between two mailing-list that are first introduced by same people')
print(same_person_count)


Among 100-500 appearance words, the number of common words between two mailing-list that are first introduced by same people
0

We want to compute the list of common words that are introduced by different people in different lists. The results are exported in a .csv file


In [63]:
#compute common word list(introduced by different people in different lists)
#and print the number
commonwords = {}
for word in wordcount1:
    if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
        if word in wordcount2:
            if wordcount2[word][0] >= 100 and wordcount2[word][0] <= 500:
                if wordcount1[word][3] != wordcount2[word][3]:
                    commonwords[word] = [wordcount1[word][0],wordcount1[word][3],wordcount1[word][2],
                                         wordcount2[word][0],wordcount2[word][3],wordcount2[word][2]]
print('Number of common words introduced by different people in different lists')
print(len(commonwords))


Number of common words introduced by different people in different lists
119

In [64]:
#build dataframe of information of those words introduced by different people
#and export to file
df1 = pd.DataFrame(commonwords)
commonword_differentauthor_dataframe = df1.transpose()
commonword_differentauthor_dataframe.columns = ["Wordcount1", "From1", "Date1","Wordcount2", "From2", "Date2"]
commonword_differentauthor_dataframe.to_csv(cwd+'/commonwords_differentauthor.csv')
print('File exported!')
print('Check '+cwd+'/commonwords_differentauthor.csv')


File exported!
Check /home/berra/bigbang/examples/word_analysis/commonwords_differentauthor.csv

Let's identify "influential words" (see definition below) and print them


In [65]:
#Compute 'influential words', the list of words that have potential of idea flows.

#Definition: A is introduced by p in list1 first, then q saw it and then 
#introduced the word A to list 2, or vice versa. We defined q saw as q said something in list1 before p poped out the word. 
#Total list of such word A.


#Build a dictionary with senders and date of first participation for each mailing list
first_participation1 = {}
for row in archives_list[0].iterrows():
    if row[1]["From"] not in first_participation1:
        first_participation1[row[1]["From"]] = row[1]["Date"]
first_participation2 = {}
for row in archives_list[1].iterrows():
    if row[1]["From"] not in first_participation2:
        first_participation2[row[1]["From"]] = row[1]["Date"]

time_influence = 0
influence_list = {}
for word in commonwords:
    if commonwords[word][2] > commonwords[word][5]: #Author2 comes first
        if commonwords[word][1] in first_participation2: #Check if author1 in list2
            if first_participation2[commonwords[word][1]] < commonwords[word][5]: #Check if author1\
                #in list2 and exists before the word first introduced in list2
                influence_list[word] = commonwords[word]
                time_influence += 1
    else: #Author1 comes first
        if commonwords[word][4] in first_participation1:
            if first_participation1[commonwords[word][4]] < commonwords[word][2]:
                influence_list[word] = commonwords[word]
                time_influence += 1

In [66]:
#print the list of influential words (exclude numbers)
if len(list(influence_list.keys())) == 0: print('No influential words detected')
for word, info in influence_list.items():
    if not word.isdigit():
        print('"'+word+'"')
        print(info)
        print(' ')


No influential words detected

In [ ]: