This notebook tries to detect "special words" in a corpus of mailing lists. (for now it works with two mailing lists only)

-it computes and exports in .csv files the word counts (words and their occurrences) -it computes and exports in .csv files the list of common words that are introduced by different people in different lists it computes and print the 'influential words' (see definition in the box)

Further extensions: -from two lists to n lists !


In [ ]:
from bigbang.archive import Archive
from bigbang.archive import load as load_archive
import bigbang.parse as parse
import bigbang.graph as graph
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
import nltk
from itertools import repeat
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from nltk.corpus import stopwords
import re

In [ ]:
#insert TWO urls of mailing lists
urls = [       "http://mm.icann.org/pipermail/wp4/",
       "http://mm.icann.org/pipermail/ge/"]


try:
    arch_paths =[]
    for url in urls:
        arch_paths.append('../archives/'+url[:-1].replace('://','_/')+'.csv')
    archives = [load_archive(arch_path) for arch_path in arch_paths]
except:
    arch_paths =[]
    for url in urls:
        arch_paths.append('../archives/'+url[:-1].replace('//','/')+'.csv')
archives = [load_archive(arch_path) for arch_path in arch_paths]

In [ ]:
#to stem or not to stem? 
#if stem is set to True, then words are converted into their stem(no plurals, no suffixes, etc.)
#if stem is set to False, then words are processed for their literal spelling

stem = False

First, we shall compute the word counts on the lists. Data will be also exported to .csv files


In [ ]:
#Compute word count on the first list
wordcount={}
for row in archives[0].data.iterrows():
    if row[1]["Body"] is not None:
        w = row[1]["Body"].replace("'", "")
        k = re.sub(r'[^\w]', ' ', w)
        t = nltk.tokenize.word_tokenize(k)
        for g in t:
            try:
                if stem: word = st.stem(g)
                else: word = g
            except:
                print(g)
                pass
            if word in stopwords.words('english'):
                continue
            if word not in wordcount:
                wordcount[word] = [1]
                wordcount[word].append(row[0])
                wordcount[word].append(row[1]["Date"])
                wordcount[word].append(row[1]["From"])
                wordcount[word].append(row[1]["In-Reply-To"])
            else:
                wordcount[word][0] += 1
wd = wordcount #In case

In [ ]:
#Compute word count on the second list
wordcount1={}
i = 0
for row in archives[1].data.iterrows()[:100]:
    i+=1
    print(i)
    if row[1]["Body"] is not None:
        w = row[1]["Body"].replace("'", "")
        k = re.sub(r'[^\w]', ' ', w)
        t = nltk.tokenize.word_tokenize(k)
        for g in t:
            try:
                if stem: word = st.stem(g)
                else: word = g
            except:
                print(g)
                pass
            if word in stopwords.words('english'):
                continue
            if word not in wordcount1:
                wordcount1[word] = [1]
                wordcount1[word].append(row[0])
                wordcount1[word].append(row[1]["Date"])
                wordcount1[word].append(row[1]["From"])
                wordcount1[word].append(row[1]["In-Reply-To"])
            else:
                wordcount1[word][0] += 1

In [ ]:
#Create and export a wordcount information dataframe per mailing list

#set the variable 'path' as a valid directory path where to store the files
path = 'c:/users/davide/bigbang/'


asd = pd.DataFrame(wordcount)
new_dataframe = asd.transpose()
new_dataframe.columns = ["Wordcount", "Message-ID", "Date","From","In-Reply-To"]
new_dataframe.to_csv(path+'wordcount_info_'+urls[0].split('/')[-2]+'.csv')

asd1 = pd.DataFrame(wordcount1)
new_dataframe1 = asd1.transpose()
new_dataframe1.columns = ["Wordcount", "Message-ID", "Date","From","In-Reply-To"]
new_dataframe1.to_csv(path+'wordcount_info_'+urls[1].split('/')[-2]+'.csv')

print('File exported!')
print('Check '+path+'wordcount_info_'+urls[0].split('/')[-2]+'.csv and '+path+'wordcount_info_'+urls[1].split('/')[-2]+'.csv')

Let's print some useful descriptive data:


In [ ]:
print('Number of unique words in mailinglist '+urls[0])
print(len(wordcount))

In [ ]:
print('Number of unique words in mailinglist '+urls[1])
print(len(wordcount))

In [ ]:
samewordcount=0
for word in wordcount:
    if word in wordcount1:
        samewordcount += 1
print('Number of same unique words in two mailing lists')
print(samewordcount)

In [ ]:
samewords = {}
for word in wordcount:
    if word in wordcount1:
        if wordcount[word][3] == wordcount1[word][3]:
            samewords[word] = [wordcount[word][0],wordcount[word][3],wordcount[word][2],
                                         wordcount1[word][0],wordcount1[word][3],wordcount1[word][2]]
print('Total number of same words that are introduced by same people')
print(len(list(samewords.keys())))

In [ ]:
#build dataframe of information of those words introduced by same people
#and export to file
df1 = pd.DataFrame(samewords)
samewords_sameauthor_dataframe = df1.transpose()
samewords_sameauthor_dataframe.columns = ["Wordcount1", "From1", "Date1","Wordcount2", "From2", "Date2"]
samewords_sameauthor_dataframe.to_csv(path+'samewords_sameauthor.csv')
print('File exported!')
print('Check '+path+'samewords_sameauthor.csv')

In [ ]:
samewordcount = 0
for word in wordcount:
    if wordcount[word][0] >= 100 and wordcount[word][0] <= 500:
        if word in wordcount1:
            if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
                samewordcount += 1
print('Among 100-500 appearance words, the number of common words between two mailing-list')
print(samewordcount)

In [ ]:
same_person_count = 0
for word in wordcount:
    if wordcount[word][0] >= 100 and wordcount[word][0] <= 500:
        if word in wordcount1:
            if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
                if wordcount[word][3] == wordcount1[word][3]:
                    #print word
                    same_person_count += 1
print('Among 100-500 appearance words, the number of common words between two mailing-list that are first introduced by same people')
print(samecount)

We want to compute the list of common words that are introduced by different people in different lists. The results are exported in a .csv file


In [ ]:
#compute common word list(introduced by different people in different lists)
#and print the number
commonwords = {}
for word in wordcount:
    if wordcount[word][0] >= 100 and wordcount[word][0] <= 500:
        if word in wordcount1:
            if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
                if wordcount[word][3] != wordcount1[word][3]:
                    commonwords[word] = [wordcount[word][0],wordcount[word][3],wordcount[word][2],
                                         wordcount1[word][0],wordcount1[word][3],wordcount1[word][2]]
print('Number of common words introduced by different people in different lists')
print(len(commonwords))

In [ ]:
#build dataframe of information of those words introduced by different people
#and export to file
df1 = pd.DataFrame(commonwords)
commonword_differentauthor_dataframe = df1.transpose()
commonword_differentauthor_dataframe.columns = ["Wordcount1", "From1", "Date1","Wordcount2", "From2", "Date2"]
commonword_differentauthor_dataframe.to_csv(path+'commonwords_differentauthor.csv')
print('File exported!')
print('Check '+path+'commonwords_differentauthor.csv')

Let's identify "influential words" (see definition below) and print them


In [ ]:
#Compute 'influential words', the list of words that have potential of idea flows.

#Definition: A is introduced by p in list1 first, then q saw it and then 
#introduced the word A to list 2, vice versa. We defined q saw as q said sth in list1 before p poped out the word. 
#Total list of such word A.


#Build a dictionary with senders and date of first participation for each mailing list
first_participation = {}
for row in archives[0].data.iterrows():
    if row[1]["From"] not in first_participation:
        first_participation[row[1]["From"]] = row[1]["Date"]
first_participation1 = {}
for row in archives[1].data.iterrows():
    if row[1]["From"] not in first_participation1:
        first_participation1[row[1]["From"]] = row[1]["Date"]

time_influence = 0
influence_list = {}
for word in commonwords:
    if commonwords[word][2] > commonwords[word][5]: #Author2 comes first
        if commonwords[word][1] in first_participation1: #Check if author1 in list2
            if first_participation1[commonwords[word][1]] < commonwords[word][5]: #Check if author1\
                #in list2 and exists before the word first introduced in list2
                influence_list[word] = commonwords[word]
                time_influence += 1
    else: #Author1 comes first
        if commonwords[word][4] in first_participation:
            if first_participation[commonwords[word][4]] < commonwords[word][2]:
                influence_list[word] = commonwords[word]
                time_influence += 1

In [ ]:
#print the list of influential words (exclude numbers)
if len(list(influence_list.keys())) == 0: print('No influential words detected')
for word, info in influence_list.items():
    if not word.isdigit():
        print('"'+word+'"')
        print(info)
        print(' ')