This notebook tries to detect "special words" in a corpus of mailing lists. (for now it works with two mailing lists only)
-it computes and exports in .csv files the word counts (words and their occurrences) -it computes and exports in .csv files the list of common words that are introduced by different people in different lists it computes and print the 'influential words' (see definition in the box)
Further extensions: -from two lists to n lists !
In [3]:
from bigbang.archive import Archive
from bigbang.archive import load as load_archive
import bigbang.parse as parse
import bigbang.graph as graph
import bigbang.mailman as mailman
import bigbang.process as process
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
from pprint import pprint as pp
import pytz
import numpy as np
import math
import nltk
from itertools import repeat
from nltk.stem.lancaster import LancasterStemmer
st = LancasterStemmer()
from nltk.corpus import stopwords
import re
import os
In [51]:
#insert TWO names of mailing lists (no more, no less)
cwd = os.getcwd()
archives_names = ["ietf-privacy", "architecture-discuss"]
archives_paths = list()
for archive_name in archives_names:
archives_paths.append('../../archives/'+archive_name+'.csv')
archives_list = [load_archive(archive_path).data for archive_path in archives_paths]
archives = Archive(pd.concat(archives_list))
archives_data = archives.data
In [52]:
#to stem or not to stem?
#if stem is set to True, then words are converted into their stem(no plurals, no suffixes, etc.)
#if stem is set to False, then words are processed for their literal spelling
stem = False
First, we shall compute the word counts on the lists. Data will be also exported to .csv files
In [53]:
#Compute word count on the first list
wordcount1={}
for row in archives_list[0].iterrows():
if row[1]["Body"] is not None:
w = row[1]["Body"].replace("'", "")
k = re.sub(r'[^\w]', ' ', w)
t = nltk.tokenize.word_tokenize(k)
for g in t:
try:
if stem: word = st.stem(g)
else: word = g
except:
print(g)
pass
if word in stopwords.words('english'):
continue
if word not in wordcount1:
wordcount1[word] = [1]
wordcount1[word].append(row[0])
wordcount1[word].append(row[1]["Date"])
wordcount1[word].append(row[1]["From"])
wordcount1[word].append(row[1]["In-Reply-To"])
else:
wordcount1[word][0] += 1
wd = wordcount #In case
In [54]:
#Compute word count on the second list
wordcount2={}
for row in archives_list[1].iterrows():
if row[1]["Body"] is not None:
w = row[1]["Body"].replace("'", "")
k = re.sub(r'[^\w]', ' ', w)
t = nltk.tokenize.word_tokenize(k)
for g in t:
try:
if stem: word = st.stem(g)
else: word = g
except:
pass
if word in stopwords.words('english'):
continue
if word not in wordcount2:
wordcount2[word] = [1]
wordcount2[word].append(row[0])
wordcount2[word].append(row[1]["Date"])
wordcount2[word].append(row[1]["From"])
wordcount2[word].append(row[1]["In-Reply-To"])
else:
wordcount2[word][0] += 1
In [55]:
#Create and export a wordcount information dataframe per mailing list
#set the variable 'path' as a valid directory path where to store the files
asd = pd.DataFrame(wordcount)
new_dataframe = asd.transpose()
new_dataframe.columns = ["Wordcount", "Message-ID", "Date","From","In-Reply-To"]
new_dataframe.to_csv(cwd+'/wordcount_info_'+archives_names[0]+'.csv')
asd1 = pd.DataFrame(wordcount1)
new_dataframe1 = asd1.transpose()
new_dataframe1.columns = ["Wordcount", "Message-ID", "Date","From","In-Reply-To"]
new_dataframe1.to_csv(cwd+'/wordcount_info_'+archives_names[1]+'.csv')
print('File exported!')
print('Check '+cwd+'/wordcount_info_'+archives_names[0]+'.csv and '+cwd+'wordcount_info_'+archives_names[1]+'.csv')
Let's print some useful descriptive data:
In [56]:
print('Number of unique words in mailinglist '+archives_names[0])
print(len(wordcount1))
In [57]:
print('Number of unique words in mailinglist '+archives_names[1])
print(len(wordcount2))
In [58]:
samewordcount=0
for word in wordcount1:
if word in wordcount2:
samewordcount += 1
print('Number of same unique words in two mailing lists')
print(samewordcount)
In [59]:
samewords = {}
for word in wordcount1:
if word in wordcount2:
if wordcount1[word][3] == wordcount2[word][3]:
samewords[word] = [wordcount1[word][0],wordcount1[word][3],wordcount1[word][2],
wordcount2[word][0],wordcount2[word][3],wordcount2[word][2]]
print('Total number of same words that are introduced by same people')
print(len(list(samewords.keys())))
In [60]:
#build dataframe of information of those words introduced by same people
#and export to file
df1 = pd.DataFrame(samewords)
samewords_sameauthor_dataframe = df1.transpose()
samewords_sameauthor_dataframe.columns = ["Wordcount1", "From1", "Date1","Wordcount2", "From2", "Date2"]
samewords_sameauthor_dataframe.to_csv(cwd+'/samewords_sameauthor.csv')
print('File exported!')
print('Check '+cwd+'/samewords_sameauthor.csv')
In [61]:
samewordcount = 0
for word in wordcount1:
if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
if word in wordcount2:
if wordcount2[word][0] >= 100 and wordcount2[word][0] <= 500:
samewordcount += 1
print('Among 100-500 appearance words, the number of common words between two mailing-list')
print(samewordcount)
In [62]:
same_person_count = 0
for word in wordcount1:
if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
if word in wordcount2:
if wordcount2[word][0] >= 100 and wordcount2[word][0] <= 500:
if wordcount1[word][3] == wordcount2[word][3]:
#print word
same_person_count += 1
print('Among 100-500 appearance words, the number of common words between two mailing-list that are first introduced by same people')
print(same_person_count)
We want to compute the list of common words that are introduced by different people in different lists. The results are exported in a .csv file
In [63]:
#compute common word list(introduced by different people in different lists)
#and print the number
commonwords = {}
for word in wordcount1:
if wordcount1[word][0] >= 100 and wordcount1[word][0] <= 500:
if word in wordcount2:
if wordcount2[word][0] >= 100 and wordcount2[word][0] <= 500:
if wordcount1[word][3] != wordcount2[word][3]:
commonwords[word] = [wordcount1[word][0],wordcount1[word][3],wordcount1[word][2],
wordcount2[word][0],wordcount2[word][3],wordcount2[word][2]]
print('Number of common words introduced by different people in different lists')
print(len(commonwords))
In [64]:
#build dataframe of information of those words introduced by different people
#and export to file
df1 = pd.DataFrame(commonwords)
commonword_differentauthor_dataframe = df1.transpose()
commonword_differentauthor_dataframe.columns = ["Wordcount1", "From1", "Date1","Wordcount2", "From2", "Date2"]
commonword_differentauthor_dataframe.to_csv(cwd+'/commonwords_differentauthor.csv')
print('File exported!')
print('Check '+cwd+'/commonwords_differentauthor.csv')
Let's identify "influential words" (see definition below) and print them
In [65]:
#Compute 'influential words', the list of words that have potential of idea flows.
#Definition: A is introduced by p in list1 first, then q saw it and then
#introduced the word A to list 2, or vice versa. We defined q saw as q said something in list1 before p poped out the word.
#Total list of such word A.
#Build a dictionary with senders and date of first participation for each mailing list
first_participation1 = {}
for row in archives_list[0].iterrows():
if row[1]["From"] not in first_participation1:
first_participation1[row[1]["From"]] = row[1]["Date"]
first_participation2 = {}
for row in archives_list[1].iterrows():
if row[1]["From"] not in first_participation2:
first_participation2[row[1]["From"]] = row[1]["Date"]
time_influence = 0
influence_list = {}
for word in commonwords:
if commonwords[word][2] > commonwords[word][5]: #Author2 comes first
if commonwords[word][1] in first_participation2: #Check if author1 in list2
if first_participation2[commonwords[word][1]] < commonwords[word][5]: #Check if author1\
#in list2 and exists before the word first introduced in list2
influence_list[word] = commonwords[word]
time_influence += 1
else: #Author1 comes first
if commonwords[word][4] in first_participation1:
if first_participation1[commonwords[word][4]] < commonwords[word][2]:
influence_list[word] = commonwords[word]
time_influence += 1
In [66]:
#print the list of influential words (exclude numbers)
if len(list(influence_list.keys())) == 0: print('No influential words detected')
for word, info in influence_list.items():
if not word.isdigit():
print('"'+word+'"')
print(info)
print(' ')
In [ ]: