In [3]:
#import the libraries
import pandas as pd
import glob
import os
from bokeh.charts import Histogram, output_file, show
from bokeh.io import output_notebook
from bokeh.layouts import gridplot
import numpy as np
#function call to display the histogram in the ipython notebook
output_notebook()


Loading BokehJS ...

1. Introduction

We are interested in observing the word frequency across e-mail replies of the Full Disclosure (FD) mailing list, which is a "public, vendor-neutral forum for detailed discussion of vulnerabilities and exploitation techniques, as well as tools, papers, news, and events of interest to the community."

The analysis and the plots shared in the notebook involves analyzing the corpus of full disclosure mailing list which has been extracted using crawlers and derive insights from this data.

In this notebook, 2012 will be used for single-year analysis.

2. Total Word Count per Document Table

We create a dataframe from the input file where every row is a document from the corpus, the total word count and the number of unique words for every document. A snippet of the dataframe is shared below for better understanding.


In [4]:
#Input year for which the word count histogram is being plotted
year='2012'
#Directory with the files
path = 'data/input/bodymessage_corpus/'+year

file_name_list=[]
file_wc_list=[]
file_uq_wc_list=[]
file_wc_df = pd.DataFrame(columns = ["file_name","word_count","unique_word_count"])

#function that returns the word count for all the documents in a list
def doc_wordcnt(path_file_name, file_name):
    #use open() for opening file.
    with open(path_file_name) as f:
        #create a list of all words fetched from the file using a list comprehension
        words = [word for line in f for word in line.split()]
        unique_words = set(words)

        #append the file name and the word count to a list
        file_name_list.append(file_name)
        file_wc_list.append(len(words))
        file_uq_wc_list.append(len(unique_words))

#loop within the directory to get all the file names
for filename in os.listdir(path):
    #get the word count for every file
    doc_wordcnt(path+'/'+filename, filename)
    
#populate the column of the dataframe using the values populated in the list
file_wc_df["file_name"] = file_name_list
file_wc_df["word_count"] = file_wc_list
file_wc_df["unique_word_count"] = file_uq_wc_list

file_wc_df.head()


Out[4]:
file_name word_count unique_word_count
0 2012_Apr_0.txt 330 214
1 2012_Apr_1.txt 51 49
2 2012_Apr_10.txt 69 64
3 2012_Apr_100.txt 72 61
4 2012_Apr_101.txt 403 264

In [5]:
#Sort in ascending order of word count to split and plot multiple histograms
file_wc_asc_df = file_wc_df.sort_values("word_count", ascending=False);
file_uq_wc_asc_df = file_wc_df.sort_values("unique_word_count", ascending=False);

3. Word Count per Document Histogram

The grid displayed below consists of histograms that show the number of words per Document for '2012'.

To achieve this, the dataframe is split into five subsets and the histogram is plotted for each of the subset to get a better picture of the number of words.


In [6]:
#Create a list that consists of plots to be plotted in a grid
plot_lst=[]


#Function to plot multiple histograms
def plot_main_hist(plot_index, plot_df, plot_year):
    #plot the histogram based on values in the dataframe
    range_max=plot_df["word_count"].iloc[0]
    range_min=plot_df["word_count"].iloc[-1]
    p = Histogram(plot_df,bins=20, values='word_count',title="Plot "+str(plot_index)+". "+plot_year+" Histogram; Range: "+str(range_min)+" to "+str(range_max))
    
    p.xaxis.axis_label = 'Word Count per Document'
    p.xaxis.axis_label_text_font_size = '10pt'
    p.xaxis.major_label_text_font_size = '10pt'

    p.yaxis.axis_label = 'Frequency'
    p.yaxis.axis_label_text_font_size = '10pt'
    p.yaxis.major_label_text_font_size = '10pt'
    show(p)

#Function to plot multiple histograms
def plot_hist(plot_index, plot_df, plot_year):
    #plot the histogram based on values in the dataframe
    range_max=plot_df["word_count"].iloc[0]
    range_min=plot_df["word_count"].iloc[-1]
    p = Histogram(plot_df,bins=20, values='word_count',title="Plot "+str(plot_index)+". "+plot_year+" Histogram; Range: "+str(range_min)+" to "+str(range_max))
    
    p.xaxis.axis_label = 'Word Count per Document'
    p.xaxis.axis_label_text_font_size = '10pt'
    p.xaxis.major_label_text_font_size = '10pt'

    p.yaxis.axis_label = 'Frequency'
    p.yaxis.axis_label_text_font_size = '10pt'
    p.yaxis.major_label_text_font_size = '10pt'
    plot_lst.append(p)

#Create dataframes and split the sorted dataframe into 5 dataframes
file_wc_asc_df1 = pd.DataFrame(data=None, columns=file_wc_asc_df.columns)
file_wc_asc_df2 = pd.DataFrame(data=None, columns=file_wc_asc_df.columns)
file_wc_asc_df3 = pd.DataFrame(data=None, columns=file_wc_asc_df.columns)
file_wc_asc_df4 = pd.DataFrame(data=None, columns=file_wc_asc_df.columns)
file_wc_asc_df5 = pd.DataFrame(data=None, columns=file_wc_asc_df.columns)
                               
file_wc_asc_df1=file_wc_asc_df.loc[(file_wc_asc_df['word_count'] >= 0) & (file_wc_asc_df['word_count'] <= 2000)]
file_wc_asc_df2=file_wc_asc_df.loc[(file_wc_asc_df['word_count'] >= 2001) & (file_wc_asc_df['word_count'] <= 4000)]
file_wc_asc_df3=file_wc_asc_df.loc[(file_wc_asc_df['word_count'] >= 4001) & (file_wc_asc_df['word_count'] <= 6000)]
file_wc_asc_df4=file_wc_asc_df.loc[(file_wc_asc_df['word_count'] >= 6001) & (file_wc_asc_df['word_count'] <= 8000)]
file_wc_asc_df5=file_wc_asc_df.loc[(file_wc_asc_df['word_count'] >= 8001)]


#Get the year from the filename for the first record
year=file_wc_asc_df.file_name[0][:4]
#call the histogram function for each dataframe
plot_main_hist(1, file_wc_asc_df, year)

year=file_wc_asc_df1.file_name[0][:4]
plot_hist(2.1, file_wc_asc_df2, year)

year=file_wc_asc_df1.file_name[0][:4]
plot_hist(2.2, file_wc_asc_df3, year)

year=file_wc_asc_df1.file_name[0][:4]
plot_hist(2.2, file_wc_asc_df4, year)

year=file_wc_asc_df1.file_name[0][:4]
plot_hist(2.3, file_wc_asc_df5, year)

#Make a grid
grid = gridplot(plot_lst, ncols=2, plot_width=450, plot_height=350)

#Show the results
show(grid)


3.1 Large e-mail discussions

All the large emails in the corpus for 2012 have been listed in the table below with the respective word count range.


In [7]:
fileRange_df= pd.DataFrame(columns=['Range','File Names'])

range_lst=[]
file_names_lst=[]

def append_dataframe(dframe):
    range_min=dframe["word_count"].iloc[-1]
    range_max=dframe["word_count"].iloc[0]
    range_str=str(range_min)+" to "+str(range_max)
    
    range_lst.append(range_str)
    file_names_lst.append(', '.join(dframe["file_name"]))

append_dataframe(file_wc_asc_df2);
append_dataframe(file_wc_asc_df3);
append_dataframe(file_wc_asc_df4);
append_dataframe(file_wc_asc_df5);

fileRange_df["Range"]=range_lst
fileRange_df["File Names"]=file_names_lst

fileRange_df.head()


Out[7]:
Range File Names
0 2047 to 3285 2012_Mar_340.txt, 2012_Oct_255.txt, 2012_Dec_2...
1 4539 to 5343 2012_Dec_222.txt, 2012_Dec_220.txt, 2012_Oct_3...
2 6077 to 7678 2012_Mar_338.txt, 2012_Mar_336.txt, 2012_Apr_3...
3 8055 to 8279 2012_Mar_341.txt, 2012_Nov_23.txt, 2012_Mar_33...

Vocabulary per Document Histogram

The grid displayed below consists of histograms that show the number of unique words per Document for '2012'.

As shown below, there is only a single document in the entire dataset that has a count of unique words greater than 2000. Majority of the documents in the dataset have a unique word count between 100 and 200.


In [21]:
#Create a list that consists of plots to be plotted in a grid
plot_lst=[]

#Function to histograms
def plot_hist(plot_index, plot_df, plot_year):
    #plot the histogram based on values in the dataframe
    range_max=plot_df["unique_word_count"].iloc[0]
    range_min=plot_df["unique_word_count"].iloc[-1]
    p = Histogram(plot_df,bins=20, values='unique_word_count',title="Plot "+str(plot_index)+". "+plot_year+" Histogram; Range: "+str(range_min)+" to "+str(range_max))
    
    p.xaxis.axis_label = 'Unique Word Count per Document'
    p.xaxis.axis_label_text_font_size = '10pt'
    p.xaxis.major_label_text_font_size = '10pt'

    p.yaxis.axis_label = 'Frequency'
    p.yaxis.axis_label_text_font_size = '10pt'
    p.yaxis.major_label_text_font_size = '10pt'
    show(p)

#Create dataframes
file_uq_wc_asc_df2 = pd.DataFrame(data=None, columns=file_uq_wc_asc_df.columns)                               
file_uq_wc_asc_df2=file_uq_wc_asc_df.loc[(file_uq_wc_asc_df['unique_word_count'] >= 700) & (file_uq_wc_asc_df['unique_word_count'] <= 4000)]

#Get the year from the filename for the first record
year=file_uq_wc_asc_df.file_name[0][:4]
#call the histogram function for each dataframe
plot_hist(1, file_uq_wc_asc_df, year)

year=file_uq_wc_asc_df.file_name[0][:4]
plot_hist(2.1, file_uq_wc_asc_df2, year)