In [3]:
#import the libraries
import pandas as pd
import glob
import os
from bokeh.charts import Histogram, output_file, show
from bokeh.io import output_notebook
from bokeh.layouts import gridplot
import numpy as np
#function call to display the histogram in the ipython notebook
output_notebook()
We are interested in observing the word frequency across e-mail replies of the Full Disclosure (FD) mailing list, which is a "public, vendor-neutral forum for detailed discussion of vulnerabilities and exploitation techniques, as well as tools, papers, news, and events of interest to the community."
The analysis and the plots shared in the notebook involves analyzing the corpus of full disclosure mailing list which has been extracted using crawlers and derive insights from this data.
In this notebook, 2012 will be used for single-year analysis.
In [4]:
#Input year for which the word count histogram is being plotted
year='2012'
#Directory with the files
path = 'data/input/bodymessage_corpus/'+year
file_name_list=[]
file_wc_list=[]
file_uq_wc_list=[]
file_wc_df = pd.DataFrame(columns = ["file_name","word_count","unique_word_count"])
#function that returns the word count for all the documents in a list
def doc_wordcnt(path_file_name, file_name):
#use open() for opening file.
with open(path_file_name) as f:
#create a list of all words fetched from the file using a list comprehension
words = [word for line in f for word in line.split()]
unique_words = set(words)
#append the file name and the word count to a list
file_name_list.append(file_name)
file_wc_list.append(len(words))
file_uq_wc_list.append(len(unique_words))
#loop within the directory to get all the file names
for filename in os.listdir(path):
#get the word count for every file
doc_wordcnt(path+'/'+filename, filename)
#populate the column of the dataframe using the values populated in the list
file_wc_df["file_name"] = file_name_list
file_wc_df["word_count"] = file_wc_list
file_wc_df["unique_word_count"] = file_uq_wc_list
file_wc_df.head()
Out[4]:
In [5]:
#Sort in ascending order of word count to split and plot multiple histograms
file_wc_asc_df = file_wc_df.sort_values("word_count", ascending=False);
file_uq_wc_asc_df = file_wc_df.sort_values("unique_word_count", ascending=False);
In [6]:
#Create a list that consists of plots to be plotted in a grid
plot_lst=[]
#Function to plot multiple histograms
def plot_main_hist(plot_index, plot_df, plot_year):
#plot the histogram based on values in the dataframe
range_max=plot_df["word_count"].iloc[0]
range_min=plot_df["word_count"].iloc[-1]
p = Histogram(plot_df,bins=20, values='word_count',title="Plot "+str(plot_index)+". "+plot_year+" Histogram; Range: "+str(range_min)+" to "+str(range_max))
p.xaxis.axis_label = 'Word Count per Document'
p.xaxis.axis_label_text_font_size = '10pt'
p.xaxis.major_label_text_font_size = '10pt'
p.yaxis.axis_label = 'Frequency'
p.yaxis.axis_label_text_font_size = '10pt'
p.yaxis.major_label_text_font_size = '10pt'
show(p)
#Function to plot multiple histograms
def plot_hist(plot_index, plot_df, plot_year):
#plot the histogram based on values in the dataframe
range_max=plot_df["word_count"].iloc[0]
range_min=plot_df["word_count"].iloc[-1]
p = Histogram(plot_df,bins=20, values='word_count',title="Plot "+str(plot_index)+". "+plot_year+" Histogram; Range: "+str(range_min)+" to "+str(range_max))
p.xaxis.axis_label = 'Word Count per Document'
p.xaxis.axis_label_text_font_size = '10pt'
p.xaxis.major_label_text_font_size = '10pt'
p.yaxis.axis_label = 'Frequency'
p.yaxis.axis_label_text_font_size = '10pt'
p.yaxis.major_label_text_font_size = '10pt'
plot_lst.append(p)
#Create dataframes and split the sorted dataframe into 5 dataframes
file_wc_asc_df1 = pd.DataFrame(data=None, columns=file_wc_asc_df.columns)
file_wc_asc_df2 = pd.DataFrame(data=None, columns=file_wc_asc_df.columns)
file_wc_asc_df3 = pd.DataFrame(data=None, columns=file_wc_asc_df.columns)
file_wc_asc_df4 = pd.DataFrame(data=None, columns=file_wc_asc_df.columns)
file_wc_asc_df5 = pd.DataFrame(data=None, columns=file_wc_asc_df.columns)
file_wc_asc_df1=file_wc_asc_df.loc[(file_wc_asc_df['word_count'] >= 0) & (file_wc_asc_df['word_count'] <= 2000)]
file_wc_asc_df2=file_wc_asc_df.loc[(file_wc_asc_df['word_count'] >= 2001) & (file_wc_asc_df['word_count'] <= 4000)]
file_wc_asc_df3=file_wc_asc_df.loc[(file_wc_asc_df['word_count'] >= 4001) & (file_wc_asc_df['word_count'] <= 6000)]
file_wc_asc_df4=file_wc_asc_df.loc[(file_wc_asc_df['word_count'] >= 6001) & (file_wc_asc_df['word_count'] <= 8000)]
file_wc_asc_df5=file_wc_asc_df.loc[(file_wc_asc_df['word_count'] >= 8001)]
#Get the year from the filename for the first record
year=file_wc_asc_df.file_name[0][:4]
#call the histogram function for each dataframe
plot_main_hist(1, file_wc_asc_df, year)
year=file_wc_asc_df1.file_name[0][:4]
plot_hist(2.1, file_wc_asc_df2, year)
year=file_wc_asc_df1.file_name[0][:4]
plot_hist(2.2, file_wc_asc_df3, year)
year=file_wc_asc_df1.file_name[0][:4]
plot_hist(2.2, file_wc_asc_df4, year)
year=file_wc_asc_df1.file_name[0][:4]
plot_hist(2.3, file_wc_asc_df5, year)
#Make a grid
grid = gridplot(plot_lst, ncols=2, plot_width=450, plot_height=350)
#Show the results
show(grid)
In [7]:
fileRange_df= pd.DataFrame(columns=['Range','File Names'])
range_lst=[]
file_names_lst=[]
def append_dataframe(dframe):
range_min=dframe["word_count"].iloc[-1]
range_max=dframe["word_count"].iloc[0]
range_str=str(range_min)+" to "+str(range_max)
range_lst.append(range_str)
file_names_lst.append(', '.join(dframe["file_name"]))
append_dataframe(file_wc_asc_df2);
append_dataframe(file_wc_asc_df3);
append_dataframe(file_wc_asc_df4);
append_dataframe(file_wc_asc_df5);
fileRange_df["Range"]=range_lst
fileRange_df["File Names"]=file_names_lst
fileRange_df.head()
Out[7]:
The grid displayed below consists of histograms that show the number of unique words per Document for '2012'.
As shown below, there is only a single document in the entire dataset that has a count of unique words greater than 2000. Majority of the documents in the dataset have a unique word count between 100 and 200.
In [21]:
#Create a list that consists of plots to be plotted in a grid
plot_lst=[]
#Function to histograms
def plot_hist(plot_index, plot_df, plot_year):
#plot the histogram based on values in the dataframe
range_max=plot_df["unique_word_count"].iloc[0]
range_min=plot_df["unique_word_count"].iloc[-1]
p = Histogram(plot_df,bins=20, values='unique_word_count',title="Plot "+str(plot_index)+". "+plot_year+" Histogram; Range: "+str(range_min)+" to "+str(range_max))
p.xaxis.axis_label = 'Unique Word Count per Document'
p.xaxis.axis_label_text_font_size = '10pt'
p.xaxis.major_label_text_font_size = '10pt'
p.yaxis.axis_label = 'Frequency'
p.yaxis.axis_label_text_font_size = '10pt'
p.yaxis.major_label_text_font_size = '10pt'
show(p)
#Create dataframes
file_uq_wc_asc_df2 = pd.DataFrame(data=None, columns=file_uq_wc_asc_df.columns)
file_uq_wc_asc_df2=file_uq_wc_asc_df.loc[(file_uq_wc_asc_df['unique_word_count'] >= 700) & (file_uq_wc_asc_df['unique_word_count'] <= 4000)]
#Get the year from the filename for the first record
year=file_uq_wc_asc_df.file_name[0][:4]
#call the histogram function for each dataframe
plot_hist(1, file_uq_wc_asc_df, year)
year=file_uq_wc_asc_df.file_name[0][:4]
plot_hist(2.1, file_uq_wc_asc_df2, year)