Comparing Summaries/Description across CVE Mitre, NVD and CVE Details

Introduction

In the 3 data sources: CVE Mitre, NVD and CVE Details, we have a summary or description field that talks about the vulnerability and its attributes. This holds neccesary information for further analysis. In this notebook we are trying to analyse the difference in words between set pairs of data sources. This will help us identify how the description/summary gentrifies as it processes through each source


In [1]:
#imports
import pandas as pd
import csv
import numpy as np
from xml.etree.ElementTree import ElementTree 
import csv
import re
import glob

In [2]:
#read all csv files
cve_details_file = pd.read_csv("cve_details.csv")
cve_details_description = cve_details_file['description']
#print(cve_details_description)
nvd_file = pd.read_csv("nvd.csv",error_bad_lines=False)
nvd_description = nvd_file['Summary']
#print(nvd_description)

In [3]:
#read xml files
cve_mitre_description=[]
CVE_tree = ElementTree()
CVE_tree.parse("cve_mitre.xml")
CVE_root= CVE_tree.getroot()
cell=0
for entry in CVE_root:       
    for child in entry:
        #print (child.tag)
        if (child.tag == '{http://www.icasi.org/CVRF/schema/vuln/1.1}Notes'):
            for note in child:
                if note.attrib['Type'] =="Description":
                    cve_mitre_description.append(note.text); 
           
                    cell+=1

In [4]:
#converting lists into sets
nvd_description = set(nvd_description)
cve_details_description = set(cve_details_description)
cve_mitre_description = set(cve_mitre_description)

In [5]:
#performing the text comparision for cve mitre and nvd
nvd_added = (nvd_description -  cve_mitre_description)
nvd_removed =  (cve_mitre_description - nvd_description)
nvd_added_count = len(nvd_added)
print (nvd_added_count)
nvd_removed_count = len(nvd_removed)
print (nvd_removed_count)


71738
84057

In [6]:
#performing the text comparision for nvd X cve details
details_added = (cve_details_description -  nvd_description)
details_removed =  (nvd_description - cve_details_description)
details_added_count = len(details_added)
print (details_added_count)
details_removed_count = len(details_removed)
print (details_removed_count)


43654
47094

In [13]:
#performing the text comparision for cve mitre X cve details
mitre_added = (cve_mitre_description -  cve_details_description)
mitre_removed =  (cve_details_description - cve_mitre_description)
mitre_added_count = len(nvd_added)
print (mitre_added_count)
mitre_removed_count = len(mitre_removed)
print (mitre_removed_count)


71738
68298

In [8]:
#creating histograms for visualization
mitre_nvd_data = {}
mitre_nvd_data["mitre_added_count"] = 71738
mitre_nvd_data["mitre_removed_count"] = 68298
mitre_details_data = {}
mitre_details_data["details_added_count"] = 43654
mitre_details_data["details_removed_count"] = 47094
details_nvd_data = {}
details_nvd_data["nvd_added_count"] = 71738
details_nvd_data["nvd_removed_count"] = 84057

Visualizations


In [9]:
#imports for histogram
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.models import Range1d
from bokeh.io import output_notebook
from bokeh.charts import Bar
import matplotlib.pyplot as plot
from datetime import datetime
output_notebook()


Loading BokehJS ...

In [10]:
#creating a histogram for mitre X nvd comparision
data = {}
data['Entries'] = mitre_nvd_data
#saving in dictionary for sorting and visualising
df_data = pd.DataFrame(data).sort_values(by='Entries', ascending=True)
series = df_data.loc[:,'Entries']

p = figure(width=800, y_range=series.index.tolist(), title="Histogram of word count difference between CVE Mitre and NVD summaries")

p.xaxis.axis_label = 'Difference in word count'
p.xaxis.axis_label_text_font_size = '10pt'
p.xaxis.major_label_text_font_size = '9pt'

p.yaxis.axis_label = 'Difference in word count in CVE Mitre as compared to NVD'
p.yaxis.axis_label_text_font_size = '10pt'
p.yaxis.major_label_text_font_size = '9pt'

j = 1
for k,v in series.iteritems():
  
  #Print fields, values, orders
  #print (k,v,j) 
  p.rect(x=v/2, y=j, width=abs(v), height=0.4,
    width_units="data", height_units="data")
  j += 1
show(p)



In [11]:
#creating a histogram for mitre X details comparision
data = {}
data['Entries'] = mitre_details_data
#saving in dictionary for sorting and visualising
df_data = pd.DataFrame(data).sort_values(by='Entries', ascending=True)
series = df_data.loc[:,'Entries']

p = figure(width=800, y_range=series.index.tolist(), title="Histogram of word count difference between CVE Mitre and CVE Details summaries")

p.xaxis.axis_label = 'Difference in word count'
p.xaxis.axis_label_text_font_size = '10pt'
p.xaxis.major_label_text_font_size = '9pt'

p.yaxis.axis_label = 'Difference in word count in CVE Mitre as compared to CVE Details'
p.yaxis.axis_label_text_font_size = '10pt'
p.yaxis.major_label_text_font_size = '9pt'

j = 1
for k,v in series.iteritems():
  
  #Print fields, values, orders
  #print (k,v,j) 
  p.rect(x=v/2, y=j, width=abs(v), height=0.4,
    width_units="data", height_units="data")
  j += 1
show(p)



In [12]:
#creating a histogram for details X nvd comparision
data = {}
data['Entries'] = details_nvd_data
#saving in dictionary for sorting and visualising
df_data = pd.DataFrame(data).sort_values(by='Entries', ascending=True)
series = df_data.loc[:,'Entries']

p = figure(width=800, y_range=series.index.tolist(), title="Histogram of word count difference between CVE Details and NVD summaries")

p.xaxis.axis_label = 'Difference in word count'
p.xaxis.axis_label_text_font_size = '10pt'
p.xaxis.major_label_text_font_size = '9pt'

p.yaxis.axis_label = 'Difference in word count in CVE Details as compared to NVD'
p.yaxis.axis_label_text_font_size = '10pt'
p.yaxis.major_label_text_font_size = '9pt'

j = 1
for k,v in series.iteritems():
  
  #Print fields, values, orders
  #print (k,v,j) 
  p.rect(x=v/2, y=j, width=abs(v), height=0.4,
    width_units="data", height_units="data")
  j += 1
show(p)