In [ ]:
%pylab --no-import-all inline
%matplotlib inline
In [16]:
from __future__ import division
import pandas as pd
import numpy as np
import csv
import re
def main():
markdown_file = "../similar_attacks.md"
preprocessed_file = "../data/vcdb_fully_processed.csv"
output_file = "../data/vcdb_similarity_comparison.csv"
relevant_attributes = ["incident_id", "industry.categories", "victim.victim_id", "timeline.incident.month", "timeline.incident.year", "timeline.discovery.day_count", "attribute.confidentiality.data_total", "actor.internal", "actor.external"]
incident_id = {}
industry_ids = []
relevant_attributes_index = []
row_attributes = []
new_header = []
new_row = []
victim_num = 1
regex = r'\w{8}\-\w{4}\-\w{4}\-\w{4}\-\w{12}'
with open(markdown_file, "rb") as md_file:
for i in md_file:
searchObj = re.findall(regex, i, flags = 0)
if searchObj:
industry_ids.append((searchObj[0], searchObj[1]))
else:
pass
with open(output_file, "wb") as new_csv_file:
wrtr = csv.writer(new_csv_file)
with open(preprocessed_file, "rb") as csv_file:
reader = csv.reader(csv_file)
headers = reader.next()
for i in range(len(headers)):
if headers[i] in relevant_attributes:
relevant_attributes_index.append(i)
else:
pass
for i in range(len(relevant_attributes_index)):
new_header.append("victim"+ str(victim_num) + "_" + headers[relevant_attributes_index[i]])
victim_num += 1
new_header.append("victim"+ str(victim_num) + "_" + headers[relevant_attributes_index[i]])
victim_num -= 1
wrtr.writerow(new_header)
for row in reader:
row_attributes= []
for i in range(1,len(relevant_attributes_index)):
row_attributes.append(row[relevant_attributes_index[i]])
incident_id[row[relevant_attributes_index[0]]] = row_attributes
for key in range(len(industry_ids)):
new_row = []
if industry_ids[key][0] in incident_id.keys() and industry_ids[key][1] in incident_id.keys():
new_row.append(industry_ids[key][0])
for vic_one in incident_id[industry_ids[key][0]]:
new_row.append(vic_one)
idx_vic_2 = 1
new_row.insert(idx_vic_2, industry_ids[key][1])
for vic_two in incident_id[industry_ids[key][1]]:
idx_vic_2 += 2
new_row.insert(idx_vic_2, vic_two)
wrtr.writerow(new_row)
else:
pass
if __name__ == "__main__":
try:
main()
except Exception as e:
print 'Something went wrong ', e
In [25]:
from pandas import read_csv
from urllib import urlopen
import csv
page = urlopen("../data/vcdb_similarity_comparison.csv")
df = read_csv(page)
grouped_object = df.groupby("victim1_incident_id")
grouped_object["victim1_victim.victim_id"].describe().to_csv("../data/vcdb_similarity_comparison_2.csv")
Out[25]:
In [ ]: