Training labels will be generated from ClamAV, Windows Defender and VirusTotal.com reports.
- vs00251.txt (clamav)
- vs00252.txt (clamav)
- vs00263.txt (clamav)
- vs00264.txt (clamav)
- MPDetection2.log (Windows Defender)
- MPDetection3.log (Windows Defender)
In [1]:
from multiprocessing import Pool
import os
from csv import writer
import numpy as np
import pandas as pd
import math
import scipy.misc
import array
import time as tm
import io # this is required as a compatability layer between 2.x and 3.x because 2.x cannot read utf-16 text files.
import re
import matplotlib.pyplot as plt
import seaborn # make it look pretty.
In [2]:
ext_drive = '/opt/vs/'
tfiles1 = os.listdir(ext_drive + "train")
tfiles2 = os.listdir(ext_drive + "train2")
In [ ]:
# First load in the clamav reports and convert to csv files.
file_name = 'data/vs00263.txt'
vfr1 = open(file_name, 'r')
vlines1 = vfr1.readlines()
# Do the next clamav file.
file_name = 'data/vs00264.txt'
vfr2 = open(file_name, 'r')
vlines2 = vfr2.readlines()
# Do the next clamav file.
file_name = 'data/vs00apt.txt'
vfr3 = open(file_name, 'r')
vlines3 = vfr3.readlines()
# Open the output csv file.
fop = open('data/clamav-vs263-264.csv', 'w')
csv_wouter = writer(fop)
cols = ['file_name','malware_type'] # write out the column names.
csv_wouter.writerow(cols)
process_clamav_report(vlines1, csv_wouter)
process_clamav_report(vlines2, csv_wouter)
process_clamav_report(vlines3, csv_wouter)
vfr1.close()
vfr2.close()
vfr3.close()
fop.close()
In [15]:
# First load in the clamav reports and convert to csv files.
file_name = 'data/vs00251.txt'
vfr1 = open(file_name, 'r')
vlines1 = vfr1.readlines()
# Do the next clamav file.
file_name = 'data/vs00252.txt'
vfr2 = open(file_name, 'r')
vlines2 = vfr2.readlines()
# Open the output csv file.
fop = open('data/clamav001.csv', 'w')
csv_wouter = writer(fop)
cols = ['file_name','malware_type'] # write out the column names.
csv_wouter.writerow(cols)
process_clamav_report(vlines1, csv_wouter)
process_clamav_report(vlines2, csv_wouter)
vfr1.close()
vfr2.close()
fop.close()
In [14]:
def process_clamav_report(vlines, outfile):
counter = 0
outlines = []
for idx, line in enumerate(vlines):
if line.startswith('---'): # we hit the scan summary at end of file.
break
else:
line = line.rstrip() # get rid of newlines they are annoying
line = line.replace('_', ' ').replace(':', ' ') # get rid of these things they are annoying
tokens = line.split()
if len(tokens) > 2:
malware_file_name = tokens[1]
malware_type = tokens[2]
outlines.append([malware_file_name, malware_type])
counter += 1
if (idx % 1000) == 0: # write out some lines
outfile.writerows(outlines)
outlines = []
print("Processed line number {:d} : {:s} -> {:s}.".format(idx, malware_file_name, malware_type))
# Finish off.
if (len(outlines) > 0):
outfile.writerows(outlines)
outlines = []
print("Completed processing {:d} lines.".format(counter))
In [10]:
help(writer)
Now generate integer values for the labels based on the malware type, since ClamAV does not
recognise all types of malware, find the unclassified files and send to VirusTotal.com for a
second opinion. As there is no standard method of defining malware type strings we will
have to do some munging on the virustotal results and convert to a ClamAV type malware
classification string. Also scan with Windows Defender and MalwareBytes Anti-Malware and
compare the results.
In [2]:
# now get the clamav data
clammals = pd.read_csv('data/clamav001.csv')
In [3]:
clammals.head()
Out[3]:
In [4]:
clammals.shape
Out[4]:
In [5]:
# Now we can assign a numerical value to each malware classification.
moks = clammals[clammals['malware_type'] == 'OK'] # these are all classified as OK by ClamAV, so we have to send them
# to VirusTotal.com for a second opinion.
moks.to_csv('data/malok.csv', index=False)
# Now sort and write out the labels.
In [6]:
moks.head()
Out[6]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [7]:
moks.shape
Out[7]:
In [4]:
# First load in the Windows Defender reports and convert to csv files.
# NOTE: windows defender logs are UTF-16, so have to use io module to open in Python 2.x
# Two scans were conducted on vs00251 and vs00252.
#file_name = 'data/MPDetection1.log'
#vfr1 = io.open(file_name, mode='r', encoding='utf-16')
#vlines1 = vfr1.readlines()
# print("Read in {:d} lines {:s}".format(len(vlines1), vlines1[0]))
# This log file contains all the detections from vs00251 and vs00252 scans after the second scan.
file_name = 'data/MPDetection2.log'
vfr2 = io.open(file_name, mode='r', encoding='utf-16')
vlines2 = vfr2.readlines()
# Open the output csv file.
fop = open('data/defender-vs251-252.csv', 'w')
csv_wouter = writer(fop)
cols = ['file_name','malware_type'] # write out the column names.
csv_wouter.writerow(cols)
#process_defender_report(vlines1, csv_wouter)
process_defender_report(vlines2, csv_wouter)
#vfr1.close()
vfr2.close()
fop.close()
In [2]:
def process_defender_report(vlines, outfile):
counter = 0
outlines = []
for idx, line in enumerate(vlines):
if line.find('DETECTION') > 0: # we hit the scan summary at end of file.
line = line.rstrip() # get rid of newlines they are annoying
#line = line.replace('_', ' ').replace(':', ' ')
tokens = line.split()
if len(tokens) > 2:
temp_file_name = tokens[3]
malware_type = tokens[2]
temp_file_name = temp_file_name.replace('_',' ').replace('->',' ')
path_tokens = temp_file_name.split()
malware_file_name = path_tokens[1]
outlines.append([malware_file_name, malware_type])
counter += 1
if (idx % 1000) == 0: # write out some lines
outfile.writerows(outlines)
outlines = []
print("Processed line number {:d} : {:s} -> {:s}.".format(idx, malware_file_name, malware_type))
# Finish off.
if (len(outlines) > 0):
outfile.writerows(outlines)
outlines = []
print("Completed processing {:d} lines.".format(counter))
In [4]:
help(pd.DataFrame.drop_duplicates)
In [ ]:
In [8]:
windefmals = pd.read_csv('data/defender001.csv')
windefmals.head()
Out[8]:
In [9]:
windefmals.shape
Out[9]:
In [10]:
clammals.head()
Out[10]:
In [11]:
clammals.shape
Out[11]:
In [12]:
131073 - 97347
Out[12]:
In [13]:
moks.head()
Out[13]:
In [14]:
moks.shape
Out[14]:
In [21]:
allmals = clammals.merge(windefmals, on='file_name', how='outer', indicator=True, sort=True)
In [22]:
allmals.head(20)
Out[22]:
In [27]:
uniq_allmals = allmals.drop_duplicates(subset='file_name', keep='first')
In [28]:
uniq_allmals.head(20)
Out[28]:
In [29]:
uniq_allmals.shape
Out[29]:
In [33]:
filled_uniq_allmals = uniq_allmals.replace(np.NaN, 'OK')
In [34]:
filled_uniq_allmals.head(20)
Out[34]:
In [35]:
filled_uniq_allmals.shape
Out[35]:
In [36]:
# Now we have our combined AV results, write to file.
filled_uniq_allmals.to_csv('data/sorted-av-report.csv', index=False)
In [38]:
moks = filled_uniq_allmals[filled_uniq_allmals['malware_type_x'] == 'OK']
moks = moks[moks['malware_type_y'] == 'OK']
moks.to_csv('data/malok.csv', index=False)
# these are all classified as OK by ClamAV and Windows Defender,
# so we have to send them to VirusTotal.com for a second opinion.
In [39]:
moks.head(20)
Out[39]:
In [40]:
moks.shape
Out[40]:
In [2]:
mals = pd.read_csv('data/sorted-av-report.csv')
mals.head()
Out[2]:
In [3]:
mals.shape
Out[3]:
In [5]:
scalar_labels = [0] * mals.shape[0]
len(scalar_labels)
Out[5]:
In [3]:
type_x = mals['malware_type_x']
type_y = mals['malware_type_y']
x_ok = type_x[type_x == 'OK']
y_ok = type_y[type_y == 'OK']
len(x_ok)
Out[3]:
In [5]:
len(y_ok)
Out[5]:
In [6]:
# Now generate unique scalar label map, we will use ClamAV as the default classification, if ClamAV is OK
# and Defender is not OK, then use the Defender classification, if both are OK then default to 0 label value for now.
scalar_labels = [0] * mals.shape[0]
label_map = {}
counter = 0
for idx, x_val in enumerate(type_x):
if x_val == 'OK':
if type_y.iloc[idx] != 'OK':
mals.iloc[idx,1] = mals.iloc[idx,2] # copy the defender classification to ClamAV classification
else:
continue # leave the scalar label == 0
# now add the classification to the label map with a new scalar value
if mals.iloc[idx,1] not in label_map.keys():
counter += 1
label_map[mals.iloc[idx,1]] = counter
# now get the scalar label for this malware sample
scalar_labels[idx] = label_map[mals.iloc[idx,1]]
mals['label'] = scalar_labels
mals.head(20)
Out[6]:
In [7]:
mals.to_csv('data/sorted_train_labels.csv', index=False)
In [14]:
# Output the malware scalar classifications.
fop = open('data/malware-class-labels.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','class'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = label_map.keys()
sorted_keys.sort()
for key in sorted_keys:
outlines.append([key, label_map[key]])
if (idx % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed label {:s} -> {:d}.".format(key, val))
# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []
print("Completed processing {:d} labels.".format(counter))
fop.close()
In [8]:
131074 - 16918
Out[8]:
In [ ]:
help(allmals.replace)
In [4]:
mals = pd.read_csv('data/sorted_train_labels.csv')
mals.head(20)
Out[4]:
In [3]:
mals.shape
Out[3]:
In [7]:
# Now generate unique scalar label map for malware families, converting Windows Defender format to ClamAV as necessary.
type_x = mals['malware_type_x']
scalar_labels = [0] * mals.shape[0]
family_labels = [' '] * mals.shape[0]
family_label_map = {}
sample_counter_map = {}
family_counter_map = {}
counter = 0
p1 = re.compile('(\w+):(\w+)/(\w+)[!.-/]+(\w+)') # Windows Defender malware definition patterns
p2 = re.compile('(\w+):(\w+)/(\w+)')
pcav = re.compile('(\w+)\.(\w+)\.(\w+)[!./-](\w+)') # ClamAV malware definition pattern
malware_family = 'unknown'
for idx, x_val in enumerate(type_x):
# first count the sample type
if x_val in sample_counter_map.keys():
sample_counter_map[x_val] += 1
else:
sample_counter_map[x_val] = 1
if x_val != 'OK':
# now check if it is a ClamAV definition.
pos = x_val.find('-')
if pos > 0:
malware_family = x_val[0:pos]
else:
malware_family = x_val
# if it is a defender classification then convert to ClamAV classification.
m = p1.match(x_val)
if m != None:
malware_family = m.group(2) + '.' + m.group(1) + '.' + m.group(3)
else:
m = p2.match(x_val)
if m != None:
malware_family = m.group(2) + '.' + m.group(1) + '.' + m.group(3)
else:
continue # leave the scalar label == 0, the malware sample has not been classified.
# now add the classification to the label map with a new scalar value
if malware_family not in family_label_map.keys():
counter += 1
family_label_map[malware_family] = counter
# Count the malware family occurrences.
if (malware_family in family_counter_map.keys()):
family_counter_map[malware_family] += 1
else:
family_counter_map[malware_family] = 1
# now get the scalar label for this malware sample
scalar_labels[idx] = family_label_map[malware_family]
family_labels[idx] = malware_family
if (idx % 1000) == 0: # report progress
print("Processed family label {:s} -> {:d}.".format(malware_family, family_label_map[malware_family]))
# Finish off by adding malware family label to training label set.
mals['family_label'] = scalar_labels
mals['family_label_str'] = family_labels
mals.head(20)
Out[7]:
In [8]:
mals.to_csv('data/sorted-family-train-labels.csv', index=False)
In [11]:
# Output the malware family scalar classifications.
fop = open('data/malware-family-labels.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','class'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = family_label_map.keys()
sorted_keys.sort()
for key in sorted_keys:
outlines.append([key, family_label_map[key]])
if (idx % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed family label {:s} -> {:d}.".format(key, val))
# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []
print("Completed processing {:d} family labels.".format(len(sorted_keys)))
fop.close()
In [12]:
# Output the malware classification counts.
fop = open('data/malware-class-counts.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','count'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = sample_counter_map.keys()
sorted_keys.sort()
for key in sorted_keys:
outlines.append([key, sample_counter_map[key]])
if (idx % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed sample {:s} -> {:d}.".format(key, val))
# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []
print("Completed processing {:d} samples.".format(len(sorted_keys)))
fop.close()
In [13]:
# Output the malware family counts.
fop = open('data/malware-family-counts.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','count'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = family_counter_map.keys()
sorted_keys.sort()
for key in sorted_keys:
outlines.append([key, family_counter_map[key]])
if (idx % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed family {:s} -> {:d}.".format(key, val))
# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []
print("Completed processing {:d} families.".format(len(sorted_keys)))
fop.close()
Experiment 2, use truncated ClamAV or WinDefender definitions to generate malware families and
assign a scalar training label to each family. Use the WinDefender definitions by default or ClamAV
if WinDefender classifies as OK. Start fresh with sorted-av-report.csv and generate new malware classification
labels and family labels.
- Script: generate-train-labels.py
In [25]:
mals = pd.read_csv('data/sorted-av-report.csv')
mals.head(20)
Out[25]:
In [26]:
# Now generate unique scalar label map, we will use WinDefender as the default classification, if WinDefender is OK
# and ClamAV is not OK, then use the ClamAV classification, if both are OK then default to 0 label value for now.
type_x = np.array(mals['malware_type_x'])
type_y = np.array(mals['malware_type_y'])
scalar_labels = [0] * mals.shape[0]
scalar_label_map = {}
counter = 0
scalar_label_map['OK'] = 0
for idx, y_val in enumerate(type_y):
if y_val != 'OK':
mals.iloc[idx,1] = mals.iloc[idx,2] # copy the defender classification to ClamAV classification
# now add the classification to the label map with a new scalar value
if mals.iloc[idx,1] not in scalar_label_map.keys():
counter += 1
scalar_label_map[mals.iloc[idx,1]] = counter
# now get the scalar label for this malware sample
scalar_labels[idx] = scalar_label_map[mals.iloc[idx,1]]
mals['sample_label'] = scalar_labels
mals.head(20)
Out[26]:
In [27]:
mals.to_csv('data/sorted-av-report-labels-wd.csv', index=False)
In [28]:
# Output the malware sample scalar classifications.
fop = open('data/malware-class-labels-wd.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','class'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = scalar_label_map.keys()
sorted_keys.sort()
for key in sorted_keys:
outlines.append([key, scalar_label_map[key]])
if (idx % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed label {:s} -> {:d}.".format(key, val))
# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []
print("Completed processing {:d} labels.".format(counter))
fop.close()
In [ ]:
In [ ]:
In [29]:
# Now generate unique scalar label map for malware families.
type_x = np.array(mals['malware_type_x'])
#type_y = mals['malware_type_y']
family_scalar_labels = [0] * mals.shape[0]
family_labels = [' '] * mals.shape[0]
family_label_map = {}
sample_counter_map = {}
family_counter_map = {}
counter = 0
pwd1 = re.compile('(\w+):(\w+)/(\w+)[!.-/]+(\w+)') # Windows Defender malware definition patterns.
pwd2 = re.compile('(\w+):(\w+)/(\w+)')
pcav = re.compile('(\w+)\.(\w+)\.(\w+)[!./-](\w+)') # ClamAV malware definition pattern.
malware_family = 'unknown'
family_label_map['unknown'] = 0 # The default family scalar label.
for idx, x_val in enumerate(type_x):
# first count the sample type
if x_val in sample_counter_map.keys():
sample_counter_map[x_val] += 1
else:
sample_counter_map[x_val] = 1
if x_val != 'OK':
# if it is a defender classification then convert to ClamAV definition style.
m = pwd1.match(x_val)
if m != None:
malware_family = m.group(2) + '.' + m.group(1) + '.' + m.group(3) # rearrange the components to
else: # (platform).(class).(type)
m = pwd2.match(x_val)
if m != None:
malware_family = m.group(2) + '.' + m.group(1) + '.' + m.group(3)
else:
# then check if it is a ClamAV definition.
m = pcav.match(x_val)
if m != None: # just truncate the end bit off.
malware_family = m.group(1) + '.' + m.group(2) + '.' + m.group(3)
else:
malware_family = x_val # catch the corner cases and default to original name/definition.
else:
malware_family = 'unknown' # leave the scalar label == 0, the malware sample has not been classified.
# now add the classification to the label map with a new scalar value
if malware_family not in family_label_map.keys():
counter += 1
family_label_map[malware_family] = counter
# Count the malware family occurrences.
if (malware_family in family_counter_map.keys()):
family_counter_map[malware_family] += 1
else:
family_counter_map[malware_family] = 1
# now get the scalar label for this malware sample
family_scalar_labels[idx] = family_label_map[malware_family]
family_labels[idx] = malware_family
if (idx % 1000) == 0: # report progress
print("Processed family label {:s} -> {:d}.".format(malware_family, family_label_map[malware_family]))
# Finish off by adding malware family label to training label set.
mals['family_label'] = family_scalar_labels
mals['family_label_str'] = family_labels
mals.head(20)
Out[29]:
In [30]:
mals.to_csv('data/sorted-family-train-labels-wd.csv', index=False)
In [31]:
# Output the malware family scalar classifications.
fop = open('data/malware-family-labels-wd.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','class'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = family_label_map.keys()
sorted_keys.sort()
for key in sorted_keys:
outlines.append([key, family_label_map[key]])
if (idx % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed family label {:s} -> {:d}.".format(key, val))
# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []
print("Completed processing {:d} family labels.".format(len(sorted_keys)))
fop.close()
In [32]:
# Output the malware sample classification counts.
fop = open('data/malware-class-counts-wd.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','count'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = sample_counter_map.keys()
sorted_keys.sort()
for key in sorted_keys:
outlines.append([key, sample_counter_map[key]])
if (idx % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed sample {:s} -> {:d}.".format(key, val))
# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []
print("Completed processing {:d} samples.".format(len(sorted_keys)))
fop.close()
In [33]:
# Output the malware family counts.
fop = open('data/malware-family-counts-wd.csv', 'w')
csv_wouter = writer(fop)
cols = ['malware_type','count'] # write out the column names.
csv_wouter.writerow(cols)
outlines = []
sorted_keys = family_counter_map.keys()
sorted_keys.sort()
for key in sorted_keys:
outlines.append([key, family_counter_map[key]])
if (idx % 100) == 0: # write out some lines
csv_wouter.writerows(outlines)
outlines = []
print("Processed family {:s} -> {:d}.".format(key, val))
# Finish off.
if (len(outlines) > 0):
csv_wouter.writerows(outlines)
outlines = []
print("Completed processing {:d} families.".format(len(sorted_keys)))
fop.close()
In [4]:
# Join the malware family sample scalar classifications and counts.
cldf = pd.read_csv('data/malware-family-labels-wd.csv')
ccdf = pd.read_csv('data/malware-family-counts-wd.csv')
cjdf = pd.merge(cldf,ccdf,on='malware_type')
cjdf.to_csv('data/malware-family-wd.csv', index=False)
# Join the malware sample scalar classifications and counts.
cldf = pd.read_csv('data/malware-class-labels-wd.csv')
ccdf = pd.read_csv('data/malware-class-counts-wd.csv')
cjdf = pd.merge(cldf,ccdf,on='malware_type')
cjdf.to_csv('data/malware-class-wd.csv', index=False)
In [ ]:
help(pd.merge)
In [2]:
mals = pd.read_csv('data/sorted-family-train-labels-wd.csv')
mals.head()
In [5]:
mals.drop(['malware_type_y', '_merge'], axis=1, inplace=True)
In [6]:
mals.head(20)
Out[6]:
In [8]:
mals.to_csv('data/sorted-train-labels.csv', index=False)
In [ ]:
In [15]:
# Load in the train labels for each sample set run then compare the
# training label value for each malware family and class to ensure
# each one has a unique scalar training label and the same malware
# types and families have the same label.
# [filename,malware_type_x,malware_type_y,sample_label,family_name,family_label]
def validate_label_generation():
mals1_df = pd.read_csv('data/sorted-train-labels-vs251-252.csv')
mals2_df = pd.read_csv('data/sorted-train-labels-vs263-264-apt.csv')
counter = 0
m1_x = np.array(mals1_df['malware_type_x'])
m1_f = np.array(mals1_df['family_name'])
m1_sl = np.array(mals1_df['sample_label'])
m1_fl = np.array(mals1_df['family_label'])
m2_x = np.array(mals2_df['malware_type_x'])
m21_f = np.array(mals2_df['family_name'])
m2_sl = np.array(mals2_df['sample_label'])
m2_fl = np.array(mals2_df['family_label'])
for idx1, mname1 in enumerate(m1_x):
for idx2, mname2 in enumerate(m2_x):
if mname1 == mname2:
if m1_sl[idx1] != m2_sl[idx2]:
print("Sample label incongruence: {:d} {:d}".format(m1_sl[idx1], m2_sl[idx2]))
counter += 1
if (m1_fl[idx1] != m2_fl[idx2]):
print("Family label incongruence: {:d} {:d}".format(m1_fl[idx1], m2_fl[idx2]))
counter += 1
if (idx1 % 1000) == 0:
print("Processed {:d} malware names.".format(idx1))
print("Total Incongruence Errors: {:d}".format(counter))
return
In [16]:
validate_label_generation()
In [10]:
# Split out the training sample sets.
def split_training_sets(training_set_directory, train_label_file, output_file):
mals1_df = pd.read_csv(train_label_file)
counter = 0
file_list = os.listdir(training_set_directory)
#malnames = np.array(mals1_df['file_name'])
malnames = np.array(mals1_df['file_name'])
truncated_filenames = []
for fname in file_list:
mname = fname[fname.find('_') + 1:]
truncated_filenames.append(mname)
counter += 1
#t1_df = mals1_df[mals1_df['file_name'].isin(truncated_filenames)]
t1_df = mals1_df[mals1_df['file_name'].isin(truncated_filenames)]
t1_df.to_csv(output_file, index=False)
return t1_df
In [11]:
s1_df = split_training_sets('/opt/vs/train1/', 'data/sorted-train-labels-vs251-252.csv', 'data/sorted-train-labels-vs251.csv')
s1_df.head()
Out[11]:
In [12]:
s1_df.shape
Out[12]:
In [13]:
s2_df = split_training_sets('/opt/vs/train2/', 'data/sorted-train-labels-vs251-252.csv', 'data/sorted-train-labels-vs252.csv')
s2_df.head()
Out[13]:
In [14]:
s3_df = split_training_sets('/opt/vs/train3/', 'data/sorted-train-labels-vs263-264-apt.csv', 'data/sorted-train-labels-vs263.csv')
s3_df.head()
Out[14]:
In [15]:
s4_df = split_training_sets('/opt/vs/train4/', 'data/sorted-train-labels-vs263-264-apt.csv', 'data/sorted-train-labels-vs264.csv')
s4_df.head()
Out[15]:
In [16]:
sa_df = split_training_sets('/opt/vs/apt/', 'data/sorted-train-labels-vs263-264-apt.csv', 'data/sorted-train-labels-apt.csv')
sa_df.head()
Out[16]:
In [17]:
s1_df = split_training_sets('/opt/vs/train1/', 'data/sorted-entropy-features-vs251-252.csv', 'data/sorted-entropy-features-vs251.csv')
s1_df.head()
Out[17]:
In [18]:
s1_df = split_training_sets('/opt/vs/train2/', 'data/sorted-entropy-features-vs251-252.csv', 'data/sorted-entropy-features-vs252.csv')
s1_df.head()
Out[18]:
In [19]:
s1_df.shape
Out[19]:
In [ ]:
In [5]:
mals1_df = pd.read_csv('data/sorted-av-report-vs251-252.csv')
mals1_df.head()
Out[5]:
In [6]:
mals1_df.drop('_merge', axis=1, inplace=True)
mals1_df.head()
Out[6]:
In [9]:
mals1_df.to_csv('data/sorted-av-report-vs251-252.csv', index=False)
In [4]:
malcounts = mals['malware_type_x'].value_counts()
malcounts
Out[4]:
In [5]:
malcounts[:10].plot(kind='barh', rot=0)
plt.show()
In [8]:
# Windows Defender malware class matching patterns.
p1 = re.compile('(\w+):(\w+)/(\w+)[!.-]+(\w+)')
p2 = re.compile('(\w+):(\w+)/(\w+)')
m = p1.match('Backdoor:MSIL/Bladabindi!rfn')
# m.group(1) == 'Backdoor'
# m.group(2) == 'MSIL'
# m.group(3) == 'Bladabindi'
# m.group(4) == 'rfn'
# Convert to ClamAV style malware family
malware_family = 'unknown'
if m != None:
malware_family = m.group(2) + '.' + m.group(1) + '.' + m.group(3)
else:
m = p2.match('Backdoor:MSIL/Bladabindi')
if m != None:
malware_family = m.group(2) + '.' + m.group(1) + '.' + m.group(3)
print(malware_family)
# Convert ClamAV malware class to malware family by removing the number and hyphen from the end of the malware string.
malware_str = 'Andr.Adware.Kuguo-2'
pos = malware_str.find('-')
if pos > 0:
malware_family = malware_str[0:pos]
else:
malware_family = malware_str
print(malware_family)
In [3]:
class_labels_df = pd.read_csv('data/av-malware-class-labels.csv')
family_labels_df = pd.read_csv('data/av-malware-family-labels.csv')
vs1_df = pd.read_csv('data/sorted-av-report-vs251-252.csv')
vs2_df = pd.read_csv('data/sorted-av-report-vs263-264-apt.csv')
print("Class Labels = {:d}, Family Labels = {:d}".format(class_labels_df.shape[0], family_labels_df.shape[0]))
In [5]:
type_x = np.array(vs2_df['malware_type_x'])
type_y = np.array(vs2_df['malware_type_y'])
scalar_labels = [0] * vs2_df.shape[0]
counter = 0
scalar_label_map = {}
for idx, y_val in enumerate(type_y):
if y_val != 'OK':
malware_name = y_val
else:
malware_name = vs2_df.iloc[idx,1]
if malware_name not in scalar_label_map.keys():
counter += 1
scalar_label_map[malware_name] = counter
# now get the scalar label for this malware sample
scalar_labels[idx] = scalar_label_map[malware_name]
print("Class Labels: {:d}".format(len(scalar_label_map.keys())))
In [7]:
sorted_train_labels_df = pd.read_csv('data/sorted-train-labels-vs251-252.csv')
type_x = np.array(sorted_train_labels_df['malware_type_x'])
counter = 0
for malware_name in scalar_label_map.keys():
if malware_name not in type_x:
counter += 1
print("New Malware Types: {:d}".format(counter))
In [8]:
5987 + 2347
Out[8]:
In [2]:
class_labels_df = pd.read_csv('data/av-malware-class-labels.csv')
family_labels_df = pd.read_csv('data/av-malware-family-labels.csv')
vs1_df = pd.read_csv('data/sorted-train-labels-vs251-252.csv')
vs2_df = pd.read_csv('data/sorted-train-labels-vs263-264-apt.csv')
print("Class Labels = {:d}, Family Labels = {:d}".format(class_labels_df.shape[0], family_labels_df.shape[0]))
In [3]:
newclass_labels_df = pd.read_csv('data/av-malware-class-labels-wd.csv')
newfamily_labels_df = pd.read_csv('data/av-malware-family-labels-wd.csv')
print("Class Labels = {:d}, Family Labels = {:d}".format(newclass_labels_df.shape[0], newfamily_labels_df.shape[0]))
In [10]:
newclass_labels_df = pd.read_csv('data/av-malware-class-labels-wd.csv')
newfamily_labels_df = pd.read_csv('data/av-malware-family-labels-wd.csv')
print("Class Labels = {:d}, Family Labels = {:d}".format(newclass_labels_df.shape[0], newfamily_labels_df.shape[0]))
In [4]:
class_labels_df = pd.read_csv('data/av-malware-class-labels.csv')
family_labels_df = pd.read_csv('data/av-malware-family-labels.csv')
vs1_df = pd.read_csv('data/sorted-train-labels-vs251.csv')
vs2_df = pd.read_csv('data/sorted-train-labels-vs252.csv')
vs3_df = pd.read_csv('data/sorted-train-labels-vs263.csv')
vs4_df = pd.read_csv('data/sorted-train-labels-vs264.csv')
vs5_df = pd.read_csv('data/sorted-train-labels-apt.csv')
print("Class Labels = {:d}, Family Labels = {:d}".format(class_labels_df.shape[0], family_labels_df.shape[0]))
ok_count = vs1_df["malware_type_x"].value_counts()
ok_count
Out[4]:
In [6]:
vs1_df.shape[0] - 8007
Out[6]:
In [7]:
ok_count = vs2_df["malware_type_x"].value_counts()
ok_count
Out[7]:
In [8]:
vs2_df.shape[0] - 8911
Out[8]:
In [9]:
ok_count = vs3_df["malware_type_x"].value_counts()
ok_count
Out[9]:
In [10]:
vs3_df.shape[0] - 13924
Out[10]:
In [11]:
ok_count = vs4_df["malware_type_x"].value_counts()
ok_count
Out[11]:
In [12]:
vs4_df.shape[0] - 23262
Out[12]:
In [13]:
ok_count = vs5_df["malware_type_x"].value_counts()
ok_count
Out[13]:
In [14]:
vs5_df.shape[0] - 1
Out[14]:
In [17]:
(vs1_df.shape[0] * 4) #+ 293
Out[17]:
In [18]:
262144 + 293
Out[18]:
In [ ]:
counter = 0
errors = 0
found = False
fip = open('/opt/vs/unpacked_file_list-vs251-252.txt','r')
unpacked_list = fip.readlines()
fip.close()
file_list = os.listdir('/opt/vs/asm/')
file_list.sort()
hdr_list = []
asm_list = []
for fname in file_list:
if fname.endswith('.asm'):
asm_list.append(fname)
elif fname.endswith('.txt'):
hdr_list.append(fname)
print("Header list size: {:d}".format(len(hdr_list)))
print("ASM list size: {:d}".format(len(asm_list)))
hdr_list.sort()
asm_list.sort()
for idx, fname in enumerate(asm_list):
asm_name = fname[0:fname.find(".asm")]
#hdr_name = hdr_list[idx]
#hdr_name = hdr_name[0:hdr_name.find(".hdr")]
#if asm_name not in asm_list:
#if asm_name != hdr_name:
for hname in hdr_list:
hdr_name = hname[0:hname.find(".txt")]
if asm_name == hdr_name:
print("Successful Disassembly for: {:s}".format(asm_name))
counter += 1
found = True
break
if not found:
errors += 1
else:
found = False
print("Total Successful Disassemblies: {:d} Total Disassembly Errors: {:d}".format(counter, errors))
In [13]:
counter = 0
errors = 0
found = False
fip = open('/opt/vs/unpacked_file_list-vs251-252.txt','r')
unpacked_list = fip.readlines()
fip.close()
file_list = os.listdir('/opt/vs/asm/')
file_list.sort()
hdr_list = []
asm_list = []
for fname in file_list:
if fname.endswith('.asm'):
asm_list.append(fname)
elif fname.endswith('.txt'):
hdr_list.append(fname)
print("Header list size: {:d}".format(len(hdr_list)))
print("ASM list size: {:d}".format(len(asm_list)))
hdr_list.sort()
asm_list.sort()
for idx, fname in enumerate(hdr_list):
hdr_name = fname[0:fname.find(".txt")]
#hdr_name = hdr_list[idx]
#hdr_name = hdr_name[0:hdr_name.find(".hdr")]
#if asm_name not in asm_list:
#if asm_name != hdr_name:
for hname in asm_list:
asm_name = hname[0:hname.find(".asm")]
if asm_name == hdr_name:
#print("Successful Disassembly for: {:s}".format(asm_name))
counter += 1
found = True
break
if not found:
errors += 1
print("Failed Disassembly for: {:s}".format(hdr_name))
else:
found = False
print("Total Successful Disassemblies: {:d} Total Disassembly Errors: {:d}".format(counter, errors))
In [ ]:
VirusShare_0003887ab64b8ae19ffa988638decac2
VirusShare_0025cc13683331a61986b6433e768f3f
VirusShare_006b4c72e79e60d10515a64ec6a4e021
VirusShare_00d574c8f6fe8453e0c57a8a731f15b4
VirusShare_01561d7971d10d2192e87b75a74980a4
Failed Disassembly for: VirusShare_018c4ec104af60efebd868c6c96c4015
Failed Disassembly for: VirusShare_027aceafdea60810bd493b91fad6d83b
Failed Disassembly for: VirusShare_028a2651d8a23f8a86c6a0440b817826
Failed Disassembly for: VirusShare_02acf1da2758c291fc377d4ea18efcce
Failed Disassembly for: VirusShare_02b88fab6d6a76e3f00e99d88b42e29e
Failed Disassembly for: VirusShare_02d15c11abb5ef375e9ac3e9f05a1a52
Failed Disassembly for: VirusShare_02e6357bc2e276c4113e6de1a5b1c69c
Failed Disassembly for: VirusShare_038ae293c2dd804f41f7f7305f37ebe2
Failed Disassembly for: VirusShare_03acebfbcabb20a76e707d585aaf8c49
Failed Disassembly for: VirusShare_6a4fbcfb44717eae2145c761c1c99b6a
Failed Disassembly for: VirusShare_af719814507fdca4b96184f33b6b92ea
Failed Disassembly for: VirusShare_d4ba6430996fb4021241efc97c607504
Failed Disassembly for: VirusShare_d8b7b276710127d233abcdb7313aac36
In [ ]: