In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
In [2]:
train_data_asm = pd.read_csv('data/train-malware-features-asm.csv')
test_data_asm = pd.read_csv('data/test-malware-features-asm.csv')
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm.csv')
sorted_train_data_asm = train_data_asm.sort(columns='filename', axis=0, ascending=True, inplace=False)
sorted_test_data_asm = test_data_asm.sort(columns='filename', axis=0, ascending=True, inplace=False)
In [3]:
sorted_train_data_asm.head()
Out[3]:
In [4]:
sorted_test_data_asm.head()
Out[4]:
In [5]:
sorted_train_image_asm.head()
Out[5]:
In [7]:
sorted_test_image_asm.head()
Out[7]:
In [8]:
sorted_train_data_asm.to_csv('data/sorted-train-malware-features-asm.csv', index=False)
sorted_test_data_asm.to_csv('data/sorted-test-malware-features-asm.csv', index=False)
In [9]:
X_train = sorted_train_data_asm.iloc[:,1:]
X_test = sorted_test_data_asm.iloc[:,1:]
X_train_image_asm = sorted_train_image_asm.iloc[:,1:]
X_test_image_asm = sorted_test_image_asm.iloc[:,1:]
In [10]:
X_train.shape
Out[10]:
In [11]:
X_test.shape
Out[11]:
In [12]:
X_train_image_asm.shape
Out[12]:
In [13]:
X_test_image_asm.shape
Out[13]:
In [14]:
# Train feature stats
X_train_means = X_train.mean()
X_train_std = X_train.std()
X_train_cor = X_train.corr()
X_train_cov = X_train.cov()
# Test feature stats
X_test_means = X_test.mean()
X_test_std = X_test.std()
X_test_cor = X_test.corr()
X_test_cov = X_test.cov()
# Train image feature stats
X_train_image_asm_means = X_train_image_asm.mean()
X_train_image_asm_std = X_train_image_asm.std()
X_train_image_asm_cor = X_train_image_asm.corr()
X_train_image_asm_cov = X_train_image_asm.cov()
# Test image feature stats
X_test_image_asm_means = X_test_image_asm.mean()
X_test_image_asm_std = X_test_image_asm.std()
X_test_image_asm_cor = X_test_image_asm.corr()
X_test_image_asm_cov = X_test_image_asm.cov()
In [19]:
# Row stats for train and test data
X_train_rowstats = pd.DataFrame(index=np.arange(X_train.shape[0]), columns=['filename','trainmean','trainstd','trainmin','trainmax','traintotal','trainlogtotal'], dtype=np.float64)
X_test_rowstats = pd.DataFrame(index=np.arange(X_test.shape[0]), columns=['filename','testmean','teststd','testmin','testmax','testtotal','testlogtotal'], dtype=np.float64)
X_train_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X_train.shape[0]):
X_train_rowstats['trainmean'][i] = X_train.iloc[i,:].mean()
X_train_rowstats['trainstd'][i] = X_train.iloc[i,:].std()
X_train_rowstats['trainmin'][i] = X_train.iloc[i,:].min()
X_train_rowstats['trainmax'][i] = X_train.iloc[i,:].max()
X_train_rowstats['traintotal'] = X_train_rowstats['trainmax'] * X_train_rowstats['trainmean'] * X_train_rowstats['trainstd']
X_train_rowstats['trainlogtotal'] = np.log(X_train_rowstats['traintotal']) # natural logarithm
X_test_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test.shape[0]):
X_test_rowstats['testmean'][i] = X_test.iloc[i,:].mean()
X_test_rowstats['teststd'][i] = X_test.iloc[i,:].std()
X_test_rowstats['testmin'][i] = X_test.iloc[i,:].min()
X_test_rowstats['testmax'][i] = X_test.iloc[i,:].max()
X_test_rowstats['testtotal'] = X_test_rowstats['testmax'] * X_test_rowstats['testmean'] * X_test_rowstats['teststd']
X_test_rowstats['testlogtotal'] = np.log(X_test_rowstats['testtotal']) # natural logarithm
X_train_rowstats.head()
Out[19]:
In [20]:
X_test_rowstats.head()
Out[20]:
In [22]:
# Image row stats
X_train_image_asm_rowstats = pd.DataFrame(index=np.arange(X_train_image_asm.shape[0]), columns=['filename','trainimagemean','trainimagestd','trainimagemin','trainimagemax','trainimagetotal','trainimagelogtotal'], dtype=np.float64)
X_test_image_asm_rowstats = pd.DataFrame(index=np.arange(X_test_image_asm.shape[0]), columns=['filename','testimagemean','testimagestd','testimagemin','testimagemax','testimagetotal','testimagelogtotal'], dtype=np.float64)
X_train_image_asm_rowstats['filename'] = sorted_train_image_asm['filename']
for i in range(0,X_train_image_asm.shape[0]):
X_train_image_asm_rowstats['trainimagemean'][i] = X_train_image_asm.iloc[i,:].mean()
X_train_image_asm_rowstats['trainimagestd'][i] = X_train_image_asm.iloc[i,:].std()
X_train_image_asm_rowstats['trainimagemin'][i] = X_train_image_asm.iloc[i,:].min()
X_train_image_asm_rowstats['trainimagemax'][i] = X_train_image_asm.iloc[i,:].max()
X_train_image_asm_rowstats['trainimagetotal'] = X_train_image_asm_rowstats['trainimagemax'] * X_train_image_asm_rowstats['trainimagemean'] * X_train_image_asm_rowstats['trainimagestd']
X_train_image_asm_rowstats['trainimagelogtotal'] = np.log(X_train_image_asm_rowstats['trainimagetotal']) # natural logarithm
X_test_image_asm_rowstats['filename'] = sorted_test_image_asm['filename']
for i in range(0,X_test_image_asm.shape[0]):
X_test_image_asm_rowstats['testimagemean'][i] = X_test_image_asm.iloc[i,:].mean()
X_test_image_asm_rowstats['testimagestd'][i] = X_test_image_asm.iloc[i,:].std()
X_test_image_asm_rowstats['testimagemin'][i] = X_test_image_asm.iloc[i,:].min()
X_test_image_asm_rowstats['testimagemax'][i] = X_test_image_asm.iloc[i,:].max()
X_test_image_asm_rowstats['testimagetotal'] = X_test_image_asm_rowstats['testimagemax'] * X_test_image_asm_rowstats['testimagemean'] * X_test_image_asm_rowstats['testimagestd']
X_test_image_asm_rowstats['testimagelogtotal'] = np.log(X_test_image_asm_rowstats['testimagetotal']) # natural logarithm
X_train_image_asm_rowstats.head()
Out[22]:
In [23]:
X_test_image_asm_rowstats.head()
Out[23]:
In [24]:
# Write column stats and row stats to file
X_train_rowstats.to_csv('data/all-train-asm-rowstats.csv', index=False)
X_test_rowstats.to_csv('data/all-test-asm-rowstats.csv', index=False)
X_train_image_asm_rowstats.to_csv('data/all-train-image-asm-rowstats.csv', index=False)
X_test_image_asm_rowstats.to_csv('data/all-test-image-asm-rowstats.csv', index=False)
In [ ]: