In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

1. Load the Train and Test ASM Feature Files and Do Some Basic Statistics


In [2]:
train_data_asm = pd.read_csv('data/train-malware-features-asm.csv')
test_data_asm = pd.read_csv('data/test-malware-features-asm.csv')
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm.csv')

sorted_train_data_asm = train_data_asm.sort(columns='filename', axis=0, ascending=True, inplace=False)
sorted_test_data_asm = test_data_asm.sort(columns='filename', axis=0, ascending=True, inplace=False)

In [3]:
sorted_train_data_asm.head()


Out[3]:
filename edx esi es fs ds ss gs cs ah al ax bh bl bx ch cl cx dh dl
2277 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 0 0 8 224 49 34 25 0 41 191 52 38 163 ...
2053 01SuzwMJEIXsK7A8dQbl 1121 24 3 0 1 4 0 2 6 22 7 1 4 0 3 37 2 4 9 ...
2144 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 0 0 1 398 0 0 47 0 1 77 4 1 56 ...
1236 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 ...
339 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 0 0 3 0 0 1 0 0 1 0 0 0 ...

5 rows × 1007 columns


In [4]:
sorted_test_data_asm.head()


Out[4]:
filename edx esi es fs ds ss gs cs ah al ax bh bl bx ch cl cx dh dl
7297 ITSUPtCmh7WdJcsYDwQ5 245 434 0 0 1 0 0 0 9 51 1 0 4 1 3 1 3 0 1 ...
3257 Ig2DB5tSiEy1cJvV0zdw 258 437 0 0 0 0 0 0 11 60 1 1 2 2 0 1 2 2 1 ...
4183 Jmo6eIhLZ4t9r8QsxEg5 238 365 0 0 0 0 0 0 8 51 0 0 4 1 0 1 2 1 0 ...
8084 JtPFl4ewgdD78OzCMa3o 241 556 1 0 1 1 0 0 4 52 0 1 0 1 2 2 2 0 1 ...
9774 K3ZtByPHGSFYNljDUEXp 92 75 0 0 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0 ...

5 rows × 1007 columns


In [5]:
sorted_train_image_asm.head()


Out[5]:
filename ASM_0 ASM_1 ASM_2 ASM_3 ASM_4 ASM_5 ASM_6 ASM_7 ASM_8 ASM_9 ASM_10 ASM_11 ASM_12 ASM_13 ASM_14 ASM_15 ASM_16 ASM_17 ASM_18
0 01IsoiSMh5gxyDYTl4CB 46 116 101 120 116 58 48 48 52 48 49 48 48 48 9 9 9 9 9 ...
1 01SuzwMJEIXsK7A8dQbl 72 69 65 68 69 82 58 48 48 52 48 48 48 48 48 9 9 9 9 ...
2 01azqd4InC7m9JpocGv5 72 69 65 68 69 82 58 48 48 52 48 48 48 48 48 9 9 9 9 ...
3 01jsnpXSAlgw6aPeDxrU 72 69 65 68 69 82 58 48 48 52 48 48 48 48 48 9 9 9 9 ...
4 01kcPWA9K2BOxQeS5Rju 72 69 65 68 69 82 58 49 48 48 48 48 48 48 48 9 9 9 9 ...

5 rows × 1001 columns


In [7]:
sorted_test_image_asm.head()


Out[7]:
filename ASM_0 ASM_1 ASM_2 ASM_3 ASM_4 ASM_5 ASM_6 ASM_7 ASM_8 ASM_9 ASM_10 ASM_11 ASM_12 ASM_13 ASM_14 ASM_15 ASM_16 ASM_17 ASM_18
0 ITSUPtCmh7WdJcsYDwQ5 72 69 65 68 69 82 58 48 48 52 48 48 48 48 48 9 9 9 9 ...
1 Ig2DB5tSiEy1cJvV0zdw 72 69 65 68 69 82 58 48 48 52 48 48 48 48 48 9 9 9 9 ...
2 Jmo6eIhLZ4t9r8QsxEg5 72 69 65 68 69 82 58 48 48 52 48 48 48 48 48 9 9 9 9 ...
3 JtPFl4ewgdD78OzCMa3o 72 69 65 68 69 82 58 48 48 52 48 48 48 48 48 9 9 9 9 ...
4 K3ZtByPHGSFYNljDUEXp 72 69 65 68 69 82 58 48 48 52 48 48 48 48 48 9 9 9 9 ...

5 rows × 1001 columns


In [8]:
sorted_train_data_asm.to_csv('data/sorted-train-malware-features-asm.csv', index=False)
sorted_test_data_asm.to_csv('data/sorted-test-malware-features-asm.csv', index=False)

2. Do Some Basic Row and Column Statistics


In [9]:
X_train = sorted_train_data_asm.iloc[:,1:]
X_test = sorted_test_data_asm.iloc[:,1:]
X_train_image_asm = sorted_train_image_asm.iloc[:,1:]
X_test_image_asm = sorted_test_image_asm.iloc[:,1:]

In [10]:
X_train.shape


Out[10]:
(10868, 1006)

In [11]:
X_test.shape


Out[11]:
(10873, 1006)

In [12]:
X_train_image_asm.shape


Out[12]:
(10868, 1000)

In [13]:
X_test_image_asm.shape


Out[13]:
(10873, 1000)

In [14]:
# Train feature stats
X_train_means = X_train.mean()
X_train_std = X_train.std()
X_train_cor = X_train.corr()
X_train_cov = X_train.cov()

# Test feature stats
X_test_means = X_test.mean()
X_test_std = X_test.std()
X_test_cor = X_test.corr()
X_test_cov = X_test.cov()

# Train image feature stats
X_train_image_asm_means = X_train_image_asm.mean()
X_train_image_asm_std = X_train_image_asm.std()
X_train_image_asm_cor = X_train_image_asm.corr()
X_train_image_asm_cov = X_train_image_asm.cov()

# Test image feature stats
X_test_image_asm_means = X_test_image_asm.mean()
X_test_image_asm_std = X_test_image_asm.std()
X_test_image_asm_cor = X_test_image_asm.corr()
X_test_image_asm_cov = X_test_image_asm.cov()

In [19]:
# Row stats for train and test data
X_train_rowstats = pd.DataFrame(index=np.arange(X_train.shape[0]), columns=['filename','trainmean','trainstd','trainmin','trainmax','traintotal','trainlogtotal'], dtype=np.float64)
X_test_rowstats = pd.DataFrame(index=np.arange(X_test.shape[0]), columns=['filename','testmean','teststd','testmin','testmax','testtotal','testlogtotal'], dtype=np.float64)

X_train_rowstats['filename'] = sorted_train_data_asm['filename']
for i in range(0,X_train.shape[0]):
    X_train_rowstats['trainmean'][i] = X_train.iloc[i,:].mean()
    X_train_rowstats['trainstd'][i] = X_train.iloc[i,:].std()
    X_train_rowstats['trainmin'][i] = X_train.iloc[i,:].min()
    X_train_rowstats['trainmax'][i] = X_train.iloc[i,:].max()
    

X_train_rowstats['traintotal'] = X_train_rowstats['trainmax'] * X_train_rowstats['trainmean'] * X_train_rowstats['trainstd']
X_train_rowstats['trainlogtotal'] = np.log(X_train_rowstats['traintotal']) # natural logarithm

X_test_rowstats['filename'] = sorted_test_data_asm['filename']
for i in range(0,X_test.shape[0]):
    X_test_rowstats['testmean'][i] = X_test.iloc[i,:].mean()
    X_test_rowstats['teststd'][i] = X_test.iloc[i,:].std()
    X_test_rowstats['testmin'][i] = X_test.iloc[i,:].min()
    X_test_rowstats['testmax'][i] = X_test.iloc[i,:].max()
    

X_test_rowstats['testtotal'] = X_test_rowstats['testmax'] * X_test_rowstats['testmean'] * X_test_rowstats['teststd']
X_test_rowstats['testlogtotal'] = np.log(X_test_rowstats['testtotal']) # natural logarithm

X_train_rowstats.head()


Out[19]:
filename trainmean trainstd trainmin trainmax traintotal trainlogtotal
0 4jKA1GUDv6TMNpPuIxER 324.461233 4825.009458 0 87555 1.370698e+11 25.643756
1 4ZBJzEqnW52fFUw0PG3v 34.154076 328.475597 0 5817 6.525965e+07 17.993884
2 6m8NxLfg2MR0nwXFuEq5 2775.414513 60923.597617 0 1367070 2.311555e+14 33.074112
3 28U1hRkQ6Yl57493ZdXD 142.093439 2939.097148 0 65928 2.753327e+10 24.038661
4 45Wy3TxE98HfiXreOCSu 2.474155 18.585792 0 445 2.046294e+04 9.926371

5 rows × 7 columns


In [20]:
X_test_rowstats.head()


Out[20]:
filename testmean teststd testmin testmax testtotal testlogtotal
0 N0DQkgaq9wjO1fJLn2ME 72.198807 1399.472707 0 31361 3.168724e+09 21.876595
1 VuvdGbYitmxa05lHrPnT 480.988072 10558.432438 0 236923 1.203209e+12 27.816013
2 NRUDJPSHu4dAyFjzLIg9 519.792247 11465.881968 0 257289 1.533411e+12 28.058516
3 ZmdXvIh5qHCOyJPwiE6g 42.637177 732.731172 0 16341 5.105188e+08 20.050938
4 rcb7LxNP6itSDnwgh3Km 1566.855865 34955.326470 0 784363 4.295953e+13 31.391280

5 rows × 7 columns


In [22]:
# Image row stats
X_train_image_asm_rowstats = pd.DataFrame(index=np.arange(X_train_image_asm.shape[0]), columns=['filename','trainimagemean','trainimagestd','trainimagemin','trainimagemax','trainimagetotal','trainimagelogtotal'], dtype=np.float64)
X_test_image_asm_rowstats = pd.DataFrame(index=np.arange(X_test_image_asm.shape[0]), columns=['filename','testimagemean','testimagestd','testimagemin','testimagemax','testimagetotal','testimagelogtotal'], dtype=np.float64)

X_train_image_asm_rowstats['filename'] = sorted_train_image_asm['filename']
for i in range(0,X_train_image_asm.shape[0]):
    X_train_image_asm_rowstats['trainimagemean'][i] = X_train_image_asm.iloc[i,:].mean()
    X_train_image_asm_rowstats['trainimagestd'][i] = X_train_image_asm.iloc[i,:].std()
    X_train_image_asm_rowstats['trainimagemin'][i] = X_train_image_asm.iloc[i,:].min()
    X_train_image_asm_rowstats['trainimagemax'][i] = X_train_image_asm.iloc[i,:].max()
    

X_train_image_asm_rowstats['trainimagetotal'] = X_train_image_asm_rowstats['trainimagemax'] * X_train_image_asm_rowstats['trainimagemean'] * X_train_image_asm_rowstats['trainimagestd']
X_train_image_asm_rowstats['trainimagelogtotal'] = np.log(X_train_image_asm_rowstats['trainimagetotal']) # natural logarithm

X_test_image_asm_rowstats['filename'] = sorted_test_image_asm['filename']
for i in range(0,X_test_image_asm.shape[0]):
    X_test_image_asm_rowstats['testimagemean'][i] = X_test_image_asm.iloc[i,:].mean()
    X_test_image_asm_rowstats['testimagestd'][i] = X_test_image_asm.iloc[i,:].std()
    X_test_image_asm_rowstats['testimagemin'][i] = X_test_image_asm.iloc[i,:].min()
    X_test_image_asm_rowstats['testimagemax'][i] = X_test_image_asm.iloc[i,:].max()
    

X_test_image_asm_rowstats['testimagetotal'] = X_test_image_asm_rowstats['testimagemax'] * X_test_image_asm_rowstats['testimagemean'] * X_test_image_asm_rowstats['testimagestd']
X_test_image_asm_rowstats['testimagelogtotal'] = np.log(X_test_image_asm_rowstats['testimagetotal']) # natural logarithm

X_train_image_asm_rowstats.head()


Out[22]:
filename trainimagemean trainimagestd trainimagemin trainimagemax trainimagetotal trainimagelogtotal
0 01IsoiSMh5gxyDYTl4CB 54.484 33.175508 9 124 224134.265636 12.320001
1 01SuzwMJEIXsK7A8dQbl 53.146 29.181162 9 124 192306.893387 12.166848
2 01azqd4InC7m9JpocGv5 53.146 29.181162 9 124 192306.893387 12.166848
3 01jsnpXSAlgw6aPeDxrU 53.146 29.181162 9 124 192306.893387 12.166848
4 01kcPWA9K2BOxQeS5Rju 55.049 30.704289 9 124 209589.809057 12.252908

5 rows × 7 columns


In [23]:
X_test_image_asm_rowstats.head()


Out[23]:
filename testimagemean testimagestd testimagemin testimagemax testimagetotal testimagelogtotal
0 ITSUPtCmh7WdJcsYDwQ5 53.604 29.549050 9 124 196409.459373 12.187957
1 Ig2DB5tSiEy1cJvV0zdw 53.269 29.424283 9 124 194357.863964 12.177456
2 Jmo6eIhLZ4t9r8QsxEg5 55.106 30.695781 9 124 209748.694414 12.253665
3 JtPFl4ewgdD78OzCMa3o 55.106 30.695781 9 124 209748.694414 12.253665
4 K3ZtByPHGSFYNljDUEXp 53.269 29.424283 9 124 194357.863964 12.177456

5 rows × 7 columns


In [24]:
# Write column stats and row stats to file

X_train_rowstats.to_csv('data/all-train-asm-rowstats.csv', index=False)
X_test_rowstats.to_csv('data/all-test-asm-rowstats.csv', index=False)
X_train_image_asm_rowstats.to_csv('data/all-train-image-asm-rowstats.csv', index=False)
X_test_image_asm_rowstats.to_csv('data/all-test-image-asm-rowstats.csv', index=False)

In [ ]: