In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score, KFold
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

1. Feature Reduction For Training Data

   Select the top 10%, 20%, 30%, 40% and 50% of features with the most variance. Starting with 1006 ASM features 
   the process will select about 100, 200... features. Then write the reduced feature sets to files.

In [2]:
train_data = pd.read_csv('data/train-malware-features-asm.csv')
labels = pd.read_csv('data/trainLabels.csv')
sorted_train_data = train_data.sort(columns='filename', axis=0, ascending=True, inplace=False)
sorted_train_labels = labels.sort(columns='Id', axis=0, ascending=True, inplace=False)
X = sorted_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])

In [3]:
print(X.shape)


(10868, 1006)

In [4]:
print(y.shape)


(10868,)

In [5]:
sorted_train_data.head()


Out[5]:
filename edx esi es fs ds ss gs cs ah al ax bh bl bx ch cl cx dh dl
2277 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 0 0 8 224 49 34 25 0 41 191 52 38 163 ...
2053 01SuzwMJEIXsK7A8dQbl 1121 24 3 0 1 4 0 2 6 22 7 1 4 0 3 37 2 4 9 ...
2144 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 0 0 1 398 0 0 47 0 1 77 4 1 56 ...
1236 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 ...
339 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 0 0 3 0 0 1 0 0 1 0 0 0 ...

5 rows × 1007 columns


In [20]:
sorted_train_labels.head()


Out[20]:
Id Class
1541 01IsoiSMh5gxyDYTl4CB 2
8627 01SuzwMJEIXsK7A8dQbl 8
9855 01azqd4InC7m9JpocGv5 9
9856 01jsnpXSAlgw6aPeDxrU 9
0 01kcPWA9K2BOxQeS5Rju 1

5 rows × 2 columns


In [21]:
train_data.head()


Out[21]:
filename edx esi es fs ds ss gs cs ah al ax bh bl bx ch cl cx dh dl
0 4jKA1GUDv6TMNpPuIxER 4049 9957 11 3 3 3 3 3 60 791 242 9 393 31 26 477 135 8 381 ...
1 4ZBJzEqnW52fFUw0PG3v 539 513 10 0 12 16 0 12 29 152 8 53 24 2 25 53 5 52 30 ...
2 6m8NxLfg2MR0nwXFuEq5 427 482 1 0 0 0 0 0 18 163 16 0 84 0 0 84 0 1 130 ...
3 28U1hRkQ6Yl57493ZdXD 51 91 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4 45Wy3TxE98HfiXreOCSu 644 726 8 0 0 0 0 0 2 98 89 0 12 1 13 157 80 0 70 ...

5 rows × 1007 columns


In [23]:
y


Out[23]:
array([2, 8, 9, ..., 4, 4, 4])

1.1 Feature Reduction to 10%


In [7]:
# find the top 10 percent variance features, from 1006 -> 101 features
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
X_new_10.shape


Out[7]:
(10868, 101)

In [8]:
X_new_10


Out[8]:
array([[  750,   496,     8, ...,    97,    70,  4572],
       [ 1121,    24,     6, ...,     0,    41,  2066],
       [ 1493,  1900,     1, ...,    48,    93, 13163],
       ..., 
       [  173,   245,    44, ...,     9,    50,   134],
       [  189,   153,    10, ...,     0,    11,  1527],
       [  153,   183,    37, ...,     9,    12,   281]])

In [9]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names


Out[9]:
array([  1,   2,   9,  10,  11,  13,  14,  16,  17,  19,  20,  21,  22,
        23,  24,  25,  26,  27,  28,  30,  31,  32,  35,  36,  39,  40,
        41,  42,  43,  52,  55,  56,  57,  58,  60,  62,  63,  65,  66,
        67,  68,  69,  71,  73,  74,  76,  77,  78,  79,  80,  81,  82,
        85,  86,  88,  89,  93,  95,  99, 100, 107, 109, 112, 115, 116,
       118, 119, 140, 398, 400, 712, 913, 914, 917, 923, 924, 925, 926,
       927, 928, 929, 930, 932, 934, 935, 937, 939, 940, 943, 944, 945,
       946, 947, 957, 962, 964, 971, 978, 985, 986, 987])

In [10]:
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()


Out[10]:
filename edx esi ah al ax bl bx cl cx dl dx eax ebp ebx ecx edi esp add al.1
2277 01IsoiSMh5gxyDYTl4CB 750 496 8 224 49 25 0 191 52 163 63 1447 905 260 1093 393 420 323 79 ...
2053 01SuzwMJEIXsK7A8dQbl 1121 24 6 22 7 4 0 37 2 9 3 1220 1544 18 1228 24 107 427 8 ...
2144 01azqd4InC7m9JpocGv5 1493 1900 1 398 0 47 0 77 4 56 2 4438 591 810 2317 1284 701 622 262 ...
1236 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 1 0 1 2 0 0 942 451 5 547 5 56 32 0 ...
339 01kcPWA9K2BOxQeS5Rju 23 35 0 3 0 1 0 1 0 0 0 137 43 19 66 15 81 11 1 ...

5 rows × 102 columns


In [11]:
data_reduced.to_csv('data/sorted-train-malware-features-asm-10percent.csv', index=False)
sorted_train_labels.to_csv('data/sorted-train-labels.csv', index=False)

1.2 Feature Reduction to 20%


In [17]:
# find the top 20 percent variance features, from 1006 -> 201 features
fsp = SelectPercentile(chi2, 20)
X_new_20 = fsp.fit_transform(X,y)
X_new_20.shape


Out[17]:
(10868, 201)

In [18]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names


Out[18]:
array([  1,   2,   3,   5,   6,   8,   9,  10,  11,  12,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
        30,  31,  32,  33,  34,  35,  36,  38,  39,  40,  41,  42,  43,
        45,  46,  47,  50,  52,  55,  56,  57,  58,  60,  62,  63,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  76,  77,  78,  79,
        80,  81,  82,  83,  85,  86,  87,  88,  89,  93,  95,  96,  97,
        98,  99, 100, 104, 105, 106, 107, 109, 111, 112, 113, 115, 116,
       117, 118, 119, 120, 122, 140, 180, 183, 199, 229, 261, 262, 264,
       265, 269, 271, 276, 286, 287, 288, 289, 290, 292, 293, 297, 298,
       299, 314, 329, 330, 333, 334, 337, 339, 346, 357, 398, 400, 420,
       424, 436, 459, 460, 492, 497, 510, 520, 523, 537, 559, 570, 581,
       624, 634, 658, 692, 712, 761, 806, 860, 868, 875, 897, 912, 913,
       914, 915, 917, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928,
       929, 930, 931, 932, 934, 935, 937, 939, 940, 942, 943, 944, 945,
       946, 947, 948, 949, 951, 954, 955, 957, 959, 962, 963, 964, 971,
       976, 978, 985, 986, 987, 999])

In [19]:
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()


Out[19]:
filename edx esi es ds ss cs ah al ax bh bl bx ch cl cx dh dl dx eax
2277 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 34 25 0 41 191 52 38 163 63 1447 ...
2053 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 1 4 0 3 37 2 4 9 3 1220 ...
2144 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 0 47 0 1 77 4 1 56 2 4438 ...
1236 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 0 1 0 0 1 2 0 0 0 942 ...
339 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 0 1 0 0 1 0 0 0 0 137 ...

5 rows × 202 columns


In [20]:
data_reduced.to_csv('data/sorted-train-malware-features-asm-20percent.csv', index=False)

1.3 Feature Reduction to 30%


In [16]:
# find the top 30 percent variance features, from 1006 -> 301 features
fsp = SelectPercentile(chi2, 30)
X_new_30 = fsp.fit_transform(X,y)
X_new_30.shape


Out[16]:
(10868, 302)

In [17]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names


Out[17]:
array([  1,   2,   3,   5,   6,   8,   9,  10,  11,  12,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
        30,  31,  32,  33,  34,  35,  36,  38,  39,  40,  41,  42,  43,
        45,  46,  47,  48,  50,  51,  52,  55,  56,  57,  58,  60,  62,
        63,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  76,  77,
        78,  79,  80,  81,  82,  83,  85,  86,  87,  88,  89,  90,  93,
        95,  96,  97,  98,  99, 100, 104, 105, 106, 107, 108, 109, 111,
       112, 113, 115, 116, 117, 118, 119, 120, 121, 122, 138, 139, 140,
       153, 174, 180, 183, 199, 200, 204, 208, 210, 229, 232, 261, 262,
       264, 265, 266, 267, 268, 269, 270, 271, 274, 276, 279, 281, 284,
       285, 286, 287, 288, 289, 290, 292, 293, 296, 297, 298, 299, 301,
       309, 310, 311, 312, 313, 314, 315, 316, 317, 319, 321, 322, 325,
       327, 329, 330, 331, 332, 333, 334, 336, 337, 338, 339, 340, 341,
       345, 346, 348, 349, 355, 357, 358, 361, 363, 369, 398, 400, 407,
       419, 420, 424, 436, 437, 438, 440, 441, 459, 460, 492, 494, 497,
       501, 505, 506, 510, 511, 512, 518, 519, 520, 521, 523, 525, 526,
       527, 531, 532, 533, 536, 537, 543, 544, 545, 547, 549, 552, 559,
       566, 570, 581, 587, 597, 618, 623, 624, 634, 645, 649, 651, 658,
       667, 668, 674, 692, 705, 712, 731, 749, 761, 806, 807, 831, 835,
       856, 860, 868, 874, 875, 889, 897, 912, 913, 914, 915, 917, 919,
       920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932,
       934, 935, 937, 939, 940, 942, 943, 944, 945, 946, 947, 948, 949,
       951, 954, 955, 957, 959, 962, 963, 964, 971, 976, 978, 985, 986,
       987, 988, 999])

In [5]:
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()


Out[5]:
filename edx esi es ds ss cs ah al ax ... LPSTR int.1 entry Software __imp_ .rdata: .data: .text: case assume
2277 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 ... 1 119 0 0 0 97 70 4572 0 0
2053 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 ... 0 5 0 0 0 0 41 2066 0 1
2144 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 ... 1 62 0 0 0 48 93 13163 0 0
1236 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 ... 0 2 0 0 0 28 15 2030 0 0
339 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 ... 0 0 1 0 0 33 15 445 0 0

5 rows × 303 columns


In [6]:
data_reduced.to_csv('data/sorted-train-malware-features-asm-30percent.csv', index=False)

1.4 Feature Reduction to 40%


In [15]:
# find the top 40 percent variance features, from 1006 -> 401 features
fsp = SelectPercentile(chi2, 40)
X_new_40 = fsp.fit_transform(X,y)
X_new_40.shape


Out[15]:
(10868, 402)

In [16]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names


Out[16]:
array([  1,   2,   3,   5,   6,   8,   9,  10,  11,  12,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
        29,  30,  31,  32,  33,  34,  35,  36,  38,  39,  40,  41,  42,
        43,  45,  46,  47,  48,  49,  50,  51,  52,  55,  56,  57,  58,
        59,  60,  62,  63,  65,  66,  67,  68,  69,  70,  71,  72,  73,
        74,  76,  77,  78,  79,  80,  81,  82,  83,  85,  86,  87,  88,
        89,  90,  91,  93,  95,  96,  97,  98,  99, 100, 104, 105, 106,
       107, 108, 109, 111, 112, 113, 115, 116, 117, 118, 119, 120, 121,
       122, 131, 138, 139, 140, 153, 155, 162, 163, 171, 174, 178, 180,
       183, 184, 195, 196, 199, 200, 204, 208, 210, 215, 229, 232, 236,
       251, 258, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271,
       274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 286, 287,
       288, 289, 290, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301,
       304, 306, 307, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318,
       319, 321, 322, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333,
       334, 336, 337, 338, 339, 340, 341, 344, 345, 346, 348, 349, 351,
       355, 357, 358, 361, 363, 364, 367, 369, 371, 378, 380, 384, 389,
       393, 398, 400, 405, 407, 409, 419, 420, 424, 436, 437, 438, 440,
       441, 442, 444, 445, 449, 450, 456, 459, 460, 471, 473, 492, 494,
       496, 497, 499, 501, 502, 505, 506, 510, 511, 512, 514, 517, 518,
       519, 520, 521, 523, 525, 526, 527, 530, 531, 532, 533, 535, 536,
       537, 539, 540, 543, 544, 545, 547, 548, 549, 552, 555, 556, 557,
       559, 566, 567, 569, 570, 574, 578, 581, 583, 587, 595, 597, 606,
       618, 623, 624, 634, 645, 649, 651, 658, 667, 668, 674, 677, 687,
       692, 699, 702, 705, 712, 731, 737, 749, 761, 784, 786, 796, 800,
       806, 807, 813, 826, 830, 831, 832, 834, 835, 837, 840, 842, 843,
       855, 856, 860, 865, 868, 869, 870, 873, 874, 875, 879, 881, 883,
       889, 897, 912, 913, 914, 915, 917, 919, 920, 921, 922, 923, 924,
       925, 926, 927, 928, 929, 930, 931, 932, 934, 935, 937, 939, 940,
       942, 943, 944, 945, 946, 947, 948, 949, 951, 954, 955, 957, 959,
       961, 962, 963, 964, 971, 976, 978, 985, 986, 987, 988, 999])

In [18]:
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()


Out[18]:
filename edx esi es ds ss cs ah al ax ... LPSTR int.1 entry Software __imp_ .rdata: .data: .text: case assume
2277 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 8 224 49 ... 1 119 0 0 0 97 70 4572 0 0
2053 01SuzwMJEIXsK7A8dQbl 1121 24 3 1 4 2 6 22 7 ... 0 5 0 0 0 0 41 2066 0 1
2144 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 1 398 0 ... 1 62 0 0 0 48 93 13163 0 0
1236 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 ... 0 2 0 0 0 28 15 2030 0 0
339 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 3 0 ... 0 0 1 0 0 33 15 445 0 0

5 rows × 403 columns


In [6]:
data_reduced.to_csv('data/sorted-train-malware-features-asm-40percent.csv', index=False)

1.5 Feature Reduction to 50%


In [7]:
# find the top 50 percent variance features, from 1006 -> 503 features
fsp = SelectPercentile(chi2, 50)
X_new_50 = fsp.fit_transform(X,y)
X_new_50.shape


Out[7]:
(10868, 503)

In [8]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names


Out[8]:
array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  38,  39,  40,
        41,  42,  43,  45,  46,  47,  48,  49,  50,  51,  52,  53,  55,
        56,  57,  58,  59,  60,  62,  63,  65,  66,  67,  68,  69,  70,
        71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
        85,  86,  87,  88,  89,  90,  91,  93,  95,  96,  97,  98,  99,
       100, 104, 105, 106, 107, 108, 109, 111, 112, 113, 115, 116, 117,
       118, 119, 120, 121, 122, 129, 131, 134, 138, 139, 140, 143, 145,
       149, 153, 155, 158, 159, 160, 162, 163, 166, 171, 174, 178, 180,
       183, 184, 195, 196, 199, 200, 201, 204, 208, 210, 215, 217, 227,
       229, 231, 232, 236, 237, 251, 253, 258, 261, 262, 263, 264, 265,
       266, 267, 268, 269, 270, 271, 273, 274, 275, 276, 277, 278, 279,
       280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
       293, 294, 295, 296, 297, 298, 299, 300, 301, 303, 304, 305, 306,
       307, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320,
       321, 322, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334,
       336, 337, 338, 339, 340, 341, 344, 345, 346, 348, 349, 350, 351,
       353, 355, 356, 357, 358, 360, 361, 363, 364, 367, 369, 371, 378,
       380, 381, 383, 384, 387, 389, 393, 398, 400, 405, 407, 409, 418,
       419, 420, 424, 430, 436, 437, 438, 440, 441, 442, 443, 444, 445,
       449, 450, 454, 455, 456, 458, 459, 460, 461, 467, 471, 473, 475,
       476, 479, 480, 481, 486, 488, 489, 490, 491, 492, 494, 495, 496,
       497, 499, 500, 501, 502, 503, 504, 505, 506, 510, 511, 512, 514,
       517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 530, 531,
       532, 533, 535, 536, 537, 539, 540, 543, 544, 545, 547, 548, 549,
       552, 554, 555, 556, 557, 558, 559, 562, 563, 564, 566, 567, 569,
       570, 571, 572, 574, 575, 577, 578, 579, 581, 583, 584, 587, 591,
       593, 595, 597, 606, 610, 614, 618, 619, 623, 624, 627, 629, 630,
       634, 645, 649, 651, 658, 663, 667, 668, 672, 673, 674, 676, 677,
       682, 687, 692, 699, 702, 705, 709, 712, 717, 718, 730, 731, 734,
       736, 737, 740, 743, 744, 747, 749, 755, 761, 784, 786, 795, 796,
       798, 800, 806, 807, 808, 813, 825, 826, 830, 831, 832, 834, 835,
       837, 840, 841, 842, 843, 850, 855, 856, 860, 865, 868, 869, 870,
       873, 874, 875, 879, 881, 883, 889, 897, 912, 913, 914, 915, 917,
       918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930,
       931, 932, 934, 935, 937, 939, 940, 942, 943, 944, 945, 946, 947,
       948, 949, 951, 954, 955, 956, 957, 959, 961, 962, 963, 964, 971,
       972, 975, 976, 978, 985, 986, 987, 988, 999])

In [9]:
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()


Out[9]:
filename edx esi es fs ds ss gs cs ah ... entry rva exe Software __imp_ .rdata: .data: .text: case assume
2277 01IsoiSMh5gxyDYTl4CB 750 496 3 0 0 0 0 0 8 ... 0 0 1 0 0 97 70 4572 0 0
2053 01SuzwMJEIXsK7A8dQbl 1121 24 3 0 1 4 0 2 6 ... 0 0 1 0 0 0 41 2066 0 1
2144 01azqd4InC7m9JpocGv5 1493 1900 0 0 0 0 0 0 1 ... 0 0 2 0 0 48 93 13163 0 0
1236 01jsnpXSAlgw6aPeDxrU 525 4 0 0 0 0 0 0 0 ... 0 0 1 0 0 28 15 2030 0 0
339 01kcPWA9K2BOxQeS5Rju 23 35 0 0 0 0 0 0 0 ... 1 0 1 0 0 33 15 445 0 0

5 rows × 504 columns


In [10]:
data_reduced.to_csv('data/sorted-train-malware-features-asm-50percent.csv', index=False)

2. Feature Reduction Of Test Data

Use columns names from reduced ASM train data feature set to select best ASM features from test data
and write to a file.

In [11]:
test_data = pd.read_csv('data/test-malware-features-asm.csv')
sorted_test_data = test_data.sort(columns='filename', axis=0, ascending=True, inplace=False)
sorted_test_data.shape


Out[11]:
(10873, 1007)

In [19]:
sorted_test_data.head()


Out[19]:
filename edx esi es fs ds ss gs cs ah ... visualc ___security_cookie assume callvirtualalloc exportedentry hardware hkey_current_user hkey_local_machine sp-analysisfailed unableto
7297 ITSUPtCmh7WdJcsYDwQ5 245 434 0 0 1 0 0 0 9 ... 0 0 7 0 0 0 0 0 0 0
3257 Ig2DB5tSiEy1cJvV0zdw 258 437 0 0 0 0 0 0 11 ... 0 0 6 0 0 0 0 0 0 0
4183 Jmo6eIhLZ4t9r8QsxEg5 238 365 0 0 0 0 0 0 8 ... 0 0 12 0 0 0 0 0 0 0
8084 JtPFl4ewgdD78OzCMa3o 241 556 1 0 1 1 0 0 4 ... 0 0 0 0 0 0 0 0 0 0
9774 K3ZtByPHGSFYNljDUEXp 92 75 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 1007 columns


In [19]:
# Get the feature names from the reduced train dataframe
column_names = data_reduced.columns
print(column_names)


Index(['filename', 'edx', 'esi', 'es', 'ds', 'ss', 'cs', 'ah', 'al', 'ax',
       ...
       'LPSTR', 'int.1', 'entry', 'Software', '__imp_', '.rdata:', '.data:',
       '.text:', 'case', 'assume'],
      dtype='object', length=403)

In [20]:
# Extract the reduced feature set from the full test feature set
sorted_test_data_reduced = sorted_test_data.loc[:,column_names]
sorted_test_data_reduced.head()


Out[20]:
filename edx esi es ds ss cs ah al ax ... LPSTR int.1 entry Software __imp_ .rdata: .data: .text: case assume
7297 ITSUPtCmh7WdJcsYDwQ5 245 434 0 1 0 0 9 51 1 ... 0 0 0 0 0 20 117 1703 0 7
3257 Ig2DB5tSiEy1cJvV0zdw 258 437 0 0 0 0 11 60 1 ... 0 2 0 0 0 16 97 2012 0 6
4183 Jmo6eIhLZ4t9r8QsxEg5 238 365 0 0 0 0 8 51 0 ... 1 0 0 0 0 0 0 0 0 12
8084 JtPFl4ewgdD78OzCMa3o 241 556 1 1 1 0 4 52 0 ... 0 0 0 0 0 40 115 1947 0 0
9774 K3ZtByPHGSFYNljDUEXp 92 75 0 0 0 0 0 1 0 ... 0 1 0 0 0 16 32 2636 0 0

5 rows × 403 columns


In [16]:
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-10percent.csv', index=False)

In [25]:
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-20percent.csv', index=False)

In [22]:
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-30percent.csv', index=False)

In [21]:
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-40percent.csv', index=False)

In [14]:
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-50percent.csv', index=False)

3. Sort and Write Byte Feature Sets

Sort the test file feature set data frames on the filename column and write to sorted data to file.

In [37]:
# First load the .asm training features and training labels
#sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-reduced.csv')
#sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv','r')

# Next load the .byte training features and sort
train_data_byte = pd.read_csv('data/train-malware-features-byte.csv')
sorted_train_data_byte = train_data_byte.sort(columns='filename', axis=0, ascending=True, inplace=False)

# Next load the .byte test features and sort
test_data_byte = pd.read_csv('data/test-malware-features-byte.csv')
sorted_test_data_byte = test_data_byte.sort(columns='filename', axis=0, ascending=True, inplace=False)

#combined_train_data = pd.DataFrame.merge(sorted_train_data_asm, sorted_train_data_byte, on='filename', how='inner', sort=False)

# Now write all the sorted feature sets to file
#f = open('data/sorted-train-features-combined.csv', 'w')
#combined_train_data.to_csv(f, index=False)
#f.close()
f = open('data/sorted-train-malware-features-byte.csv', 'w')
sorted_train_data_byte.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-malware-features-byte.csv', 'w')
sorted_test_data_byte.to_csv(f, index=False)
f.close()

4. Sort and Reduce Image Data for Test and Train Files


In [ ]:
# Load and sort asm image data for test and train files
train_image_asm = pd.read_csv('data/train-image-features-asm.csv')
sorted_train_image_asm = train_image_asm.sort(columns='filename', axis=0, ascending=True, inplace=False)

test_image_asm = pd.read_csv('data/test-image-features-asm.csv')
sorted_test_image_asm = test_image_asm.sort(columns='filename', axis=0, ascending=True, inplace=False)

# NOTE: byte file images have low standard deviation and mean variance, not very useful for learning.

# Load and sort byte image data for test and train files
# train_image_byte = pd.read_csv('data/train-image-features-byte.csv')
# sorted_train_image_byte = train_image_byte.sort(columns='filename', axis=0, ascending=True, inplace=False)

# test_image_byte = pd.read_csv('data/test-image-features-byte.csv')
#sorted_test_image_byte = test_image_byte.sort(columns='filename', axis=0, ascending=True, inplace=False)

In [4]:
# Now write all the sorted image feature sets to file
f = open('data/sorted-train-image-features-asm.csv', 'w')
sorted_train_image_asm.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm.csv', 'w')
sorted_test_image_asm.to_csv(f, index=False)
f.close()
#f = open('data/sorted-train-image-features-byte.csv', 'w')
#sorted_train_image_byte.to_csv(f, index=False)
#f.close()
#f = open('data/sorted-test-image-features-byte.csv', 'w')
#sorted_test_image_byte.to_csv(f, index=False)
#f.close()

In [29]:
sorted_train_image_asm.head()


Out[29]:
filename ASM_0 ASM_1 ASM_2 ASM_3 ASM_4 ASM_5 ASM_6 ASM_7 ASM_8 ASM_9 ASM_10 ASM_11 ASM_12 ASM_13 ASM_14 ASM_15 ASM_16 ASM_17 ASM_18
0 01IsoiSMh5gxyDYTl4CB 46 116 101 120 116 58 48 48 52 48 49 48 48 48 9 9 9 9 9 ...
1 01SuzwMJEIXsK7A8dQbl 72 69 65 68 69 82 58 48 48 52 48 48 48 48 48 9 9 9 9 ...
2 01azqd4InC7m9JpocGv5 72 69 65 68 69 82 58 48 48 52 48 48 48 48 48 9 9 9 9 ...
3 01jsnpXSAlgw6aPeDxrU 72 69 65 68 69 82 58 48 48 52 48 48 48 48 48 9 9 9 9 ...
4 01kcPWA9K2BOxQeS5Rju 72 69 65 68 69 82 58 49 48 48 48 48 48 48 48 9 9 9 9 ...

5 rows × 1001 columns

4.1 Feature Reduction to 10%


In [30]:
# Now select 10% best train image asm features by variance
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 10)
X_new = fsp.fit_transform(X,y)
X_new.shape


Out[30]:
(10868, 100)

In [31]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names


Out[31]:
array([142, 145, 151, 156, 164, 173, 186, 188, 190, 215, 222, 223, 224,
       226, 227, 244, 245, 246, 247, 248, 261, 262, 272, 311, 312, 313,
       314, 315, 317, 318, 334, 338, 339, 340, 345, 351, 352, 353, 354,
       356, 366, 371, 373, 374, 375, 376, 378, 379, 380, 381, 405, 410,
       412, 413, 422, 423, 424, 425, 426, 427, 437, 460, 583, 584, 585,
       586, 614, 616, 617, 618, 619, 620, 629, 631, 641, 645, 646, 647,
       653, 655, 678, 681, 682, 683, 685, 712, 720, 724, 761, 852, 853,
       920, 950, 951, 952, 954, 956, 957, 958, 963])

In [32]:
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()


Out[32]:
filename ASM_141 ASM_144 ASM_150 ASM_155 ASM_163 ASM_172 ASM_185 ASM_187 ASM_189 ASM_214 ASM_221 ASM_222 ASM_223 ASM_225 ASM_226 ASM_243 ASM_244 ASM_245 ASM_246
0 01IsoiSMh5gxyDYTl4CB 58 52 9 9 59 115 101 9 101 118 115 115 101 98 108 116 101 120 116 ...
1 01SuzwMJEIXsK7A8dQbl 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
2 01azqd4InC7m9JpocGv5 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
3 01jsnpXSAlgw6aPeDxrU 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
4 01kcPWA9K2BOxQeS5Rju 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...

5 rows × 101 columns


In [33]:
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()


Out[33]:
filename ASM_141 ASM_144 ASM_150 ASM_155 ASM_163 ASM_172 ASM_185 ASM_187 ASM_189 ASM_214 ASM_221 ASM_222 ASM_223 ASM_225 ASM_226 ASM_243 ASM_244 ASM_245 ASM_246
0 ITSUPtCmh7WdJcsYDwQ5 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
1 Ig2DB5tSiEy1cJvV0zdw 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
2 Jmo6eIhLZ4t9r8QsxEg5 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
3 JtPFl4ewgdD78OzCMa3o 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...
4 K3ZtByPHGSFYNljDUEXp 9 9 124 104 9 32 32 104 32 32 32 32 32 13 10 9 9 9 9 ...

5 rows × 101 columns


In [8]:


In [34]:
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-10percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-10percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()
#f = open('data/sorted-train-image-features-byte-reduced.csv', 'w')
#sorted_train_image_byte_reduced.to_csv(f, index=False)
#f.close()
#f = open('data/sorted-test-image-features-byte-reduced.csv', 'w')
#sorted_test_image_byte_reduced.to_csv(f, index=False)
#f.close()

4.2 Feature Reduction to 20%


In [35]:
# Now select 20% best train image asm features by variance
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 20)
X_new = fsp.fit_transform(X,y)
X_new.shape


Out[35]:
(10868, 200)

In [36]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names


Out[36]:
array([ 34,  41,  42,  43,  44, 126, 138, 140, 141, 142, 143, 144, 145,
       146, 147, 148, 151, 155, 156, 158, 164, 167, 173, 186, 188, 190,
       198, 205, 215, 216, 220, 221, 222, 223, 224, 226, 227, 240, 243,
       244, 245, 246, 247, 248, 249, 253, 261, 262, 263, 264, 268, 272,
       282, 287, 292, 311, 312, 313, 314, 315, 317, 318, 334, 337, 338,
       339, 340, 344, 345, 346, 351, 352, 353, 354, 356, 359, 366, 371,
       372, 373, 374, 375, 376, 378, 379, 380, 381, 384, 405, 408, 410,
       412, 413, 414, 415, 422, 423, 424, 425, 426, 427, 436, 437, 439,
       449, 452, 460, 464, 466, 467, 539, 555, 556, 557, 558, 559, 560,
       561, 564, 572, 573, 581, 583, 584, 585, 586, 587, 597, 598, 614,
       615, 616, 617, 618, 619, 620, 621, 622, 624, 629, 630, 631, 632,
       633, 641, 642, 643, 645, 646, 647, 648, 653, 655, 659, 675, 676,
       677, 678, 679, 681, 682, 683, 685, 712, 713, 720, 722, 724, 725,
       734, 735, 755, 761, 763, 786, 852, 853, 920, 923, 932, 935, 936,
       937, 947, 948, 949, 950, 951, 952, 953, 954, 956, 957, 958, 959,
       961, 962, 963, 965, 991])

In [37]:
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()


Out[37]:
filename ASM_33 ASM_40 ASM_41 ASM_42 ASM_43 ASM_125 ASM_137 ASM_139 ASM_140 ASM_141 ASM_142 ASM_143 ASM_144 ASM_145 ASM_146 ASM_147 ASM_150 ASM_154 ASM_155
0 01IsoiSMh5gxyDYTl4CB 120 49 48 48 48 45 116 120 116 58 48 48 52 48 49 48 9 9 9 ...
1 01SuzwMJEIXsK7A8dQbl 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
2 01azqd4InC7m9JpocGv5 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
3 01jsnpXSAlgw6aPeDxrU 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
4 01kcPWA9K2BOxQeS5Rju 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...

5 rows × 201 columns


In [39]:
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()


Out[39]:
filename ASM_33 ASM_40 ASM_41 ASM_42 ASM_43 ASM_125 ASM_137 ASM_139 ASM_140 ASM_141 ASM_142 ASM_143 ASM_144 ASM_145 ASM_146 ASM_147 ASM_150 ASM_154 ASM_155
0 ITSUPtCmh7WdJcsYDwQ5 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
1 Ig2DB5tSiEy1cJvV0zdw 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
2 Jmo6eIhLZ4t9r8QsxEg5 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
3 JtPFl4ewgdD78OzCMa3o 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...
4 K3ZtByPHGSFYNljDUEXp 48 9 9 9 9 10 48 48 48 9 9 9 9 9 9 9 124 84 104 ...

5 rows × 201 columns


In [40]:
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-20percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-20percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()

4.3 Feature Reduction to 30%


In [8]:
# Now select 30% best train image asm features by variance
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = sorted_train_labels['Class'].values.tolist()
fsp = SelectPercentile(chi2, 30)
X_new = fsp.fit_transform(X,y)
X_new.shape


Out[8]:
(10868, 300)

In [9]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names


Out[9]:
array([ 29,  30,  32,  33,  34,  35,  41,  42,  43,  44,  48, 125, 126,
       138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 151, 155,
       156, 157, 158, 160, 161, 162, 163, 164, 165, 167, 169, 173, 174,
       179, 186, 188, 190, 198, 201, 202, 205, 215, 216, 220, 221, 222,
       223, 224, 226, 227, 240, 243, 244, 245, 246, 247, 248, 249, 252,
       253, 260, 261, 262, 263, 264, 265, 266, 267, 268, 271, 272, 282,
       287, 291, 292, 297, 307, 311, 312, 313, 314, 315, 317, 318, 327,
       328, 334, 337, 338, 339, 340, 343, 344, 345, 346, 351, 352, 353,
       354, 356, 357, 358, 359, 366, 370, 371, 372, 373, 374, 375, 376,
       378, 379, 380, 381, 384, 405, 408, 409, 410, 412, 413, 414, 415,
       422, 423, 424, 425, 426, 427, 436, 437, 439, 440, 441, 447, 448,
       449, 450, 451, 452, 460, 464, 465, 466, 467, 538, 539, 555, 556,
       557, 558, 559, 560, 561, 563, 564, 567, 568, 572, 573, 580, 581,
       582, 583, 584, 585, 586, 587, 588, 597, 598, 602, 613, 614, 615,
       616, 617, 618, 619, 620, 621, 622, 623, 624, 627, 629, 630, 631,
       632, 633, 634, 635, 641, 642, 643, 645, 646, 647, 648, 649, 650,
       651, 652, 653, 654, 655, 657, 658, 659, 674, 675, 676, 677, 678,
       679, 680, 681, 682, 683, 685, 692, 693, 695, 696, 711, 712, 713,
       719, 720, 722, 723, 724, 725, 728, 729, 730, 731, 733, 734, 735,
       738, 739, 752, 753, 755, 760, 761, 763, 777, 786, 830, 831, 847,
       848, 851, 852, 853, 856, 867, 873, 874, 875, 876, 878, 879, 919,
       920, 923, 931, 932, 935, 936, 937, 947, 948, 949, 950, 951, 952,
       953, 954, 956, 957, 958, 959, 960, 961, 962, 963, 965, 973, 978, 991])

In [10]:
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()


Out[10]:
filename ASM_28 ASM_29 ASM_31 ASM_32 ASM_33 ASM_34 ASM_40 ASM_41 ASM_42 ... ASM_957 ASM_958 ASM_959 ASM_960 ASM_961 ASM_962 ASM_964 ASM_972 ASM_977 ASM_990
0 01IsoiSMh5gxyDYTl4CB 13 10 116 101 120 116 49 48 48 ... 122 101 9 9 32 32 32 49 53 10
1 01SuzwMJEIXsK7A8dQbl 68 69 58 48 48 52 9 9 9 ... 10 46 116 101 120 116 48 9 9 116
2 01azqd4InC7m9JpocGv5 68 69 58 48 48 52 9 9 9 ... 10 46 116 101 120 116 48 9 9 116
3 01jsnpXSAlgw6aPeDxrU 68 69 58 48 48 52 9 9 9 ... 10 46 116 101 120 116 48 9 9 116
4 01kcPWA9K2BOxQeS5Rju 68 69 58 49 48 48 9 9 9 ... 9 9 59 32 91 48 48 89 32 71

5 rows × 301 columns


In [11]:
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()


Out[11]:
filename ASM_28 ASM_29 ASM_31 ASM_32 ASM_33 ASM_34 ASM_40 ASM_41 ASM_42 ... ASM_957 ASM_958 ASM_959 ASM_960 ASM_961 ASM_962 ASM_964 ASM_972 ASM_977 ASM_990
0 ITSUPtCmh7WdJcsYDwQ5 68 69 58 48 48 52 9 9 9 ... 10 46 105 99 111 100 58 48 9 32
1 Ig2DB5tSiEy1cJvV0zdw 68 69 58 48 48 52 9 9 9 ... 32 32 59 32 13 10 116 48 9 59
2 Jmo6eIhLZ4t9r8QsxEg5 68 69 58 48 48 52 9 9 9 ... 9 9 59 32 91 48 48 89 32 71
3 JtPFl4ewgdD78OzCMa3o 68 69 58 48 48 52 9 9 9 ... 9 9 59 32 91 48 48 89 32 71
4 K3ZtByPHGSFYNljDUEXp 68 69 58 48 48 52 9 9 9 ... 32 32 59 32 13 10 116 48 9 59

5 rows × 301 columns


In [12]:
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-30percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-30percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()

4.4 Feature Reduction to 40%


In [22]:
# Now select 40% best train image asm features by variance
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 40)
X_new = fsp.fit_transform(X,y)
X_new.shape


Out[22]:
(10868, 400)

In [23]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names


Out[23]:
array([ 15,  21,  22,  29,  30,  32,  33,  34,  35,  41,  42,  43,  44,
        48, 125, 126, 136, 138, 139, 140, 141, 142, 143, 144, 145, 146,
       147, 148, 151, 155, 156, 157, 158, 160, 161, 162, 163, 164, 165,
       167, 169, 173, 174, 179, 186, 188, 190, 198, 201, 202, 205, 215,
       216, 220, 221, 222, 223, 224, 226, 227, 236, 240, 242, 243, 244,
       245, 246, 247, 248, 249, 252, 253, 260, 261, 262, 263, 264, 265,
       266, 267, 268, 269, 271, 272, 282, 287, 291, 292, 293, 294, 295,
       296, 297, 307, 310, 311, 312, 313, 314, 315, 317, 318, 323, 326,
       327, 328, 330, 334, 337, 338, 339, 340, 341, 343, 344, 345, 346,
       349, 351, 352, 353, 354, 356, 357, 358, 359, 366, 370, 371, 372,
       373, 374, 375, 376, 378, 379, 380, 381, 384, 392, 399, 400, 401,
       402, 403, 404, 405, 408, 409, 410, 412, 413, 414, 415, 420, 422,
       423, 424, 425, 426, 427, 428, 429, 430, 431, 436, 437, 439, 440,
       441, 446, 447, 448, 449, 450, 451, 452, 453, 457, 458, 459, 460,
       461, 464, 465, 466, 467, 477, 478, 479, 480, 481, 482, 538, 539,
       555, 556, 557, 558, 559, 560, 561, 563, 564, 567, 568, 572, 573,
       580, 581, 582, 583, 584, 585, 586, 587, 588, 590, 597, 598, 600,
       602, 603, 607, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622,
       623, 624, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 641,
       642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654,
       655, 656, 657, 658, 659, 671, 672, 673, 674, 675, 676, 677, 678,
       679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 692, 693, 694,
       695, 696, 701, 702, 704, 711, 712, 713, 715, 718, 719, 720, 722,
       723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735,
       736, 738, 739, 740, 751, 752, 753, 754, 755, 756, 760, 761, 763,
       765, 777, 778, 781, 782, 785, 786, 801, 802, 814, 818, 819, 830,
       831, 836, 847, 848, 849, 850, 851, 852, 853, 856, 867, 868, 870,
       873, 874, 875, 876, 877, 878, 879, 882, 898, 919, 920, 923, 930,
       931, 932, 934, 935, 936, 937, 938, 947, 948, 949, 950, 951, 952,
       953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 965, 966,
       973, 974, 975, 976, 978, 989, 991, 992, 996, 999])

In [24]:
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()


Out[24]:
filename ASM_14 ASM_20 ASM_21 ASM_28 ASM_29 ASM_31 ASM_32 ASM_33 ASM_34 ... ASM_972 ASM_973 ASM_974 ASM_975 ASM_977 ASM_988 ASM_990 ASM_991 ASM_995 ASM_998
0 01IsoiSMh5gxyDYTl4CB 9 32 32 13 10 116 101 120 116 ... 49 66 69 51 53 41 10 46 116 48
1 01SuzwMJEIXsK7A8dQbl 48 9 9 68 69 58 48 48 52 ... 9 9 9 9 9 10 116 101 48 48
2 01azqd4InC7m9JpocGv5 48 9 9 68 69 58 48 48 52 ... 9 9 9 9 9 10 116 101 48 48
3 01jsnpXSAlgw6aPeDxrU 48 9 9 68 69 58 48 48 52 ... 9 9 9 9 9 10 116 101 48 48
4 01kcPWA9K2BOxQeS5Rju 48 9 9 68 69 58 49 48 48 ... 89 84 69 83 32 83 71 77 32 65

5 rows × 401 columns


In [25]:
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()


Out[25]:
filename ASM_14 ASM_20 ASM_21 ASM_28 ASM_29 ASM_31 ASM_32 ASM_33 ASM_34 ... ASM_972 ASM_973 ASM_974 ASM_975 ASM_977 ASM_988 ASM_990 ASM_991 ASM_995 ASM_998
0 ITSUPtCmh7WdJcsYDwQ5 48 9 9 68 69 58 48 48 52 ... 48 9 9 9 9 32 32 32 9 114
1 Ig2DB5tSiEy1cJvV0zdw 48 9 9 68 69 58 48 48 52 ... 48 49 48 48 9 32 59 32 109 9
2 Jmo6eIhLZ4t9r8QsxEg5 48 9 9 68 69 58 48 48 52 ... 89 84 69 83 32 83 71 77 32 65
3 JtPFl4ewgdD78OzCMa3o 48 9 9 68 69 58 48 48 52 ... 89 84 69 83 32 83 71 77 32 65
4 K3ZtByPHGSFYNljDUEXp 48 9 9 68 69 58 48 48 52 ... 48 49 48 48 9 32 59 32 109 9

5 rows × 401 columns


In [26]:
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-40percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-40percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()

4.5 Feature Reduction to 50%


In [27]:
# Now select 50% best train image asm features by variance
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 50)
X_new = fsp.fit_transform(X,y)
X_new.shape


Out[27]:
(10868, 500)

In [28]:
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names


Out[28]:
array([  2,   4,   5,  15,  21,  22,  24,  25,  26,  27,  29,  30,  32,
        33,  34,  35,  41,  42,  43,  44,  48,  50, 125, 126, 135, 136,
       138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 151, 152,
       154, 155, 156, 157, 158, 160, 161, 162, 163, 164, 165, 167, 169,
       173, 174, 179, 186, 188, 190, 198, 201, 202, 205, 215, 216, 217,
       219, 220, 221, 222, 223, 224, 226, 227, 229, 236, 240, 241, 242,
       243, 244, 245, 246, 247, 248, 249, 252, 253, 260, 261, 262, 263,
       264, 265, 266, 267, 268, 269, 271, 272, 273, 282, 287, 291, 292,
       293, 294, 295, 296, 297, 307, 308, 310, 311, 312, 313, 314, 315,
       316, 317, 318, 319, 321, 323, 326, 327, 328, 330, 334, 337, 338,
       339, 340, 341, 343, 344, 345, 346, 349, 350, 351, 352, 353, 354,
       356, 357, 358, 359, 366, 367, 368, 370, 371, 372, 373, 374, 375,
       376, 378, 379, 380, 381, 384, 385, 386, 387, 388, 390, 391, 392,
       399, 400, 401, 402, 403, 404, 405, 408, 409, 410, 412, 413, 414,
       415, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431,
       436, 437, 439, 440, 441, 442, 443, 445, 446, 447, 448, 449, 450,
       451, 452, 453, 457, 458, 459, 460, 461, 464, 465, 466, 467, 477,
       478, 479, 480, 481, 482, 538, 539, 555, 556, 557, 558, 559, 560,
       561, 563, 564, 567, 568, 571, 572, 573, 580, 581, 582, 583, 584,
       585, 586, 587, 588, 589, 590, 597, 598, 600, 601, 602, 603, 606,
       607, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624,
       627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 640, 641,
       642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654,
       655, 656, 657, 658, 659, 662, 664, 670, 671, 672, 673, 674, 675,
       676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688,
       689, 691, 692, 693, 694, 695, 696, 701, 702, 703, 704, 708, 709,
       711, 712, 713, 714, 715, 717, 718, 719, 720, 721, 722, 723, 724,
       725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 738,
       739, 740, 743, 744, 751, 752, 753, 754, 755, 756, 757, 758, 759,
       760, 761, 762, 763, 765, 774, 775, 776, 777, 778, 779, 780, 781,
       782, 784, 785, 786, 787, 788, 789, 793, 798, 801, 802, 813, 814,
       818, 819, 820, 830, 831, 835, 836, 837, 838, 840, 841, 847, 848,
       849, 850, 851, 852, 853, 855, 856, 857, 866, 867, 868, 869, 870,
       873, 874, 875, 876, 877, 878, 879, 882, 898, 899, 904, 907, 908,
       919, 920, 923, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939,
       940, 941, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957,
       958, 959, 960, 961, 962, 963, 965, 966, 967, 968, 973, 974, 975,
       976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 989, 990, 991,
       992, 995, 996, 997, 998, 999])

In [29]:
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()


Out[29]:
filename ASM_1 ASM_3 ASM_4 ASM_14 ASM_20 ASM_21 ASM_23 ASM_24 ASM_25 ... ASM_984 ASM_988 ASM_989 ASM_990 ASM_991 ASM_994 ASM_995 ASM_996 ASM_997 ASM_998
0 01IsoiSMh5gxyDYTl4CB 116 120 116 9 32 32 32 32 32 ... 54 41 13 10 46 120 116 58 48 48
1 01SuzwMJEIXsK7A8dQbl 69 68 69 48 9 9 13 10 72 ... 32 10 46 116 101 58 48 48 52 48
2 01azqd4InC7m9JpocGv5 69 68 69 48 9 9 13 10 72 ... 32 10 46 116 101 58 48 48 52 48
3 01jsnpXSAlgw6aPeDxrU 69 68 69 48 9 9 13 10 72 ... 32 10 46 116 101 58 48 48 52 48
4 01kcPWA9K2BOxQeS5Rju 69 68 69 48 9 9 13 10 72 ... 83 83 69 71 77 84 32 72 69 65

5 rows × 501 columns


In [30]:
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()


Out[30]:
filename ASM_1 ASM_3 ASM_4 ASM_14 ASM_20 ASM_21 ASM_23 ASM_24 ASM_25 ... ASM_984 ASM_988 ASM_989 ASM_990 ASM_991 ASM_994 ASM_995 ASM_996 ASM_997 ASM_998
0 ITSUPtCmh7WdJcsYDwQ5 69 68 69 48 9 9 13 10 72 ... 114 32 32 32 32 58 9 80 111 114
1 Ig2DB5tSiEy1cJvV0zdw 69 68 69 48 9 9 13 10 72 ... 32 32 32 59 32 114 109 97 116 9
2 Jmo6eIhLZ4t9r8QsxEg5 69 68 69 48 9 9 13 10 72 ... 83 83 69 71 77 84 32 72 69 65
3 JtPFl4ewgdD78OzCMa3o 69 68 69 48 9 9 13 10 72 ... 83 83 69 71 77 84 32 72 69 65
4 K3ZtByPHGSFYNljDUEXp 69 68 69 48 9 9 13 10 72 ... 32 32 32 59 32 114 109 97 116 9

5 rows × 501 columns


In [31]:
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-50percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-50percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()

In [ ]:


In [ ]:


In [ ]:

6. Run ExtraTreeClassifiers With 10-Fold Cross Validation

Now we can have a quick look at how well the feature set can be classified

In [19]:
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred

In [20]:
ytrain = np.array(y)

In [21]:
X = data_reduced.iloc[:,1:]
X.shape


Out[21]:
(10868, 101)

In [22]:
# At last we can build a hypothesis
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,ytrain,clf1)
print("logloss = ", log_loss(y, p1))
print("score = ", accuracy_score(ytrain, pred1))
cm = confusion_matrix(y, pred1)
print(cm)


(array([    1,     3,     7, ..., 10840, 10852, 10864]), array([    0,     2,     4, ..., 10865, 10866, 10867]))
(array([   19,    21,    42, ..., 10814, 10833, 10838]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   14,    17,    24, ..., 10846, 10848, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    9,    20,    27, ..., 10857, 10861, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([    6,    25,    26, ..., 10827, 10832, 10841]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    8,    11,    23, ..., 10823, 10837, 10855]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,    37,    39, ..., 10824, 10828, 10834]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
(array([    4,     5,    10, ..., 10851, 10865, 10867]), array([    0,     1,     2, ..., 10863, 10864, 10866]))
(array([    0,    15,    32, ..., 10836, 10847, 10853]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
(array([   16,    18,    22, ..., 10860, 10862, 10863]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
('logloss = ', 0.041558532227727193)
('score = ', 0.99236290025763707)
[[1535    0    1    1    0    0    0    4    0]
 [   6 2468    2    0    0    2    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   0    0    1  470    0    0    0    4    0]
 [   1    1    0    0   38    0    0    2    0]
 [   3    1    0    1    0  741    0    2    3]
 [   0    0    0    0    0    0  398    0    0]
 [  28    1    1    6    0    1    2 1188    1]
 [   4    0    0    0    0    2    0    2 1005]]

In [23]:
# Finally shove the test feature set into the classifier
test_X = test_data_reduced.iloc[:,1:]
test_predictions = clf1.predict(test_X)
test_predictions


Out[23]:
array([2, 3, 4, ..., 3, 6, 7])

In [30]:
# Write out the predictions to a csv file
out_test_y = pd.DataFrame(columns=['filename', 'class'])
out_test_y['filename'] = test_data_reduced['filename']
out_test_y['class'] = pd.DataFrame(test_predictions, columns=['class'])
out_test_y.head()


Out[30]:
filename class
0 N0DQkgaq9wjO1fJLn2ME 2
1 VuvdGbYitmxa05lHrPnT 3
2 NRUDJPSHu4dAyFjzLIg9 4
3 ZmdXvIh5qHCOyJPwiE6g 2
4 rcb7LxNP6itSDnwgh3Km 3

5 rows × 2 columns


In [33]:
out_test_y.to_csv('data/test-label-etc-predictions.csv', index=False)

7. TEST/EXPERIMENTAL CODE ONLY


In [ ]:
# go through the features and delete any that sum to less than 200
colsum = X.sum(axis=0, numeric_only=True)

In [ ]:
zerocols = colsum[(colsum[:] == 0)]
zerocols

In [ ]:
zerocols = colsum[(colsum[:] < 110)]
zerocols.shape

In [ ]:
reduceX = X
for col in reduceX.columns:
  if sum(reduceX[col]) < 100:
    del reduceX[col]
    
reduceX.shape

In [ ]:
skb = SelectKBest(chi2, k=20)
X_kbestnew = skb.fit_transform(X, y)
X_kbestnew.shape

In [ ]:
y = [0]*labels.shape[0]
fnames = train_data['filename']
for i in range(len(y)):
  fname = train_data.loc[i,'filename']
  row = labels[labels['Id'] == fname]
  y[i] = row.iloc[0,1]

In [ ]:
# DO NOT USE BYTE IMAGE DATA
# Now select 10% best train image byte features by variance
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_byte.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 10)
X_new = fsp.fit_transform(X,y)
X_new.shape

selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names

data_trimmed = sorted_train_image_byte.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_byte['filename'])
sorted_train_image_byte_reduced = data_fnames.join(data_trimmed)
sorted_train_image_byte_reduced.head()

data_trimmed = sorted_test_image_byte.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_byte['filename'])
sorted_test_image_byte_reduced = data_fnames.join(data_trimmed)
sorted_test_image_byte_reduced.head()