In [1]:

    
import numpy as np
import scipy as sp
import pandas as pd
import sklearn as skl
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, SelectPercentile
from sklearn.feature_selection import chi2
from sklearn.metrics import log_loss, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score, KFold
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

1. Feature Reduction For Training Data

   Select the top 10%, 20%, 30%, 40% and 50% of features with the most variance. Starting with 1006 ASM features 
   the process will select about 100, 200... features. Then write the reduced feature sets to files.



In [2]:

    
train_data = pd.read_csv('data/train-malware-features-asm.csv')
labels = pd.read_csv('data/trainLabels.csv')
sorted_train_data = train_data.sort(columns='filename', axis=0, ascending=True, inplace=False)
sorted_train_labels = labels.sort(columns='Id', axis=0, ascending=True, inplace=False)
X = sorted_train_data.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])



In [3]:

    
print(X.shape)









    



(10868, 1006)



In [4]:

    
print(y.shape)



In [5]:

    
sorted_train_data.head()









    Out[5]:






  
    
      
      filename
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      
    
  
  
    
      2277
       01IsoiSMh5gxyDYTl4CB
        750
        496
       3
       0
       0
       0
       0
       0
       8
       224
       49
       34
       25
       0
       41
       191
       52
       38
       163
      ...
    
    
      2053
       01SuzwMJEIXsK7A8dQbl
       1121
         24
       3
       0
       1
       4
       0
       2
       6
        22
        7
        1
        4
       0
        3
        37
        2
        4
         9
      ...
    
    
      2144
       01azqd4InC7m9JpocGv5
       1493
       1900
       0
       0
       0
       0
       0
       0
       1
       398
        0
        0
       47
       0
        1
        77
        4
        1
        56
      ...
    
    
      1236
       01jsnpXSAlgw6aPeDxrU
        525
          4
       0
       0
       0
       0
       0
       0
       0
         0
        0
        0
        1
       0
        0
         1
        2
        0
         0
      ...
    
    
      339 
       01kcPWA9K2BOxQeS5Rju
         23
         35
       0
       0
       0
       0
       0
       0
       0
         3
        0
        0
        1
       0
        0
         1
        0
        0
         0
      ...
    
  

5 rows × 1007 columns



In [20]:

    
sorted_train_labels.head()









    Out[20]:






  
    
      
      Id
      Class
    
  
  
    
      1541
       01IsoiSMh5gxyDYTl4CB
       2
    
    
      8627
       01SuzwMJEIXsK7A8dQbl
       8
    
    
      9855
       01azqd4InC7m9JpocGv5
       9
    
    
      9856
       01jsnpXSAlgw6aPeDxrU
       9
    
    
      0   
       01kcPWA9K2BOxQeS5Rju
       1
    
  

5 rows × 2 columns



In [21]:

    
train_data.head()









    Out[21]:






  
    
      
      filename
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      
    
  
  
    
      0
       4jKA1GUDv6TMNpPuIxER
       4049
       9957
       11
       3
        3
        3
       3
        3
       60
       791
       242
        9
       393
       31
       26
       477
       135
        8
       381
      ...
    
    
      1
       4ZBJzEqnW52fFUw0PG3v
        539
        513
       10
       0
       12
       16
       0
       12
       29
       152
         8
       53
        24
        2
       25
        53
         5
       52
        30
      ...
    
    
      2
       6m8NxLfg2MR0nwXFuEq5
        427
        482
        1
       0
        0
        0
       0
        0
       18
       163
        16
        0
        84
        0
        0
        84
         0
        1
       130
      ...
    
    
      3
       28U1hRkQ6Yl57493ZdXD
         51
         91
        0
       0
        0
        0
       0
        0
        0
         0
         0
        0
         0
        0
        0
         0
         0
        0
         0
      ...
    
    
      4
       45Wy3TxE98HfiXreOCSu
        644
        726
        8
       0
        0
        0
       0
        0
        2
        98
        89
        0
        12
        1
       13
       157
        80
        0
        70
      ...
    
  

5 rows × 1007 columns



In [23]:

    
y









    Out[23]:





array([2, 8, 9, ..., 4, 4, 4])

1.1 Feature Reduction to 10%



In [7]:

    
# find the top 10 percent variance features, from 1006 -> 101 features
fsp = SelectPercentile(chi2, 10)
X_new_10 = fsp.fit_transform(X,y)
X_new_10.shape









    Out[7]:





(10868, 101)



In [8]:

    
X_new_10









    Out[8]:





array([[  750,   496,     8, ...,    97,    70,  4572],
       [ 1121,    24,     6, ...,     0,    41,  2066],
       [ 1493,  1900,     1, ...,    48,    93, 13163],
       ..., 
       [  173,   245,    44, ...,     9,    50,   134],
       [  189,   153,    10, ...,     0,    11,  1527],
       [  153,   183,    37, ...,     9,    12,   281]])



In [9]:

    
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names









    Out[9]:





array([  1,   2,   9,  10,  11,  13,  14,  16,  17,  19,  20,  21,  22,
        23,  24,  25,  26,  27,  28,  30,  31,  32,  35,  36,  39,  40,
        41,  42,  43,  52,  55,  56,  57,  58,  60,  62,  63,  65,  66,
        67,  68,  69,  71,  73,  74,  76,  77,  78,  79,  80,  81,  82,
        85,  86,  88,  89,  93,  95,  99, 100, 107, 109, 112, 115, 116,
       118, 119, 140, 398, 400, 712, 913, 914, 917, 923, 924, 925, 926,
       927, 928, 929, 930, 932, 934, 935, 937, 939, 940, 943, 944, 945,
       946, 947, 957, 962, 964, 971, 978, 985, 986, 987])



In [10]:

    
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()









    Out[10]:






  
    
      
      filename
      edx
      esi
      ah
      al
      ax
      bl
      bx
      cl
      cx
      dl
      dx
      eax
      ebp
      ebx
      ecx
      edi
      esp
      add
      al.1
      
    
  
  
    
      2277
       01IsoiSMh5gxyDYTl4CB
        750
        496
       8
       224
       49
       25
       0
       191
       52
       163
       63
       1447
        905
       260
       1093
        393
       420
       323
        79
      ...
    
    
      2053
       01SuzwMJEIXsK7A8dQbl
       1121
         24
       6
        22
        7
        4
       0
        37
        2
         9
        3
       1220
       1544
        18
       1228
         24
       107
       427
         8
      ...
    
    
      2144
       01azqd4InC7m9JpocGv5
       1493
       1900
       1
       398
        0
       47
       0
        77
        4
        56
        2
       4438
        591
       810
       2317
       1284
       701
       622
       262
      ...
    
    
      1236
       01jsnpXSAlgw6aPeDxrU
        525
          4
       0
         0
        0
        1
       0
         1
        2
         0
        0
        942
        451
         5
        547
          5
        56
        32
         0
      ...
    
    
      339 
       01kcPWA9K2BOxQeS5Rju
         23
         35
       0
         3
        0
        1
       0
         1
        0
         0
        0
        137
         43
        19
         66
         15
        81
        11
         1
      ...
    
  

5 rows × 102 columns



In [11]:

    
data_reduced.to_csv('data/sorted-train-malware-features-asm-10percent.csv', index=False)
sorted_train_labels.to_csv('data/sorted-train-labels.csv', index=False)

1.2 Feature Reduction to 20%



In [17]:

    
# find the top 20 percent variance features, from 1006 -> 201 features
fsp = SelectPercentile(chi2, 20)
X_new_20 = fsp.fit_transform(X,y)
X_new_20.shape









    Out[17]:





(10868, 201)



In [18]:

    
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names









    Out[18]:





array([  1,   2,   3,   5,   6,   8,   9,  10,  11,  12,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
        30,  31,  32,  33,  34,  35,  36,  38,  39,  40,  41,  42,  43,
        45,  46,  47,  50,  52,  55,  56,  57,  58,  60,  62,  63,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  76,  77,  78,  79,
        80,  81,  82,  83,  85,  86,  87,  88,  89,  93,  95,  96,  97,
        98,  99, 100, 104, 105, 106, 107, 109, 111, 112, 113, 115, 116,
       117, 118, 119, 120, 122, 140, 180, 183, 199, 229, 261, 262, 264,
       265, 269, 271, 276, 286, 287, 288, 289, 290, 292, 293, 297, 298,
       299, 314, 329, 330, 333, 334, 337, 339, 346, 357, 398, 400, 420,
       424, 436, 459, 460, 492, 497, 510, 520, 523, 537, 559, 570, 581,
       624, 634, 658, 692, 712, 761, 806, 860, 868, 875, 897, 912, 913,
       914, 915, 917, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928,
       929, 930, 931, 932, 934, 935, 937, 939, 940, 942, 943, 944, 945,
       946, 947, 948, 949, 951, 954, 955, 957, 959, 962, 963, 964, 971,
       976, 978, 985, 986, 987, 999])



In [19]:

    
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()









    Out[19]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      bh
      bl
      bx
      ch
      cl
      cx
      dh
      dl
      dx
      eax
      
    
  
  
    
      2277
       01IsoiSMh5gxyDYTl4CB
        750
        496
       3
       0
       0
       0
       8
       224
       49
       34
       25
       0
       41
       191
       52
       38
       163
       63
       1447
      ...
    
    
      2053
       01SuzwMJEIXsK7A8dQbl
       1121
         24
       3
       1
       4
       2
       6
        22
        7
        1
        4
       0
        3
        37
        2
        4
         9
        3
       1220
      ...
    
    
      2144
       01azqd4InC7m9JpocGv5
       1493
       1900
       0
       0
       0
       0
       1
       398
        0
        0
       47
       0
        1
        77
        4
        1
        56
        2
       4438
      ...
    
    
      1236
       01jsnpXSAlgw6aPeDxrU
        525
          4
       0
       0
       0
       0
       0
         0
        0
        0
        1
       0
        0
         1
        2
        0
         0
        0
        942
      ...
    
    
      339 
       01kcPWA9K2BOxQeS5Rju
         23
         35
       0
       0
       0
       0
       0
         3
        0
        0
        1
       0
        0
         1
        0
        0
         0
        0
        137
      ...
    
  

5 rows × 202 columns



In [20]:

    
data_reduced.to_csv('data/sorted-train-malware-features-asm-20percent.csv', index=False)

1.3 Feature Reduction to 30%



In [16]:

    
# find the top 30 percent variance features, from 1006 -> 301 features
fsp = SelectPercentile(chi2, 30)
X_new_30 = fsp.fit_transform(X,y)
X_new_30.shape









    Out[16]:





(10868, 302)



In [17]:

    
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names









    Out[17]:





array([  1,   2,   3,   5,   6,   8,   9,  10,  11,  12,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
        30,  31,  32,  33,  34,  35,  36,  38,  39,  40,  41,  42,  43,
        45,  46,  47,  48,  50,  51,  52,  55,  56,  57,  58,  60,  62,
        63,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  76,  77,
        78,  79,  80,  81,  82,  83,  85,  86,  87,  88,  89,  90,  93,
        95,  96,  97,  98,  99, 100, 104, 105, 106, 107, 108, 109, 111,
       112, 113, 115, 116, 117, 118, 119, 120, 121, 122, 138, 139, 140,
       153, 174, 180, 183, 199, 200, 204, 208, 210, 229, 232, 261, 262,
       264, 265, 266, 267, 268, 269, 270, 271, 274, 276, 279, 281, 284,
       285, 286, 287, 288, 289, 290, 292, 293, 296, 297, 298, 299, 301,
       309, 310, 311, 312, 313, 314, 315, 316, 317, 319, 321, 322, 325,
       327, 329, 330, 331, 332, 333, 334, 336, 337, 338, 339, 340, 341,
       345, 346, 348, 349, 355, 357, 358, 361, 363, 369, 398, 400, 407,
       419, 420, 424, 436, 437, 438, 440, 441, 459, 460, 492, 494, 497,
       501, 505, 506, 510, 511, 512, 518, 519, 520, 521, 523, 525, 526,
       527, 531, 532, 533, 536, 537, 543, 544, 545, 547, 549, 552, 559,
       566, 570, 581, 587, 597, 618, 623, 624, 634, 645, 649, 651, 658,
       667, 668, 674, 692, 705, 712, 731, 749, 761, 806, 807, 831, 835,
       856, 860, 868, 874, 875, 889, 897, 912, 913, 914, 915, 917, 919,
       920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932,
       934, 935, 937, 939, 940, 942, 943, 944, 945, 946, 947, 948, 949,
       951, 954, 955, 957, 959, 962, 963, 964, 971, 976, 978, 985, 986,
       987, 988, 999])



In [5]:

    
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()









    Out[5]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      ...
      LPSTR
      int.1
      entry
      Software
      __imp_
      .rdata:
      .data:
      .text:
      case
      assume
    
  
  
    
      2277
      01IsoiSMh5gxyDYTl4CB
      750
      496
      3
      0
      0
      0
      8
      224
      49
      ...
      1
      119
      0
      0
      0
      97
      70
      4572
      0
      0
    
    
      2053
      01SuzwMJEIXsK7A8dQbl
      1121
      24
      3
      1
      4
      2
      6
      22
      7
      ...
      0
      5
      0
      0
      0
      0
      41
      2066
      0
      1
    
    
      2144
      01azqd4InC7m9JpocGv5
      1493
      1900
      0
      0
      0
      0
      1
      398
      0
      ...
      1
      62
      0
      0
      0
      48
      93
      13163
      0
      0
    
    
      1236
      01jsnpXSAlgw6aPeDxrU
      525
      4
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      2
      0
      0
      0
      28
      15
      2030
      0
      0
    
    
      339
      01kcPWA9K2BOxQeS5Rju
      23
      35
      0
      0
      0
      0
      0
      3
      0
      ...
      0
      0
      1
      0
      0
      33
      15
      445
      0
      0
    
  

5 rows × 303 columns



In [6]:

    
data_reduced.to_csv('data/sorted-train-malware-features-asm-30percent.csv', index=False)

1.4 Feature Reduction to 40%



In [15]:

    
# find the top 40 percent variance features, from 1006 -> 401 features
fsp = SelectPercentile(chi2, 40)
X_new_40 = fsp.fit_transform(X,y)
X_new_40.shape









    Out[15]:





(10868, 402)



In [16]:

    
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names









    Out[16]:





array([  1,   2,   3,   5,   6,   8,   9,  10,  11,  12,  13,  14,  15,
        16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
        29,  30,  31,  32,  33,  34,  35,  36,  38,  39,  40,  41,  42,
        43,  45,  46,  47,  48,  49,  50,  51,  52,  55,  56,  57,  58,
        59,  60,  62,  63,  65,  66,  67,  68,  69,  70,  71,  72,  73,
        74,  76,  77,  78,  79,  80,  81,  82,  83,  85,  86,  87,  88,
        89,  90,  91,  93,  95,  96,  97,  98,  99, 100, 104, 105, 106,
       107, 108, 109, 111, 112, 113, 115, 116, 117, 118, 119, 120, 121,
       122, 131, 138, 139, 140, 153, 155, 162, 163, 171, 174, 178, 180,
       183, 184, 195, 196, 199, 200, 204, 208, 210, 215, 229, 232, 236,
       251, 258, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271,
       274, 275, 276, 277, 278, 279, 280, 281, 283, 284, 285, 286, 287,
       288, 289, 290, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301,
       304, 306, 307, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318,
       319, 321, 322, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333,
       334, 336, 337, 338, 339, 340, 341, 344, 345, 346, 348, 349, 351,
       355, 357, 358, 361, 363, 364, 367, 369, 371, 378, 380, 384, 389,
       393, 398, 400, 405, 407, 409, 419, 420, 424, 436, 437, 438, 440,
       441, 442, 444, 445, 449, 450, 456, 459, 460, 471, 473, 492, 494,
       496, 497, 499, 501, 502, 505, 506, 510, 511, 512, 514, 517, 518,
       519, 520, 521, 523, 525, 526, 527, 530, 531, 532, 533, 535, 536,
       537, 539, 540, 543, 544, 545, 547, 548, 549, 552, 555, 556, 557,
       559, 566, 567, 569, 570, 574, 578, 581, 583, 587, 595, 597, 606,
       618, 623, 624, 634, 645, 649, 651, 658, 667, 668, 674, 677, 687,
       692, 699, 702, 705, 712, 731, 737, 749, 761, 784, 786, 796, 800,
       806, 807, 813, 826, 830, 831, 832, 834, 835, 837, 840, 842, 843,
       855, 856, 860, 865, 868, 869, 870, 873, 874, 875, 879, 881, 883,
       889, 897, 912, 913, 914, 915, 917, 919, 920, 921, 922, 923, 924,
       925, 926, 927, 928, 929, 930, 931, 932, 934, 935, 937, 939, 940,
       942, 943, 944, 945, 946, 947, 948, 949, 951, 954, 955, 957, 959,
       961, 962, 963, 964, 971, 976, 978, 985, 986, 987, 988, 999])



In [18]:

    
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()









    Out[18]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      ...
      LPSTR
      int.1
      entry
      Software
      __imp_
      .rdata:
      .data:
      .text:
      case
      assume
    
  
  
    
      2277
      01IsoiSMh5gxyDYTl4CB
      750
      496
      3
      0
      0
      0
      8
      224
      49
      ...
      1
      119
      0
      0
      0
      97
      70
      4572
      0
      0
    
    
      2053
      01SuzwMJEIXsK7A8dQbl
      1121
      24
      3
      1
      4
      2
      6
      22
      7
      ...
      0
      5
      0
      0
      0
      0
      41
      2066
      0
      1
    
    
      2144
      01azqd4InC7m9JpocGv5
      1493
      1900
      0
      0
      0
      0
      1
      398
      0
      ...
      1
      62
      0
      0
      0
      48
      93
      13163
      0
      0
    
    
      1236
      01jsnpXSAlgw6aPeDxrU
      525
      4
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      2
      0
      0
      0
      28
      15
      2030
      0
      0
    
    
      339
      01kcPWA9K2BOxQeS5Rju
      23
      35
      0
      0
      0
      0
      0
      3
      0
      ...
      0
      0
      1
      0
      0
      33
      15
      445
      0
      0
    
  

5 rows × 403 columns



In [6]:

    
data_reduced.to_csv('data/sorted-train-malware-features-asm-40percent.csv', index=False)

1.5 Feature Reduction to 50%



In [7]:

    
# find the top 50 percent variance features, from 1006 -> 503 features
fsp = SelectPercentile(chi2, 50)
X_new_50 = fsp.fit_transform(X,y)
X_new_50.shape









    Out[7]:





(10868, 503)



In [8]:

    
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names









    Out[8]:





array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  38,  39,  40,
        41,  42,  43,  45,  46,  47,  48,  49,  50,  51,  52,  53,  55,
        56,  57,  58,  59,  60,  62,  63,  65,  66,  67,  68,  69,  70,
        71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
        85,  86,  87,  88,  89,  90,  91,  93,  95,  96,  97,  98,  99,
       100, 104, 105, 106, 107, 108, 109, 111, 112, 113, 115, 116, 117,
       118, 119, 120, 121, 122, 129, 131, 134, 138, 139, 140, 143, 145,
       149, 153, 155, 158, 159, 160, 162, 163, 166, 171, 174, 178, 180,
       183, 184, 195, 196, 199, 200, 201, 204, 208, 210, 215, 217, 227,
       229, 231, 232, 236, 237, 251, 253, 258, 261, 262, 263, 264, 265,
       266, 267, 268, 269, 270, 271, 273, 274, 275, 276, 277, 278, 279,
       280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292,
       293, 294, 295, 296, 297, 298, 299, 300, 301, 303, 304, 305, 306,
       307, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320,
       321, 322, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334,
       336, 337, 338, 339, 340, 341, 344, 345, 346, 348, 349, 350, 351,
       353, 355, 356, 357, 358, 360, 361, 363, 364, 367, 369, 371, 378,
       380, 381, 383, 384, 387, 389, 393, 398, 400, 405, 407, 409, 418,
       419, 420, 424, 430, 436, 437, 438, 440, 441, 442, 443, 444, 445,
       449, 450, 454, 455, 456, 458, 459, 460, 461, 467, 471, 473, 475,
       476, 479, 480, 481, 486, 488, 489, 490, 491, 492, 494, 495, 496,
       497, 499, 500, 501, 502, 503, 504, 505, 506, 510, 511, 512, 514,
       517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 530, 531,
       532, 533, 535, 536, 537, 539, 540, 543, 544, 545, 547, 548, 549,
       552, 554, 555, 556, 557, 558, 559, 562, 563, 564, 566, 567, 569,
       570, 571, 572, 574, 575, 577, 578, 579, 581, 583, 584, 587, 591,
       593, 595, 597, 606, 610, 614, 618, 619, 623, 624, 627, 629, 630,
       634, 645, 649, 651, 658, 663, 667, 668, 672, 673, 674, 676, 677,
       682, 687, 692, 699, 702, 705, 709, 712, 717, 718, 730, 731, 734,
       736, 737, 740, 743, 744, 747, 749, 755, 761, 784, 786, 795, 796,
       798, 800, 806, 807, 808, 813, 825, 826, 830, 831, 832, 834, 835,
       837, 840, 841, 842, 843, 850, 855, 856, 860, 865, 868, 869, 870,
       873, 874, 875, 879, 881, 883, 889, 897, 912, 913, 914, 915, 917,
       918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930,
       931, 932, 934, 935, 937, 939, 940, 942, 943, 944, 945, 946, 947,
       948, 949, 951, 954, 955, 956, 957, 959, 961, 962, 963, 964, 971,
       972, 975, 976, 978, 985, 986, 987, 988, 999])



In [9]:

    
data_trimmed = sorted_train_data.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_data['filename'])
data_reduced = data_fnames.join(data_trimmed)
data_reduced.head()









    Out[9]:






  
    
      
      filename
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      ...
      entry
      rva
      exe
      Software
      __imp_
      .rdata:
      .data:
      .text:
      case
      assume
    
  
  
    
      2277
      01IsoiSMh5gxyDYTl4CB
      750
      496
      3
      0
      0
      0
      0
      0
      8
      ...
      0
      0
      1
      0
      0
      97
      70
      4572
      0
      0
    
    
      2053
      01SuzwMJEIXsK7A8dQbl
      1121
      24
      3
      0
      1
      4
      0
      2
      6
      ...
      0
      0
      1
      0
      0
      0
      41
      2066
      0
      1
    
    
      2144
      01azqd4InC7m9JpocGv5
      1493
      1900
      0
      0
      0
      0
      0
      0
      1
      ...
      0
      0
      2
      0
      0
      48
      93
      13163
      0
      0
    
    
      1236
      01jsnpXSAlgw6aPeDxrU
      525
      4
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      1
      0
      0
      28
      15
      2030
      0
      0
    
    
      339
      01kcPWA9K2BOxQeS5Rju
      23
      35
      0
      0
      0
      0
      0
      0
      0
      ...
      1
      0
      1
      0
      0
      33
      15
      445
      0
      0
    
  

5 rows × 504 columns



In [10]:

    
data_reduced.to_csv('data/sorted-train-malware-features-asm-50percent.csv', index=False)

2. Feature Reduction Of Test Data

Use columns names from reduced ASM train data feature set to select best ASM features from test data
and write to a file.



In [11]:

    
test_data = pd.read_csv('data/test-malware-features-asm.csv')
sorted_test_data = test_data.sort(columns='filename', axis=0, ascending=True, inplace=False)
sorted_test_data.shape









    Out[11]:





(10873, 1007)



In [19]:

    
sorted_test_data.head()









    Out[19]:






  
    
      
      filename
      edx
      esi
      es
      fs
      ds
      ss
      gs
      cs
      ah
      ...
      visualc
      ___security_cookie
      assume
      callvirtualalloc
      exportedentry
      hardware
      hkey_current_user
      hkey_local_machine
      sp-analysisfailed
      unableto
    
  
  
    
      7297
      ITSUPtCmh7WdJcsYDwQ5
      245
      434
      0
      0
      1
      0
      0
      0
      9
      ...
      0
      0
      7
      0
      0
      0
      0
      0
      0
      0
    
    
      3257
      Ig2DB5tSiEy1cJvV0zdw
      258
      437
      0
      0
      0
      0
      0
      0
      11
      ...
      0
      0
      6
      0
      0
      0
      0
      0
      0
      0
    
    
      4183
      Jmo6eIhLZ4t9r8QsxEg5
      238
      365
      0
      0
      0
      0
      0
      0
      8
      ...
      0
      0
      12
      0
      0
      0
      0
      0
      0
      0
    
    
      8084
      JtPFl4ewgdD78OzCMa3o
      241
      556
      1
      0
      1
      1
      0
      0
      4
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
    
      9774
      K3ZtByPHGSFYNljDUEXp
      92
      75
      0
      0
      0
      0
      0
      0
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      0
    
  

5 rows × 1007 columns



In [19]:

    
# Get the feature names from the reduced train dataframe
column_names = data_reduced.columns
print(column_names)









    



Index(['filename', 'edx', 'esi', 'es', 'ds', 'ss', 'cs', 'ah', 'al', 'ax',
       ...
       'LPSTR', 'int.1', 'entry', 'Software', '__imp_', '.rdata:', '.data:',
       '.text:', 'case', 'assume'],
      dtype='object', length=403)



In [20]:

    
# Extract the reduced feature set from the full test feature set
sorted_test_data_reduced = sorted_test_data.loc[:,column_names]
sorted_test_data_reduced.head()









    Out[20]:






  
    
      
      filename
      edx
      esi
      es
      ds
      ss
      cs
      ah
      al
      ax
      ...
      LPSTR
      int.1
      entry
      Software
      __imp_
      .rdata:
      .data:
      .text:
      case
      assume
    
  
  
    
      7297
      ITSUPtCmh7WdJcsYDwQ5
      245
      434
      0
      1
      0
      0
      9
      51
      1
      ...
      0
      0
      0
      0
      0
      20
      117
      1703
      0
      7
    
    
      3257
      Ig2DB5tSiEy1cJvV0zdw
      258
      437
      0
      0
      0
      0
      11
      60
      1
      ...
      0
      2
      0
      0
      0
      16
      97
      2012
      0
      6
    
    
      4183
      Jmo6eIhLZ4t9r8QsxEg5
      238
      365
      0
      0
      0
      0
      8
      51
      0
      ...
      1
      0
      0
      0
      0
      0
      0
      0
      0
      12
    
    
      8084
      JtPFl4ewgdD78OzCMa3o
      241
      556
      1
      1
      1
      0
      4
      52
      0
      ...
      0
      0
      0
      0
      0
      40
      115
      1947
      0
      0
    
    
      9774
      K3ZtByPHGSFYNljDUEXp
      92
      75
      0
      0
      0
      0
      0
      1
      0
      ...
      0
      1
      0
      0
      0
      16
      32
      2636
      0
      0
    
  

5 rows × 403 columns



In [16]:

    
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-10percent.csv', index=False)



In [25]:

    
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-20percent.csv', index=False)



In [22]:

    
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-30percent.csv', index=False)



In [21]:

    
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-40percent.csv', index=False)



In [14]:

    
sorted_test_data_reduced.to_csv('data/sorted-test-malware-features-asm-50percent.csv', index=False)

3. Sort and Write Byte Feature Sets

Sort the test file feature set data frames on the filename column and write to sorted data to file.



In [37]:

    
# First load the .asm training features and training labels
#sorted_train_data_asm = pd.read_csv('data/sorted-train-malware-features-asm-reduced.csv')
#sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv','r')

# Next load the .byte training features and sort
train_data_byte = pd.read_csv('data/train-malware-features-byte.csv')
sorted_train_data_byte = train_data_byte.sort(columns='filename', axis=0, ascending=True, inplace=False)

# Next load the .byte test features and sort
test_data_byte = pd.read_csv('data/test-malware-features-byte.csv')
sorted_test_data_byte = test_data_byte.sort(columns='filename', axis=0, ascending=True, inplace=False)

#combined_train_data = pd.DataFrame.merge(sorted_train_data_asm, sorted_train_data_byte, on='filename', how='inner', sort=False)

# Now write all the sorted feature sets to file
#f = open('data/sorted-train-features-combined.csv', 'w')
#combined_train_data.to_csv(f, index=False)
#f.close()
f = open('data/sorted-train-malware-features-byte.csv', 'w')
sorted_train_data_byte.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-malware-features-byte.csv', 'w')
sorted_test_data_byte.to_csv(f, index=False)
f.close()

4. Sort and Reduce Image Data for Test and Train Files



In [ ]:

    
# Load and sort asm image data for test and train files
train_image_asm = pd.read_csv('data/train-image-features-asm.csv')
sorted_train_image_asm = train_image_asm.sort(columns='filename', axis=0, ascending=True, inplace=False)

test_image_asm = pd.read_csv('data/test-image-features-asm.csv')
sorted_test_image_asm = test_image_asm.sort(columns='filename', axis=0, ascending=True, inplace=False)

# NOTE: byte file images have low standard deviation and mean variance, not very useful for learning.

# Load and sort byte image data for test and train files
# train_image_byte = pd.read_csv('data/train-image-features-byte.csv')
# sorted_train_image_byte = train_image_byte.sort(columns='filename', axis=0, ascending=True, inplace=False)

# test_image_byte = pd.read_csv('data/test-image-features-byte.csv')
#sorted_test_image_byte = test_image_byte.sort(columns='filename', axis=0, ascending=True, inplace=False)



In [4]:

    
# Now write all the sorted image feature sets to file
f = open('data/sorted-train-image-features-asm.csv', 'w')
sorted_train_image_asm.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm.csv', 'w')
sorted_test_image_asm.to_csv(f, index=False)
f.close()
#f = open('data/sorted-train-image-features-byte.csv', 'w')
#sorted_train_image_byte.to_csv(f, index=False)
#f.close()
#f = open('data/sorted-test-image-features-byte.csv', 'w')
#sorted_test_image_byte.to_csv(f, index=False)
#f.close()



In [29]:

    
sorted_train_image_asm.head()









    Out[29]:






  
    
      
      filename
      ASM_0
      ASM_1
      ASM_2
      ASM_3
      ASM_4
      ASM_5
      ASM_6
      ASM_7
      ASM_8
      ASM_9
      ASM_10
      ASM_11
      ASM_12
      ASM_13
      ASM_14
      ASM_15
      ASM_16
      ASM_17
      ASM_18
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       46
       116
       101
       120
       116
       58
       48
       48
       52
       48
       49
       48
       48
       48
        9
       9
       9
       9
       9
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
       72
        69
        65
        68
        69
       82
       58
       48
       48
       52
       48
       48
       48
       48
       48
       9
       9
       9
       9
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
       72
        69
        65
        68
        69
       82
       58
       48
       48
       52
       48
       48
       48
       48
       48
       9
       9
       9
       9
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
       72
        69
        65
        68
        69
       82
       58
       48
       48
       52
       48
       48
       48
       48
       48
       9
       9
       9
       9
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
       72
        69
        65
        68
        69
       82
       58
       49
       48
       48
       48
       48
       48
       48
       48
       9
       9
       9
       9
      ...
    
  

5 rows × 1001 columns

4.1 Feature Reduction to 10%



In [30]:

    
# Now select 10% best train image asm features by variance
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 10)
X_new = fsp.fit_transform(X,y)
X_new.shape









    Out[30]:





(10868, 100)



In [31]:

    
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names









    Out[31]:





array([142, 145, 151, 156, 164, 173, 186, 188, 190, 215, 222, 223, 224,
       226, 227, 244, 245, 246, 247, 248, 261, 262, 272, 311, 312, 313,
       314, 315, 317, 318, 334, 338, 339, 340, 345, 351, 352, 353, 354,
       356, 366, 371, 373, 374, 375, 376, 378, 379, 380, 381, 405, 410,
       412, 413, 422, 423, 424, 425, 426, 427, 437, 460, 583, 584, 585,
       586, 614, 616, 617, 618, 619, 620, 629, 631, 641, 645, 646, 647,
       653, 655, 678, 681, 682, 683, 685, 712, 720, 724, 761, 852, 853,
       920, 950, 951, 952, 954, 956, 957, 958, 963])



In [32]:

    
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()









    Out[32]:






  
    
      
      filename
      ASM_141
      ASM_144
      ASM_150
      ASM_155
      ASM_163
      ASM_172
      ASM_185
      ASM_187
      ASM_189
      ASM_214
      ASM_221
      ASM_222
      ASM_223
      ASM_225
      ASM_226
      ASM_243
      ASM_244
      ASM_245
      ASM_246
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       58
       52
         9
         9
       59
       115
       101
         9
       101
       118
       115
       115
       101
       98
       108
       116
       101
       120
       116
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
        9
        9
       124
       104
        9
        32
        32
       104
        32
        32
        32
        32
        32
       13
        10
         9
         9
         9
         9
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
        9
        9
       124
       104
        9
        32
        32
       104
        32
        32
        32
        32
        32
       13
        10
         9
         9
         9
         9
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        9
        9
       124
       104
        9
        32
        32
       104
        32
        32
        32
        32
        32
       13
        10
         9
         9
         9
         9
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
        9
        9
       124
       104
        9
        32
        32
       104
        32
        32
        32
        32
        32
       13
        10
         9
         9
         9
         9
      ...
    
  

5 rows × 101 columns



In [33]:

    
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()









    Out[33]:






  
    
      
      filename
      ASM_141
      ASM_144
      ASM_150
      ASM_155
      ASM_163
      ASM_172
      ASM_185
      ASM_187
      ASM_189
      ASM_214
      ASM_221
      ASM_222
      ASM_223
      ASM_225
      ASM_226
      ASM_243
      ASM_244
      ASM_245
      ASM_246
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       9
       9
       124
       104
       9
       32
       32
       104
       32
       32
       32
       32
       32
       13
       10
       9
       9
       9
       9
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       9
       9
       124
       104
       9
       32
       32
       104
       32
       32
       32
       32
       32
       13
       10
       9
       9
       9
       9
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       9
       9
       124
       104
       9
       32
       32
       104
       32
       32
       32
       32
       32
       13
       10
       9
       9
       9
       9
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       9
       9
       124
       104
       9
       32
       32
       104
       32
       32
       32
       32
       32
       13
       10
       9
       9
       9
       9
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
       9
       9
       124
       104
       9
       32
       32
       104
       32
       32
       32
       32
       32
       13
       10
       9
       9
       9
       9
      ...
    
  

5 rows × 101 columns



In [8]:



In [34]:

    
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-10percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-10percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()
#f = open('data/sorted-train-image-features-byte-reduced.csv', 'w')
#sorted_train_image_byte_reduced.to_csv(f, index=False)
#f.close()
#f = open('data/sorted-test-image-features-byte-reduced.csv', 'w')
#sorted_test_image_byte_reduced.to_csv(f, index=False)
#f.close()

4.2 Feature Reduction to 20%



In [35]:

    
# Now select 20% best train image asm features by variance
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 20)
X_new = fsp.fit_transform(X,y)
X_new.shape









    Out[35]:





(10868, 200)



In [36]:

    
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names









    Out[36]:





array([ 34,  41,  42,  43,  44, 126, 138, 140, 141, 142, 143, 144, 145,
       146, 147, 148, 151, 155, 156, 158, 164, 167, 173, 186, 188, 190,
       198, 205, 215, 216, 220, 221, 222, 223, 224, 226, 227, 240, 243,
       244, 245, 246, 247, 248, 249, 253, 261, 262, 263, 264, 268, 272,
       282, 287, 292, 311, 312, 313, 314, 315, 317, 318, 334, 337, 338,
       339, 340, 344, 345, 346, 351, 352, 353, 354, 356, 359, 366, 371,
       372, 373, 374, 375, 376, 378, 379, 380, 381, 384, 405, 408, 410,
       412, 413, 414, 415, 422, 423, 424, 425, 426, 427, 436, 437, 439,
       449, 452, 460, 464, 466, 467, 539, 555, 556, 557, 558, 559, 560,
       561, 564, 572, 573, 581, 583, 584, 585, 586, 587, 597, 598, 614,
       615, 616, 617, 618, 619, 620, 621, 622, 624, 629, 630, 631, 632,
       633, 641, 642, 643, 645, 646, 647, 648, 653, 655, 659, 675, 676,
       677, 678, 679, 681, 682, 683, 685, 712, 713, 720, 722, 724, 725,
       734, 735, 755, 761, 763, 786, 852, 853, 920, 923, 932, 935, 936,
       937, 947, 948, 949, 950, 951, 952, 953, 954, 956, 957, 958, 959,
       961, 962, 963, 965, 991])



In [37]:

    
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()









    Out[37]:






  
    
      
      filename
      ASM_33
      ASM_40
      ASM_41
      ASM_42
      ASM_43
      ASM_125
      ASM_137
      ASM_139
      ASM_140
      ASM_141
      ASM_142
      ASM_143
      ASM_144
      ASM_145
      ASM_146
      ASM_147
      ASM_150
      ASM_154
      ASM_155
      
    
  
  
    
      0
       01IsoiSMh5gxyDYTl4CB
       120
       49
       48
       48
       48
       45
       116
       120
       116
       58
       48
       48
       52
       48
       49
       48
         9
        9
         9
      ...
    
    
      1
       01SuzwMJEIXsK7A8dQbl
        48
        9
        9
        9
        9
       10
        48
        48
        48
        9
        9
        9
        9
        9
        9
        9
       124
       84
       104
      ...
    
    
      2
       01azqd4InC7m9JpocGv5
        48
        9
        9
        9
        9
       10
        48
        48
        48
        9
        9
        9
        9
        9
        9
        9
       124
       84
       104
      ...
    
    
      3
       01jsnpXSAlgw6aPeDxrU
        48
        9
        9
        9
        9
       10
        48
        48
        48
        9
        9
        9
        9
        9
        9
        9
       124
       84
       104
      ...
    
    
      4
       01kcPWA9K2BOxQeS5Rju
        48
        9
        9
        9
        9
       10
        48
        48
        48
        9
        9
        9
        9
        9
        9
        9
       124
       84
       104
      ...
    
  

5 rows × 201 columns



In [39]:

    
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()









    Out[39]:






  
    
      
      filename
      ASM_33
      ASM_40
      ASM_41
      ASM_42
      ASM_43
      ASM_125
      ASM_137
      ASM_139
      ASM_140
      ASM_141
      ASM_142
      ASM_143
      ASM_144
      ASM_145
      ASM_146
      ASM_147
      ASM_150
      ASM_154
      ASM_155
      
    
  
  
    
      0
       ITSUPtCmh7WdJcsYDwQ5
       48
       9
       9
       9
       9
       10
       48
       48
       48
       9
       9
       9
       9
       9
       9
       9
       124
       84
       104
      ...
    
    
      1
       Ig2DB5tSiEy1cJvV0zdw
       48
       9
       9
       9
       9
       10
       48
       48
       48
       9
       9
       9
       9
       9
       9
       9
       124
       84
       104
      ...
    
    
      2
       Jmo6eIhLZ4t9r8QsxEg5
       48
       9
       9
       9
       9
       10
       48
       48
       48
       9
       9
       9
       9
       9
       9
       9
       124
       84
       104
      ...
    
    
      3
       JtPFl4ewgdD78OzCMa3o
       48
       9
       9
       9
       9
       10
       48
       48
       48
       9
       9
       9
       9
       9
       9
       9
       124
       84
       104
      ...
    
    
      4
       K3ZtByPHGSFYNljDUEXp
       48
       9
       9
       9
       9
       10
       48
       48
       48
       9
       9
       9
       9
       9
       9
       9
       124
       84
       104
      ...
    
  

5 rows × 201 columns



In [40]:

    
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-20percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-20percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()

4.3 Feature Reduction to 30%



In [8]:

    
# Now select 30% best train image asm features by variance
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = sorted_train_labels['Class'].values.tolist()
fsp = SelectPercentile(chi2, 30)
X_new = fsp.fit_transform(X,y)
X_new.shape









    Out[8]:





(10868, 300)



In [9]:

    
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names









    Out[9]:





array([ 29,  30,  32,  33,  34,  35,  41,  42,  43,  44,  48, 125, 126,
       138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 151, 155,
       156, 157, 158, 160, 161, 162, 163, 164, 165, 167, 169, 173, 174,
       179, 186, 188, 190, 198, 201, 202, 205, 215, 216, 220, 221, 222,
       223, 224, 226, 227, 240, 243, 244, 245, 246, 247, 248, 249, 252,
       253, 260, 261, 262, 263, 264, 265, 266, 267, 268, 271, 272, 282,
       287, 291, 292, 297, 307, 311, 312, 313, 314, 315, 317, 318, 327,
       328, 334, 337, 338, 339, 340, 343, 344, 345, 346, 351, 352, 353,
       354, 356, 357, 358, 359, 366, 370, 371, 372, 373, 374, 375, 376,
       378, 379, 380, 381, 384, 405, 408, 409, 410, 412, 413, 414, 415,
       422, 423, 424, 425, 426, 427, 436, 437, 439, 440, 441, 447, 448,
       449, 450, 451, 452, 460, 464, 465, 466, 467, 538, 539, 555, 556,
       557, 558, 559, 560, 561, 563, 564, 567, 568, 572, 573, 580, 581,
       582, 583, 584, 585, 586, 587, 588, 597, 598, 602, 613, 614, 615,
       616, 617, 618, 619, 620, 621, 622, 623, 624, 627, 629, 630, 631,
       632, 633, 634, 635, 641, 642, 643, 645, 646, 647, 648, 649, 650,
       651, 652, 653, 654, 655, 657, 658, 659, 674, 675, 676, 677, 678,
       679, 680, 681, 682, 683, 685, 692, 693, 695, 696, 711, 712, 713,
       719, 720, 722, 723, 724, 725, 728, 729, 730, 731, 733, 734, 735,
       738, 739, 752, 753, 755, 760, 761, 763, 777, 786, 830, 831, 847,
       848, 851, 852, 853, 856, 867, 873, 874, 875, 876, 878, 879, 919,
       920, 923, 931, 932, 935, 936, 937, 947, 948, 949, 950, 951, 952,
       953, 954, 956, 957, 958, 959, 960, 961, 962, 963, 965, 973, 978, 991])



In [10]:

    
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()









    Out[10]:






  
    
      
      filename
      ASM_28
      ASM_29
      ASM_31
      ASM_32
      ASM_33
      ASM_34
      ASM_40
      ASM_41
      ASM_42
      ...
      ASM_957
      ASM_958
      ASM_959
      ASM_960
      ASM_961
      ASM_962
      ASM_964
      ASM_972
      ASM_977
      ASM_990
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      13
      10
      116
      101
      120
      116
      49
      48
      48
      ...
      122
      101
      9
      9
      32
      32
      32
      49
      53
      10
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      68
      69
      58
      48
      48
      52
      9
      9
      9
      ...
      10
      46
      116
      101
      120
      116
      48
      9
      9
      116
    
    
      2
      01azqd4InC7m9JpocGv5
      68
      69
      58
      48
      48
      52
      9
      9
      9
      ...
      10
      46
      116
      101
      120
      116
      48
      9
      9
      116
    
    
      3
      01jsnpXSAlgw6aPeDxrU
      68
      69
      58
      48
      48
      52
      9
      9
      9
      ...
      10
      46
      116
      101
      120
      116
      48
      9
      9
      116
    
    
      4
      01kcPWA9K2BOxQeS5Rju
      68
      69
      58
      49
      48
      48
      9
      9
      9
      ...
      9
      9
      59
      32
      91
      48
      48
      89
      32
      71
    
  

5 rows × 301 columns



In [11]:

    
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()









    Out[11]:






  
    
      
      filename
      ASM_28
      ASM_29
      ASM_31
      ASM_32
      ASM_33
      ASM_34
      ASM_40
      ASM_41
      ASM_42
      ...
      ASM_957
      ASM_958
      ASM_959
      ASM_960
      ASM_961
      ASM_962
      ASM_964
      ASM_972
      ASM_977
      ASM_990
    
  
  
    
      0
      ITSUPtCmh7WdJcsYDwQ5
      68
      69
      58
      48
      48
      52
      9
      9
      9
      ...
      10
      46
      105
      99
      111
      100
      58
      48
      9
      32
    
    
      1
      Ig2DB5tSiEy1cJvV0zdw
      68
      69
      58
      48
      48
      52
      9
      9
      9
      ...
      32
      32
      59
      32
      13
      10
      116
      48
      9
      59
    
    
      2
      Jmo6eIhLZ4t9r8QsxEg5
      68
      69
      58
      48
      48
      52
      9
      9
      9
      ...
      9
      9
      59
      32
      91
      48
      48
      89
      32
      71
    
    
      3
      JtPFl4ewgdD78OzCMa3o
      68
      69
      58
      48
      48
      52
      9
      9
      9
      ...
      9
      9
      59
      32
      91
      48
      48
      89
      32
      71
    
    
      4
      K3ZtByPHGSFYNljDUEXp
      68
      69
      58
      48
      48
      52
      9
      9
      9
      ...
      32
      32
      59
      32
      13
      10
      116
      48
      9
      59
    
  

5 rows × 301 columns



In [12]:

    
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-30percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-30percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()

4.4 Feature Reduction to 40%



In [22]:

    
# Now select 40% best train image asm features by variance
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 40)
X_new = fsp.fit_transform(X,y)
X_new.shape









    Out[22]:





(10868, 400)



In [23]:

    
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names









    Out[23]:





array([ 15,  21,  22,  29,  30,  32,  33,  34,  35,  41,  42,  43,  44,
        48, 125, 126, 136, 138, 139, 140, 141, 142, 143, 144, 145, 146,
       147, 148, 151, 155, 156, 157, 158, 160, 161, 162, 163, 164, 165,
       167, 169, 173, 174, 179, 186, 188, 190, 198, 201, 202, 205, 215,
       216, 220, 221, 222, 223, 224, 226, 227, 236, 240, 242, 243, 244,
       245, 246, 247, 248, 249, 252, 253, 260, 261, 262, 263, 264, 265,
       266, 267, 268, 269, 271, 272, 282, 287, 291, 292, 293, 294, 295,
       296, 297, 307, 310, 311, 312, 313, 314, 315, 317, 318, 323, 326,
       327, 328, 330, 334, 337, 338, 339, 340, 341, 343, 344, 345, 346,
       349, 351, 352, 353, 354, 356, 357, 358, 359, 366, 370, 371, 372,
       373, 374, 375, 376, 378, 379, 380, 381, 384, 392, 399, 400, 401,
       402, 403, 404, 405, 408, 409, 410, 412, 413, 414, 415, 420, 422,
       423, 424, 425, 426, 427, 428, 429, 430, 431, 436, 437, 439, 440,
       441, 446, 447, 448, 449, 450, 451, 452, 453, 457, 458, 459, 460,
       461, 464, 465, 466, 467, 477, 478, 479, 480, 481, 482, 538, 539,
       555, 556, 557, 558, 559, 560, 561, 563, 564, 567, 568, 572, 573,
       580, 581, 582, 583, 584, 585, 586, 587, 588, 590, 597, 598, 600,
       602, 603, 607, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622,
       623, 624, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 641,
       642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654,
       655, 656, 657, 658, 659, 671, 672, 673, 674, 675, 676, 677, 678,
       679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 692, 693, 694,
       695, 696, 701, 702, 704, 711, 712, 713, 715, 718, 719, 720, 722,
       723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735,
       736, 738, 739, 740, 751, 752, 753, 754, 755, 756, 760, 761, 763,
       765, 777, 778, 781, 782, 785, 786, 801, 802, 814, 818, 819, 830,
       831, 836, 847, 848, 849, 850, 851, 852, 853, 856, 867, 868, 870,
       873, 874, 875, 876, 877, 878, 879, 882, 898, 919, 920, 923, 930,
       931, 932, 934, 935, 936, 937, 938, 947, 948, 949, 950, 951, 952,
       953, 954, 955, 956, 957, 958, 959, 960, 961, 962, 963, 965, 966,
       973, 974, 975, 976, 978, 989, 991, 992, 996, 999])



In [24]:

    
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()









    Out[24]:






  
    
      
      filename
      ASM_14
      ASM_20
      ASM_21
      ASM_28
      ASM_29
      ASM_31
      ASM_32
      ASM_33
      ASM_34
      ...
      ASM_972
      ASM_973
      ASM_974
      ASM_975
      ASM_977
      ASM_988
      ASM_990
      ASM_991
      ASM_995
      ASM_998
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      9
      32
      32
      13
      10
      116
      101
      120
      116
      ...
      49
      66
      69
      51
      53
      41
      10
      46
      116
      48
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      48
      9
      9
      68
      69
      58
      48
      48
      52
      ...
      9
      9
      9
      9
      9
      10
      116
      101
      48
      48
    
    
      2
      01azqd4InC7m9JpocGv5
      48
      9
      9
      68
      69
      58
      48
      48
      52
      ...
      9
      9
      9
      9
      9
      10
      116
      101
      48
      48
    
    
      3
      01jsnpXSAlgw6aPeDxrU
      48
      9
      9
      68
      69
      58
      48
      48
      52
      ...
      9
      9
      9
      9
      9
      10
      116
      101
      48
      48
    
    
      4
      01kcPWA9K2BOxQeS5Rju
      48
      9
      9
      68
      69
      58
      49
      48
      48
      ...
      89
      84
      69
      83
      32
      83
      71
      77
      32
      65
    
  

5 rows × 401 columns



In [25]:

    
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()









    Out[25]:






  
    
      
      filename
      ASM_14
      ASM_20
      ASM_21
      ASM_28
      ASM_29
      ASM_31
      ASM_32
      ASM_33
      ASM_34
      ...
      ASM_972
      ASM_973
      ASM_974
      ASM_975
      ASM_977
      ASM_988
      ASM_990
      ASM_991
      ASM_995
      ASM_998
    
  
  
    
      0
      ITSUPtCmh7WdJcsYDwQ5
      48
      9
      9
      68
      69
      58
      48
      48
      52
      ...
      48
      9
      9
      9
      9
      32
      32
      32
      9
      114
    
    
      1
      Ig2DB5tSiEy1cJvV0zdw
      48
      9
      9
      68
      69
      58
      48
      48
      52
      ...
      48
      49
      48
      48
      9
      32
      59
      32
      109
      9
    
    
      2
      Jmo6eIhLZ4t9r8QsxEg5
      48
      9
      9
      68
      69
      58
      48
      48
      52
      ...
      89
      84
      69
      83
      32
      83
      71
      77
      32
      65
    
    
      3
      JtPFl4ewgdD78OzCMa3o
      48
      9
      9
      68
      69
      58
      48
      48
      52
      ...
      89
      84
      69
      83
      32
      83
      71
      77
      32
      65
    
    
      4
      K3ZtByPHGSFYNljDUEXp
      48
      9
      9
      68
      69
      58
      48
      48
      52
      ...
      48
      49
      48
      48
      9
      32
      59
      32
      109
      9
    
  

5 rows × 401 columns



In [26]:

    
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-40percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-40percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()

4.5 Feature Reduction to 50%



In [27]:

    
# Now select 50% best train image asm features by variance
sorted_train_image_asm = pd.read_csv('data/sorted-train-image-features-asm.csv')
sorted_test_image_asm = pd.read_csv('data/sorted-test-image-features-asm.csv')
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_asm.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 50)
X_new = fsp.fit_transform(X,y)
X_new.shape









    Out[27]:





(10868, 500)



In [28]:

    
selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names









    Out[28]:





array([  2,   4,   5,  15,  21,  22,  24,  25,  26,  27,  29,  30,  32,
        33,  34,  35,  41,  42,  43,  44,  48,  50, 125, 126, 135, 136,
       138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 151, 152,
       154, 155, 156, 157, 158, 160, 161, 162, 163, 164, 165, 167, 169,
       173, 174, 179, 186, 188, 190, 198, 201, 202, 205, 215, 216, 217,
       219, 220, 221, 222, 223, 224, 226, 227, 229, 236, 240, 241, 242,
       243, 244, 245, 246, 247, 248, 249, 252, 253, 260, 261, 262, 263,
       264, 265, 266, 267, 268, 269, 271, 272, 273, 282, 287, 291, 292,
       293, 294, 295, 296, 297, 307, 308, 310, 311, 312, 313, 314, 315,
       316, 317, 318, 319, 321, 323, 326, 327, 328, 330, 334, 337, 338,
       339, 340, 341, 343, 344, 345, 346, 349, 350, 351, 352, 353, 354,
       356, 357, 358, 359, 366, 367, 368, 370, 371, 372, 373, 374, 375,
       376, 378, 379, 380, 381, 384, 385, 386, 387, 388, 390, 391, 392,
       399, 400, 401, 402, 403, 404, 405, 408, 409, 410, 412, 413, 414,
       415, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431,
       436, 437, 439, 440, 441, 442, 443, 445, 446, 447, 448, 449, 450,
       451, 452, 453, 457, 458, 459, 460, 461, 464, 465, 466, 467, 477,
       478, 479, 480, 481, 482, 538, 539, 555, 556, 557, 558, 559, 560,
       561, 563, 564, 567, 568, 571, 572, 573, 580, 581, 582, 583, 584,
       585, 586, 587, 588, 589, 590, 597, 598, 600, 601, 602, 603, 606,
       607, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624,
       627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 640, 641,
       642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654,
       655, 656, 657, 658, 659, 662, 664, 670, 671, 672, 673, 674, 675,
       676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688,
       689, 691, 692, 693, 694, 695, 696, 701, 702, 703, 704, 708, 709,
       711, 712, 713, 714, 715, 717, 718, 719, 720, 721, 722, 723, 724,
       725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 738,
       739, 740, 743, 744, 751, 752, 753, 754, 755, 756, 757, 758, 759,
       760, 761, 762, 763, 765, 774, 775, 776, 777, 778, 779, 780, 781,
       782, 784, 785, 786, 787, 788, 789, 793, 798, 801, 802, 813, 814,
       818, 819, 820, 830, 831, 835, 836, 837, 838, 840, 841, 847, 848,
       849, 850, 851, 852, 853, 855, 856, 857, 866, 867, 868, 869, 870,
       873, 874, 875, 876, 877, 878, 879, 882, 898, 899, 904, 907, 908,
       919, 920, 923, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939,
       940, 941, 947, 948, 949, 950, 951, 952, 953, 954, 955, 956, 957,
       958, 959, 960, 961, 962, 963, 965, 966, 967, 968, 973, 974, 975,
       976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 989, 990, 991,
       992, 995, 996, 997, 998, 999])



In [29]:

    
data_trimmed = sorted_train_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_asm['filename'])
sorted_train_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_train_image_asm_reduced.head()









    Out[29]:






  
    
      
      filename
      ASM_1
      ASM_3
      ASM_4
      ASM_14
      ASM_20
      ASM_21
      ASM_23
      ASM_24
      ASM_25
      ...
      ASM_984
      ASM_988
      ASM_989
      ASM_990
      ASM_991
      ASM_994
      ASM_995
      ASM_996
      ASM_997
      ASM_998
    
  
  
    
      0
      01IsoiSMh5gxyDYTl4CB
      116
      120
      116
      9
      32
      32
      32
      32
      32
      ...
      54
      41
      13
      10
      46
      120
      116
      58
      48
      48
    
    
      1
      01SuzwMJEIXsK7A8dQbl
      69
      68
      69
      48
      9
      9
      13
      10
      72
      ...
      32
      10
      46
      116
      101
      58
      48
      48
      52
      48
    
    
      2
      01azqd4InC7m9JpocGv5
      69
      68
      69
      48
      9
      9
      13
      10
      72
      ...
      32
      10
      46
      116
      101
      58
      48
      48
      52
      48
    
    
      3
      01jsnpXSAlgw6aPeDxrU
      69
      68
      69
      48
      9
      9
      13
      10
      72
      ...
      32
      10
      46
      116
      101
      58
      48
      48
      52
      48
    
    
      4
      01kcPWA9K2BOxQeS5Rju
      69
      68
      69
      48
      9
      9
      13
      10
      72
      ...
      83
      83
      69
      71
      77
      84
      32
      72
      69
      65
    
  

5 rows × 501 columns



In [30]:

    
data_trimmed = sorted_test_image_asm.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_asm['filename'])
sorted_test_image_asm_reduced = data_fnames.join(data_trimmed)
sorted_test_image_asm_reduced.head()









    Out[30]:






  
    
      
      filename
      ASM_1
      ASM_3
      ASM_4
      ASM_14
      ASM_20
      ASM_21
      ASM_23
      ASM_24
      ASM_25
      ...
      ASM_984
      ASM_988
      ASM_989
      ASM_990
      ASM_991
      ASM_994
      ASM_995
      ASM_996
      ASM_997
      ASM_998
    
  
  
    
      0
      ITSUPtCmh7WdJcsYDwQ5
      69
      68
      69
      48
      9
      9
      13
      10
      72
      ...
      114
      32
      32
      32
      32
      58
      9
      80
      111
      114
    
    
      1
      Ig2DB5tSiEy1cJvV0zdw
      69
      68
      69
      48
      9
      9
      13
      10
      72
      ...
      32
      32
      32
      59
      32
      114
      109
      97
      116
      9
    
    
      2
      Jmo6eIhLZ4t9r8QsxEg5
      69
      68
      69
      48
      9
      9
      13
      10
      72
      ...
      83
      83
      69
      71
      77
      84
      32
      72
      69
      65
    
    
      3
      JtPFl4ewgdD78OzCMa3o
      69
      68
      69
      48
      9
      9
      13
      10
      72
      ...
      83
      83
      69
      71
      77
      84
      32
      72
      69
      65
    
    
      4
      K3ZtByPHGSFYNljDUEXp
      69
      68
      69
      48
      9
      9
      13
      10
      72
      ...
      32
      32
      32
      59
      32
      114
      109
      97
      116
      9
    
  

5 rows × 501 columns



In [31]:

    
# Now write all the sorted and reduced image feature sets to file
f = open('data/sorted-train-image-features-asm-50percent.csv', 'w')
sorted_train_image_asm_reduced.to_csv(f, index=False)
f.close()
f = open('data/sorted-test-image-features-asm-50percent.csv', 'w')
sorted_test_image_asm_reduced.to_csv(f, index=False)
f.close()



In [ ]:



In [ ]:



In [ ]:

6. Run ExtraTreeClassifiers With 10-Fold Cross Validation

Now we can have a quick look at how well the feature set can be classified



In [19]:

    
def run_cv(X,y, clf):

    # Construct a kfolds object
    kf = KFold(len(y),n_folds=10,shuffle=True)
    y_prob = np.zeros((len(y),9))
    y_pred = np.zeros(len(y))
    
    # Iterate through folds
    for train_index, test_index in kf:
        print(test_index, train_index)
        X_train = X.loc[train_index,:]
        X_test = X.loc[test_index,:]
        y_train = y[train_index]

        clf.fit(X_train,y_train)
        y_prob[test_index] = clf.predict_proba(X_test)
        y_pred[test_index] = clf.predict(X_test)
    
    return y_prob, y_pred



In [20]:

    
ytrain = np.array(y)



In [21]:

    
X = data_reduced.iloc[:,1:]
X.shape









    Out[21]:





(10868, 101)



In [22]:

    
# At last we can build a hypothesis
clf1 = ExtraTreesClassifier(n_estimators=1000, max_features=None, min_samples_leaf=1, min_samples_split=9, n_jobs=4, criterion='gini')
p1, pred1 = run_cv(X,ytrain,clf1)
print("logloss = ", log_loss(y, p1))
print("score = ", accuracy_score(ytrain, pred1))
cm = confusion_matrix(y, pred1)
print(cm)









    



(array([    1,     3,     7, ..., 10840, 10852, 10864]), array([    0,     2,     4, ..., 10865, 10866, 10867]))
(array([   19,    21,    42, ..., 10814, 10833, 10838]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([   14,    17,    24, ..., 10846, 10848, 10856]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    9,    20,    27, ..., 10857, 10861, 10866]), array([    0,     1,     2, ..., 10864, 10865, 10867]))
(array([    6,    25,    26, ..., 10827, 10832, 10841]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    8,    11,    23, ..., 10823, 10837, 10855]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
(array([    2,    37,    39, ..., 10824, 10828, 10834]), array([    0,     1,     3, ..., 10865, 10866, 10867]))
(array([    4,     5,    10, ..., 10851, 10865, 10867]), array([    0,     1,     2, ..., 10863, 10864, 10866]))
(array([    0,    15,    32, ..., 10836, 10847, 10853]), array([    1,     2,     3, ..., 10865, 10866, 10867]))
(array([   16,    18,    22, ..., 10860, 10862, 10863]), array([    0,     1,     2, ..., 10865, 10866, 10867]))
('logloss = ', 0.041558532227727193)
('score = ', 0.99236290025763707)
[[1535    0    1    1    0    0    0    4    0]
 [   6 2468    2    0    0    2    0    0    0]
 [   0    0 2942    0    0    0    0    0    0]
 [   0    0    1  470    0    0    0    4    0]
 [   1    1    0    0   38    0    0    2    0]
 [   3    1    0    1    0  741    0    2    3]
 [   0    0    0    0    0    0  398    0    0]
 [  28    1    1    6    0    1    2 1188    1]
 [   4    0    0    0    0    2    0    2 1005]]



In [23]:

    
# Finally shove the test feature set into the classifier
test_X = test_data_reduced.iloc[:,1:]
test_predictions = clf1.predict(test_X)
test_predictions









    Out[23]:





array([2, 3, 4, ..., 3, 6, 7])



In [30]:

    
# Write out the predictions to a csv file
out_test_y = pd.DataFrame(columns=['filename', 'class'])
out_test_y['filename'] = test_data_reduced['filename']
out_test_y['class'] = pd.DataFrame(test_predictions, columns=['class'])
out_test_y.head()









    Out[30]:






  
    
      
      filename
      class
    
  
  
    
      0
       N0DQkgaq9wjO1fJLn2ME
       2
    
    
      1
       VuvdGbYitmxa05lHrPnT
       3
    
    
      2
       NRUDJPSHu4dAyFjzLIg9
       4
    
    
      3
       ZmdXvIh5qHCOyJPwiE6g
       2
    
    
      4
       rcb7LxNP6itSDnwgh3Km
       3
    
  

5 rows × 2 columns



In [33]:

    
out_test_y.to_csv('data/test-label-etc-predictions.csv', index=False)

7. TEST/EXPERIMENTAL CODE ONLY



In [ ]:

    
# go through the features and delete any that sum to less than 200
colsum = X.sum(axis=0, numeric_only=True)



In [ ]:

    
zerocols = colsum[(colsum[:] == 0)]
zerocols



In [ ]:

    
zerocols = colsum[(colsum[:] < 110)]
zerocols.shape



In [ ]:

    
reduceX = X
for col in reduceX.columns:
  if sum(reduceX[col]) < 100:
    del reduceX[col]
    
reduceX.shape



In [ ]:

    
skb = SelectKBest(chi2, k=20)
X_kbestnew = skb.fit_transform(X, y)
X_kbestnew.shape



In [ ]:

    
y = [0]*labels.shape[0]
fnames = train_data['filename']
for i in range(len(y)):
  fname = train_data.loc[i,'filename']
  row = labels[labels['Id'] == fname]
  y[i] = row.iloc[0,1]



In [ ]:

    
# DO NOT USE BYTE IMAGE DATA
# Now select 10% best train image byte features by variance
sorted_train_labels = pd.read_csv('data/sorted-train-labels.csv')
X = sorted_train_image_byte.iloc[:,1:]
y = np.array(sorted_train_labels.iloc[:,1])
fsp = SelectPercentile(chi2, 10)
X_new = fsp.fit_transform(X,y)
X_new.shape

selected_names = fsp.get_support(indices=True)
selected_names = selected_names + 1
selected_names

data_trimmed = sorted_train_image_byte.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_train_image_byte['filename'])
sorted_train_image_byte_reduced = data_fnames.join(data_trimmed)
sorted_train_image_byte_reduced.head()

data_trimmed = sorted_test_image_byte.iloc[:,selected_names]
data_fnames = pd.DataFrame(sorted_test_image_byte['filename'])
sorted_test_image_byte_reduced = data_fnames.join(data_trimmed)
sorted_test_image_byte_reduced.head()

	filename	edx	esi	es	ds	ss	cs	ah	al	ax	bh	bl	ch	cl	cx	dh	dl
2277	01IsoiSMh5gxyDYTl4CB	750	496	3	0	0	0	8	224	49	34	25	41	191	52	38	163	...
2053	01SuzwMJEIXsK7A8dQbl	1121	24	3	1	4	2	6	22	7	1	4	3	37	2	4	9	...
2144	01azqd4InC7m9JpocGv5	1493	1900	0	0	0	0	1	398	0	0	47	1	77	4	1	56	...
1236	01jsnpXSAlgw6aPeDxrU	525	4	0	0	0	0	0	0	0	0	1	0	1	2	0	0	...
339	01kcPWA9K2BOxQeS5Rju	23	35	0	0	0	0	0	3	0	0	1	0	1	0	0	0	...

	Id	Class
1541	01IsoiSMh5gxyDYTl4CB	2
8627	01SuzwMJEIXsK7A8dQbl	8
9855	01azqd4InC7m9JpocGv5	9
9856	01jsnpXSAlgw6aPeDxrU	9
0	01kcPWA9K2BOxQeS5Rju	1

	filename	edx	esi	es	fs	ds	ss	gs	cs	ah	al	ax	bh	bl	bx	ch	cl	cx	dh	dl
0	4jKA1GUDv6TMNpPuIxER	4049	9957	11	3	3	3	3	3	60	791	242	9	393	31	26	477	135	8	381	...
1	4ZBJzEqnW52fFUw0PG3v	539	513	10	0	12	16	0	12	29	152	8	53	24	2	25	53	5	52	30	...
2	6m8NxLfg2MR0nwXFuEq5	427	482	1	0	0	0	0	0	18	163	16	0	84	0	0	84	0	1	130	...
3	28U1hRkQ6Yl57493ZdXD	51	91	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	...
4	45Wy3TxE98HfiXreOCSu	644	726	8	0	0	0	0	0	2	98	89	0	12	1	13	157	80	0	70	...

	filename	edx	esi	ah	al	ax	bl	cl	cx	dl	dx	eax	ebp	ebx	ecx	edi	esp	add	al.1
2277	01IsoiSMh5gxyDYTl4CB	750	496	8	224	49	25	191	52	163	63	1447	905	260	1093	393	420	323	79	...
2053	01SuzwMJEIXsK7A8dQbl	1121	24	6	22	7	4	37	2	9	3	1220	1544	18	1228	24	107	427	8	...
2144	01azqd4InC7m9JpocGv5	1493	1900	1	398	0	47	77	4	56	2	4438	591	810	2317	1284	701	622	262	...
1236	01jsnpXSAlgw6aPeDxrU	525	4	0	0	0	1	1	2	0	0	942	451	5	547	5	56	32	0	...
339	01kcPWA9K2BOxQeS5Rju	23	35	0	3	0	1	1	0	0	0	137	43	19	66	15	81	11	1	...

	filename	edx	esi	es	ds	ss	cs	ah	al	ax	...	LPSTR	int.1	entry	.rdata:	.data:	.text:	assume
2277	01IsoiSMh5gxyDYTl4CB	750	496	3	0	0	0	8	224	49	...	1	119	0	97	70	4572	0
2053	01SuzwMJEIXsK7A8dQbl	1121	24	3	1	4	2	6	22	7	...	0	5	0	0	41	2066	1
2144	01azqd4InC7m9JpocGv5	1493	1900	0	0	0	0	1	398	0	...	1	62	0	48	93	13163	0
1236	01jsnpXSAlgw6aPeDxrU	525	4	0	0	0	0	0	0	0	...	0	2	0	28	15	2030	0
339	01kcPWA9K2BOxQeS5Rju	23	35	0	0	0	0	0	3	0	...	0	0	1	33	15	445	0

	filename	edx	esi	es	ds	ss	ah	...	assume
7297	ITSUPtCmh7WdJcsYDwQ5	245	434	0	1	0	9	...	7
3257	Ig2DB5tSiEy1cJvV0zdw	258	437	0	0	0	11	...	6
4183	Jmo6eIhLZ4t9r8QsxEg5	238	365	0	0	0	8	...	12
8084	JtPFl4ewgdD78OzCMa3o	241	556	1	1	1	4	...	0
9774	K3ZtByPHGSFYNljDUEXp	92	75	0	0	0	0	...	0

	filename	edx	esi	es	ds	ss	ah	al	ax	...	LPSTR	int.1	.rdata:	.data:	.text:	assume
7297	ITSUPtCmh7WdJcsYDwQ5	245	434	0	1	0	9	51	1	...	0	0	20	117	1703	7
3257	Ig2DB5tSiEy1cJvV0zdw	258	437	0	0	0	11	60	1	...	0	2	16	97	2012	6
4183	Jmo6eIhLZ4t9r8QsxEg5	238	365	0	0	0	8	51	0	...	1	0	0	0	0	12
8084	JtPFl4ewgdD78OzCMa3o	241	556	1	1	1	4	52	0	...	0	0	40	115	1947	0
9774	K3ZtByPHGSFYNljDUEXp	92	75	0	0	0	0	1	0	...	0	1	16	32	2636	0

	filename	class
0	N0DQkgaq9wjO1fJLn2ME	2
1	VuvdGbYitmxa05lHrPnT	3
2	NRUDJPSHu4dAyFjzLIg9	4
3	ZmdXvIh5qHCOyJPwiE6g	2
4	rcb7LxNP6itSDnwgh3Km	3