notebook.community

Edit and run



In [1]:

    
# This is the second attempt at training a NN. To handle variable sequence
# lengths last time I had to feed examples one at a time, but this is
# abhorently slow. I am trying another technique here, which is padding
# The sequences and then using a mask internally to permit variable
# Lengths. If this doesn't work I will just have to pad the sequences.



In [1]:

    
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dropout, Dense
from keras.layers.recurrent import LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
%matplotlib inline









    



Using TensorFlow backend.



In [2]:

    
np.random.seed(42)



In [3]:

    
malware_classes = ["Agent", "AutoRun", "FraudLoad", "FraudPack", "Hupigon", "Krap",
           "Lipler", "Magania", "None", "Poison", "Swizzor", "Tdss",
           "VB", "Virut", "Zbot"]

# a function for writing predictions in the required format
def write_predictions(predictions, ids, outfile):
    """
    assumes len(predictions) == len(ids), and that predictions[i] is the
    index of the predicted class with the malware_classes list above for
    the executable corresponding to ids[i].
    outfile will be overwritten
    """
    with open(outfile,"w+") as f:
        # write header
        f.write("Id,Prediction\n")
        for i, history_id in enumerate(ids):
            f.write("%s,%d\n" % (history_id, predictions[i]))

def classes_to_Y(classes):
    output = []
    for cls in classes:
        output.append(malware_classes.index(cls))
    return np.array(output)



In [4]:

    
# load training classes
classes = np.load("../data/features/train_classes.npy")



In [5]:

    
# The way masking works, 0 must be maintained as a "ignore this" symbol,
# so I have to increase the index by 1 to free the zero index.
for i in xrange(len(full_features)):
    full_features[i] +=1 #Add 1 to each of the arrays in the array
    
print full_features[0]









    



[  734   566     4     5  1150   558   740     4   785  5849    11     3
     1   547    12    40   804  2049   946     1   947     1   811   113
   805     5   803    68  3872   808  1827  2759   807  1591   809  1174
   806   654  1590  1592   478   604  5653   495   844    11     3     1
   547    10     4     6  1222    13  4798     7  1983    12    40    14
    11     3     2     9   680     8    10     4     6   700    13   702
     7   698    12   696    14    11     3     2     9   134     8    10
     4     6   309    13   310     7   308    12   303    14    11     3
     2   694   709     2   678   725     5   697   723   727    10     4
     6   729    13   730     7   728    12    40    14    11     3     2
     9   798     8    10     4     6   818    13   817     7   792    12
   802    14    11     3     2     9   216     8    10     4     6   476
    13   477     7   475    12   474    14    11     3     2     9   774
     8    10     4     6   779    13   780     7   778    12   775    14
    11     3     2     9   776     8    10     4     6   783    13   784
     7   597    12   777    14    11     3     2     9   742     8    10
     4     6   747    13   746     7   743    12   744    14    11     3
     2     9   543     8    10     4     6   633    13   636     7   630
    12   635    14    11     3     2     9   812     8    10     4     6
   815    13   816     7   814    12   813    14    11     3     2     9
   751     8    10     4     6   755    13   754     7   748    12   752
    14    11     3     2     9   600     8    10     4     6   692    13
   693     7   691    12   684    14    11     3     2     9   515     8
    10     4     6   611    13   612     7   610    12   609    14    11
     3     2     9   840     8    10     4     6   842    13   843     7
   828    12   841    14    11     3     2     9   834     8    10     4
     6   838    13   839     7   597    12   836  1104   145  1136    14
    11     3     2     9   790     8    10     4     6   800    13   801
     7   799    12   791   463    29    22    24    23    30   244   883
     1   547    29    22    24    23    30   244   677    42    22    24
    23    30   244   677    41   935   537   133   244  1265   618     4
   291   152   532   549   551   509   468     5   537   133   244  1264
   618     4   291   152   532   549   551   509   468     5   537   133
   244  1258   618     4   291   152   532   549   551   509   468     5
   537   133   244  1237   618     4   291   152   532   549   551   509
   468     5   537   133   244  1263   618     4   291   152   532   549
   551   509   468     5    29    22    64   624   576   638    42    22
    64   624   576   638    41   845   749    42    22    64   624   576
   638    41   576   749   463    29    22    24    23    30   244    42
    22    24    23    30   244    41  1266   537   133   244  1260  1273
     4   291   152   532   549   551  1261  1270     4   291   152   532
   468     5   758   761  1132   555  5653   759  1168   760   757   758
   761  1130   555  5653   759  1167   760   757    29    22    24    23
    30     2   442    45  1103    42    22    24    23    30     2   442
    45  1103    41   616    32    14    11     3     2     9   566     8
    10     4     6   787    13   786     7   695    12   782   463    14
    11     3     2     9   821   616    10     4     6 16495    13 16495
     7     5    12   895   463    14    11     3     2     9   515     8
    10     4     6   611    13   612     7   610    12   609    14    11
     3     2     9   821   616    10     4     6  1095    13  1094     7
   716    12   895   463   583   329 16568   115   616   171   590   616
   580     5   579     5   575     5   578     5   489   756   585   613
   589   586   587   584   588    29    22    64    23    30   244    42
    22    64    23    30   244    41  1205   478  1230   632    29    22
    24    23    30   244   677    42    22    24    23    30   244   677
    41   935   463    14    11     3     2     9   821   616    10     4
     6  1095    13  1094     7   716    12   895   583   329 17885   115
   467   171  4342  4342  4342   580  5782   579 18236   575   108   578
   354   489  1131  1152  1151   585   613  1577  1064   589   586 14554
  1431  1625   587   584   588  1148    14    11     3     2     9   790
     8    10     4     6   800    13   801     7   799    12   791    14
    11     3     2     9   751     8    10     4     6   755    13   754
     7   748    12   752   499    32   452   448   486    40   484  5378
     1     1    41     1   499    32   452   448   486    40   484  5378
     1     1    41     1   583   329  9611   115   821  1516   171   254
   580     5   579     5   575     5   578     5   489   756   585   613
   589   586   587   584   588   156   115  2209   256   583   329 11878
   115   539   171 17247   580     5   579     5   575     5   578     5
   489   756   585   613   589   586   587   584   588   614   329 17885
  1251  1389     1   685   256   614   329 17885  1251  1389     1   685
    29    22    64   624   576   638    42    22    64   624   576   638
    41   845   749    42    22    64   624   576   638    41   576   749
    29    22    64    23    30   244  1372    29    22    24    23    30
   244  1372   537   133  1069   735  1640  6224   468     5   463   495
   478   558  1150]






    



---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-5-42bf646f6aae> in <module>()
      7 
      8 # since this worked correctly, I can use keras to pad the sequences.
----> 9 padded = pad_sequences(full_features) # Automatically pads up to longest seq
     10 
     11 print padded.shape # Should be a num_samples by num_features np array

/n/scrb152/Software/Python/cs181/lib/python2.7/site-packages/keras/preprocessing/sequence.pyc in pad_sequences(sequences, maxlen, dtype, padding, truncating, value)
     44             break
     45 
---> 46     x = (np.ones((nb_samples, maxlen) + sample_shape) * value).astype(dtype)
     47     for idx, s in enumerate(sequences):
     48         if len(s) == 0:

/n/scrb152/Software/Python/cs181/lib/python2.7/site-packages/numpy/core/numeric.pyc in ones(shape, dtype, order)
    188 
    189     """
--> 190     a = empty(shape, dtype, order)
    191     multiarray.copyto(a, 1, casting='unsafe')
    192     return a

MemoryError:



In [ ]:

    
# The way masking works, 0 must be maintained as a "ignore this" symbol,
# so I have to increase the index by 1 to free the zero index.
for i in xrange(len(full_features)):
    full_features[i] +=1 #Add 1 to each of the arrays in the array
    
print full_features[0]



In [9]:

    
maxlengths = [len(x) for x in full_features]



In [19]:

    
fig = plt.figure(figsize=[12,6])
ax = fig.add_subplot(111)
ax.hist(maxlengths, bins=200);
ax.set_xlim(0,1000000)









    Out[19]:





(0, 1000000)



In [22]:

    
# Looks like I will have to truncate the sequences in order to get anything
# reasonable. There are some out to 6million, but it looks like almost all
# could be captured at length 200000

# Truncateto the first 200000 words
padded = pad_sequences(full_features, maxlen=200000, truncating='post')

print padded.shape # Should be a num_samples by num_features np array









    



(6810, 200000)



In [23]:

    
# If this all looks good, save the array for later

np.save("../data/features/100_cutoff_alphabet_19679_padded_len200.npy", padded)



In [27]:

    
padded = 0



In [5]:

    
# In future, can load just with this line 
full_features = np.load("../data/features/100_cutoff_alphabet_19679_padded_len200.npy")



In [6]:

    
# pull out training examples
X = full_features[:classes.shape[0],:]

X_test = full_features[classes.shape[0]:,:]
print X_test.shape

Y = classes_to_Y(classes)

Y_hot = np.zeros((classes.shape[0], 16))
for i, clazz in enumerate(Y):
    Y_hot[i,clazz] = 1

print Y_hot









    



(3724, 200000)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  1.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  1.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]



In [7]:

    
# Just to check that worked ok.
print classes[21]
print Y[21]
print Y_hot[21]
print len(malware_classes)









    



None
8
[ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
15



In [8]:

    
# Now randomly select 100 samples to hold out
rand_index = np.random.permutation(np.arange(classes.shape[0]))

X_train = X[rand_index[100:]]
Y_train = Y_hot[rand_index[100:]]
X_validate = X[rand_index[:100]]
Y_validate = Y_hot[rand_index[:100]]


print X_train.shape

print Y_train.shape

print X_validate.shape

print Y_validate.shape









    



(2986, 200000)
(2986, 16)
(100, 200000)
(100, 16)



In [9]:

    
# Clobbering to save memory
padding = 0
full_features = 0
classes= 0
X = 0



In [10]:

    
# The vocabulary size is 2 + the maximum integer index. 
# To allow for padding (wout padding would be 1)
vocab_size = 19681

# Length of the dense embedding one for each int in the sequence
embedding_length = 256 # arbitrary

# Should be able to vary batch size with mask
batch_size = 80
model = Sequential()

# Collapse the large input dimension into a 256 dimensional
# dense embedding
model.add(
    Embedding(vocab_size, embedding_length, mask_zero=True)
)

# Could add a Dropout layer next but will avoid for now
model.add(Bidirectional(
    LSTM(100, return_sequences=True)
))# Arbitrary output size. TODO make this stateful

# Why not 2!
model.add(LSTM(42)) # Arbitrary again

model.add(Dense(200, activation="sigmoid"))
model.add(Dense(16, activation="softmax"))



In [11]:

    
model.compile(loss='categorical_crossentropy',
             optimizer="adam",
             metrics=["accuracy"])



In [12]:

    
from keras.callbacks import ProgbarLogger, History, LambdaCallback



In [13]:

    
import psutil
from __future__ import print_function
summarize = lambda *__: print([psutil.virtual_memory(),psutil.cpu_percent(percpu=True)])



In [ ]:

    
callbacks = [
    ProgbarLogger(), 
    History(),
    LambdaCallback(
        on_batch_begin=summarize, 
        on_batch_end=summarize, 
        on_epoch_begin=summarize
    )]

model.fit(
    X_train, Y_train, batch_size=batch_size,
    nb_epoch=5, verbose=1, callbacks=callbacks, 
    validation_data=(X_validate, Y_validate)
    )









    



Train on 2986 samples, validate on 100 samples
Epoch 1/5
[svmem(total=270846246912, available=232580489216, percent=14.1, used=37297233920, free=161685471232, active=45634285568, inactive=51054436352, buffers=388141056, cached=71475400704, shared=103583744), [82.1, 0.2, 0.0, 0.1, 0.0, 0.1, 0.1, 0.0, 0.1, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 3.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
Epoch 1/5
[svmem(total=270846246912, available=232516435968, percent=14.2, used=37361287168, free=161621417984, active=45698281472, inactive=51054436352, buffers=388141056, cached=71475400704, shared=103583744), [100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 100.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]



In [ ]:

    
print "ok"



In [ ]: