In [1]:
# This is the second attempt at training a NN. To handle variable sequence
# lengths last time I had to feed examples one at a time, but this is
# abhorently slow. I am trying another technique here, which is padding
# The sequences and then using a mask internally to permit variable
# Lengths. If this doesn't work I will just have to pad the sequences.
In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dropout, Dense
from keras.layers.recurrent import LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
np.random.seed(42)
In [3]:
malware_classes = ["Agent", "AutoRun", "FraudLoad", "FraudPack", "Hupigon", "Krap",
"Lipler", "Magania", "None", "Poison", "Swizzor", "Tdss",
"VB", "Virut", "Zbot"]
# a function for writing predictions in the required format
def write_predictions(predictions, ids, outfile):
"""
assumes len(predictions) == len(ids), and that predictions[i] is the
index of the predicted class with the malware_classes list above for
the executable corresponding to ids[i].
outfile will be overwritten
"""
with open(outfile,"w+") as f:
# write header
f.write("Id,Prediction\n")
for i, history_id in enumerate(ids):
f.write("%s,%d\n" % (history_id, predictions[i]))
def classes_to_Y(classes):
output = []
for cls in classes:
output.append(malware_classes.index(cls))
return np.array(output)
In [4]:
# load training classes
classes = np.load("../data/features/train_classes.npy")
In [5]:
# The way masking works, 0 must be maintained as a "ignore this" symbol,
# so I have to increase the index by 1 to free the zero index.
for i in xrange(len(full_features)):
full_features[i] +=1 #Add 1 to each of the arrays in the array
print full_features[0]
In [ ]:
# The way masking works, 0 must be maintained as a "ignore this" symbol,
# so I have to increase the index by 1 to free the zero index.
for i in xrange(len(full_features)):
full_features[i] +=1 #Add 1 to each of the arrays in the array
print full_features[0]
In [9]:
maxlengths = [len(x) for x in full_features]
In [19]:
fig = plt.figure(figsize=[12,6])
ax = fig.add_subplot(111)
ax.hist(maxlengths, bins=200);
ax.set_xlim(0,1000000)
Out[19]:
In [22]:
# Looks like I will have to truncate the sequences in order to get anything
# reasonable. There are some out to 6million, but it looks like almost all
# could be captured at length 200000
# Truncateto the first 200000 words
padded = pad_sequences(full_features, maxlen=200000, truncating='post')
print padded.shape # Should be a num_samples by num_features np array
In [23]:
# If this all looks good, save the array for later
np.save("../data/features/100_cutoff_alphabet_19679_padded_len200.npy", padded)
In [27]:
padded = 0
In [5]:
# In future, can load just with this line
full_features = np.load("../data/features/100_cutoff_alphabet_19679_padded_len200.npy")
In [6]:
# pull out training examples
X = full_features[:classes.shape[0],:]
X_test = full_features[classes.shape[0]:,:]
print X_test.shape
Y = classes_to_Y(classes)
Y_hot = np.zeros((classes.shape[0], 16))
for i, clazz in enumerate(Y):
Y_hot[i,clazz] = 1
print Y_hot
In [7]:
# Just to check that worked ok.
print classes[21]
print Y[21]
print Y_hot[21]
print len(malware_classes)
In [8]:
# Now randomly select 100 samples to hold out
rand_index = np.random.permutation(np.arange(classes.shape[0]))
X_train = X[rand_index[100:]]
Y_train = Y_hot[rand_index[100:]]
X_validate = X[rand_index[:100]]
Y_validate = Y_hot[rand_index[:100]]
print X_train.shape
print Y_train.shape
print X_validate.shape
print Y_validate.shape
In [9]:
# Clobbering to save memory
padding = 0
full_features = 0
classes= 0
X = 0
In [10]:
# The vocabulary size is 2 + the maximum integer index.
# To allow for padding (wout padding would be 1)
vocab_size = 19681
# Length of the dense embedding one for each int in the sequence
embedding_length = 256 # arbitrary
# Should be able to vary batch size with mask
batch_size = 80
model = Sequential()
# Collapse the large input dimension into a 256 dimensional
# dense embedding
model.add(
Embedding(vocab_size, embedding_length, mask_zero=True)
)
# Could add a Dropout layer next but will avoid for now
model.add(Bidirectional(
LSTM(100, return_sequences=True)
))# Arbitrary output size. TODO make this stateful
# Why not 2!
model.add(LSTM(42)) # Arbitrary again
model.add(Dense(200, activation="sigmoid"))
model.add(Dense(16, activation="softmax"))
In [11]:
model.compile(loss='categorical_crossentropy',
optimizer="adam",
metrics=["accuracy"])
In [12]:
from keras.callbacks import ProgbarLogger, History, LambdaCallback
In [13]:
import psutil
from __future__ import print_function
summarize = lambda *__: print([psutil.virtual_memory(),psutil.cpu_percent(percpu=True)])
In [ ]:
callbacks = [
ProgbarLogger(),
History(),
LambdaCallback(
on_batch_begin=summarize,
on_batch_end=summarize,
on_epoch_begin=summarize
)]
model.fit(
X_train, Y_train, batch_size=batch_size,
nb_epoch=5, verbose=1, callbacks=callbacks,
validation_data=(X_validate, Y_validate)
)
In [ ]:
print "ok"
In [ ]: