In [1]:
# This is the third attempt at training a NN. I am doing sequence padding
# again, but truncating to as low a value as possible to make training
# faster and avoid memory issues (I've been having crashes on the current)
# feature set
In [1]:
# Force Theano to use multiple cores
!OMP_NUM_THREADS=4
In [2]:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dropout, Dense
from keras.layers.recurrent import LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional
from keras.layers.core import Dropout
from keras.preprocessing.sequence import pad_sequences
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
from theano import config
config.openmp = True
config.openmp_elemwise_minsize = 100000
In [4]:
np.random.seed(42)
In [5]:
malware_classes = ["Agent", "AutoRun", "FraudLoad", "FraudPack", "Hupigon", "Krap",
"Lipler", "Magania", "None", "Poison", "Swizzor", "Tdss",
"VB", "Virut", "Zbot"]
# a function for writing predictions in the required format
def write_predictions(predictions, ids, outfile):
"""
assumes len(predictions) == len(ids), and that predictions[i] is the
index of the predicted class with the malware_classes list above for
the executable corresponding to ids[i].
outfile will be overwritten
"""
with open(outfile,"w+") as f:
# write header
f.write("Id,Prediction\n")
for i, history_id in enumerate(ids):
f.write("%s,%d\n" % (history_id, predictions[i]))
def classes_to_Y(classes):
output = []
for cls in classes:
output.append(malware_classes.index(cls))
return np.array(output)
In [6]:
# load training classes
classes = np.load("../data/features/train_classes.npy")
In [5]:
# load sparse matrix of training data
full_features = np.load("../data/features/100_cutoff_alphabet_19679_word_to_intseq.npy")
# The way masking works, 0 must be maintained as a "ignore this" symbol,
# so I have to increase the index by 1 to free the zero index.
for i in xrange(len(full_features)):
full_features[i] +=1 #Add 1 to each of the arrays in the array
print full_features[0]
In [ ]:
# The way masking works, 0 must be maintained as a "ignore this" symbol,
# so I have to increase the index by 1 to free the zero index.
for i in xrange(len(full_features)):
full_features[i] +=1 #Add 1 to each of the arrays in the array
print full_features[0]
In [6]:
maxlengths = [len(x) for x in full_features]
In [8]:
fig = plt.figure(figsize=[12,6])
ax = fig.add_subplot(111)
ax.hist(maxlengths, bins=1000);
ax.set_xlim(0,200000)
Out[8]:
In [6]:
# Truncateto the first 35000 words
padded = pad_sequences(full_features, maxlen=5000, truncating='post')
print padded.shape # Should be a num_samples by num_features np array
In [7]:
# If this all looks good, save the array for later
np.save("../data/features/100_cutoff_alphabet_19679_padded_len5.npy", padded)
In [8]:
padded = 0
In [7]:
# In future, can load just with this line
full_features = np.load("../data/features/100_cutoff_alphabet_19679_padded_len5.npy")
In [8]:
# pull out training examples
X = full_features[:classes.shape[0],:]
X_test = full_features[classes.shape[0]:,:]
print X_test.shape
Y = classes_to_Y(classes)
Y_hot = np.zeros((classes.shape[0], 16))
for i, clazz in enumerate(Y):
Y_hot[i,clazz] = 1
print Y_hot
In [9]:
# Just to check that worked ok.
print classes[21]
print Y[21]
print Y_hot[21]
print len(malware_classes)
In [10]:
# Now randomly select 100 samples to hold out
rand_index = np.random.permutation(np.arange(classes.shape[0]))
X_train = X[rand_index[100:]]
Y_train = Y_hot[rand_index[100:]]
X_validate = X[rand_index[:100]]
Y_validate = Y_hot[rand_index[:100]]
print X_train.shape
print Y_train.shape
print X_validate.shape
print Y_validate.shape
In [11]:
# Clobbering to save memory
padding = 0
full_features = 0
classes= 0
X = 0
Y_hot = 0
Y =0
In [12]:
print "check"
In [12]:
# The vocabulary size is 2 + the maximum integer index.
# To allow for padding (wout padding would be 1)
vocab_size = 19681
# Length of the dense embedding one for each int in the sequence
embedding_length = 256 # arbitrary
# Should be able to vary batch size with mask
batch_size = 100
model = Sequential()
# Collapse the large input dimension into a 256 dimensional
# dense embedding
model.add(
Embedding(vocab_size, embedding_length, mask_zero=True)
)
# Could add a Dropout layer next but will avoid for now
model.add(Bidirectional(
LSTM(150, return_sequences=True)
))# Arbitrary output size. TODO make this stateful
model.add(Dropout(.20)) # Regularize
# Why not 2!
model.add(LSTM(500)) # Arbitrary again
model.add(Dropout(.30)) # Regularize
model.add(Dense(500, activation="tanh"))
model.add(Dropout(.50))
model.add(Dense(200, activation="tanh"))
model.add(Dropout(.2))
model.add(Dense(16, activation="softmax"))
In [13]:
model.compile(loss='categorical_crossentropy',
optimizer="adam",
metrics=["accuracy"])
In [14]:
from keras.callbacks import ProgbarLogger, History, LambdaCallback, ModelCheckpoint
In [16]:
import psutil
from __future__ import print_function
summarize = lambda *__: print([psutil.virtual_memory(),psutil.cpu_percent(percpu=True)])
In [ ]:
callbacks = [
ProgbarLogger(),
History(),
LambdaCallback(
on_batch_begin=summarize,
on_batch_end=summarize,
on_epoch_begin=summarize),
ModelCheckpoint(
"5000_mask_best_drop_weighted.hdf5",
verbose=1,
monitor="val_acc",
mode="max",
save_best_only=True)
]
class_weights = {
0: 14,
1: 32,
2: 43,
3: 51,
4: 39,
5: 41,
6: 30,
7: 39,
8: 1,
9: 77,
10: 3,
11: 50,
12: 4,
13: 27,
14: 40,
15: 1
}
model.fit(
X_train, Y_train, batch_size=batch_size,
class_weight=class_weights,
nb_epoch=5, verbose=1, callbacks=callbacks,
validation_data=(X_validate, Y_validate)
)
In [15]:
print "ok"
In [ ]:
predictions = model.predict(X_test)
class_preds = model.predict_classes(X_test)
class_prob = model.predict_proba(X_test)
np.save("../predictions/tiny_seq_LSTM.npy", predictions)
np.save("../predictions/tiny_seq_class_LSTM.npy", class_preds)
np.save("../predictions/tiny_seq_class_proba_LSTM.npy", class_prob)
In [ ]:
class_preds = model.predict_classes(X_test)
print(class_preds)
test_ids = np.load("../data/features/test_ids.npy")
print(test_ids)
write_predictions(class_preds, test_ids, "../predictions/LSTM5000_class_weight.csv")
In [ ]: