In [1]:
# This notebook is my first attempt to build a NN for classifying the
# Sequences I extracted. I am using Keras because the API is very 
# straightforward. This is an LSTM because recurrent networks were 
# designed to handle variable length inputs, although some people in the 
# keras community appeared to have forgotten that. I think I have an idea
# for the architecture that will handle variable lengths. If this turns
# out to be wrong we can either 1. truncate and pad 2. mask or 3. use a
# different framework

In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dropout, Dense
from keras.layers.recurrent import LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import Bidirectional


Using TensorFlow backend.

In [ ]:
np.random.seed(42)

In [ ]:
malware_classes = ["Agent", "AutoRun", "FraudLoad", "FraudPack", "Hupigon", "Krap",
           "Lipler", "Magania", "None", "Poison", "Swizzor", "Tdss",
           "VB", "Virut", "Zbot"]

# a function for writing predictions in the required format
def write_predictions(predictions, ids, outfile):
    """
    assumes len(predictions) == len(ids), and that predictions[i] is the
    index of the predicted class with the malware_classes list above for
    the executable corresponding to ids[i].
    outfile will be overwritten
    """
    with open(outfile,"w+") as f:
        # write header
        f.write("Id,Prediction\n")
        for i, history_id in enumerate(ids):
            f.write("%s,%d\n" % (history_id, predictions[i]))

def classes_to_Y(classes):
    output = []
    for cls in classes:
        output.append(malware_classes.index(cls))
    return np.array(output)

In [4]:
# load training classes
classes = np.load("../data/features/train_classes.npy")

# load sparse matrix of training data
full_features = np.load("../data/features/100_cutoff_alphabet_19679_word_to_intseq.npy")

In [5]:
# pull out training examples
X = full_features[:classes.shape[0]]

X_test = full_features[classes.shape[0]:]
print X_test.shape

Y = classes_to_Y(classes)

Y_hot = np.zeros((classes.shape[0], 16))
for i, clazz in enumerate(Y):
    Y_hot[i,clazz] = 1

print Y_hot


(3724,)
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  1.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  1.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]

In [6]:
# Just to check that worked ok.
print classes[21]
print Y[21]
print Y_hot[21]
print len(malware_classes)


None
8
[ 0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
15

In [7]:
# Now randomly select 100 samples to hold out
rand_index = np.random.permutation(np.arange(classes.shape[0]))

X_train = X[rand_index[100:]]
Y_train = Y_hot[rand_index[100:]]
X_validate = X[rand_index[:100]]
Y_validate = Y_hot[rand_index[:100]]


print X_train.shape

print Y_train.shape

print X_validate.shape

print Y_validate.shape


(2986,)
(2986, 16)
(100,)
(100, 16)

In [8]:
# The vocabulary size is 1 + the maximum integer index. 
vocab_size = 19680

# Length of the dense embedding one for each int in the sequence
embedding_length = 256 # arbitrary

# Must be one to allow variable length
batch_size = 1
model = Sequential()

# Collapse the large input dimension into a 256 dimensional
# dense embedding
model.add(
    Embedding(vocab_size, embedding_length)
)

# Could add a Dropout layer next but will avoid for now
model.add(Bidirectional(
    LSTM(100, return_sequences=True)
))# Arbitrary output size. TODO make this stateful

# Why not 2!
model.add(Bidirectional(LSTM(42))) # Arbitrary again

model.add(Dense(200, activation="sigmoid"))
model.add(Dense(16, activation="softmax"))

In [9]:
model.compile(loss='categorical_crossentropy',
             optimizer="adam",
             metrics=["accuracy"])

In [10]:
from keras.callbacks import ProgbarLogger, History

In [13]:
new_X_train = np.array([x.tolist() for x in X_train])

In [17]:
# callbacks = [ProgbarLogger(), History()]
model.fit(
    X_train, Y_train, batch_size=1,
    nb_epoch=5, verbose=2
    )


Epoch 1/5
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-17-4ec2a1f4c222> in <module>()
      2 model.fit(
      3     X_train, Y_train, batch_size=1,
----> 4     nb_epoch=5, verbose=2
      5     )

/n/scrb152/Software/Python/cs181/lib/python2.7/site-packages/keras/models.pyc in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, **kwargs)
    625                               shuffle=shuffle,
    626                               class_weight=class_weight,
--> 627                               sample_weight=sample_weight)
    628 
    629     def evaluate(self, x, y, batch_size=32, verbose=1,

/n/scrb152/Software/Python/cs181/lib/python2.7/site-packages/keras/engine/training.pyc in fit(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight)
   1122                               verbose=verbose, callbacks=callbacks,
   1123                               val_f=val_f, val_ins=val_ins, shuffle=shuffle,
-> 1124                               callback_metrics=callback_metrics)
   1125 
   1126     def evaluate(self, x, y, batch_size=32, verbose=1, sample_weight=None):

/n/scrb152/Software/Python/cs181/lib/python2.7/site-packages/keras/engine/training.pyc in _fit_loop(self, f, ins, out_labels, batch_size, nb_epoch, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics)
    840                 batch_logs['size'] = len(batch_ids)
    841                 callbacks.on_batch_begin(batch_index, batch_logs)
--> 842                 outs = f(ins_batch)
    843                 if type(outs) != list:
    844                     outs = [outs]

/n/scrb152/Software/Python/cs181/lib/python2.7/site-packages/keras/backend/tensorflow_backend.pyc in __call__(self, inputs)
   1038             feed_dict[tensor] = value
   1039         session = get_session()
-> 1040         updated = session.run(self.outputs + [self.updates_op], feed_dict=feed_dict)
   1041         return updated[:len(self.outputs)]
   1042 

/n/scrb152/Software/Python/cs181/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in run(self, fetches, feed_dict, options, run_metadata)
    380     try:
    381       result = self._run(None, fetches, feed_dict, options_ptr,
--> 382                          run_metadata_ptr)
    383       if run_metadata:
    384         proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/n/scrb152/Software/Python/cs181/lib/python2.7/site-packages/tensorflow/python/client/session.pyc in _run(self, handle, fetches, feed_dict, options, run_metadata)
    632                 ' to a larger type (e.g. int64).')
    633 
--> 634           np_val = np.asarray(subfeed_val, dtype=subfeed_dtype)
    635 
    636           if not subfeed_t.get_shape().is_compatible_with(np_val.shape):

/n/scrb152/Software/Python/cs181/lib/python2.7/site-packages/numpy/core/numeric.pyc in asarray(a, dtype, order)
    480 
    481     """
--> 482     return array(a, dtype, copy=False, order=order)
    483 
    484 def asanyarray(a, dtype=None, order=None):

ValueError: setting an array element with a sequence.

In [ ]:
for seq, label in zip(X_train, Y_train):
    metric = model.train_on_batch(np.array([seq]), np.array([label]))
    print "Trained on first example. Metric is: \n {}".format(metric)


Trained on first example. Metric is: 
 [2.7084858, 0.0]
Trained on first example. Metric is: 
 [2.4121249, 0.0]
Trained on first example. Metric is: 
 [3.4390562, 0.0]
Trained on first example. Metric is: 
 [2.0780964, 0.0]
Trained on first example. Metric is: 
 [1.8925185, 1.0]
Trained on first example. Metric is: 
 [1.6950102, 1.0]
Trained on first example. Metric is: 
 [1.4848764, 1.0]
Trained on first example. Metric is: 
 [3.8131661, 0.0]
Trained on first example. Metric is: 
 [3.180295, 0.0]
Trained on first example. Metric is: 
 [3.2343106, 0.0]
Trained on first example. Metric is: 
 [2.9842432, 0.0]
Trained on first example. Metric is: 
 [0.95159495, 1.0]
Trained on first example. Metric is: 
 [0.8911801, 1.0]
Trained on first example. Metric is: 
 [3.5191693, 0.0]
Trained on first example. Metric is: 
 [2.610594, 0.0]
Trained on first example. Metric is: 
 [0.76259893, 1.0]
Trained on first example. Metric is: 
 [4.399488, 0.0]
Trained on first example. Metric is: 
 [0.70320916, 1.0]
Trained on first example. Metric is: 
 [3.6987491, 0.0]
Trained on first example. Metric is: 
 [2.2815373, 0.0]
Trained on first example. Metric is: 
 [0.66521698, 1.0]
Trained on first example. Metric is: 
 [5.0278625, 0.0]
Trained on first example. Metric is: 
 [0.69401133, 1.0]
Trained on first example. Metric is: 
 [2.9516208, 0.0]
Trained on first example. Metric is: 
 [0.68500137, 1.0]
Trained on first example. Metric is: 
 [3.9252605, 0.0]
Trained on first example. Metric is: 
 [0.6881097, 1.0]
Trained on first example. Metric is: 
 [0.72968042, 1.0]
Trained on first example. Metric is: 
 [0.71296477, 1.0]
Trained on first example. Metric is: 
 [2.7255709, 0.0]
Trained on first example. Metric is: 
 [1.9827336, 0.0]
Trained on first example. Metric is: 
 [0.65255147, 1.0]
Trained on first example. Metric is: 
 [1.9443451, 0.0]
Trained on first example. Metric is: 
 [2.5392272, 0.0]
Trained on first example. Metric is: 
 [3.4258537, 0.0]
Trained on first example. Metric is: 
 [2.3527081, 0.0]
Trained on first example. Metric is: 
 [0.67592138, 1.0]
Trained on first example. Metric is: 
 [0.82889223, 1.0]
Trained on first example. Metric is: 
 [2.0216384, 0.0]

In [ ]:
predictions = model.predict(X_test)
class_preds = model.predict_classes(X_test)
class_prob = model.predict_proba(X_test)
np.save("../predictions/tiny_seq_LSTM.npy", predictions)
np.save("../predictions/tiny_seq_class_LSTM.npy", class_preds)
np.save("../predictions/tiny_seq_class_proba_LSTM.npy", class_proba)

In [ ]: