In [13]:
from sklearn.neural_network import MLPClassifier
from scipy.io import mmread
import numpy as np

In [14]:
malware_classes = ["Agent", "AutoRun", "FraudLoad", "FraudPack", "Hupigon", "Krap",
           "Lipler", "Magania", "None", "Poison", "Swizzor", "Tdss",
           "VB", "Virut", "Zbot"]

# a function for writing predictions in the required format
def write_predictions(predictions, ids, outfile):
    """
    assumes len(predictions) == len(ids), and that predictions[i] is the
    index of the predicted class with the malware_classes list above for
    the executable corresponding to ids[i].
    outfile will be overwritten
    """
    with open(outfile,"w+") as f:
        # write header
        f.write("Id,Prediction\n")
        for i, history_id in enumerate(ids):
            f.write("%s,%d\n" % (history_id, predictions[i]))

def classes_to_Y(classes):
    output = []
    for cls in classes:
        output.append(malware_classes.index(cls))
    return np.array(output)

In [17]:
# load training classes
# classes = np.load("../data/features/train_classes.npy")
classes = np.load("../data/features/train_classes_extra.npy")

# load sparse matrix of training data, convert csr to numpy array
# sparse_mat_train_test = mmread("../data/features/naive_word_hashed_full_features.mtx")

# sparse = sparse_mat_train_test.toarray()

sparse = np.load("../data/features/count_vector_full_10k_features_extra.npy")
# sparse = np.load("../data/features/count_vector_full_10k_features.npy")
# sparse = np.load("../data/features/count_vector_full_10k_features_tfidf.npy")

# pull out training examples
X = sparse[:classes.shape[0]]

X_test = sparse[classes.shape[0]:]
print X_test.shape

Y = classes_to_Y(classes)

# for manual CV
# X_CV = X[-300:]
# X = X[:-300]

# Y_CV = Y[-300:]
# Y = Y[:-300]


(3724, 10000)

In [12]:
total_Y_counts = {}
for y in Y:
    if malware_classes[y] in total_Y_counts:
        total_Y_counts[malware_classes[y]] += 1
    else:
        total_Y_counts[malware_classes[y]] = 1
print total_Y_counts


{'FraudPack': 27, 'None': 1463, 'Tdss': 30, 'Swizzor': 487, 'Agent': 110, 'VB': 336, 'Lipler': 46, 'Zbot': 34, 'FraudLoad': 32, 'Poison': 19, 'Virut': 53, 'AutoRun': 42, 'Magania': 39, 'Hupigon': 35, 'Krap': 33}

In [18]:
NN = MLPClassifier(hidden_layer_sizes=(400), activation="tanh")
NN.fit(X, Y)


Out[18]:
MLPClassifier(activation='tanh', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=400, learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [10]:
preds = NN.predict(X_CV)

mistakes = {}
for i in range(len(preds)):
    if preds[i] != Y_CV[i]:
        cls = malware_classes[Y_CV[i]]
        if cls in mistakes:
            mistakes[cls] += 1
        else:
            mistakes[cls] = 1
#         print malware_classes[Y_CV[i]]
print len(mistakes)
print mistakes


11
{'FraudPack': 2, 'None': 9, 'Tdss': 1, 'VB': 7, 'Agent': 4, 'Poison': 1, 'Virut': 5, 'AutoRun': 5, 'Magania': 1, 'Hupigon': 4, 'Krap': 4}

In [19]:
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(NN, X, Y, cv=5)
print cv_score
print sum(cv_score) / len(cv_score)


[ 0.86277603  0.87163233  0.87878788  0.88498403  0.86312399]
0.872260850556

In [ ]:


In [21]:
# 400, 200, tanh -> .862 #
# 400, 100, tanh -> .866 #
# 400, 50,  tanh -> .863 #
# 400, 50,  logi -> .867
# 200, 100, tanh -> .860 #
# 800,      tanh -> .866 #
# 400,      tanh -> .867 #
# 400,      logi -> .867 #
# 400,      relu -> .855 #
# 200,      tanh -> .862 #
# 100,      tanh -> .858 #
# 400, tanh, tfidf -> .867

In [21]:
test_pred = NN.predict(X_test)
print test_pred.shape


(3724,)

In [23]:
test_ids = np.load("../data/features/test_ids.npy")
print test_ids


['0015c8c9ff02fea9d0f45692b9eebfb4abff4e42f'
 '001f298a534ae4b0db7f2707169250aa215c3b5f2'
 '001f5fdaaa8bbe20303527198d09a30bb7ca3eb50' ...,
 'ff85866b215233b2fecdca2c2b8fda54ad24c86fd'
 'ff884224571e0476990574df5da76e0991db583af'
 'ffc47163a530c51ef2e6572d786aefbaed99890f2']

In [24]:
write_predictions(test_pred, test_ids, "../predictions/rfc_10.csv")

In [ ]: