In [13]:
from sklearn.neural_network import MLPClassifier
from scipy.io import mmread
import numpy as np
In [14]:
malware_classes = ["Agent", "AutoRun", "FraudLoad", "FraudPack", "Hupigon", "Krap",
"Lipler", "Magania", "None", "Poison", "Swizzor", "Tdss",
"VB", "Virut", "Zbot"]
# a function for writing predictions in the required format
def write_predictions(predictions, ids, outfile):
"""
assumes len(predictions) == len(ids), and that predictions[i] is the
index of the predicted class with the malware_classes list above for
the executable corresponding to ids[i].
outfile will be overwritten
"""
with open(outfile,"w+") as f:
# write header
f.write("Id,Prediction\n")
for i, history_id in enumerate(ids):
f.write("%s,%d\n" % (history_id, predictions[i]))
def classes_to_Y(classes):
output = []
for cls in classes:
output.append(malware_classes.index(cls))
return np.array(output)
In [17]:
# load training classes
# classes = np.load("../data/features/train_classes.npy")
classes = np.load("../data/features/train_classes_extra.npy")
# load sparse matrix of training data, convert csr to numpy array
# sparse_mat_train_test = mmread("../data/features/naive_word_hashed_full_features.mtx")
# sparse = sparse_mat_train_test.toarray()
sparse = np.load("../data/features/count_vector_full_10k_features_extra.npy")
# sparse = np.load("../data/features/count_vector_full_10k_features.npy")
# sparse = np.load("../data/features/count_vector_full_10k_features_tfidf.npy")
# pull out training examples
X = sparse[:classes.shape[0]]
X_test = sparse[classes.shape[0]:]
print X_test.shape
Y = classes_to_Y(classes)
# for manual CV
# X_CV = X[-300:]
# X = X[:-300]
# Y_CV = Y[-300:]
# Y = Y[:-300]
In [12]:
total_Y_counts = {}
for y in Y:
if malware_classes[y] in total_Y_counts:
total_Y_counts[malware_classes[y]] += 1
else:
total_Y_counts[malware_classes[y]] = 1
print total_Y_counts
In [18]:
NN = MLPClassifier(hidden_layer_sizes=(400), activation="tanh")
NN.fit(X, Y)
Out[18]:
In [10]:
preds = NN.predict(X_CV)
mistakes = {}
for i in range(len(preds)):
if preds[i] != Y_CV[i]:
cls = malware_classes[Y_CV[i]]
if cls in mistakes:
mistakes[cls] += 1
else:
mistakes[cls] = 1
# print malware_classes[Y_CV[i]]
print len(mistakes)
print mistakes
In [19]:
from sklearn.model_selection import cross_val_score
cv_score = cross_val_score(NN, X, Y, cv=5)
print cv_score
print sum(cv_score) / len(cv_score)
In [ ]:
In [21]:
# 400, 200, tanh -> .862 #
# 400, 100, tanh -> .866 #
# 400, 50, tanh -> .863 #
# 400, 50, logi -> .867
# 200, 100, tanh -> .860 #
# 800, tanh -> .866 #
# 400, tanh -> .867 #
# 400, logi -> .867 #
# 400, relu -> .855 #
# 200, tanh -> .862 #
# 100, tanh -> .858 #
# 400, tanh, tfidf -> .867
In [21]:
test_pred = NN.predict(X_test)
print test_pred.shape
In [23]:
test_ids = np.load("../data/features/test_ids.npy")
print test_ids
In [24]:
write_predictions(test_pred, test_ids, "../predictions/rfc_10.csv")
In [ ]: