In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from scipy.io import mmread
import numpy as np

In [3]:
ids = np.load("../data/features/train_ids.npy")
classes = np.load("../data/features/train_classes.npy")
print ids
print classes


['00269ea50001a6c699d0222032d45b74b2e7e8be9'
 '00278ec420236020d6121dffe0cc20034422e7228'
 '002d5615d19c851934dc481c607b6a74a6e9e536e' ...,
 'ffc7716423ff262f25a2463ab7d80b485c51ebb9d'
 'ffc94f4caf71d5cac3c3661606f6503f994c463c5'
 'ffdba6079b981688512353cF89ca7e1b8f4868263']
['None' 'Lipler' 'VB' ..., 'None' 'None' 'FraudPack']

In [2]:
malware_classes = ["Agent", "AutoRun", "FraudLoad", "FraudPack", "Hupigon", "Krap",
           "Lipler", "Magania", "None", "Poison", "Swizzor", "Tdss",
           "VB", "Virut", "Zbot"]

# a function for writing predictions in the required format
def write_predictions(predictions, ids, outfile):
    """
    assumes len(predictions) == len(ids), and that predictions[i] is the
    index of the predicted class with the malware_classes list above for
    the executable corresponding to ids[i].
    outfile will be overwritten
    """
    with open(outfile,"w+") as f:
        # write header
        f.write("Id,Prediction\n")
        for i, history_id in enumerate(ids):
            f.write("%s,%d\n" % (history_id, predictions[i]))

def classes_to_Y(classes):
    output = []
    for cls in classes:
        output.append(malware_classes.index(cls))
    return np.array(output)

In [3]:
# load training classes
classes = np.load("../data/features/train_classes.npy")

# load sparse matrix of training data
sparse_mat_train_test = mmread("../data/features/naive_word_hashed_full_features.mtx")

In [4]:
# convert csr to a numpy array
sparse = sparse_mat_train_test.toarray()

# pull out training examples
X = sparse[:classes.shape[0]]
# X_CV = X[-300:]
# X = X[:-300]

X_test = sparse[classes.shape[0]:]
print X_test.shape

Y = classes_to_Y(classes)
# Y_CV = Y[-300:]
# Y = Y[:-300]


(3724, 1048576)

In [5]:
RF = RandomForestClassifier()
RF.fit(X, Y)


Out[5]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [ ]:
# preds = RF.predict(X_CV)

# mistakes = 0
# for i in range(len(preds)):
#     if preds[i] != Y_CV[i]:
#         mistakes += 1
#         print malware_classes[Y_CV[i]]
# print mistakes

In [ ]:
test_pred = RF.predict(X_test)
print test_pred.shape

In [ ]:
# test_pred = RF.predict(X_test)

In [22]:
# print test_pred
test_ids = np.load("../data/features/test_ids.npy")
print test_ids


['0015c8c9ff02fea9d0f45692b9eebfb4abff4e42f'
 '001f298a534ae4b0db7f2707169250aa215c3b5f2'
 '001f5fdaaa8bbe20303527198d09a30bb7ca3eb50' ...,
 'ff85866b215233b2fecdca2c2b8fda54ad24c86fd'
 'ff884224571e0476990574df5da76e0991db583af'
 'ffc47163a530c51ef2e6572d786aefbaed99890f2']

In [23]:
# print test_ids
write_predictions(test_pred, test_ids, "../predictions/rfc_10.csv")

In [ ]: