In [17]:
import src.utils.utils as utils
%load_ext autoreload
%autoreload 2
import hash_bytecode
import itertools
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.cross_validation import cross_val_score, KFold
import sklearn.cross_validation as cv
from sklearn.feature_extraction.text import CountVectorizer
In [2]:
db = utils.get_mongodb()
collection = db.samples
In [ ]:
In [3]:
hash_bytecode.init('train')
In [4]:
stream = hash_bytecode.stream_files(collection, test=False)
In [ ]:
In [ ]:
In [5]:
X, y = hash_bytecode.get_batch(stream, 1000)
In [6]:
%%time
# flattens the list of docs into something the vectorizers can handle
X = [''.join(doc) for doc in X]
In [10]:
hvec = hash_bytecode.make_vectorizer()
hvec
Out[10]:
In [ ]:
In [139]:
%%time
bow = hvec.transform(X[0:100])
In [140]:
# from 1000
bow
Out[140]:
In [100]:
bow
Out[100]:
In [79]:
In [141]:
perceptron = Perceptron(n_jobs=-1, penalty='l2')
rf = RandomForestClassifier(n_jobs=-1, max_depth=10)
pa = PassiveAggressiveClassifier(C=1, n_jobs=-1)
In [156]:
clfs = [perceptron, pa]
In [ ]:
In [143]:
%%time
rf_predicted= cv.cross_val_predict(rf, bow, y[0:100], cv=5)
In [145]:
print(metrics.classification_report(y[0:100], rf_predicted))
In [146]:
%%time
p_predicted= cv.cross_val_predict(perceptron, bow, y[0:100], cv=10)
In [147]:
print(metrics.classification_report(y[0:100], p_predicted))
In [150]:
%%time
pa_predicted= cv.cross_val_predict(pa, bow, y[0:100], cv=10)
In [149]:
print(metrics.classification_report(y[0:100], pa_predicted))
In [ ]:
cv = CountVectorizer(vocabulary=hexes)
In [7]:
# resuing x, y as a test set before retraining
X_test, y_test = X, y
In [ ]:
%%time
test_bow = cv.transform(X_test)
In [ ]:
training_set_size = []
percp_scores = []
pa_scores = []
In [ ]:
classes = np.arange(9) + 1
nsamples = 0
In [ ]:
for i in range(950):
if i%2 == 0:
print("Working on batch %d", i)
batch_text, batch_labels = hash_bytecode.get_batch(stream, 100)
nsamples += len(batch_text)
batch_text = [''.join(doc) for doc in X]
print('training batch bow')
batch_bow = cv.transform(batch_text)
perceptron.partial_fit(batch_bow, batch_labels, classes=classes)
pa.partial_fit(batch_bow, batch_labels, classes=classes)
if nsamples % 1000 == 0:
training_set_size.append(nsamples)
percp_scores.append(perceptron.score(X_test, y_test))
pa_scores.append(pa.score(X_test, y_test))
In [ ]:
In [15]:
hexes = {i.split('x')[1] for i in [hex(i) for i in range(16**2)]}
In [29]:
hexes.add('??')
In [31]:
In [32]:
%%time
bow = cv.fit_transform(X_test[0:5])
In [33]:
Out[33]:
In [73]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [72]:
In [ ]:
In [ ]: