In [17]:
import src.utils.utils as utils
%load_ext autoreload
%autoreload 2
import hash_bytecode
import itertools
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.cross_validation import cross_val_score, KFold
import sklearn.cross_validation as cv
from sklearn.feature_extraction.text import CountVectorizer


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

In [2]:
db = utils.get_mongodb()
collection = db.samples

In [ ]:


In [3]:
hash_bytecode.init('train')

In [4]:
stream = hash_bytecode.stream_files(collection, test=False)

In [ ]:


In [ ]:


In [5]:
X, y = hash_bytecode.get_batch(stream, 1000)


<pymongo.cursor.Cursor object at 0x7fe6a205b3c8>
0, about to get ikWOqdCK57rcjxP4SJZD in class 3
100, about to get 6CsTIZDSEdqNzi5tOKBo in class 9
200, about to get JUr6yTFW3ExqApBt5S2z in class 3
300, about to get 6NCw8nF9A7fUzYLyJIiD in class 3
400, about to get 0YWidX9hOD5sPrtTvc2M in class 2
500, about to get dNw1IRg58zQl0oFVJS2q in class 3
600, about to get 31KjQiNWATfPaozHtYOl in class 1
700, about to get eXUS3QBdChLqRxM8aYEs in class 3
800, about to get dncOwWDxEl3Ri7XCMmUz in class 3
900, about to get gKxq7SlW5RUHM0ru41Ta in class 8
id 1000, class, 1000, data size 1000

In [6]:
%%time
# flattens the list of docs into something the vectorizers can handle
X = [''.join(doc) for doc in X]


CPU times: user 5.18 s, sys: 2.54 s, total: 7.72 s
Wall time: 7.72 s

In [10]:
hvec = hash_bytecode.make_vectorizer()
hvec


Out[10]:
HashingVectorizer(analyzer='word', binary=False, decode_error='ignore',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, n_features=134217728, ngram_range=(2, 4),
         non_negative=True, norm='l2', preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern=' ', tokenizer=None)

In [ ]:


In [139]:
%%time
bow = hvec.transform(X[0:100])


CPU times: user 4min 22s, sys: 8.87 s, total: 4min 31s
Wall time: 4min 31s

In [140]:
# from 1000
bow


Out[140]:
<100x33554432 sparse matrix of type '<class 'numpy.float64'>'
	with 200 stored elements in Compressed Sparse Row format>

In [100]:
bow


Out[100]:
<500x33554432 sparse matrix of type '<class 'numpy.float64'>'
	with 500 stored elements in Compressed Sparse Row format>

In [79]:


In [141]:
perceptron = Perceptron(n_jobs=-1, penalty='l2')
rf = RandomForestClassifier(n_jobs=-1, max_depth=10)
pa = PassiveAggressiveClassifier(C=1, n_jobs=-1)

In [156]:
clfs = [perceptron, pa]

In [ ]:


In [143]:
%%time
rf_predicted= cv.cross_val_predict(rf, bow, y[0:100], cv=5)


CPU times: user 19min 12s, sys: 17 s, total: 19min 29s
Wall time: 2min 47s

In [145]:
print(metrics.classification_report(y[0:100], rf_predicted))


             precision    recall  f1-score   support

          1       0.16      0.20      0.18        15
          2       0.67      0.14      0.24        28
          3       0.45      1.00      0.62        29
          4       0.20      0.20      0.20         5
          6       0.00      0.00      0.00         7
          7       0.00      0.00      0.00         4
          8       0.75      0.50      0.60         6
          9       0.00      0.00      0.00         6

avg / total       0.39      0.40      0.32       100


In [146]:
%%time
p_predicted= cv.cross_val_predict(perceptron, bow, y[0:100], cv=10)


CPU times: user 1min 20s, sys: 1min 7s, total: 2min 27s
Wall time: 33.6 s
/usr/local/lib/python3.4/dist-packages/sklearn/cross_validation.py:417: Warning: The least populated class in y has only 4 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=10.
  % (min_labels, self.n_folds)), Warning)

In [147]:
print(metrics.classification_report(y[0:100], p_predicted))


             precision    recall  f1-score   support

          1       0.16      0.27      0.20        15
          2       0.28      0.18      0.22        28
          3       0.27      0.31      0.29        29
          4       0.00      0.00      0.00         5
          6       0.10      0.14      0.12         7
          7       0.00      0.00      0.00         4
          8       0.00      0.00      0.00         6
          9       0.00      0.00      0.00         6

avg / total       0.19      0.19      0.18       100


In [150]:
%%time
pa_predicted= cv.cross_val_predict(pa, bow, y[0:100], cv=10)


CPU times: user 1min 16s, sys: 11.4 s, total: 1min 28s
Wall time: 33.2 s

In [149]:
print(metrics.classification_report(y[0:100], pa_predicted))


             precision    recall  f1-score   support

          1       0.17      0.27      0.21        15
          2       0.29      0.43      0.34        28
          3       0.37      0.45      0.41        29
          4       0.00      0.00      0.00         5
          6       0.00      0.00      0.00         7
          7       0.00      0.00      0.00         4
          8       0.00      0.00      0.00         6
          9       0.00      0.00      0.00         6

avg / total       0.21      0.29      0.25       100


In [ ]:
cv = CountVectorizer(vocabulary=hexes)

In [7]:
# resuing x, y as a test set before retraining
X_test, y_test = X, y

In [ ]:
%%time
test_bow = cv.transform(X_test)

In [ ]:
training_set_size = []
percp_scores = []
pa_scores = []

In [ ]:
classes = np.arange(9)  + 1
nsamples = 0

In [ ]:
for i in range(950):
    if i%2 == 0:
        print("Working on batch %d", i)
    
    batch_text, batch_labels = hash_bytecode.get_batch(stream, 100)
    nsamples += len(batch_text)
    batch_text = [''.join(doc) for doc in X]
    print('training batch bow')
    batch_bow = cv.transform(batch_text)
    perceptron.partial_fit(batch_bow, batch_labels, classes=classes)
    pa.partial_fit(batch_bow, batch_labels, classes=classes)
    if nsamples % 1000 == 0:
        training_set_size.append(nsamples)
        percp_scores.append(perceptron.score(X_test, y_test))
        pa_scores.append(pa.score(X_test, y_test))

In [ ]:


In [15]:
hexes = {i.split('x')[1] for i in [hex(i) for i in range(16**2)]}

In [29]:
hexes.add('??')

In [31]:


In [32]:
%%time
bow = cv.fit_transform(X_test[0:5])


CPU times: user 7.33 s, sys: 185 ms, total: 7.51 s
Wall time: 7.5 s

In [33]:



Out[33]:
<5x257 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [73]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [72]:


In [ ]:


In [ ]: