notebook.community

Edit and run



In [17]:

    
import src.utils.utils as utils
%load_ext autoreload
%autoreload 2
import hash_bytecode
import itertools
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import Perceptron
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.cross_validation import cross_val_score, KFold
import sklearn.cross_validation as cv
from sklearn.feature_extraction.text import CountVectorizer









    



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload



In [2]:

    
db = utils.get_mongodb()
collection = db.samples



In [ ]:



In [3]:

    
hash_bytecode.init('train')



In [4]:

    
stream = hash_bytecode.stream_files(collection, test=False)



In [ ]:



In [ ]:



In [5]:

    
X, y = hash_bytecode.get_batch(stream, 1000)









    



<pymongo.cursor.Cursor object at 0x7fe6a205b3c8>
0, about to get ikWOqdCK57rcjxP4SJZD in class 3
100, about to get 6CsTIZDSEdqNzi5tOKBo in class 9
200, about to get JUr6yTFW3ExqApBt5S2z in class 3
300, about to get 6NCw8nF9A7fUzYLyJIiD in class 3
400, about to get 0YWidX9hOD5sPrtTvc2M in class 2
500, about to get dNw1IRg58zQl0oFVJS2q in class 3
600, about to get 31KjQiNWATfPaozHtYOl in class 1
700, about to get eXUS3QBdChLqRxM8aYEs in class 3
800, about to get dncOwWDxEl3Ri7XCMmUz in class 3
900, about to get gKxq7SlW5RUHM0ru41Ta in class 8
id 1000, class, 1000, data size 1000



In [6]:

    
%%time
# flattens the list of docs into something the vectorizers can handle
X = [''.join(doc) for doc in X]









    



CPU times: user 5.18 s, sys: 2.54 s, total: 7.72 s
Wall time: 7.72 s



In [10]:

    
hvec = hash_bytecode.make_vectorizer()
hvec









    Out[10]:





HashingVectorizer(analyzer='word', binary=False, decode_error='ignore',
         dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
         lowercase=True, n_features=134217728, ngram_range=(2, 4),
         non_negative=True, norm='l2', preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern=' ', tokenizer=None)



In [ ]:



In [139]:

    
%%time
bow = hvec.transform(X[0:100])









    



CPU times: user 4min 22s, sys: 8.87 s, total: 4min 31s
Wall time: 4min 31s



In [140]:

    
# from 1000
bow









    Out[140]:





<100x33554432 sparse matrix of type '<class 'numpy.float64'>'
	with 200 stored elements in Compressed Sparse Row format>



In [100]:

    
bow









    Out[100]:





<500x33554432 sparse matrix of type '<class 'numpy.float64'>'
	with 500 stored elements in Compressed Sparse Row format>



In [79]:



In [141]:

    
perceptron = Perceptron(n_jobs=-1, penalty='l2')
rf = RandomForestClassifier(n_jobs=-1, max_depth=10)
pa = PassiveAggressiveClassifier(C=1, n_jobs=-1)



In [156]:

    
clfs = [perceptron, pa]



In [ ]:



In [143]:

    
%%time
rf_predicted= cv.cross_val_predict(rf, bow, y[0:100], cv=5)









    



CPU times: user 19min 12s, sys: 17 s, total: 19min 29s
Wall time: 2min 47s



In [145]:

    
print(metrics.classification_report(y[0:100], rf_predicted))









    



             precision    recall  f1-score   support

          1       0.16      0.20      0.18        15
          2       0.67      0.14      0.24        28
          3       0.45      1.00      0.62        29
          4       0.20      0.20      0.20         5
          6       0.00      0.00      0.00         7
          7       0.00      0.00      0.00         4
          8       0.75      0.50      0.60         6
          9       0.00      0.00      0.00         6

avg / total       0.39      0.40      0.32       100



In [146]:

    
%%time
p_predicted= cv.cross_val_predict(perceptron, bow, y[0:100], cv=10)









    



CPU times: user 1min 20s, sys: 1min 7s, total: 2min 27s
Wall time: 33.6 s






    



/usr/local/lib/python3.4/dist-packages/sklearn/cross_validation.py:417: Warning: The least populated class in y has only 4 members, which is too few. The minimum number of labels for any class cannot be less than n_folds=10.
  % (min_labels, self.n_folds)), Warning)



In [147]:

    
print(metrics.classification_report(y[0:100], p_predicted))









    



             precision    recall  f1-score   support

          1       0.16      0.27      0.20        15
          2       0.28      0.18      0.22        28
          3       0.27      0.31      0.29        29
          4       0.00      0.00      0.00         5
          6       0.10      0.14      0.12         7
          7       0.00      0.00      0.00         4
          8       0.00      0.00      0.00         6
          9       0.00      0.00      0.00         6

avg / total       0.19      0.19      0.18       100



In [150]:

    
%%time
pa_predicted= cv.cross_val_predict(pa, bow, y[0:100], cv=10)









    



CPU times: user 1min 16s, sys: 11.4 s, total: 1min 28s
Wall time: 33.2 s



In [149]:

    
print(metrics.classification_report(y[0:100], pa_predicted))









    



             precision    recall  f1-score   support

          1       0.17      0.27      0.21        15
          2       0.29      0.43      0.34        28
          3       0.37      0.45      0.41        29
          4       0.00      0.00      0.00         5
          6       0.00      0.00      0.00         7
          7       0.00      0.00      0.00         4
          8       0.00      0.00      0.00         6
          9       0.00      0.00      0.00         6

avg / total       0.21      0.29      0.25       100



In [ ]:

    
cv = CountVectorizer(vocabulary=hexes)



In [7]:

    
# resuing x, y as a test set before retraining
X_test, y_test = X, y



In [ ]:

    
%%time
test_bow = cv.transform(X_test)



In [ ]:

    
training_set_size = []
percp_scores = []
pa_scores = []



In [ ]:

    
classes = np.arange(9)  + 1
nsamples = 0



In [ ]:

    
for i in range(950):
    if i%2 == 0:
        print("Working on batch %d", i)
    
    batch_text, batch_labels = hash_bytecode.get_batch(stream, 100)
    nsamples += len(batch_text)
    batch_text = [''.join(doc) for doc in X]
    print('training batch bow')
    batch_bow = cv.transform(batch_text)
    perceptron.partial_fit(batch_bow, batch_labels, classes=classes)
    pa.partial_fit(batch_bow, batch_labels, classes=classes)
    if nsamples % 1000 == 0:
        training_set_size.append(nsamples)
        percp_scores.append(perceptron.score(X_test, y_test))
        pa_scores.append(pa.score(X_test, y_test))



In [ ]:



In [15]:

    
hexes = {i.split('x')[1] for i in [hex(i) for i in range(16**2)]}



In [29]:

    
hexes.add('??')



In [31]:



In [32]:

    
%%time
bow = cv.fit_transform(X_test[0:5])









    



CPU times: user 7.33 s, sys: 185 ms, total: 7.51 s
Wall time: 7.5 s



In [33]:









    Out[33]:





<5x257 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>



In [73]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [ ]:



In [72]:



In [ ]:



In [ ]: