In [1]:
import src.utils.utils as utils
import gensim
from gensim.models.word2vec import Word2Vec
import sklearn.cross_validation as cv
from sklearn.ensemble import RandomForestClassifier

import numpy as np  # Make sure that numpy is imported


hello

In [2]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.index2word)
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0.
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
       #
       # Print a status message every 1000th review
       if counter%1000. == 0.:
           print( "Review %d of %d" % (counter, len(reviews)))
       # 
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
           num_features)
       #
       # Increment the counter
       counter = counter + 1.
    return reviewFeatureVecs

In [3]:
887960960

In [4]:
testsample = malwares.samples.find_one({"class":"5"})


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-4-480b7e851eba> in <module>()
----> 1 testsample = malwares.samples.find_one({"class":"5"})

/usr/local/lib/python3.4/site-packages/pymongo/collection.py in find_one(self, spec_or_id, *args, **kwargs)
    722                            *args, **kwargs).max_time_ms(max_time_ms)
    723 
--> 724         for result in cursor.limit(-1):
    725             return result
    726         return None

/usr/local/lib/python3.4/site-packages/pymongo/cursor.py in __next__(self)
   1074             raise StopIteration
   1075         db = self.__collection.database
-> 1076         if len(self.__data) or self._refresh():
   1077             if self.__manipulate:
   1078                 return db._fix_outgoing(self.__data.popleft(),

/usr/local/lib/python3.4/site-packages/pymongo/cursor.py in _refresh(self)
   1018                               self.__skip, ntoreturn,
   1019                               self.__query_spec(), self.__fields,
-> 1020                               self.__uuid_subtype))
   1021             if not self.__id:
   1022                 self.__killed = True

/usr/local/lib/python3.4/site-packages/pymongo/cursor.py in __send_message(self, message)
    931 
    932             try:
--> 933                 res = client._send_message_with_response(message, **kwargs)
    934                 self.__connection_id, (response, sock, pool) = res
    935                 if self.__exhaust:

/usr/local/lib/python3.4/site-packages/pymongo/mongo_client.py in _send_message_with_response(self, message, _must_use_master, **kwargs)
   1203                 sock_info.sock.settimeout(kwargs["network_timeout"])
   1204 
-> 1205             response = self.__send_and_receive(message, sock_info)
   1206 
   1207             if not exhaust:

/usr/local/lib/python3.4/site-packages/pymongo/mongo_client.py in __send_and_receive(self, message, sock_info)
   1180         try:
   1181             sock_info.sock.sendall(data)
-> 1182             return self.__receive_message_on_socket(1, request_id, sock_info)
   1183         except:
   1184             sock_info.close()

/usr/local/lib/python3.4/site-packages/pymongo/mongo_client.py in __receive_message_on_socket(self, operation, rqst_id, sock_info)
   1172         assert operation == struct.unpack("<i", header[12:])[0]
   1173 
-> 1174         return self.__receive_data_on_socket(length - 16, sock_info)
   1175 
   1176     def __send_and_receive(self, message, sock_info):

/usr/local/lib/python3.4/site-packages/pymongo/mongo_client.py in __receive_data_on_socket(self, length, sock_info)
   1151         message = EMPTY
   1152         while length:
-> 1153             chunk = sock_info.sock.recv(length)
   1154             if chunk == EMPTY:
   1155                 raise ConnectionFailure("connection closed")

KeyboardInterrupt: 

In [5]:
data = malwares.reduced.find({'asm_info': {'$exists': True}})

In [6]:
data.count()


Out[6]:
435

In [ ]:


In [7]:
train = ((m['class'], m['asm_info']['seq']) for m in data)

In [8]:
train = list(train)

In [9]:
t = [t for t in train if len(t[1]) >2]

labs = [lab[0] for lab in t]
seq = [seq[1] for seq in t]
len(t)


Out[9]:
407

In [10]:
labels = np.array([int(l) for l in labs])

In [ ]:


In [ ]:


In [ ]:


In [12]:
model = Word2Vec(seq, workers=4, window=20)

In [13]:
train_vecs = getAvgFeatureVecs(seq, model, 100)
# test_vecs = getAvgFeatureVecs(test, model, 100)


Review 0 of 407

In [ ]:


In [75]:



Out[75]:
(248, 100)

In [76]:


In [103]:



Review 0 of 407
Review 0 of 105

In [78]:


In [105]:



Out[105]:
array([4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3,
       4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

In [30]:
forest = RandomForestClassifier( n_estimators=1000, max_depth=10, n_jobs=2)

In [40]:
res = forest.fit(train_vecs, labels)

In [108]:
scores = cv.cross_val_score(forest, train_vecs, labels, cv=10)

In [31]:
predicted = cv.cross_val_predict(forest, train_vecs, labels, cv=10)

In [32]:
predicted


Out[32]:
array([4, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 1, 1, 3, 3,
       4, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 2, 2, 1, 8, 8, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 9, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 1, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 5, 5, 5,
       5, 5, 2, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 1, 6, 6, 6, 6, 6, 6, 7, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 8, 4, 6, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 1, 6, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 1, 1, 9, 9, 9, 1])

In [33]:
labels


Out[33]:
array([4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 3,
       4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

In [34]:
import sklearn.metrics as metrics

print(metrics.classification_report(labels, predicted))


             precision    recall  f1-score   support

          1       0.77      0.86      0.81        43
          2       0.86      0.86      0.86        50
          3       1.00      1.00      1.00        50
          4       0.92      0.97      0.94        34
          5       1.00      0.91      0.96        35
          6       0.96      0.96      0.96        50
          7       0.98      0.93      0.95        45
          8       0.94      0.94      0.94        50
          9       0.98      0.94      0.96        50

avg / total       0.93      0.93      0.93       407


In [42]:
feature_importance = res.feature_importances_

In [43]:
ind = np.argsort(feature_importance)[::-1]

In [44]:
for f in range(10):
    print("%d. feature %d (%f)" % (f + 1, ind[f], feature_importance[ind[f]]))


1. feature 49 (0.033371)
2. feature 14 (0.029397)
3. feature 57 (0.028274)
4. feature 23 (0.021922)
5. feature 56 (0.021283)
6. feature 86 (0.021065)
7. feature 77 (0.018875)
8. feature 76 (0.018426)
9. feature 44 (0.017547)
10. feature 65 (0.016839)

In [23]:
1 -len(preds[preds != 0])/len(labels)


Out[23]:
0.9287469287469288

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [ ]:


In [52]:
s = ' '
seqs = [s.join(se) for se in seq]

In [ ]:


In [64]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB

In [78]:
tfidf8 = TfidfVectorizer(ngram_range=(3,8))
tfidf8 = tfidf.fit_transform(seqs)

In [80]:
tfidf12 = TfidfVectorizer(ngram_range=(6,12))
tfidf12 = tfidf.fit_transform(seqs)

In [79]:
clf8 = MultinomialNB()
clf8.fit(tfidf8, labels)


Out[79]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [81]:
clf12 = MultinomialNB()
clf12.fit(tfidf12, labels)


Out[81]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [ ]:


In [83]:
predicted8 = cv.cross_val_predict(clf, tfidf8, labels, cv=10)
predicted12 = cv.cross_val_predict(clf, tfidf12, labels, cv=10)

In [84]:
print(metrics.classification_report(labels, predicted8))
print(metrics.classification_report(labels, predicted12))


             precision    recall  f1-score   support

          1       0.90      0.88      0.89        43
          2       0.96      1.00      0.98        50
          3       0.94      1.00      0.97        50
          4       1.00      0.94      0.97        34
          5       1.00      0.74      0.85        35
          6       1.00      0.92      0.96        50
          7       0.89      0.93      0.91        45
          8       0.78      0.94      0.85        50
          9       0.96      0.94      0.95        50

avg / total       0.93      0.93      0.93       407

             precision    recall  f1-score   support

          1       0.90      0.88      0.89        43
          2       0.96      1.00      0.98        50
          3       0.94      1.00      0.97        50
          4       1.00      0.94      0.97        34
          5       1.00      0.74      0.85        35
          6       1.00      0.92      0.96        50
          7       0.89      0.93      0.91        45
          8       0.78      0.94      0.85        50
          9       0.96      0.94      0.95        50

avg / total       0.93      0.93      0.93       407


In [ ]:


In [90]:
words = np.array([['for', 'i','am','dead'],['i','am','not','dead']])

In [93]:
words


Out[93]:
array([['for', 'i', 'am', 'dead'],
       ['i', 'am', 'not', 'dead']], 
      dtype='<U4')

In [97]:
from sklearn.decomposition import PCA

In [100]:
pca = PCA(n_components=2)

In [ ]:


In [ ]: