In [1]:
import src.utils.utils as utils
import gensim
from gensim.models.word2vec import Word2Vec
import sklearn.cross_validation as cv
from sklearn.ensemble import RandomForestClassifier
import numpy as np # Make sure that numpy is imported
In [2]:
def makeFeatureVec(words, model, num_features):
# Function to average all of the word vectors in a given
# paragraph
#
# Pre-initialize an empty numpy array (for speed)
featureVec = np.zeros((num_features,),dtype="float32")
#
nwords = 0.
#
# Index2word is a list that contains the names of the words in
# the model's vocabulary. Convert it to a set, for speed
index2word_set = set(model.index2word)
#
# Loop over each word in the review and, if it is in the model's
# vocaublary, add its feature vector to the total
for word in words:
if word in index2word_set:
nwords = nwords + 1.
featureVec = np.add(featureVec,model[word])
#
# Divide the result by the number of words to get the average
featureVec = np.divide(featureVec,nwords)
return featureVec
def getAvgFeatureVecs(reviews, model, num_features):
# Given a set of reviews (each one a list of words), calculate
# the average feature vector for each one and return a 2D numpy array
#
# Initialize a counter
counter = 0.
#
# Preallocate a 2D numpy array, for speed
reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
#
# Loop through the reviews
for review in reviews:
#
# Print a status message every 1000th review
if counter%1000. == 0.:
print( "Review %d of %d" % (counter, len(reviews)))
#
# Call the function (defined above) that makes average feature vectors
reviewFeatureVecs[counter] = makeFeatureVec(review, model, \
num_features)
#
# Increment the counter
counter = counter + 1.
return reviewFeatureVecs
In [3]:
887960960
In [4]:
testsample = malwares.samples.find_one({"class":"5"})
In [5]:
data = malwares.reduced.find({'asm_info': {'$exists': True}})
In [6]:
data.count()
Out[6]:
In [ ]:
In [7]:
train = ((m['class'], m['asm_info']['seq']) for m in data)
In [8]:
train = list(train)
In [9]:
t = [t for t in train if len(t[1]) >2]
labs = [lab[0] for lab in t]
seq = [seq[1] for seq in t]
len(t)
Out[9]:
In [10]:
labels = np.array([int(l) for l in labs])
In [ ]:
In [ ]:
In [ ]:
In [12]:
model = Word2Vec(seq, workers=4, window=20)
In [13]:
train_vecs = getAvgFeatureVecs(seq, model, 100)
# test_vecs = getAvgFeatureVecs(test, model, 100)
In [ ]:
In [75]:
Out[75]:
In [76]:
In [103]:
In [78]:
In [105]:
Out[105]:
In [30]:
forest = RandomForestClassifier( n_estimators=1000, max_depth=10, n_jobs=2)
In [40]:
res = forest.fit(train_vecs, labels)
In [108]:
scores = cv.cross_val_score(forest, train_vecs, labels, cv=10)
In [31]:
predicted = cv.cross_val_predict(forest, train_vecs, labels, cv=10)
In [32]:
predicted
Out[32]:
In [33]:
labels
Out[33]:
In [34]:
import sklearn.metrics as metrics
print(metrics.classification_report(labels, predicted))
In [42]:
feature_importance = res.feature_importances_
In [43]:
ind = np.argsort(feature_importance)[::-1]
In [44]:
for f in range(10):
print("%d. feature %d (%f)" % (f + 1, ind[f], feature_importance[ind[f]]))
In [23]:
1 -len(preds[preds != 0])/len(labels)
Out[23]:
In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer
In [ ]:
In [52]:
s = ' '
seqs = [s.join(se) for se in seq]
In [ ]:
In [64]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
In [78]:
tfidf8 = TfidfVectorizer(ngram_range=(3,8))
tfidf8 = tfidf.fit_transform(seqs)
In [80]:
tfidf12 = TfidfVectorizer(ngram_range=(6,12))
tfidf12 = tfidf.fit_transform(seqs)
In [79]:
clf8 = MultinomialNB()
clf8.fit(tfidf8, labels)
Out[79]:
In [81]:
clf12 = MultinomialNB()
clf12.fit(tfidf12, labels)
Out[81]:
In [ ]:
In [83]:
predicted8 = cv.cross_val_predict(clf, tfidf8, labels, cv=10)
predicted12 = cv.cross_val_predict(clf, tfidf12, labels, cv=10)
In [84]:
print(metrics.classification_report(labels, predicted8))
print(metrics.classification_report(labels, predicted12))
In [ ]:
In [90]:
words = np.array([['for', 'i','am','dead'],['i','am','not','dead']])
In [93]:
words
Out[93]:
In [97]:
from sklearn.decomposition import PCA
In [100]:
pca = PCA(n_components=2)
In [ ]:
In [ ]: