In [25]:
from docopt import docopt
import logging
import os
import hashlib
import glob
import getpass
import socket
import datetime
import re
import requests
from slimit.lexer import Lexer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import manifold
from sklearn import cross_validation
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


class Database():
    """A directory structure containing JavaScript samples and associated metadata"""
    def __init__(self, path):
        """Load or create the Database at path"""
        self.path = path
        self.subdirs = [self.path+'/'+sdir for sdir in ['malicious', 'legitimate', 'unknown']]
        if not self.check():
            self.create()

    def check(self):
        """Check if the database exists and contains the needed subdirs"""
        if not os.path.isdir(self.path):
            return False
        if not all(map(os.path.isdir, self.subdirs)):
            return False
        return True

    def create(self):
        """Create the directory structure needed to host a database"""
        for sdir in self.subdirs:
            os.makedirs(sdir)

    def lookup(self, sha1):
        """Return the snippet with the provided sha1, if it exists in the database"""
        logging.debug('Looking for snippet '+sha1)
        files=sum([glob.glob(sdir+'/*.js') for sdir in self.subdirs], [])
        matching_files = [f for f in files if os.path.basename(f)[:-3] == sha1]
        assert len(matching_files) <= 1, "Multiple snippets of the same hash in the DB"
        if matching_files:
            fname = matching_files[0]
            logging.debug('Found')
            return Snippet(filename=matching_files[0])
        logging.debug('Not found')
        return None

    def add_new(self, snippet, message):
        """Add to the database a snippet that wasn't there before, return the newly created snippet"""
        fname = self.path+'/'+snippet.status+'/'+snippet.sha1+'.js'
        with open(fname, 'wb') as f:
            f.write(snippet.data)
        snippet.filename = fname
        snippet.log(event='created', message=message)
        return snippet

    def move(self, snippet, status, message):
        """Moves an existing snippet from its status to another"""
        path = snippet.filename.split('/')
        new_name = '/'.join(path[:-2]+[status, path[-1]])
        os.rename(snippet.filename, new_name)
        #Now moving the logs
        os.rename(snippet.filename[:-2]+'log', new_name[:-2]+'log')
        snippet.filename = new_name
        old_status = snippet.status
        snippet.status = status
        snippet.log(event='moved from '+old_status, message=message)
        
    def legitimate_snippets(self):
        """The legitimate snippets in the DB"""
        return [Snippet(filename=fname) for fname in sorted(glob.glob(self.path+'/legitimate/*.js'))]
    
    def malicious_snippets(self):
        """The malicious snippets in the DB"""
        return [Snippet(filename=fname) for fname in sorted(glob.glob(self.path+'/malicious/*.js'))]
    
        
class Snippet():
    """A JavaScript snippet"""

    log_template = "{date} {severity} hostname={hostname} user={user} sha1={sha1} status={status} event={event} message={message}\n"

    def __init__(self, filename=None, status='unknown', sha1=None, data=None):
        """Create a new snippet instance, either from data and status, or from a file in a DB"""
        self.data = data
        if filename:
            self.loadFromFile(filename)
            return
        if not sha1:
            self.sha1 = hashlib.sha1(self.data).hexdigest()
        else:
            self.sha1 = sha1
        self.status = status

    def loadFromFile(self, fname):
        """Load a snippet from a file in a DB"""
        self.filename = fname
        self.__init__(data=open(fname, 'r').read(), status=fname.split('/')[-2], sha1=fname.split('/')[-1][:-3])

    def log(self, event, message):
        """Append a line to the snippet's log file"""
        if not self.filename:
            raise ValueError("We don't know which DB we belong to.")
        severity='ERROR:' if 'bad move' in event else 'WARNING:' if 'moved' in event else 'INFO:'
        log_message = self.log_template.format(date=datetime.datetime.now().isoformat(),
                                               severity=severity,
                                               hostname=socket.gethostname(),
                                               user=getpass.getuser(),
                                               sha1=self.sha1,
                                               status=self.status,
                                               event=event,
                                               message=message if message else '')
        log_fname = self.filename[:-3]+'.log'
        with open(log_fname, 'a') as f:
            f.write(log_message)

In [2]:
db = Database('sekoia_db')

In [3]:
def train_from_js_tokens(corpus):
    tokens_corpus = []
    for t in corpus:
        print('INFO: Parsing '+t.filename)
        try:
            lexer = Lexer()
            lexer.input(t.data)
            tokens_corpus.append(' '.join([token.type for token in lexer]))
        except TypeError as e:
            print('Error lexing '+t.filename+' : '+str(e))
    return train_tfidf(tokens_corpus)

In [4]:
def train_tfidf(corpus):
    '''Return the tf_idf transformer trained on the supplied corpus

    Args:
        corpus (list): List of text elements'''
    count_vect = CountVectorizer()
    train_counts = count_vect.fit_transform(corpus)
    tfidf_transformer = TfidfTransformer().fit(train_counts)
    def text2tfidf(text_list):
        '''Transform a list of text into a tfidf matrix'''
        return tfidf_transformer.transform(count_vect.transform(text_list)).toarray()
    return text2tfidf

In [5]:
corpus = db.legitimate_snippets()+db.malicious_snippets()
tfidf_transformer = train_from_js_tokens(corpus)


INFO: Parsing sekoia_db/legitimate/0034c7dacfeb469c20f9fb8bcad407a6dc82937b.js
INFO: Parsing sekoia_db/legitimate/0a7e9f4c559853b7d1bb10d045b93e207150283a.js
INFO: Parsing sekoia_db/legitimate/12717789ff0f8512728ec437724b90b4698cf718.js
INFO: Parsing sekoia_db/legitimate/1852661bd11a09ca9b9cb63d1aa6ff390fffaf4e.js
INFO: Parsing sekoia_db/legitimate/2674a226732a96f20320ae4b3233bdf9476ff218.js
INFO: Parsing sekoia_db/legitimate/27aa8ddb0c9457d659c51806103741b163f4650a.js
INFO: Parsing sekoia_db/legitimate/2e1a413df833abd6e09088d1e7f4f7507b156995.js
INFO: Parsing sekoia_db/legitimate/32ea2acd3d51248716d70e26b047efe35099d0ce.js
INFO: Parsing sekoia_db/legitimate/33dbfc1abb4f98ede991f25fb9882f286b900e4d.js
INFO: Parsing sekoia_db/legitimate/3a9910396919f0542104a28fb3e7edbaa19bc317.js
INFO: Parsing sekoia_db/legitimate/3f36e7886f2e5d26b3537cc04d5bebdec3762f59.js
INFO: Parsing sekoia_db/legitimate/4125aeafb14e79f2924405f48d11369594b5aab6.js
INFO: Parsing sekoia_db/legitimate/44fd3f7462bfc6d8e3b21cbadeffd14433c368f1.js
INFO: Parsing sekoia_db/legitimate/4a1757478d2750312e382ecc14e3802bdb135237.js
INFO: Parsing sekoia_db/legitimate/6919012e72e3ff6c90f853888ee81d8db5f2211f.js
INFO: Parsing sekoia_db/legitimate/6e17aac23eee73d0d46a46ec66d332ea3c7f4429.js
INFO: Parsing sekoia_db/legitimate/751927cc0f71697c6b6f680192ac84a6d7ff3787.js
INFO: Parsing sekoia_db/legitimate/7bb0efb0f319d4237be5968391d98880affb547a.js
INFO: Parsing sekoia_db/legitimate/7cb076e63cd15b63b72477a3f2ad069d7fd31b3b.js
INFO: Parsing sekoia_db/legitimate/7e928d526cddff5ec6fdd832308db0e185d3c895.js
INFO: Parsing sekoia_db/legitimate/979143a6b10532413aefba462844d59df29fb2c1.js
INFO: Parsing sekoia_db/legitimate/9cacd3f5e72886436a7fb2251c804855fa05424b.js
INFO: Parsing sekoia_db/legitimate/a4159c8eb6c3ab846d365f2e7ae91e17a9adaa96.js
INFO: Parsing sekoia_db/legitimate/ac23fd531a08cf183ab08b689396b413b6738188.js
INFO: Parsing sekoia_db/legitimate/ccf9a60f5625e75d5e99d15bad6cd41810890ae0.js
INFO: Parsing sekoia_db/legitimate/dac2653d236c6a44809aef6eef26ebd13a02623b.js
INFO: Parsing sekoia_db/legitimate/dd337855c92fa79080e914c166f46ef8ab0908b9.js
INFO: Parsing sekoia_db/legitimate/e2d0c34e25ebce60edc4814ed99c570bb6a803f7.js
INFO: Parsing sekoia_db/legitimate/f6c78b41176d1b555c75fb0f464012868cf88cc5.js
INFO: Parsing sekoia_db/legitimate/fa9eebd174f8272bb3a68c96db6f19188aa9eea2.js
INFO: Parsing sekoia_db/malicious/00493276ff45f6c942e304bccdb677247931c5c8.js
INFO: Parsing sekoia_db/malicious/05107d9b05e03bc1d7163e5bf5f9c604858f1975.js
INFO: Parsing sekoia_db/malicious/0f582f86a817ea86bbb1b4448809578792abc426.js
INFO: Parsing sekoia_db/malicious/11823e0dd5a2939932ab08c59e4d570e93a119af.js
INFO: Parsing sekoia_db/malicious/13f5b14f7b32d3f572cdaa12f5a9e823544c68bb.js
INFO: Parsing sekoia_db/malicious/153c5446ee082fc289eb39e68182b2e79232532e.js
INFO: Parsing sekoia_db/malicious/24394b117a0ffc93ae2eca2ff9bf156a8dc4bc87.js
INFO: Parsing sekoia_db/malicious/2692705a42a4b27f05eb4830e5154650fdca554a.js
INFO: Parsing sekoia_db/malicious/299a041f98054b155f1cb07b810cef83d23247f2.js
INFO: Parsing sekoia_db/malicious/46896658e9e2f0d3c1d64dd08abbf72b963f1a24.js
INFO: Parsing sekoia_db/malicious/50cb0dd7c2bcda6596c972c95a929f6ba435d670.js
INFO: Parsing sekoia_db/malicious/50deec2f718f46483aae4ae0fb5c90693a820689.js
INFO: Parsing sekoia_db/malicious/519f04888204f277cba0e925634a3dd9958d9fb2.js
INFO: Parsing sekoia_db/malicious/548a25b51845dec6720c800528a835fbd9977fab.js
INFO: Parsing sekoia_db/malicious/5c3c250a5884122e56e8d508cbc2a60ea289e524.js
INFO: Parsing sekoia_db/malicious/6a8d11ae8d2bb2df9a55f79bdad9850f53a62f8a.js
INFO: Parsing sekoia_db/malicious/72126896dcd4f9614512887660a2722534634fe2.js
INFO: Parsing sekoia_db/malicious/7f5881237d3bbb514fbb5426de9f0d476c688627.js
INFO: Parsing sekoia_db/malicious/883a6f301c1fa871d2b12dfaeeafa436751a3d00.js
INFO: Parsing sekoia_db/malicious/8a877ce3d18f80017a9b09ea3a9cd9883e74c12a.js
INFO: Parsing sekoia_db/malicious/8f77522ae890c7a8fdad75b0e65afb3a150c56e2.js
INFO: Parsing sekoia_db/malicious/9b3808a2151824586077a1bc82aeea1ff5ad1ab3.js
INFO: Parsing sekoia_db/malicious/9f252cd4303072abf2cc8237ed2cee4d8bc28684.js
INFO: Parsing sekoia_db/malicious/a481a0a3bb0135d5292741ab30e223f6dd7d37ef.js
INFO: Parsing sekoia_db/malicious/abe0507e4841e2176645bd5cf0ce49cfb53e143d.js
INFO: Parsing sekoia_db/malicious/aceba980a1d602d78c4313c585b504d61f3d0db6.js
INFO: Parsing sekoia_db/malicious/b81a665e11f75767e536b14fde540c9c96c7273c.js
INFO: Parsing sekoia_db/malicious/bc9c8a3f6a657c8527a7cacb5958b4a4a4b3fb92.js
INFO: Parsing sekoia_db/malicious/c081a10133219e3dc06004e05b87964bdb770b69.js
INFO: Parsing sekoia_db/malicious/c20dd57d6b2e3f133449cbc590bbaee1ed152f23.js
INFO: Parsing sekoia_db/malicious/c735542798670583d2e492be4d11a2863a0ef195.js
INFO: Parsing sekoia_db/malicious/c7bac1a640f74bf1e5128ec465b25f20145766f0.js
INFO: Parsing sekoia_db/malicious/cdd9aebd5c7e84949bc90593b45d77f7ac383d33.js
INFO: Parsing sekoia_db/malicious/cf4713e1ad5068267bfcac8c9227d9fad1fc0b09.js
INFO: Parsing sekoia_db/malicious/d01177b92831b2544dc28b61467d61e620711bcc.js
INFO: Parsing sekoia_db/malicious/d0202fd0e179a6cd8f3a8e2498dc7b5061a9db33.js
INFO: Parsing sekoia_db/malicious/d346b132f82427951672fbdf5ddffc9bb0148d78.js
INFO: Parsing sekoia_db/malicious/d97e9c3bf9508248ed7e6e81152029205162f9ee.js
INFO: Parsing sekoia_db/malicious/e2c0b62964ebfc6b0ebc83f5b5cff8ac393bb3e5.js
INFO: Parsing sekoia_db/malicious/e35d002c12f2b3411b6be81e7bec061f80a90112.js
INFO: Parsing sekoia_db/malicious/e8b73f1cf84eb9f0f0ded0525bf8c4e4bed3d6ae.js
INFO: Parsing sekoia_db/malicious/ec3e5487c526cf9f4a0c20e3317ddf046aac5f45.js
INFO: Parsing sekoia_db/malicious/f8322b51104c8ee6906e8fca968341fd8921baf2.js
INFO: Parsing sekoia_db/malicious/ffabce78341dc27fc7993efadd46fa54fbcb55dc.js

In [6]:
X_0 = tfidf_transformer([s.data for s in db.legitimate_snippets()])

In [9]:
X_1 = tfidf_transformer([s.data for s in db.malicious_snippets()])
X = np.vstack([X_0, X_1])
Y =np.zeros(len(X))
Y[-len(X_1):] = 1

In [10]:
def clf_eval(clf, X, Y):
    loo = cross_validation.LeaveOneOut(len(Y))
    Y_true = []
    Y_pred = []
    for train_index, test_index in loo:
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        clf.fit(X_train, Y_train)
        Y_pred.append(clf.predict(X_test))
        Y_true.append(Y_test)
    Y_true = np.array(Y_true)
    Y_pred = np.array(Y_pred)
    cm =  metrics.confusion_matrix(Y_true, Y_pred)
    print("Overall accuracy : "+str(sum(Y_true == Y_pred)/len(Y_true)))
    return cm

In [11]:
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

results = {}
for str_clf in ['LinearSVC', 'KNeighborsClassifier', 'SVC', 'BaggingClassifier', 'RandomForestClassifier',
                'ExtraTreesClassifier', 'AdaBoostClassifier', 'GradientBoostingClassifier']:
    clf = eval(str_clf+'()')
    print("Evaluating "+str_clf+"...")
    results[str_clf] = clf_eval(clf, X, Y)


Evaluating LinearSVC...
Overall accuracy : [ 0.97297297]
Evaluating KNeighborsClassifier...
Overall accuracy : [ 0.93243243]
Evaluating SVC...
Overall accuracy : [ 0.59459459]
Evaluating BaggingClassifier...
Overall accuracy : [ 0.95945946]
Evaluating RandomForestClassifier...
Overall accuracy : [ 1.]
Evaluating ExtraTreesClassifier...
Overall accuracy : [ 0.97297297]
Evaluating AdaBoostClassifier...
Overall accuracy : [ 0.94594595]
Evaluating GradientBoostingClassifier...
Overall accuracy : [ 0.94594595]

In [21]:
results_fpfn = {}
results_fpfn['Yes'] = [len(X_0),0]
results_fpfn['No'] = [0, len(X_1)]
for k in results:
    results_fpfn[k] = [results[k][0,1], results[k][1,0]]
for k in results_fpfn:
    plt.scatter(results_fpfn[k][0], results_fpfn[k][1])
    plt.text(results_fpfn[k][0], results_fpfn[k][1], k)
plt.title("Performance of different classification methods over the current database")
plt.xlabel("# of False Positives")
plt.ylabel("# of False Negatives")


Out[21]:
<matplotlib.text.Text at 0x11621b358>

In [27]:
def single_projection(X, s, l, color, labels=None):
    Y = l.fit_transform(X)
    plt.title(s)
    plt.scatter(Y[:, 0], Y[:, 1], c=color, alpha=0.7)
    plt.axis('tight')
    if labels:
        place_labels(labels, Y)

def project_on_plane(X, color, n_neighbors = 10, n_components = 2, title='2D projection', unique=None, labels=None):
    '''Give multiple 2D representations of a high-dimenstional dataset

    See http://scikit-learn.org/stable/auto_examples/manifold/plot_compare_methods.html'''
    fig = plt.figure(figsize=(15,8))
    learners = [['Isomap', manifold.Isomap(n_neighbors, n_components)],
                ['LLE', manifold.LocallyLinearEmbedding(n_neighbors, n_components, eigen_solver='auto')],
                ['LTSA', manifold.LocallyLinearEmbedding(n_neighbors, n_components, eigen_solver='auto', method='ltsa')],
                ['Hessian', manifold.LocallyLinearEmbedding(n_neighbors, n_components, eigen_solver='auto', method='hessian')],
                ["Modified", manifold.LocallyLinearEmbedding(n_neighbors, n_components, eigen_solver='auto', method='hessian')],
                ["MDS", manifold.MDS(n_components, max_iter=100, n_init=1)],
                ["Spectral Embedding", manifold.SpectralEmbedding(n_components=n_components,n_neighbors=n_neighbors)],
                ["t-SNE", manifold.TSNE(n_components=n_components, init='pca', random_state=0)]
                ]
    fig_num = 331
    if unique:
        s,l = [x for x in learners if x[0] == unique][0]
        single_projection(X, s, l, color, labels)
    else:
        for s, l in learners:
            ax = fig.add_subplot(fig_num)
            fig_num += 1
            single_projection(X, s, l, color, labels)
    plt.savefig(title+'.pdf')
    plt.show()

In [28]:
project_on_plane(X, ['r' if c == 1 else 'g' for c in Y])



In [29]:
project_on_plane(X, ['r' if c == 1 else 'g' for c in Y], unique='Isomap')



In [30]:
project_on_plane(X, ['r' if c == 1 else 'g' for c in Y], unique='MDS')



In [35]:
Y2d = manifold.Isomap(10, 2).fit_transform(X)
plt.scatter(Y2d[:, 0], Y2d[:, 1], c=['r' if c == 1 else 'g' for c in Y], alpha=0.7)


Out[35]:
<matplotlib.collections.PathCollection at 0x1179601d0>

In [73]:
clf = RandomForestClassifier().fit(X,Y)
isomap = manifold.Isomap(10, 2)
Y2d = isomap.fit_transform(X)
if hasattr(clf, "decision_function"):
    Z = clf.decision_function(X)
else:
    Z = clf.predict_proba(X)[:, 1]
plt.scatter(Y2d[:, 0], Y2d[:, 1], c=Z, alpha=0.7)
x_min = X.min(0)
x_span = X.max(0)-X.min(0)
rand_points = []
for i in range(0,100):
    r = np.random.random(len(x_min))*x_span - x_min
    rand_points.append(r)
rand_points = np.array(rand_points)
if hasattr(clf, "decision_function"):
    rand_values = clf.decision_function(rand_points)
else:
    rand_values = clf.predict_proba(rand_points)[:, 1]
Ybis2d = isomap.transform(rand_points)
plt.scatter(Ybis2d[:,0], Ybis2d[:,1], c=rand_values)


Out[73]:
<matplotlib.collections.PathCollection at 0x119098be0>

In [86]:
zi[:,:,0].shape


Out[86]:
(500, 500)

In [88]:
plt.figure(figsize=(15,10))
from scipy.interpolate import griddata
_Y = np.vstack([Y2d, Ybis2d])
_Z = np.vstack([Z[:,None], rand_values[:,None]])
xi = np.linspace(_Y[:,0].min(), _Y[:,0].max(), 500)
yi = np.linspace(_Y[:,1].min(), _Y[:,1].max(), 500)
zi = griddata(_Y, _Z, (xi[None,:], yi[:,None]), method='nearest')
CS = plt.contour(xi,yi,zi[:,:,0],15,linewidths=0.5,colors='k')
CS = plt.contourf(xi,yi,zi[:,:,0],15,cmap=plt.cm.jet)
plt.scatter(Y2d[:, 0], Y2d[:, 1], c=Z)
plt.savefig('Picasso.png')