notebook.community

Edit and run



In [ ]:

    
import sys
sys.path.reverse()
from gensim import corpora, models, similarities



In [ ]:

    
#universal stop list and word delimiters
stoplist=set('- get head post 302 200 404 403'.split()) 
import re
delimiters="/", r'\\', r'\\'," ","&","?"
regexPattern = '|'.join(map(re.escape, delimiters))
regexPattern



In [ ]:

    
#learn words
dictionary = corpora.Dictionary(
    re.split(regexPattern,line.decode('ascii','ignore').lower() ) 
    for line in open('www.mozilla.org.access_2013-12-31-13.good').readlines()
)



In [ ]:

    
dictionary.token2id



In [ ]:

    
len(dictionary)



In [ ]:

    
#learn some bad words from https://code.google.com/p/fuzzdb/source/browse/trunk/attack-payloads/all-attacks/all-attacks-unix.txt
dictionary.add_documents(re.split(regexPattern,line.lower()) for line in open('all-attacks-unix.txt').readlines())



In [ ]:

    
dictionary.token2id



In [ ]:

    
len(dictionary)
#save if you want with: dictionary.save('xldictionary.dic')



In [ ]:

    
#make a corpus of good words only
corpusGood=[dictionary.doc2bow([word for word in re.split(regexPattern,line.decode('ascii','ignore').lower()) if word not in stoplist]) for line in open('www.mozilla.org.access_2013-12-31-13.good').readlines()[0:20000]]



In [ ]:

    
len(corpusGood)



In [ ]:

    
#latent semantic index model http://en.wikipedia.org/wiki/Latent_semantic_indexing
lsiGood = models.LsiModel(corpusGood, id2word=dictionary,num_topics=200)
lsiGood.print_topics(1000)[0]



In [ ]:

    
agoodhit=r'''/thunderbird/js/jquery/jquery-1.5.1.min.js'''



In [ ]:

    
vecGood = dictionary.doc2bow([word for word in re.split(regexPattern,agoodhit.lower())])
veclsiGood = lsiGood[vecGood] # convert the query to LSI space



In [ ]:

    
len(veclsiGood)



In [ ]:

    
from operator import itemgetter
sorted(veclsiGood, key=itemgetter(1),reverse=True)



In [ ]:

    
lsiGood.print_topics(1000)[4]



In [ ]:

    
badstoplist=set('- get foo or 1.0 1.1 302 200 404 403'.split()) 
#create a bad corpus
corpusBad=[dictionary.doc2bow([word for word in re.split(regexPattern,line.decode('ascii','ignore').lower()) if word not in badstoplist])  for line in open('web-attacks-unix.txt').readlines()]



In [ ]:

    
lsiBad = models.LsiModel(corpusBad,id2word=dictionary,num_topics=1000)
#lsiBad.print_topics(10)



In [ ]:

    
abadhit=r'''GET ../../../../../../../../../../../../etc/passwd'''
#[word for word in re.split(regexPattern,abadhit.lower())]



In [ ]:

    
vecBad = dictionary.doc2bow([word for word in re.split(regexPattern,abadhit.lower()) if word not in badstoplist])
veclsiBad = lsiBad[vecBad]
sorted(veclsiBad, key=itemgetter(1),reverse=True)



In [ ]:

    
lsiBad.print_topics(1000)[0]



In [ ]:

    
#if you didn't know..is the hit good or bad?
ahit=r'''GET http/1.1 ../../../../../../../../../../../../etc/passwd'''
vecHit = dictionary.doc2bow([word for word in re.split(regexPattern,ahit.lower()) if word not in badstoplist])
veclsiBad = lsiBad[vecHit]
veclsiGood = lsiGood[vecHit]
#compare matches.
badMatches=sorted(veclsiBad, key=itemgetter(1),reverse=True)
goodMatches=sorted(veclsiGood, key=itemgetter(1),reverse=True)



In [ ]:

    
print(len(goodMatches))
print(len(badMatches))



In [ ]:

    
print(badMatches[0],goodMatches[0])



In [ ]:

    
print(lsiBad.print_topics(1000)[0])
print(lsiGood.print_topics(1000)[0])



In [ ]: