In [ ]:
import sys
sys.path.reverse()
from gensim import corpora, models, similarities

In [ ]:
#universal stop list and word delimiters
stoplist=set('- get head post 302 200 404 403'.split()) 
import re
delimiters="/", r'\\', r'\\'," ","&","?"
regexPattern = '|'.join(map(re.escape, delimiters))
regexPattern

In [ ]:
#learn words
dictionary = corpora.Dictionary(
    re.split(regexPattern,line.decode('ascii','ignore').lower() ) 
    for line in open('www.mozilla.org.access_2013-12-31-13.good').readlines()
)

In [ ]:
dictionary.token2id

In [ ]:
len(dictionary)

In [ ]:
#learn some bad words from https://code.google.com/p/fuzzdb/source/browse/trunk/attack-payloads/all-attacks/all-attacks-unix.txt
dictionary.add_documents(re.split(regexPattern,line.lower()) for line in open('all-attacks-unix.txt').readlines())

In [ ]:
dictionary.token2id

In [ ]:
len(dictionary)
#save if you want with: dictionary.save('xldictionary.dic')

In [ ]:
#make a corpus of good words only
corpusGood=[dictionary.doc2bow([word for word in re.split(regexPattern,line.decode('ascii','ignore').lower()) if word not in stoplist]) for line in open('www.mozilla.org.access_2013-12-31-13.good').readlines()[0:20000]]

In [ ]:
len(corpusGood)

In [ ]:
#latent semantic index model http://en.wikipedia.org/wiki/Latent_semantic_indexing
lsiGood = models.LsiModel(corpusGood, id2word=dictionary,num_topics=200)
lsiGood.print_topics(1000)[0]

In [ ]:
agoodhit=r'''/thunderbird/js/jquery/jquery-1.5.1.min.js'''

In [ ]:
vecGood = dictionary.doc2bow([word for word in re.split(regexPattern,agoodhit.lower())])
veclsiGood = lsiGood[vecGood] # convert the query to LSI space

In [ ]:
len(veclsiGood)

In [ ]:
from operator import itemgetter
sorted(veclsiGood, key=itemgetter(1),reverse=True)

In [ ]:
lsiGood.print_topics(1000)[4]

In [ ]:
badstoplist=set('- get foo or 1.0 1.1 302 200 404 403'.split()) 
#create a bad corpus
corpusBad=[dictionary.doc2bow([word for word in re.split(regexPattern,line.decode('ascii','ignore').lower()) if word not in badstoplist])  for line in open('web-attacks-unix.txt').readlines()]

In [ ]:
lsiBad = models.LsiModel(corpusBad,id2word=dictionary,num_topics=1000)
#lsiBad.print_topics(10)

In [ ]:
abadhit=r'''GET ../../../../../../../../../../../../etc/passwd'''
#[word for word in re.split(regexPattern,abadhit.lower())]

In [ ]:
vecBad = dictionary.doc2bow([word for word in re.split(regexPattern,abadhit.lower()) if word not in badstoplist])
veclsiBad = lsiBad[vecBad]
sorted(veclsiBad, key=itemgetter(1),reverse=True)

In [ ]:
lsiBad.print_topics(1000)[0]

In [ ]:
#if you didn't know..is the hit good or bad?
ahit=r'''GET http/1.1 ../../../../../../../../../../../../etc/passwd'''
vecHit = dictionary.doc2bow([word for word in re.split(regexPattern,ahit.lower()) if word not in badstoplist])
veclsiBad = lsiBad[vecHit]
veclsiGood = lsiGood[vecHit]
#compare matches.
badMatches=sorted(veclsiBad, key=itemgetter(1),reverse=True)
goodMatches=sorted(veclsiGood, key=itemgetter(1),reverse=True)

In [ ]:
print(len(goodMatches))
print(len(badMatches))

In [ ]:
print(badMatches[0],goodMatches[0])

In [ ]:
print(lsiBad.print_topics(1000)[0])
print(lsiGood.print_topics(1000)[0])

In [ ]: