In [ ]:
from collections import Counter
from itertools import combinations
from collections import namedtuple
import datetime
import hashlib
import sys
import time
import signal
import re
import gzip
%matplotlib
LogLine = namedtuple('LogLine', ['ts', 'text', 'processed'])
DataRecord = namedtuple('DataRecord', ['line', 'md5hash', 'stats', 'processedStats'])
In [ ]:
def openFile(name, mode):
if name.lower().endswith('.gz'):
return gzip.open(name, mode+'b')
else:
return open(name, mode)
In [ ]:
def tuple2Str(a):
'''
make a concatenation of a tuple
can make multiple things alias to the same comparison..
'a','aaa','aa','aa','aaa','a' all map to 'aaaa'
'''
return '%s%s' % a
# GOOD
def str2Super(X):
'''
make super words
'''
return sorted(map(tuple2Str, set(combinations(X.rstrip().split(), 2))))
def processString(inText):
FLAGS = re.MULTILINE | re.DOTALL
URL = ' URL '
FILEPATH = ' FILEPATH '
IPADDR = ' IPADDR '
FILEANDLINE = ' FILEANDLINE '
DATE = ' DATE '
TIME = ' TIME '
SILENTREMOVE = ''
SPACE = ' '
PERL = ' PERLFILE '
CGI = ' CGIFILE '
JPG = ' JPGFILE '
AFILE = ' AFILE '
LEVEL = ' LEVEL '
INT = ' INT '
badchars = [r'\[',r'\]',r'\(',r'\)',r'{',r'}',r':',r',',r'-']
silentchars = [r'\"',r'\.',r'\'',r'\`',r'!']
text = ""+inText.lower()
#text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", URL,inText,FLAGS)
#2010-04-01 00:39:21,914
text = re.sub(r'(?:\d{2}:\d{2}:\d{2},\d{3})',TIME,text,FLAGS)
text = re.sub(r'(?:\d{4}-\d{2}-\d{2})',DATE,text,FLAGS)
text = re.sub(r'(?:\w+(\.?)+:\d+)',FILEANDLINE,text,FLAGS) #thisone bad
text = re.sub(r'https?:\/\/\S+',URL,text,FLAGS)
#text = re.sub(r'(?:\w+\.cgi)',CGI,text,FLAGS)
#text = re.sub(r'(?:\w+\.jpg)',JPG,text,FLAGS)
text = re.sub(r'(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)',IPADDR,text,FLAGS)
text = re.sub(r'(\S+)\/([^\/]?)(?:\S+)',FILEPATH,text,FLAGS)
text = re.sub(r'(?:(\w+\.)+\w{1,3})',AFILE,text,FLAGS)
text = re.sub(r'alert|error|crit',LEVEL,text,FLAGS)
text = re.sub(r'(?:\d+)',INT,text,FLAGS)
for c in badchars:
text = re.sub(c,SPACE,text,FLAGS)
for c in silentchars:
text = re.sub(c,SILENTREMOVE,text,FLAGS)
text = re.sub(r'\s+',' ',text,FLAGS)
print inText
print text
return text.lstrip().rstrip()
def dataset_iterator(fIn, num_lines):
'''
Handle reading the data from file into a know form
'''
lines_read = 0
success_full = 0
while num_lines == -1 or lines_read < num_lines:
lines_read += 1
line = fIn.readline()
if len(line) == 0:
break
else:
processed = processString(line[27:].strip())
try:
logtype = 1
if logtype == 0:
# syslogway
ts = datetime.datetime.strptime(line[:14], '%b %d %H:%M:%S')
rest = line[15:].strip()
yield LogLine(ts.replace(year=2015), rest,processed)
success_full += 1
if logtype == 1:
# apache weblog way
ts = datetime.datetime.strptime(line[1:25],
'%a %b %d %H:%M:%S %Y')
rest = line[27:].strip()
processed = processString(rest)
yield LogLine(ts, rest, processed)
success_full += 1
print processed
except:
pass
# TODO lookup faster hashes
def makeHash(s):
'''
make a md5 string rep of an input string
'''
m = hashlib.md5()
m.update(s)
return m.hexdigest()
def init(inFile):
totalS = time.time()
print 'Attempting to open %s' % ( inFile)
out = sys.stdout
a = openFile(inFile, 'r')
D = list()
G = dict()
readCount = 0
for r in dataset_iterator(a, -1):
h = makeHash(r.text)
s = str2Super(r.text)
ps = str2Super(r.processed)
D.append(DataRecord(r, h, s,ps))
readCount += 1
a.close()
print 'Read %i items' % readCount
return D
In [ ]:
D = init('apache_error_1K.log')
In [ ]:
allTuples = [str.join(" ",X.processedStats) for X in D]
allOriginal = [X.line.text for X in D]
allProcessed =[X.line.processed for X in D]
print 'original:',allOriginal[0]
print 'processed:',allProcessed[0]
print 'tupleified:',allTuples[0]
firstAndSecondOrders = list()
for i in range(len(allTuples)):
firstAndSecondOrders.append(allTuples[i] + ' ' +allProcessed[i] )
In [ ]:
genVectorsFromThis = firstAndSecondOrders
s=set()
x=0
for i in range(len(genVectorsFromThis)):
s.add(genVectorsFromThis[i])
x+=1
if len(s) == 100:
break
print x,len(s)
for i in s:
print i
In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=2, max_features = 1000)
vz = vectorizer.fit_transform(genVectorsFromThis)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
In [ ]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=80, random_state=0)
svd_tfidf = svd.fit_transform(vz[:1000])
In [ ]:
svd_tfidf.shape
In [ ]:
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)
In [ ]:
tsne_tfidf.shape
In [ ]:
tsne_tfidf[0]
In [ ]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook
output_notebook()
plot_tfidf = bp.figure(plot_width=1024, plot_height=1024, title="LogLines (tf-idf)",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
plot_tfidf.scatter(x=tsne_tfidf[:,0], y=tsne_tfidf[:,1],
source=bp.ColumnDataSource({
"LogLine": allOriginal[:1000],
"LineProcessed":genVectorsFromThis[:1000]
}))
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"LogLine": "@LogLine \n(PROCESSED: \"@LineProcessed\")"}
show(plot_tfidf)
In [ ]:
from sklearn.cluster import MiniBatchKMeans
num_clusters=20
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)
In [ ]:
for i,line in enumerate(genVectorsFromThis):
if (i<5):
print("Cluster "+ str(kmeans_clusters[i]) + ":" + line + "(distance: " + str(kmeans_distances[i][kmeans_clusters[i]]) + ")")
In [ ]:
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(num_clusters):
sys.stdout.write("Cluster %d:" % i)
for j in sorted_centroids[i, :10]:
sys.stdout.write (' %s' % terms[j])
print
In [ ]:
tsne_kmeans = tsne_model.fit_transform(kmeans_distances[:10000])
In [ ]:
import numpy as np
colormap = np.array([
"#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c",
"#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5",
"#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f",
"#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])
plot_kmeans = bp.figure(plot_width=1024, plot_height=1024, title="tuple k-means",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
plot_kmeans.scatter(x=tsne_kmeans[:,0], y=tsne_kmeans[:,1],
color=colormap[kmeans_clusters][:10000],
source=bp.ColumnDataSource({
"LogLine": allOriginal[:1000],
"LineProcessed":genVectorsFromThis[:1000],
"cluster": kmeans_clusters[:10000]
}))
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"LogLine": "@LogLine \n(PROCESSED: \"@LineProcessed\")"}
show(plot_kmeans)
In [ ]:
import lda
from sklearn.feature_extraction.text import CountVectorizer
cvectorizer = CountVectorizer(min_df=4, max_features=10000, stop_words='english')
cvz = cvectorizer.fit_transform(genVectorsFromThis)
n_topics = 20
n_iter = 1000
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)
In [ ]:
n_top_words = 10
topic_summaries = []
topic_word = lda_model.topic_word_ # get the topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
topic_summaries.append(' '.join(topic_words))
print('Topic {}: {}'.format(i, ' '.join(topic_words)))
In [ ]:
tsne_lda = tsne_model.fit_transform(X_topics[:10000])
In [ ]:
doc_topic = lda_model.doc_topic_
lda_keys = []
for i, logLine in enumerate(genVectorsFromThis):
lda_keys += [doc_topic[i].argmax()]
In [ ]:
plot_lda = bp.figure(plot_width=800, plot_height=800, title="loglines (LDA)",
tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
x_axis_type=None, y_axis_type=None, min_border=1)
plot_lda.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1],
color=colormap[lda_keys][:10000],
source=bp.ColumnDataSource({
"LogLine": allOriginal[:10000],
"LineProcessed":genVectorsFromThis[:1000],
"topic_key": lda_keys[:10000]
}))
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips={"LogLine": "@LogLine -PROCESSED:[@LineProcessed) -topic: @topic_key)"}
show(plot_lda)
In [ ]:
In [ ]: