In [ ]:
from collections import Counter
from itertools import combinations
from collections import namedtuple
import datetime
import hashlib
import sys
import time
import signal
import re
import gzip
%matplotlib
LogLine = namedtuple('LogLine', ['ts', 'text', 'processed'])
DataRecord = namedtuple('DataRecord', ['line', 'md5hash', 'stats', 'processedStats'])

In [ ]:
def openFile(name, mode):
    if name.lower().endswith('.gz'):
        return gzip.open(name, mode+'b')
    else:
        return open(name, mode)

In [ ]:
def tuple2Str(a):
    '''
         make a concatenation of a tuple
         can make multiple things alias to the same comparison..
         'a','aaa','aa','aa','aaa','a' all map to 'aaaa'
    '''

    return '%s%s' % a


# GOOD
def str2Super(X):
    '''
        make super words
    '''
    return sorted(map(tuple2Str, set(combinations(X.rstrip().split(), 2))))


def processString(inText):
    FLAGS = re.MULTILINE | re.DOTALL
    URL = ' URL '
    FILEPATH = ' FILEPATH '
    IPADDR = ' IPADDR '
    FILEANDLINE = ' FILEANDLINE '
    DATE = ' DATE '
    TIME = ' TIME '
    SILENTREMOVE = ''
    SPACE = ' '
    PERL = ' PERLFILE '
    CGI = ' CGIFILE '
    JPG = ' JPGFILE '
    AFILE = ' AFILE '
    LEVEL = ' LEVEL '
    INT = ' INT '
    
    badchars = [r'\[',r'\]',r'\(',r'\)',r'{',r'}',r':',r',',r'-']
    silentchars = [r'\"',r'\.',r'\'',r'\`',r'!']
    text = ""+inText.lower()

    #text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", URL,inText,FLAGS)
    #2010-04-01 00:39:21,914

    
    text = re.sub(r'(?:\d{2}:\d{2}:\d{2},\d{3})',TIME,text,FLAGS)
    text = re.sub(r'(?:\d{4}-\d{2}-\d{2})',DATE,text,FLAGS)
    text = re.sub(r'(?:\w+(\.?)+:\d+)',FILEANDLINE,text,FLAGS) #thisone bad
    text = re.sub(r'https?:\/\/\S+',URL,text,FLAGS)
    #text = re.sub(r'(?:\w+\.cgi)',CGI,text,FLAGS)
    #text = re.sub(r'(?:\w+\.jpg)',JPG,text,FLAGS)
    text = re.sub(r'(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)',IPADDR,text,FLAGS)
    text = re.sub(r'(\S+)\/([^\/]?)(?:\S+)',FILEPATH,text,FLAGS)
    text = re.sub(r'(?:(\w+\.)+\w{1,3})',AFILE,text,FLAGS)
    text = re.sub(r'alert|error|crit',LEVEL,text,FLAGS)

    text = re.sub(r'(?:\d+)',INT,text,FLAGS)
    
    for c in badchars:
        text = re.sub(c,SPACE,text,FLAGS)
        
    for c in silentchars:
            text = re.sub(c,SILENTREMOVE,text,FLAGS)

    text = re.sub(r'\s+',' ',text,FLAGS)
    
    print inText
    print text
    
    return text.lstrip().rstrip()
    


def dataset_iterator(fIn, num_lines):
    '''
        Handle reading the data from file into a know form
    '''
    lines_read = 0
    success_full = 0
    while num_lines == -1 or lines_read < num_lines:
        lines_read += 1
        line = fIn.readline()
        if len(line) == 0:
            break
        else:
            processed = processString(line[27:].strip())
            try:
                
                logtype = 1
                if logtype == 0:
                    # syslogway
                    ts = datetime.datetime.strptime(line[:14], '%b %d %H:%M:%S')
                    rest = line[15:].strip()
                    yield LogLine(ts.replace(year=2015), rest,processed)
                    success_full += 1
                if logtype == 1:
                    # apache weblog way
                    ts = datetime.datetime.strptime(line[1:25],
                                                    '%a %b %d %H:%M:%S %Y')
                    rest = line[27:].strip()
                    processed = processString(rest)
                    yield LogLine(ts, rest, processed)
                    success_full += 1
                    
                print processed
            except:
                pass
 
# TODO lookup faster hashes
def makeHash(s):
    '''
        make a md5 string rep of an input string
    '''

    m = hashlib.md5()
    m.update(s)
    return m.hexdigest()



def init(inFile):

    totalS = time.time()

    print 'Attempting to open %s' % ( inFile)
    out = sys.stdout

    a = openFile(inFile, 'r')
    D = list()
    G = dict()

    readCount = 0
    
    for r in dataset_iterator(a, -1):
        h = makeHash(r.text)
        s = str2Super(r.text)
        ps = str2Super(r.processed)
        D.append(DataRecord(r, h, s,ps))
        readCount += 1

    a.close()

    print 'Read %i items' % readCount
    
    return D

In [ ]:
D = init('apache_error_1K.log')

In [ ]:
allTuples = [str.join(" ",X.processedStats) for X in D]
allOriginal = [X.line.text for X in D]
allProcessed =[X.line.processed for X in D]
print 'original:',allOriginal[0]
print 'processed:',allProcessed[0]
print 'tupleified:',allTuples[0]

firstAndSecondOrders = list()
for i in range(len(allTuples)):
    firstAndSecondOrders.append(allTuples[i] + ' ' +allProcessed[i] )

In [ ]:
genVectorsFromThis = firstAndSecondOrders
s=set()
x=0
for i in range(len(genVectorsFromThis)):
    s.add(genVectorsFromThis[i])
    x+=1
    if len(s) == 100:
        break
print x,len(s)
for i in s:
    print i

In [ ]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(min_df=2, max_features = 1000)
vz = vectorizer.fit_transform(genVectorsFromThis)
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [ ]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=80, random_state=0)
svd_tfidf = svd.fit_transform(vz[:1000])

In [ ]:
svd_tfidf.shape

In [ ]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_tfidf = tsne_model.fit_transform(svd_tfidf)

In [ ]:
tsne_tfidf.shape

In [ ]:
tsne_tfidf[0]

In [ ]:
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

output_notebook()
plot_tfidf = bp.figure(plot_width=1024, plot_height=1024, title="LogLines (tf-idf)",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_tfidf.scatter(x=tsne_tfidf[:,0], y=tsne_tfidf[:,1],
                    source=bp.ColumnDataSource({
                        "LogLine": allOriginal[:1000],
                        "LineProcessed":genVectorsFromThis[:1000]
                    }))

hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"LogLine": "@LogLine \n(PROCESSED: \"@LineProcessed\")"}
show(plot_tfidf)

In [ ]:
from sklearn.cluster import MiniBatchKMeans

num_clusters=20
kmeans_model = MiniBatchKMeans(n_clusters=num_clusters, init='k-means++', n_init=1, 
                         init_size=1000, batch_size=1000, verbose=False, max_iter=1000)
kmeans = kmeans_model.fit(vz)
kmeans_clusters = kmeans.predict(vz)
kmeans_distances = kmeans.transform(vz)

In [ ]:
for i,line in enumerate(genVectorsFromThis):
    if (i<5):
        print("Cluster "+ str(kmeans_clusters[i]) + ":" + line + "(distance: " + str(kmeans_distances[i][kmeans_clusters[i]]) + ")")

In [ ]:
sorted_centroids = kmeans.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(num_clusters):
    sys.stdout.write("Cluster %d:" % i)
    for j in sorted_centroids[i, :10]:
        sys.stdout.write (' %s' % terms[j])
    print

In [ ]:
tsne_kmeans = tsne_model.fit_transform(kmeans_distances[:10000])

In [ ]:
import numpy as np

colormap = np.array([
    "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", 
    "#98df8a", "#d62728", "#ff9896", "#9467bd", "#c5b0d5", 
    "#8c564b", "#c49c94", "#e377c2", "#f7b6d2", "#7f7f7f", 
    "#c7c7c7", "#bcbd22", "#dbdb8d", "#17becf", "#9edae5"
])

plot_kmeans = bp.figure(plot_width=1024, plot_height=1024, title="tuple k-means",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_kmeans.scatter(x=tsne_kmeans[:,0], y=tsne_kmeans[:,1], 
                    color=colormap[kmeans_clusters][:10000], 
                    source=bp.ColumnDataSource({
                         "LogLine": allOriginal[:1000],
                         "LineProcessed":genVectorsFromThis[:1000],
                         "cluster": kmeans_clusters[:10000]
                    }))
hover = plot_kmeans.select(dict(type=HoverTool))
hover.tooltips={"LogLine": "@LogLine \n(PROCESSED: \"@LineProcessed\")"}
show(plot_kmeans)

In [ ]:
import lda
from sklearn.feature_extraction.text import CountVectorizer

cvectorizer = CountVectorizer(min_df=4, max_features=10000, stop_words='english')
cvz = cvectorizer.fit_transform(genVectorsFromThis)

n_topics = 20
n_iter = 1000
lda_model = lda.LDA(n_topics=n_topics, n_iter=n_iter)
X_topics = lda_model.fit_transform(cvz)

In [ ]:
n_top_words = 10
topic_summaries = []

topic_word = lda_model.topic_word_  # get the topic words
vocab = cvectorizer.get_feature_names()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

In [ ]:
tsne_lda = tsne_model.fit_transform(X_topics[:10000])

In [ ]:
doc_topic = lda_model.doc_topic_
lda_keys = []
for i, logLine in enumerate(genVectorsFromThis):
    lda_keys += [doc_topic[i].argmax()]

In [ ]:
plot_lda = bp.figure(plot_width=800, plot_height=800, title="loglines (LDA)",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

plot_lda.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], 
                 color=colormap[lda_keys][:10000], 
                 source=bp.ColumnDataSource({
                    "LogLine": allOriginal[:10000],
                    "LineProcessed":genVectorsFromThis[:1000],
                    "topic_key": lda_keys[:10000]
                }))
hover = plot_lda.select(dict(type=HoverTool))
hover.tooltips={"LogLine": "@LogLine -PROCESSED:[@LineProcessed) -topic: @topic_key)"}
show(plot_lda)

In [ ]:


In [ ]: