URL Access Analysis

Imports


In [ ]:
%pylab inline
from IPython.display import display
import pandas as pd
import numpy as np
import os
import re
from collections import OrderedDict
from performance_tools.urls_flow.backends import ElasticURLFlowBackend

Pandas options


In [ ]:
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 10000)
pd.set_option('display.width', 200)

CSV input file


In [ ]:
_csv_filename = 'ebo_production_test_16-03_20-03.csv'
csv_file = os.path.realpath(os.path.join(os.path.curdir, _csv_filename))

Get data from backend

Query parameters


In [ ]:
host = '127.0.0.1'
port = 9200
query = """(type: \"nginx-access\") AND (host: \"hostname\") AND
        (NOT ((referrer: (
                    \"site_media\" OR
                    \"files\" OR
                    \"static\" OR
                    \"login\" OR
                    \"ping\" OR
                    \"apple-touch\" OR
                    \"robots.txt\" OR
                    \"favicon.ico\" OR
                    \"index.php\" OR
                    \"password/reset\")) OR
        '(request: (\"site_media\" OR
                    \"files\" OR
                    \"static\" OR
                    \"login\" OR
                    \"ping\" OR
                    \"apple-touch\" OR
                    \"robots.txt\" OR
                    \"favicon.ico\" OR
                    \"index.php\" OR
                    \"password/reset\"
        ))))"""
date_from = '2015-03-16T08:00:00.000Z'
date_to = '2015-03-20T20:00:00.000Z'
size = 50

Backend instance


In [ ]:
es = ElasticURLFlowBackend(host=host, port=port, query=query, date_from=date_from, date_to=date_to, size=size)

Regular expression to normalize IDs in each URL


In [ ]:
REGEX = re.compile(r'/(([A-Z0-9]{3,4})?(TR|BEN|CLI|CON|PI)\d+|[A-Z]{3}|[-0-9a-fA-F,_]*[-0-9,_]+[-0-9a-fA-F,_]*)')

Write data to CSV


In [ ]:
es.to_csv(csv_file, REGEX, verbose=0)

Analysis


In [ ]:
NOISE = 0.1
LOWER_QUANTILE = NOISE / 2
UPPER_QUANTILE = 1 - (NOISE / 2)

Read CSV file


In [ ]:
df = pd.read_csv(csv_file)

Statistical data

Number of requests


In [ ]:
df['Request'].count()

Time stats


In [ ]:
df_without_noise = df[df.Time <= df.Time.quantile(UPPER_QUANTILE)]
df_without_noise = df_without_noise[df.Time >= df.Time.quantile(LOWER_QUANTILE)]
df_without_noise.reset_index()

stats = {
    'Sum': df_without_noise.Time.sum(),
    'Mean': df_without_noise.Time.mean(),
    'Std': df_without_noise.Time.std(),
    'Min': df_without_noise.Time.min(),
    'Max': df_without_noise.Time.max(),
}

pd.DataFrame(stats.values(), index=stats.keys(), columns=['Time'])

Requests stats


In [ ]:
def _get_stats(x, functions):
    x = x[x.Time >= x.Time.quantile(LOWER_QUANTILE)]
    x = x[x.Time <= x.Time.quantile(UPPER_QUANTILE)]
    values = []
    index = []
    for i, f in functions.items():
        values.append(f(x.Time))
        index.append(i)
    return pd.Series(values, index=index)

functions = OrderedDict((
    ('Count By Week', len),
    ('Count By Day', lambda x: len(x) / 5),
    ('Mean', np.mean),
    ('Std', np.std),
    ('Max', np.max),
    ('Min', np.min),
    ('Sum', np.sum),
    ('Median', np.median),
))

Stats by each request


In [ ]:
df.groupby('Request').apply(_get_stats, functions)

Stats by request and referrer


In [ ]:
df.groupby(['Referrer', 'Request']).apply(_get_stats, functions)