In [ ]:
%pylab inline
from IPython.display import display
import pandas as pd
import numpy as np
import os
import re
from collections import OrderedDict
from performance_tools.urls_flow.backends import ElasticURLFlowBackend
In [ ]:
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 10000)
pd.set_option('display.width', 200)
In [ ]:
_csv_filename = 'ebo_production_test_16-03_20-03.csv'
csv_file = os.path.realpath(os.path.join(os.path.curdir, _csv_filename))
In [ ]:
host = '127.0.0.1'
port = 9200
query = """(type: \"nginx-access\") AND (host: \"hostname\") AND
(NOT ((referrer: (
\"site_media\" OR
\"files\" OR
\"static\" OR
\"login\" OR
\"ping\" OR
\"apple-touch\" OR
\"robots.txt\" OR
\"favicon.ico\" OR
\"index.php\" OR
\"password/reset\")) OR
'(request: (\"site_media\" OR
\"files\" OR
\"static\" OR
\"login\" OR
\"ping\" OR
\"apple-touch\" OR
\"robots.txt\" OR
\"favicon.ico\" OR
\"index.php\" OR
\"password/reset\"
))))"""
date_from = '2015-03-16T08:00:00.000Z'
date_to = '2015-03-20T20:00:00.000Z'
size = 50
In [ ]:
es = ElasticURLFlowBackend(host=host, port=port, query=query, date_from=date_from, date_to=date_to, size=size)
In [ ]:
REGEX = re.compile(r'/(([A-Z0-9]{3,4})?(TR|BEN|CLI|CON|PI)\d+|[A-Z]{3}|[-0-9a-fA-F,_]*[-0-9,_]+[-0-9a-fA-F,_]*)')
In [ ]:
es.to_csv(csv_file, REGEX, verbose=0)
In [ ]:
NOISE = 0.1
LOWER_QUANTILE = NOISE / 2
UPPER_QUANTILE = 1 - (NOISE / 2)
In [ ]:
df = pd.read_csv(csv_file)
In [ ]:
df['Request'].count()
In [ ]:
df_without_noise = df[df.Time <= df.Time.quantile(UPPER_QUANTILE)]
df_without_noise = df_without_noise[df.Time >= df.Time.quantile(LOWER_QUANTILE)]
df_without_noise.reset_index()
stats = {
'Sum': df_without_noise.Time.sum(),
'Mean': df_without_noise.Time.mean(),
'Std': df_without_noise.Time.std(),
'Min': df_without_noise.Time.min(),
'Max': df_without_noise.Time.max(),
}
pd.DataFrame(stats.values(), index=stats.keys(), columns=['Time'])
In [ ]:
def _get_stats(x, functions):
x = x[x.Time >= x.Time.quantile(LOWER_QUANTILE)]
x = x[x.Time <= x.Time.quantile(UPPER_QUANTILE)]
values = []
index = []
for i, f in functions.items():
values.append(f(x.Time))
index.append(i)
return pd.Series(values, index=index)
functions = OrderedDict((
('Count By Week', len),
('Count By Day', lambda x: len(x) / 5),
('Mean', np.mean),
('Std', np.std),
('Max', np.max),
('Min', np.min),
('Sum', np.sum),
('Median', np.median),
))
In [ ]:
df.groupby('Request').apply(_get_stats, functions)
In [ ]:
df.groupby(['Referrer', 'Request']).apply(_get_stats, functions)