In [1]:
import numpy as np
import pandas as pd
from cStringIO import StringIO

def parse_raw_graphite_metric(line):
    head, body = line.rstrip('\n').split('|', 1)
    name, since, until, freq = head.replace('\n', '').split(',')
    since = float(since) * 1e9
    until = float(until) * 1e9
    freq  = '%ss' % int(freq)
    data  = body.rstrip('\n').split(',')
    index = pd.date_range(start=since, end=until, freq=freq, tz='UTC')
    index = index[:len(data)]
    S = pd.Series(
        name  = name,
        data  = data,
        index = index,
    )
    S.replace('None', np.nan, inplace=True)
    return S.astype(float)

def parse_raw_graphite(content):
    # in testing on a 16MiB 350 metric data dump:
    # - str.split         ~ 11ms
    # - custom lazy split ~  9ms
    # - cStringIO         ~  6ms
    metrics = (
        parse_raw_graphite_metric(line)
        for line in StringIO(content)
    )
    return pd.DataFrame.from_items(
        (metric.name, metric)
        for metric in metrics
    )

In [35]:
url = 'https://metric.mtsvc.net/render/?width=586&height=308&_salt=1399928295.7&from=-14days&target=sum(servers.cs.c01.web.c*.reqpersec)&target=sum(servers.cs.c02.web.c*.reqpersec)&target=sum(servers.cs.c03.web.n*.reqpersec)&target=sum(servers.cs.c10.web.n*.reqpersec)&rawData=True'
import requests

rv = requests.get(url)

In [6]:
df = parse_raw_graphite(rv.text)
df.head()


Out[6]:
sumSeries(servers.cs.c01.web.c*.reqpersec) sumSeries(servers.cs.c02.web.c*.reqpersec) sumSeries(servers.cs.c03.web.n*.reqpersec) sumSeries(servers.cs.c10.web.n*.reqpersec)
2014-04-28 21:00:00+00:00 1577.437469 1331.267760 1736.593220 1631.097701
2014-04-28 21:01:00+00:00 1584.454545 1383.644068 1603.033333 1495.600526
2014-04-28 21:02:00+00:00 1572.783333 1395.336612 1513.811749 1568.283333
2014-04-28 21:03:00+00:00 1608.983333 1403.661017 1502.593220 1574.456011
2014-04-28 21:04:00+00:00 1543.783333 1323.067797 1542.133333 1505.099649

5 rows × 4 columns


In [12]:
rename = {x: x.split('.')[2] for x in df.columns}
rename


Out[12]:
{'sumSeries(servers.cs.c01.web.c*.reqpersec)': 'c01',
 'sumSeries(servers.cs.c02.web.c*.reqpersec)': 'c02',
 'sumSeries(servers.cs.c03.web.n*.reqpersec)': 'c03',
 'sumSeries(servers.cs.c10.web.n*.reqpersec)': 'c10'}

In [13]:
df = df.rename(columns=rename)
df.head()


Out[13]:
c01 c02 c03 c10
2014-04-28 21:00:00+00:00 1577.437469 1331.267760 1736.593220 1631.097701
2014-04-28 21:01:00+00:00 1584.454545 1383.644068 1603.033333 1495.600526
2014-04-28 21:02:00+00:00 1572.783333 1395.336612 1513.811749 1568.283333
2014-04-28 21:03:00+00:00 1608.983333 1403.661017 1502.593220 1574.456011
2014-04-28 21:04:00+00:00 1543.783333 1323.067797 1542.133333 1505.099649

5 rows × 4 columns


In [20]:
%pylab inline

try:
    df['c01'].plot()
except:
    "There was a formatter problem"


Populating the interactive namespace from numpy and matplotlib

In [24]:
df['c01'].hist(bins=100)


Out[24]:
<matplotlib.axes.AxesSubplot at 0x10a8f2ad0>

In [27]:
import seaborn as sns
sns.jointplot(df['c01'], df['c10'], kind="hex")


Out[27]:
<seaborn.axisgrid.JointGrid at 0x10b895850>

In [34]:
sns.boxplot(df)


Out[34]:
<matplotlib.axes.AxesSubplot at 0x10c1c02d0>