Kubeflow Stats

  • Notebook containing some plots of Kubeflow usage based on spartakus metrics

In [19]:
import datetime
import numpy as np

In [2]:
# Install and pin to versions that seem to work together
!pip3 install pandas-gbq==0.10.0  google-cloud-bigquery==1.11.2 google-api-core==1.8.2
!pip3 install matplotlib


Collecting pandas-gbq==0.10.0
  Using cached https://files.pythonhosted.org/packages/6a/65/bc46678a5550c0cef1700d7292319deae716751af3f6158250d6a3a454ed/pandas_gbq-0.10.0-py2.py3-none-any.whl
Collecting google-cloud-bigquery==1.11.2
  Using cached https://files.pythonhosted.org/packages/b3/33/236bdc6f5204bed8f69aecbabbe1a9b3a5be51f959fb51eba0545181ffa0/google_cloud_bigquery-1.11.2-py2.py3-none-any.whl
Collecting google-api-core==1.8.2
  Using cached https://files.pythonhosted.org/packages/7d/73/e4877e921fe59307ec6b1b0b0c2ad9fde2d1c6bab8dd06ec913891a20dc6/google_api_core-1.8.2-py2.py3-none-any.whl
Collecting pydata-google-auth (from pandas-gbq==0.10.0)
  Using cached https://files.pythonhosted.org/packages/89/c5/03b68c114bc2c2bcaa2e40fdf269a14361fa75b70a09415e8bad65413b75/pydata_google_auth-0.1.3-py2.py3-none-any.whl
Collecting google-auth-oauthlib (from pandas-gbq==0.10.0)
  Using cached https://files.pythonhosted.org/packages/74/a2/1323b1bce9935ac948cd4863509de16cf852cd80b12dd29e648c65fea93d/google_auth_oauthlib-0.4.0-py2.py3-none-any.whl
Collecting pandas>=0.19.0 (from pandas-gbq==0.10.0)
  Using cached https://files.pythonhosted.org/packages/1d/9a/7eb9952f4b4d73fbd75ad1d5d6112f407e695957444cb695cbb3cdab918a/pandas-0.25.0-cp36-cp36m-manylinux1_x86_64.whl
Collecting google-auth (from pandas-gbq==0.10.0)
  Using cached https://files.pythonhosted.org/packages/c5/9b/ed0516cc1f7609fb0217e3057ff4f0f9f3e3ce79a369c6af4a6c5ca25664/google_auth-1.6.3-py2.py3-none-any.whl
Collecting setuptools (from pandas-gbq==0.10.0)
  Using cached https://files.pythonhosted.org/packages/ec/51/f45cea425fd5cb0b0380f5b0f048ebc1da5b417e48d304838c02d6288a1e/setuptools-41.0.1-py2.py3-none-any.whl
Collecting protobuf>=3.6.0 (from google-cloud-bigquery==1.11.2)
  Using cached https://files.pythonhosted.org/packages/dc/0e/e7cdff89745986c984ba58e6ff6541bc5c388dd9ab9d7d312b3b1532584a/protobuf-3.9.0-cp36-cp36m-manylinux1_x86_64.whl
Collecting google-cloud-core<0.30dev,>=0.29.0 (from google-cloud-bigquery==1.11.2)
  Using cached https://files.pythonhosted.org/packages/0c/f2/3c225e7a69cb27d283b68bff867722bd066bc1858611180197f711815ea5/google_cloud_core-0.29.1-py2.py3-none-any.whl
Collecting google-resumable-media>=0.3.1 (from google-cloud-bigquery==1.11.2)
  Using cached https://files.pythonhosted.org/packages/e2/5d/4bc5c28c252a62efe69ed1a1561da92bd5af8eca0cdcdf8e60354fae9b29/google_resumable_media-0.3.2-py2.py3-none-any.whl
Collecting six>=1.10.0 (from google-api-core==1.8.2)
  Using cached https://files.pythonhosted.org/packages/73/fb/00a976f728d0d1fecfe898238ce23f502a721c0ac0ecfedb80e0d88c64e9/six-1.12.0-py2.py3-none-any.whl
Collecting requests<3.0.0dev,>=2.18.0 (from google-api-core==1.8.2)
  Using cached https://files.pythonhosted.org/packages/51/bd/23c926cd341ea6b7dd0b2a00aba99ae0f828be89d72b2190f27c11d4b7fb/requests-2.22.0-py2.py3-none-any.whl
Collecting pytz (from google-api-core==1.8.2)
  Using cached https://files.pythonhosted.org/packages/3d/73/fe30c2daaaa0713420d0382b16fbb761409f532c56bdcc514bf7b6262bb6/pytz-2019.1-py2.py3-none-any.whl
Collecting googleapis-common-protos!=1.5.4,<2.0dev,>=1.5.3 (from google-api-core==1.8.2)
Collecting requests-oauthlib>=0.7.0 (from google-auth-oauthlib->pandas-gbq==0.10.0)
  Using cached https://files.pythonhosted.org/packages/c2/e2/9fd03d55ffb70fe51f587f20bcf407a6927eb121de86928b34d162f0b1ac/requests_oauthlib-1.2.0-py2.py3-none-any.whl
Collecting python-dateutil>=2.6.1 (from pandas>=0.19.0->pandas-gbq==0.10.0)
  Using cached https://files.pythonhosted.org/packages/41/17/c62faccbfbd163c7f57f3844689e3a78bae1f403648a6afb1d0866d87fbb/python_dateutil-2.8.0-py2.py3-none-any.whl
Collecting numpy>=1.13.3 (from pandas>=0.19.0->pandas-gbq==0.10.0)
  Using cached https://files.pythonhosted.org/packages/87/2d/e4656149cbadd3a8a0369fcd1a9c7d61cc7b87b3903b85389c70c989a696/numpy-1.16.4-cp36-cp36m-manylinux1_x86_64.whl
Collecting pyasn1-modules>=0.2.1 (from google-auth->pandas-gbq==0.10.0)
  Using cached https://files.pythonhosted.org/packages/91/f0/b03e00ce9fddf4827c42df1c3ce10c74eadebfb706231e8d6d1c356a4062/pyasn1_modules-0.2.5-py2.py3-none-any.whl
Collecting cachetools>=2.0.0 (from google-auth->pandas-gbq==0.10.0)
  Using cached https://files.pythonhosted.org/packages/2f/a6/30b0a0bef12283e83e58c1d6e7b5aabc7acfc4110df81a4471655d33e704/cachetools-3.1.1-py2.py3-none-any.whl
Collecting rsa>=3.1.4 (from google-auth->pandas-gbq==0.10.0)
  Using cached https://files.pythonhosted.org/packages/02/e5/38518af393f7c214357079ce67a317307936896e961e35450b70fad2a9cf/rsa-4.0-py2.py3-none-any.whl
Collecting idna<2.9,>=2.5 (from requests<3.0.0dev,>=2.18.0->google-api-core==1.8.2)
  Using cached https://files.pythonhosted.org/packages/14/2c/cd551d81dbe15200be1cf41cd03869a46fe7226e7450af7a6545bfc474c9/idna-2.8-py2.py3-none-any.whl
Collecting chardet<3.1.0,>=3.0.2 (from requests<3.0.0dev,>=2.18.0->google-api-core==1.8.2)
  Using cached https://files.pythonhosted.org/packages/bc/a9/01ffebfb562e4274b6487b4bb1ddec7ca55ec7510b22e4c51f14098443b8/chardet-3.0.4-py2.py3-none-any.whl
Collecting certifi>=2017.4.17 (from requests<3.0.0dev,>=2.18.0->google-api-core==1.8.2)
  Using cached https://files.pythonhosted.org/packages/69/1b/b853c7a9d4f6a6d00749e94eb6f3a041e342a885b87340b79c1ef73e3a78/certifi-2019.6.16-py2.py3-none-any.whl
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 (from requests<3.0.0dev,>=2.18.0->google-api-core==1.8.2)
  Using cached https://files.pythonhosted.org/packages/e6/60/247f23a7121ae632d62811ba7f273d0e58972d75e58a94d329d51550a47d/urllib3-1.25.3-py2.py3-none-any.whl
Collecting oauthlib>=3.0.0 (from requests-oauthlib>=0.7.0->google-auth-oauthlib->pandas-gbq==0.10.0)
  Using cached https://files.pythonhosted.org/packages/58/5e/289e98ff5ad1a321945803000c5f10f5f90eba346d13139ecdd075cfbe17/oauthlib-3.0.2-py2.py3-none-any.whl
Collecting pyasn1<0.5.0,>=0.4.1 (from pyasn1-modules>=0.2.1->google-auth->pandas-gbq==0.10.0)
  Using cached https://files.pythonhosted.org/packages/7b/7c/c9386b82a25115cccf1903441bba3cbadcfae7b678a20167347fa8ded34c/pyasn1-0.4.5-py2.py3-none-any.whl
Installing collected packages: pyasn1, pyasn1-modules, six, cachetools, rsa, google-auth, setuptools, oauthlib, idna, chardet, certifi, urllib3, requests, requests-oauthlib, google-auth-oauthlib, pydata-google-auth, protobuf, pytz, googleapis-common-protos, google-api-core, google-cloud-core, google-resumable-media, google-cloud-bigquery, python-dateutil, numpy, pandas, pandas-gbq
Successfully installed cachetools-3.1.1 certifi-2019.6.16 chardet-3.0.4 google-api-core-1.8.2 google-auth-1.6.3 google-auth-oauthlib-0.4.0 google-cloud-bigquery-1.11.2 google-cloud-core-0.29.1 google-resumable-media-0.3.2 googleapis-common-protos-1.6.0 idna-2.8 numpy-1.16.4 oauthlib-3.0.2 pandas-0.25.0 pandas-gbq-0.10.0 protobuf-3.9.0 pyasn1-0.4.5 pyasn1-modules-0.2.5 pydata-google-auth-0.1.3 python-dateutil-2.8.0 pytz-2019.1 requests-2.22.0 requests-oauthlib-1.2.0 rsa-4.0 setuptools-41.0.1 six-1.12.0 urllib3-1.25.3
Collecting matplotlib
  Using cached https://files.pythonhosted.org/packages/57/4f/dd381ecf6c6ab9bcdaa8ea912e866dedc6e696756156d8ecc087e20817e2/matplotlib-3.1.1-cp36-cp36m-manylinux1_x86_64.whl
Collecting pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 (from matplotlib)
  Using cached https://files.pythonhosted.org/packages/9f/dc/b205465a60baca8e04a1555a84d9c79f910661765056f071fb6fc2db4841/pyparsing-2.4.1-py2.py3-none-any.whl
Collecting python-dateutil>=2.1 (from matplotlib)
  Using cached https://files.pythonhosted.org/packages/41/17/c62faccbfbd163c7f57f3844689e3a78bae1f403648a6afb1d0866d87fbb/python_dateutil-2.8.0-py2.py3-none-any.whl
Collecting cycler>=0.10 (from matplotlib)
  Using cached https://files.pythonhosted.org/packages/f7/d2/e07d3ebb2bd7af696440ce7e754c59dd546ffe1bbe732c8ab68b9c834e61/cycler-0.10.0-py2.py3-none-any.whl
Collecting numpy>=1.11 (from matplotlib)
  Using cached https://files.pythonhosted.org/packages/87/2d/e4656149cbadd3a8a0369fcd1a9c7d61cc7b87b3903b85389c70c989a696/numpy-1.16.4-cp36-cp36m-manylinux1_x86_64.whl
Collecting kiwisolver>=1.0.1 (from matplotlib)
  Using cached https://files.pythonhosted.org/packages/f8/a1/5742b56282449b1c0968197f63eae486eca2c35dcd334bab75ad524e0de1/kiwisolver-1.1.0-cp36-cp36m-manylinux1_x86_64.whl
Collecting six>=1.5 (from python-dateutil>=2.1->matplotlib)
  Using cached https://files.pythonhosted.org/packages/73/fb/00a976f728d0d1fecfe898238ce23f502a721c0ac0ecfedb80e0d88c64e9/six-1.12.0-py2.py3-none-any.whl
Collecting setuptools (from kiwisolver>=1.0.1->matplotlib)
  Using cached https://files.pythonhosted.org/packages/ec/51/f45cea425fd5cb0b0380f5b0f048ebc1da5b417e48d304838c02d6288a1e/setuptools-41.0.1-py2.py3-none-any.whl
Installing collected packages: pyparsing, six, python-dateutil, cycler, numpy, setuptools, kiwisolver, matplotlib
Successfully installed cycler-0.10.0 kiwisolver-1.1.0 matplotlib-3.1.1 numpy-1.16.4 pyparsing-2.4.1 python-dateutil-2.8.0 setuptools-41.0.1 six-1.12.0

In [3]:
# Inline all matplotlib plots
%matplotlib inline

In [4]:
from google.cloud import bigquery

In [5]:
# NOTE: The RuntimeWarnings (if any) are harmless. See ContinuumIO/anaconda-issues#6678.
import pandas as pd
from pandas.io import gbq

In [6]:
# Note: access to this project is restricted
PROJECT="kubeflow-usage"

In [7]:
# We need to filter out empty timestamps because they can't be converted to dates
# Why would pings be missing a timestamp

# Get data for last year
max_age_hours = 24 * 365
query = """
    select TIMESTAMP_SECONDS(cast(timestamp as int64)) as timestamp, clusteriD FROM usage.collector 
    where timestamp is not null and length(timestamp) > 0 and
    timestamp_diff(CURRENT_TIMESTAMP(), TIMESTAMP_SECONDS(cast(timestamp as int64)), HOUR) < {max_age_hours}
""".format(max_age_hours=max_age_hours)

data=gbq.read_gbq(str(query), dialect='standard', project_id=PROJECT)


/home/jlewi/.local/lib/python3.6/site-packages/google/auth/_default.py:66: UserWarning: Your application has authenticated using end user credentials from Google Cloud SDK. We recommend that most server applications use service accounts instead. If your application continues to use end user credentials from Cloud SDK, you might receive a "quota exceeded" or "API not enabled" error. For more information about service accounts, see https://cloud.google.com/docs/authentication/
  warnings.warn(_CLOUD_SDK_CREDENTIALS_WARNING)

Compute cluster stats


In [8]:
# Good reference for working with aggregations
# https://www.shanelynn.ie/summarising-aggregation-and-grouping-data-in-python-pandas/
aggregations = {
    "timestamp": {
        "min_timestamp": "min",
        "max_timestamp": "max",
        "count": "count"
    }    
}
stats = data.groupby("clusteriD").agg(aggregations)


/home/jlewi/.local/lib/python3.6/site-packages/pandas/core/groupby/generic.py:1455: FutureWarning: using a dict with renaming is deprecated and will be removed
in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)

In [9]:
# Drop the multi-level index to make it easier to work with
stats.columns = stats.columns.droplevel(level=0)

In [10]:
stats["age"] = stats["max_timestamp"] - stats["min_timestamp"]

In [11]:
long_stats = stats[stats["age"] > datetime.timedelta(hours=24)]

Number of new deployments

  • Compute the number of new deployments in the last 28 days
  • The number of new deployments will be noisy due to lots of very short lived deployments
  • So we also show the number of new deployments that ended up living a minimum of 12 hours

In [12]:
new_per_day = pd.Series(data=1, index=stats["min_timestamp"])
new_per_day = new_per_day.sort_index()
new_deployments = new_per_day.rolling('28d').sum()

In [13]:
from matplotlib import pyplot as plt
plt.plot(new_deployments.index, new_deployments.values)
plt.title("New Kubeflow Deployments Last 28 Days")
plt.ylabel('Number of created deployments', fontsize='large');


/home/jlewi/.local/lib/python3.6/site-packages/pandas/plotting/_matplotlib/converter.py:102: FutureWarning: Using an implicitly registered datetime converter for a matplotlib plotting method. The converter was registered by pandas on import. Future versions of pandas will require you to explicitly register matplotlib converters.

To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()
  warnings.warn(msg, FutureWarning)

In [14]:
new_per_day_long = pd.Series(data=1, index=long_stats["min_timestamp"])
new_per_day_long = new_per_day_long.sort_index()
new_deployments_long = new_per_day_long.rolling('28d').sum()

In [15]:
from matplotlib import pyplot as plt
plt.plot(new_deployments_long.index, new_deployments_long.values)
plt.title("New Kubeflow Deployments Last 28 Days with minimum duration 24 hours")
plt.ylabel('Number of created deployments', fontsize='large');


Number of active deployments

  • Compute the number of active deployments at any given time
  • We do this by creating a time series of +1 at the first ping and -1 at the last ping
  • We then integrate over this time series
  • This means we filter out any deployments for which we only have a single ping which means we should exclude clusters with a lifetime of less than 1 day

In [20]:
# We need to drop unknown_cluster and empty string because those represent
# multiple clusters and the min and max timestamp will not correspond to the duration
# of those clusters

filtered_stats = stats[np.logical_and(stats.index != "" , stats.index != "unknown_cluster")]

create_times = pd.Series(data=1, index=filtered_stats["min_timestamp"])
delete_times = pd.Series(data=-1, index=filtered_stats["max_timestamp"])

# Drop delete_times and create_times which are less than 24 hours old.
# We assume those clusters are still active. So we don't want to interpret the
# max ping timestamp as a delete timestamp
now = datetime.datetime.now(delete_times.index.tzinfo)

min_hours = 48
create_times = create_times[now - create_times.index > datetime.timedelta(hours=min_hours)]
delete_times = delete_times[now - delete_times.index > datetime.timedelta(hours=min_hours)]

In [21]:
deltas = pd.concat([create_times, delete_times])
deltas = deltas.sort_index()
active_deployments= deltas.cumsum()

In [22]:
plt.plot(active_deployments.index, active_deployments.values)
plt.title("Active Kubeflow Deployments with minimum duration 24 hours")
plt.ylabel('Number of deployments', fontsize='large');


Compute histogram of Kubeflow deployment age for


In [23]:
last_28 = stats[now - stats["min_timestamp"] < datetime.timedelta(days=28)]
age_counts = pd.Series(data=1, index=last_28["age"])
age_counts=age_counts.sort_index()
age_stats = age_counts.cumsum()

In [24]:
plt.plot(age_stats.index.total_seconds() / (24.0 * 3600), age_stats.values)
plt.title("Kubeflow deployment duration for most recent 28 days")
plt.ylabel('Kubeflow deployment count', fontsize='large');


---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-24-3c9c67a53827> in <module>
----> 1 plt.plot(age_stats.index.total_seconds() / (24.0 * 3600), age_stats.values)
      2 plt.title("Kubeflow deployment duration for most recent 28 days")
      3 plt.ylabel('Kubeflow deployment count', fontsize='large');

~/.local/lib/python3.6/site-packages/matplotlib/pyplot.py in plot(scalex, scaley, data, *args, **kwargs)
   2793     return gca().plot(
   2794         *args, scalex=scalex, scaley=scaley, **({"data": data} if data
-> 2795         is not None else {}), **kwargs)
   2796 
   2797 

~/.local/lib/python3.6/site-packages/matplotlib/axes/_axes.py in plot(self, scalex, scaley, data, *args, **kwargs)
   1664         """
   1665         kwargs = cbook.normalize_kwargs(kwargs, mlines.Line2D._alias_map)
-> 1666         lines = [*self._get_lines(*args, data=data, **kwargs)]
   1667         for line in lines:
   1668             self.add_line(line)

~/.local/lib/python3.6/site-packages/matplotlib/axes/_base.py in __call__(self, *args, **kwargs)
    223                 this += args[0],
    224                 args = args[1:]
--> 225             yield from self._plot_args(this, kwargs)
    226 
    227     def get_next_color(self):

~/.local/lib/python3.6/site-packages/matplotlib/axes/_base.py in _plot_args(self, tup, kwargs)
    397             func = self._makefill
    398 
--> 399         ncx, ncy = x.shape[1], y.shape[1]
    400         if ncx > 1 and ncy > 1 and ncx != ncy:
    401             cbook.warn_deprecated(

IndexError: tuple index out of range

In [ ]:
# Compute a plot of age stats for clusters that have a minimum age of 1 day
# Excluding short lived clusters filters out short lived automated clusters from testing 
# and other data
last_28_long = last_28[last_28["age"] > datetime.timedelta(hours=24)]
age_counts_long = pd.Series(data=1, index=last_28_long["age"])
age_counts_long=age_counts_long.sort_index()
age_stats_long = age_counts_long.cumsum()
age_cpdf_long = age_stats_long / age_stats_long.max()

In [ ]:
plt.plot(age_stats_long.index.total_seconds() / (24.0 * 3600), age_stats_long.values)
plt.title("Age of Kubeflow deployments for clusters created in last 28 days")
plt.xlabel("Deployment Age(days)")
plt.ylabel('Cumulative deployment count', fontsize='large');

In [ ]:
plt.plot(age_cpdf_long.index.total_seconds() / (24.0 * 3600), age_cpdf_long.values)
now_day = now.strftime("%Y-%m-%d")
plt.title("Age of Kubeflow deployments for clusters created in last 28 days\nTotal deployments={0}\n{1}".format(age_stats_long.max(), now_day))
plt.xlabel("Deployment Age(days)")
plt.ylabel('Cumulative distribution', fontsize='large');

In [ ]:
last_28_long["age"].quantile([.25, .5, .75, .9])