In [10]:
import pandas
import numpy as np
import mpld3
import matplotlib
import matplotlib.pyplot as plt
import datetime

matplotlib.style.use('ggplot')

%matplotlib inline

In [11]:
data = pandas.read_csv('/tmp/inspector_demo.csv',
                       parse_dates=['run_start_timestamp', 'run_check_start_timestamp', 'run_check_end_timestamp'],
                       date_parser=lambda d: datetime.datetime.strptime(d, "%Y-%m-%d %H:%M:%S"))
print(type(data['run_start_timestamp'][0]))
data.head()


<class 'pandas.tslib.Timestamp'>
Out[11]:
instance_name database_name table_name table_partitioned run_start_timestamp run_mode partition_key partition_value check_name check_policy_type ... run_check_start_timestamp run_check_end_timestamp run_check_mode run_check_rc run_check_violation_cnt run_check_anomaly_score run_check_scope run_check_unit run_check_severity_score run_check_validated
0 prod westwind cust_asset_events 1 2015-01-01 02:00:00 incremental date_id 2015001 date_id_fk quality ... 2015-01-01 02:00:00 2015-01-01 02:00:01 incremental 0 0 0 0 rows 0 NaN
1 prod westwind cust_asset_events 1 2015-01-01 02:00:00 incremental date_id 2015001 event_type_id_fk quality ... 2015-01-01 02:00:01 2015-01-01 02:00:02 incremental 0 0 0 0 rows 0 NaN
2 prod westwind cust_asset_events 1 2015-01-01 02:00:00 incremental date_id 2015001 stats_not_stale data-management ... 2015-01-01 02:00:02 2015-01-01 02:00:03 full 0 0 0 0 tables 0 NaN
3 prod westwind cust_asset_events 1 2015-01-01 02:00:00 incremental date_id 2015001 table_not_empty quality ... 2015-01-01 02:00:03 2015-01-01 02:00:04 incremental 0 0 0 0 tables 0 NaN
4 prod westwind cust_asset_events 1 2015-01-01 02:00:00 incremental date_id 2015001 asset_id_fk quality ... 2015-01-01 02:00:04 2015-01-01 02:00:05 incremental 0 0 0 0 rows 0 NaN

5 rows × 21 columns


In [12]:
[group['run_check_violation_cnt'].sum() for key, group in data.groupby('instance_name')]


Out[12]:
[62180046357]

In [36]:
history_raw = [pandas.Series(df['run_check_violation_cnt'].values, index=df['run_start_timestamp'].values)
                    for df in [group[['run_start_timestamp', 'run_check_violation_cnt']]
                               for key, group in data.groupby('instance_name')]]
# Resample each timeseries by minute
history = [hist.resample('H', how='count') for hist in history_raw]
history[0].where(history[0].values >= datetime.datetime.now() - datetime.timedelta(days=7))


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-36-459794804a7e> in <module>()
      4 # Resample each timeseries by minute
      5 history = [hist.resample('H', how='count') for hist in history_raw]
----> 6 history[0].where(history[0].values >= datetime.datetime.now() - datetime.timedelta(days=7))

TypeError: unorderable types: int() >= datetime.datetime()

In [6]:
series[0].resample('D', how='count')


Out[6]:
2015-01-01    4380
Freq: D, dtype: int64

In [ ]:
plots = [plt.figure() for h in history]

In [ ]: