notebook.community

Edit and run



In [10]:

    
import pandas
import numpy as np
import mpld3
import matplotlib
import matplotlib.pyplot as plt
import datetime

matplotlib.style.use('ggplot')

%matplotlib inline



In [11]:

    
data = pandas.read_csv('/tmp/inspector_demo.csv',
                       parse_dates=['run_start_timestamp', 'run_check_start_timestamp', 'run_check_end_timestamp'],
                       date_parser=lambda d: datetime.datetime.strptime(d, "%Y-%m-%d %H:%M:%S"))
print(type(data['run_start_timestamp'][0]))
data.head()









    



<class 'pandas.tslib.Timestamp'>






    Out[11]:






  
    
      
      instance_name
      database_name
      table_name
      table_partitioned
      run_start_timestamp
      run_mode
      partition_key
      partition_value
      check_name
      check_policy_type
      ...
      run_check_start_timestamp
      run_check_end_timestamp
      run_check_mode
      run_check_rc
      run_check_violation_cnt
      run_check_anomaly_score
      run_check_scope
      run_check_unit
      run_check_severity_score
      run_check_validated
    
  
  
    
      0
      prod
      westwind
      cust_asset_events
      1
      2015-01-01 02:00:00
      incremental
      date_id
      2015001
      date_id_fk
      quality
      ...
      2015-01-01 02:00:00
      2015-01-01 02:00:01
      incremental
      0
      0
      0
      0
      rows
      0
      NaN
    
    
      1
      prod
      westwind
      cust_asset_events
      1
      2015-01-01 02:00:00
      incremental
      date_id
      2015001
      event_type_id_fk
      quality
      ...
      2015-01-01 02:00:01
      2015-01-01 02:00:02
      incremental
      0
      0
      0
      0
      rows
      0
      NaN
    
    
      2
      prod
      westwind
      cust_asset_events
      1
      2015-01-01 02:00:00
      incremental
      date_id
      2015001
      stats_not_stale
      data-management
      ...
      2015-01-01 02:00:02
      2015-01-01 02:00:03
      full
      0
      0
      0
      0
      tables
      0
      NaN
    
    
      3
      prod
      westwind
      cust_asset_events
      1
      2015-01-01 02:00:00
      incremental
      date_id
      2015001
      table_not_empty
      quality
      ...
      2015-01-01 02:00:03
      2015-01-01 02:00:04
      incremental
      0
      0
      0
      0
      tables
      0
      NaN
    
    
      4
      prod
      westwind
      cust_asset_events
      1
      2015-01-01 02:00:00
      incremental
      date_id
      2015001
      asset_id_fk
      quality
      ...
      2015-01-01 02:00:04
      2015-01-01 02:00:05
      incremental
      0
      0
      0
      0
      rows
      0
      NaN
    
  

5 rows × 21 columns



In [12]:

    
[group['run_check_violation_cnt'].sum() for key, group in data.groupby('instance_name')]









    Out[12]:





[62180046357]



In [36]:

    
history_raw = [pandas.Series(df['run_check_violation_cnt'].values, index=df['run_start_timestamp'].values)
                    for df in [group[['run_start_timestamp', 'run_check_violation_cnt']]
                               for key, group in data.groupby('instance_name')]]
# Resample each timeseries by minute
history = [hist.resample('H', how='count') for hist in history_raw]
history[0].where(history[0].values >= datetime.datetime.now() - datetime.timedelta(days=7))









    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-36-459794804a7e> in <module>()
      4 # Resample each timeseries by minute
      5 history = [hist.resample('H', how='count') for hist in history_raw]
----> 6 history[0].where(history[0].values >= datetime.datetime.now() - datetime.timedelta(days=7))

TypeError: unorderable types: int() >= datetime.datetime()



In [6]:

    
series[0].resample('D', how='count')









    Out[6]:





2015-01-01    4380
Freq: D, dtype: int64



In [ ]:

    
plots = [plt.figure() for h in history]



In [ ]:

	instance_name	database_name	table_name	table_partitioned	run_start_timestamp	run_mode	partition_key	partition_value	check_name	check_policy_type	...	run_check_start_timestamp	run_check_end_timestamp	run_check_mode	run_check_unit	run_check_validated
0	prod	westwind	cust_asset_events	1	2015-01-01 02:00:00	incremental	date_id	2015001	date_id_fk	quality	...	2015-01-01 02:00:00	2015-01-01 02:00:01	incremental	rows	NaN
1	prod	westwind	cust_asset_events	1	2015-01-01 02:00:00	incremental	date_id	2015001	event_type_id_fk	quality	...	2015-01-01 02:00:01	2015-01-01 02:00:02	incremental	rows	NaN
2	prod	westwind	cust_asset_events	1	2015-01-01 02:00:00	incremental	date_id	2015001	stats_not_stale	data-management	...	2015-01-01 02:00:02	2015-01-01 02:00:03	full	tables	NaN
3	prod	westwind	cust_asset_events	1	2015-01-01 02:00:00	incremental	date_id	2015001	table_not_empty	quality	...	2015-01-01 02:00:03	2015-01-01 02:00:04	incremental	tables	NaN
4	prod	westwind	cust_asset_events	1	2015-01-01 02:00:00	incremental	date_id	2015001	asset_id_fk	quality	...	2015-01-01 02:00:04	2015-01-01 02:00:05	incremental	rows	NaN