In [6]:
import os; os.sys.path.append(os.path.dirname(os.path.abspath('.'))) # for relative imports
from utils.nab_data import NABData
import numpy as np
import pandas as pd

In [54]:
import numpy as np
import pandas as pd
import os
from collections import defaultdict, OrderedDict
from natural.number import ordinal
from natural.date import compress
from dateutil.parser import parse

class NABData(object):
    data_dir = os.path.join(os.path.dirname(os.path.abspath('.')), 'data')
    
    def __init__(self):
        self._load()
    
    def _dir_iter(self):
        for root, dirs, files in os.walk(self.data_dir):
            for filename in filter(lambda x: x.find('csv') != -1, files):
                yield root.split('/')[-1], filename
                
    @staticmethod
    def _format_timeseries(df):
        df['timestamp'] = df['timestamp'].map(parse)
        return df.set_index('timestamp').sort_index()
    
    @staticmethod
    def _filename2key(filename):
        return filename[:filename.find('.csv')]
                
    def _load(self):
        self.files = defaultdict(dict)
        cache_miss = False
        hdf_path = self._hdf_path()
        for folder, filename in self._dir_iter():
            try:
                self.files[folder][filename] = pd.read_hdf(hdf_path, os.path.join(folder, self._filename2key(filename)))
            except (KeyError, IOError):
                self.files[folder][filename] = self._format_timeseries(pd.read_csv(os.path.join(self.data_dir, folder, filename)))
                cache_miss = True
        if cache_miss: 
            self._save_hdf()
            
    def _hdf_path(self):
        return os.path.join(self.data_dir, 'cache.h5')
    
    def _save_hdf(self):
        hdf_path = self._hdf_path()
        for category, filename, df in self._files_iter():
            key = os.path.join(category, self._filename2key(filename))
            df.to_hdf(hdf_path, key)
            
    def __getitem__(self, items):
        return self.files[items]
    
    def _files_iter(self):
        for cat_key in self.files.keys():
            for filename in self.files[cat_key].keys():
                yield (cat_key, filename, self.files[cat_key][filename])
    
    def summary(self):
        data = []
        cols = ['category', 'file', 'length', 'features', 'period', 'periods_vary', 'min', 
                'max', 'mean', 'std', '25th_percentile', '50th_percentile', '75th_percentile']
        for category, filename, df in self._files_iter():
            row = dict([('category', category), ('file', filename)])
            row['length'] = len(df)
            row['features'] = len(df.columns)
            row['period'] = compress(df.index[1]-df.index[0])
            row['periods_vary'] = int(np.unique(np.diff(df.index)).shape == (1,))
            row['min'] = df['value'].min()
            row['max'] = df['value'].max()
            row['mean'] = df['value'].mean()
            row['std'] = df['value'].std()
            for q in [25, 50, 75]:
                row['{}_percentile'.format(ordinal(q))] = np.percentile(df['value'], q)
            data.append(row)
        return self._round_float_cols(pd.DataFrame(data)[cols])
    
    @staticmethod
    def _round_float_cols(df, digits=2):
        for col in df.columns: 
            if df[col].dtype in [np.float64, np.float32]:
                df[col] = np.round(df[col], digits)
        return df

In [11]:
df = pd.read_csv('/home/jstrong/src/NAB/data/realKnownCause/nyc_taxi.csv')
df.head()


Out[11]:
timestamp value
0 2014-07-01 00:00:00 10844
1 2014-07-01 00:30:00 8127
2 2014-07-01 01:00:00 6210
3 2014-07-01 01:30:00 4656
4 2014-07-01 02:00:00 3820

In [45]:
#data = NABData()
np.unique(np.diff(data['realKnownCause']['nyc_taxi.csv'].index)).shape


realKnownCause
Out[45]:
(1,)

In [55]:
data = NABData()
data.summary()


/home/jstrong/src/envs/neural/local/lib/python2.7/site-packages/tables/path.py:100: NaturalNameWarning: object name is not a valid Python identifier: 'iio_us-east-1_i-a2eb1cd9_NetworkIn'; it does not match the pattern ``^[a-zA-Z_][a-zA-Z0-9_]*$``; you will not be able to use natural naming to access this object; using ``getattr()`` will still work, though
  NaturalNameWarning)
/home/jstrong/src/envs/neural/local/lib/python2.7/site-packages/tables/path.py:100: NaturalNameWarning: object name is not a valid Python identifier: 'exchange-2_cpm_results'; it does not match the pattern ``^[a-zA-Z_][a-zA-Z0-9_]*$``; you will not be able to use natural naming to access this object; using ``getattr()`` will still work, though
  NaturalNameWarning)
/home/jstrong/src/envs/neural/local/lib/python2.7/site-packages/tables/path.py:100: NaturalNameWarning: object name is not a valid Python identifier: 'exchange-2_cpc_results'; it does not match the pattern ``^[a-zA-Z_][a-zA-Z0-9_]*$``; you will not be able to use natural naming to access this object; using ``getattr()`` will still work, though
  NaturalNameWarning)
/home/jstrong/src/envs/neural/local/lib/python2.7/site-packages/tables/path.py:100: NaturalNameWarning: object name is not a valid Python identifier: 'exchange-3_cpm_results'; it does not match the pattern ``^[a-zA-Z_][a-zA-Z0-9_]*$``; you will not be able to use natural naming to access this object; using ``getattr()`` will still work, though
  NaturalNameWarning)
/home/jstrong/src/envs/neural/local/lib/python2.7/site-packages/tables/path.py:100: NaturalNameWarning: object name is not a valid Python identifier: 'exchange-3_cpc_results'; it does not match the pattern ``^[a-zA-Z_][a-zA-Z0-9_]*$``; you will not be able to use natural naming to access this object; using ``getattr()`` will still work, though
  NaturalNameWarning)
/home/jstrong/src/envs/neural/local/lib/python2.7/site-packages/tables/path.py:100: NaturalNameWarning: object name is not a valid Python identifier: 'exchange-4_cpc_results'; it does not match the pattern ``^[a-zA-Z_][a-zA-Z0-9_]*$``; you will not be able to use natural naming to access this object; using ``getattr()`` will still work, though
  NaturalNameWarning)
/home/jstrong/src/envs/neural/local/lib/python2.7/site-packages/tables/path.py:100: NaturalNameWarning: object name is not a valid Python identifier: 'exchange-4_cpm_results'; it does not match the pattern ``^[a-zA-Z_][a-zA-Z0-9_]*$``; you will not be able to use natural naming to access this object; using ``getattr()`` will still work, though
  NaturalNameWarning)
Out[55]:
category file length features period periods_vary min max mean std 25th_percentile 50th_percentile 75th_percentile
0 realTweets Twitter_volume_IBM.csv 15893 1 5m 1 0.00 1.390000e+02 4.39 5.50 1.00 3.00 6.00
1 realTweets Twitter_volume_GOOG.csv 15842 1 5m 1 0.00 4.650000e+02 20.74 18.56 11.00 16.00 26.00
2 realTweets Twitter_volume_FB.csv 15833 1 5m 1 0.00 1.258000e+03 17.81 19.74 9.00 14.00 22.00
3 realTweets Twitter_volume_CRM.csv 15902 1 5m 1 0.00 2.090000e+02 3.35 4.61 1.00 2.00 5.00
4 realTweets Twitter_volume_KO.csv 15851 1 5m 1 0.00 2.241000e+03 11.40 24.80 5.00 8.00 13.00
5 realTweets Twitter_volume_AMZN.csv 15831 1 5m 1 0.00 1.673000e+03 53.30 30.55 36.00 50.00 65.00
6 realTweets Twitter_volume_UPS.csv 15866 1 5m 1 0.00 2.310000e+02 5.46 21.57 0.00 2.00 4.00
7 realTweets Twitter_volume_AAPL.csv 15902 1 5m 1 0.00 1.347900e+04 85.55 321.05 29.00 47.00 76.00
8 realTweets Twitter_volume_CVS.csv 15853 1 5m 1 0.00 5.000000e+01 0.36 1.09 0.00 0.00 0.00
9 realTweets Twitter_volume_PFE.csv 15858 1 5m 1 0.00 3.600000e+01 0.87 1.46 0.00 0.00 1.00
10 realKnownCause nyc_taxi.csv 10320 1 30m 1 8.00 3.919700e+04 15137.57 6939.50 10262.00 16778.00 19838.75
11 realKnownCause rogue_agent_key_hold.csv 1882 1 5m 0 0.00 9.000000e-01 0.04 0.06 0.00 0.00 0.07
12 realKnownCause rogue_agent_key_updown.csv 5315 1 5m 0 0.00 2.882100e+02 0.49 5.37 0.00 0.00 0.00
13 realKnownCause machine_temperature_system_failure.csv 22695 1 5m 0 2.08 1.085100e+02 85.93 13.75 83.08 89.41 94.02
14 realKnownCause ec2_request_latency_system_failure.csv 4032 1 5m 0 22.86 9.925000e+01 45.16 2.29 43.94 45.02 46.36
15 realKnownCause cpu_utilization_asg_misconfiguration.csv 18050 1 5m 1 11.53 1.000000e+02 38.28 15.64 30.79 32.00 35.66
16 realKnownCause ambient_temperature_system_failure.csv 7267 1 1h 0 57.46 8.622000e+01 71.24 4.25 68.37 71.86 74.43
17 artificialNoAnomaly art_daily_small_noise.csv 4032 1 5m 1 18.00 8.798000e+01 42.44 28.08 19.84 21.61 75.45
18 artificialNoAnomaly art_daily_perfect_square_wave.csv 4032 1 5m 1 20.00 8.000000e+01 42.50 29.05 20.00 20.00 80.00
19 artificialNoAnomaly art_flatline.csv 4032 1 5m 1 45.00 4.500000e+01 45.00 0.00 45.00 45.00 45.00
20 artificialNoAnomaly art_daily_no_noise.csv 4032 1 5m 1 20.00 8.000000e+01 42.50 27.95 20.00 20.29 79.62
21 artificialNoAnomaly art_noisy.csv 4032 1 5m 1 8.00 1.900000e+01 13.49 3.15 10.79 13.45 16.23
22 realAWSCloudwatch elb_request_count_8c0756.csv 4032 1 5m 0 1.00 6.560000e+02 61.84 56.66 15.00 48.00 89.00
23 realAWSCloudwatch ec2_cpu_utilization_24ae8d.csv 4032 1 5m 1 0.07 2.340000e+00 0.13 0.09 0.13 0.13 0.13
24 realAWSCloudwatch rds_cpu_utilization_e47b3b.csv 4032 1 5m 1 12.63 7.623000e+01 18.93 5.61 15.84 16.68 25.52
25 realAWSCloudwatch ec2_cpu_utilization_fe7f93.csv 4032 1 5m 1 1.80 9.967000e+01 5.78 11.81 2.18 2.58 3.43
26 realAWSCloudwatch ec2_cpu_utilization_825cc2.csv 4032 1 5m 0 18.72 9.912000e+01 89.79 12.08 89.08 92.45 94.30
27 realAWSCloudwatch ec2_cpu_utilization_53ea38.csv 4032 1 5m 1 1.60 2.660000e+00 1.83 0.10 1.77 1.80 1.87
28 realAWSCloudwatch rds_cpu_utilization_cc0c53.csv 4032 1 5m 0 5.19 2.510000e+01 8.11 3.65 6.01 6.08 7.10
29 realAWSCloudwatch ec2_cpu_utilization_ac20cd.csv 4032 1 5m 0 2.46 9.974000e+01 40.99 21.92 33.15 34.66 37.63
30 realAWSCloudwatch iio_us-east-1_i-a2eb1cd9_NetworkIn.csv 1243 1 5m 1 789781.00 6.151940e+07 4615221.91 4534241.67 2576003.70 3795175.80 5152488.80
31 realAWSCloudwatch ec2_network_in_257a54.csv 4032 1 5m 0 38516.60 2.451260e+08 570809.85 4607792.94 219341.75 234245.50 251774.75
32 realAWSCloudwatch ec2_disk_write_bytes_1ef3de.csv 4730 1 5m 0 0.00 5.474570e+08 6581560.77 40385680.18 0.00 0.00 0.00
33 realAWSCloudwatch ec2_network_in_5abac7.csv 4730 1 5m 0 42.00 8.285420e+06 118714.64 775718.73 42.00 68.40 108.00
34 realAWSCloudwatch ec2_cpu_utilization_c6585a.csv 4032 1 5m 1 0.06 1.600000e+00 0.09 0.09 0.07 0.07 0.07
35 realAWSCloudwatch ec2_disk_write_bytes_c0d644.csv 4032 1 5m 1 0.00 8.639640e+08 17331273.32 79696644.73 0.00 0.00 0.00
36 realAWSCloudwatch ec2_cpu_utilization_77c1ca.csv 4032 1 5m 1 0.06 9.990000e+01 10.52 26.93 0.10 0.10 0.10
37 realAWSCloudwatch ec2_cpu_utilization_5f5533.csv 4032 1 5m 1 34.77 6.809000e+01 43.11 4.30 39.30 42.92 46.01
38 realAWSCloudwatch grok_asg_anomaly.csv 4621 1 5m 1 0.00 4.562000e+01 27.68 13.14 33.33 33.44 33.56
39 realAdExchange exchange-2_cpm_results.csv 1624 1 1h 0 0.00 1.050000e+00 0.34 0.16 0.21 0.30 0.46
40 realAdExchange exchange-2_cpc_results.csv 1624 1 1h 0 0.03 2.300000e-01 0.10 0.03 0.08 0.10 0.12
41 realAdExchange exchange-3_cpm_results.csv 1538 1 1h 0 0.32 5.500000e+00 0.77 0.34 0.56 0.70 0.90
42 realAdExchange exchange-3_cpc_results.csv 1538 1 1h 0 0.04 1.030000e+00 0.14 0.08 0.10 0.12 0.15
43 realAdExchange exchange-4_cpc_results.csv 1643 1 1h 0 0.02 3.130000e+00 0.09 0.13 0.06 0.07 0.10
44 realAdExchange exchange-4_cpm_results.csv 1643 1 1h 0 0.12 1.644000e+01 0.53 0.75 0.36 0.47 0.58
45 artificialWithAnomaly art_daily_nojump.csv 4032 1 5m 1 18.00 8.797000e+01 40.82 27.64 19.70 21.38 74.79
46 artificialWithAnomaly art_daily_jumpsdown.csv 4032 1 5m 1 18.00 8.800000e+01 41.51 27.51 19.99 21.64 74.86
47 artificialWithAnomaly art_daily_jumpsup.csv 4032 1 5m 1 18.00 1.649500e+02 44.49 32.43 19.99 21.65 76.44
48 artificialWithAnomaly art_daily_flatmiddle.csv 4032 1 5m 1 -22.00 8.796000e+01 18.98 45.37 -19.93 -17.54 74.43
49 artificialWithAnomaly art_increase_spike_density.csv 4032 1 5m 1 0.00 2.000000e+01 0.42 2.87 0.00 0.00 0.00
50 artificialWithAnomaly art_load_balancer_spikes.csv 4032 1 5m 1 0.00 3.220000e+00 0.11 0.44 0.00 0.00 0.00
51 realTraffic speed_6005.csv 2500 1 10m 0 20.00 1.090000e+02 81.91 8.75 77.00 82.00 88.00
52 realTraffic TravelTime_387.csv 2500 1 14m 0 9.00 5.059000e+03 325.09 399.56 133.00 201.00 366.00
53 realTraffic speed_7578.csv 1127 1 5m 0 1.00 9.000000e+01 64.05 9.24 63.00 66.00 68.00
54 realTraffic occupancy_t4013.csv 2500 1 5m 0 0.00 4.306000e+01 7.24 4.37 4.06 6.83 9.83
55 realTraffic occupancy_6005.csv 2380 1 5m 0 0.00 2.228000e+01 4.50 3.40 1.94 3.83 6.17
56 realTraffic TravelTime_451.csv 2162 1 10m 0 22.00 5.578000e+03 327.22 444.74 146.00 203.00 332.00
57 realTraffic speed_t4013.csv 2495 1 5m 0 11.00 7.700000e+01 62.93 5.19 61.00 63.00 65.00

In [36]:
data.files.keys()


Out[36]:
['realTweets',
 'realKnownCause',
 'artificialNoAnomaly',
 'realAWSCloudwatch',
 'realAdExchange',
 'artificialWithAnomaly',
 'realTraffic']

In [15]:
data = NABData()
data.load()


------ data
------------ README.md
-------- realTweets
-------------- Twitter_volume_IBM.csv
-------------- Twitter_volume_CVS.csv
-------------- Twitter_volume_AMZN.csv
-------------- Twitter_volume_UPS.csv
-------------- Twitter_volume_CRM.csv
-------------- Twitter_volume_FB.csv
-------------- Twitter_volume_AAPL.csv
-------------- Twitter_volume_GOOG.csv
-------------- Twitter_volume_PFE.csv
-------------- Twitter_volume_KO.csv
-------- artificialNoAnomaly
-------------- art_noisy.csv
-------------- art_daily_small_noise.csv
-------------- art_flatline.csv
-------------- art_daily_perfect_square_wave.csv
-------------- art_daily_no_noise.csv
-------- realAdExchange
-------------- exchange-4_cpm_results.csv
-------------- exchange-4_cpc_results.csv
-------------- exchange-3_cpc_results.csv
-------------- exchange-2_cpm_results.csv
-------------- exchange-3_cpm_results.csv
-------------- exchange-2_cpc_results.csv
-------- artificialWithAnomaly
-------------- art_daily_nojump.csv
-------------- art_load_balancer_spikes.csv
-------------- art_daily_jumpsdown.csv
-------------- art_daily_flatmiddle.csv
-------------- art_daily_jumpsup.csv
-------------- art_increase_spike_density.csv
-------- realTraffic
-------------- TravelTime_451.csv
-------------- occupancy_t4013.csv
-------------- TravelTime_387.csv
-------------- speed_t4013.csv
-------------- speed_7578.csv
-------------- speed_6005.csv
-------------- occupancy_6005.csv
-------- realKnownCause
-------------- rogue_agent_key_updown.csv
-------------- ambient_temperature_system_failure.csv
-------------- ec2_request_latency_system_failure.csv
-------------- rogue_agent_key_hold.csv
-------------- nyc_taxi.csv
-------------- machine_temperature_system_failure.csv
-------------- cpu_utilization_asg_misconfiguration.csv
-------- realAWSCloudwatch
-------------- ec2_disk_write_bytes_c0d644.csv
-------------- iio_us-east-1_i-a2eb1cd9_NetworkIn.csv
-------------- rds_cpu_utilization_e47b3b.csv
-------------- ec2_cpu_utilization_ac20cd.csv
-------------- grok_asg_anomaly.csv
-------------- ec2_cpu_utilization_c6585a.csv
-------------- ec2_network_in_5abac7.csv
-------------- ec2_cpu_utilization_5f5533.csv
-------------- ec2_disk_write_bytes_1ef3de.csv
-------------- ec2_cpu_utilization_53ea38.csv
-------------- ec2_cpu_utilization_825cc2.csv
-------------- ec2_cpu_utilization_fe7f93.csv
-------------- ec2_cpu_utilization_77c1ca.csv
-------------- rds_cpu_utilization_cc0c53.csv
-------------- ec2_cpu_utilization_24ae8d.csv
-------------- ec2_network_in_257a54.csv
-------------- elb_request_count_8c0756.csv

In [ ]: