In [6]:
import os; os.sys.path.append(os.path.dirname(os.path.abspath('.'))) # for relative imports
from utils.nab_data import NABData
import numpy as np
import pandas as pd
In [54]:
import numpy as np
import pandas as pd
import os
from collections import defaultdict, OrderedDict
from natural.number import ordinal
from natural.date import compress
from dateutil.parser import parse
class NABData(object):
data_dir = os.path.join(os.path.dirname(os.path.abspath('.')), 'data')
def __init__(self):
self._load()
def _dir_iter(self):
for root, dirs, files in os.walk(self.data_dir):
for filename in filter(lambda x: x.find('csv') != -1, files):
yield root.split('/')[-1], filename
@staticmethod
def _format_timeseries(df):
df['timestamp'] = df['timestamp'].map(parse)
return df.set_index('timestamp').sort_index()
@staticmethod
def _filename2key(filename):
return filename[:filename.find('.csv')]
def _load(self):
self.files = defaultdict(dict)
cache_miss = False
hdf_path = self._hdf_path()
for folder, filename in self._dir_iter():
try:
self.files[folder][filename] = pd.read_hdf(hdf_path, os.path.join(folder, self._filename2key(filename)))
except (KeyError, IOError):
self.files[folder][filename] = self._format_timeseries(pd.read_csv(os.path.join(self.data_dir, folder, filename)))
cache_miss = True
if cache_miss:
self._save_hdf()
def _hdf_path(self):
return os.path.join(self.data_dir, 'cache.h5')
def _save_hdf(self):
hdf_path = self._hdf_path()
for category, filename, df in self._files_iter():
key = os.path.join(category, self._filename2key(filename))
df.to_hdf(hdf_path, key)
def __getitem__(self, items):
return self.files[items]
def _files_iter(self):
for cat_key in self.files.keys():
for filename in self.files[cat_key].keys():
yield (cat_key, filename, self.files[cat_key][filename])
def summary(self):
data = []
cols = ['category', 'file', 'length', 'features', 'period', 'periods_vary', 'min',
'max', 'mean', 'std', '25th_percentile', '50th_percentile', '75th_percentile']
for category, filename, df in self._files_iter():
row = dict([('category', category), ('file', filename)])
row['length'] = len(df)
row['features'] = len(df.columns)
row['period'] = compress(df.index[1]-df.index[0])
row['periods_vary'] = int(np.unique(np.diff(df.index)).shape == (1,))
row['min'] = df['value'].min()
row['max'] = df['value'].max()
row['mean'] = df['value'].mean()
row['std'] = df['value'].std()
for q in [25, 50, 75]:
row['{}_percentile'.format(ordinal(q))] = np.percentile(df['value'], q)
data.append(row)
return self._round_float_cols(pd.DataFrame(data)[cols])
@staticmethod
def _round_float_cols(df, digits=2):
for col in df.columns:
if df[col].dtype in [np.float64, np.float32]:
df[col] = np.round(df[col], digits)
return df
In [11]:
df = pd.read_csv('/home/jstrong/src/NAB/data/realKnownCause/nyc_taxi.csv')
df.head()
Out[11]:
In [45]:
#data = NABData()
np.unique(np.diff(data['realKnownCause']['nyc_taxi.csv'].index)).shape
Out[45]:
In [55]:
data = NABData()
data.summary()
Out[55]:
In [36]:
data.files.keys()
Out[36]:
In [15]:
data = NABData()
data.load()
In [ ]: