Tables

Module to simplify handling of input and output tables (as .csv files). For now, this assumes that all files are archived in a directory 'xyz.d', where 'xyz' is the 'data set name'. Eventually, we will have a way of archiving sets of files in .zip files.


In [1]:
from __future__ import print_function

import pandas as pd
import os, os.path
import StringIO
import hashlib
from IPython.core.magic import register_cell_magic
import re

In [2]:
class Table(pd.DataFrame):
    
    _internal_names = pd.DataFrame._internal_names + ['filename']
    _internal_names_set = set(_internal_names)
    
    _metadata = ['tablename','dsname']
            
    def __init__(self,*args,**kwargs):
        dsname = kwargs.pop('dsname',None)
        tablename = kwargs.pop('tablename',None)
        filename = kwargs.pop('filename',None)
        super(self.__class__,self).__init__(*args,**kwargs)
        if dsname is not None:
            self.dsname = dsname
        if tablename is not None:
            self.tablename = tablename
        if filename is not None:
            self.filename = filename
        
    @property
    def _constructor(self):
        return self.__class__

In [3]:
class DataSet(object):    
    
    ROOT = 'data'
    DSNAME = None     # default data set name
    DSTYPE = 'dir'    # someday we will allow 'zip' for zip archives
    #DSTYPE = 'cell'  # for CSV data provided via %%Table cell magic
    #DSTYPE = 'data'  # for dataframe data provided directly
    CELLDATA = {}     # csv text from %%Table magic cells, indexed by table name
    DATATABLES = {}   # dataframes directly provided by client, indexed by table name
    
    def __init__(self):
        raise NotImplementedError("Cannot create instance of class '{}'".format(self.__class__.__name__))
    
    @classmethod
    def set_root(cls,root):
        assert os.path.exists(root)
        cls.ROOT = root
    
    @classmethod
    def set_source(cls,dsname,dstype=None):
        if dstype is None:
            dirname = self.root + '/' + dsname + '.d'
            if os.path.exists(dirname):
                dstype = 'dir'
            else:
                dstype = 'unknown'
        assert dstype in ['dir','cell','data']
        cls.DSNAME = dsname
        cls.DSTYPE = dstype
        cls.CELLDATA = {}
        cls.DATATABLES = {}
        
    @classmethod
    def set_data(cls,tablename,table):
        cls.DATATABLES[tablename] = table
        if tablename in cls.CELLDATA:
            del cls.CELLDATA[tablename]
            
    @classmethod
    def set_cell(cls,tablename,celltext):
        cls.CELLDATA[tablename] = celltext
        if tablename in cls.DATATABLES:
            del cls.DATATABLES[tablename]
            
    @classmethod
    def _file_name(cls,tablename,prefix=None):
        n = tablename
        if prefix:
            n = prefix + '/' + tablename
        return cls.ROOT + '/' + cls.DSNAME + '.d/' + n + '.csv'
    
    @classmethod
    def get_table(cls,tablename,optional=False,prefix=None,columns=None,extrasok=True):
        stream = None
        filename = None
        t = None
        if tablename in cls.DATATABLES:
            t = cls.DATATABLES[tablename]
        else:
            if tablename in cls.CELLDATA:
                stream = StringIO.StringIO(cls.CELLDATA[tablename])
            else:
                filename = cls._file_name(tablename,prefix=prefix)
                if os.path.exists(filename):
                    stream = file(filename,'r')
            if stream is None:
                if optional:
                    d = pd.DataFrame(columns=columns)
                else:
                    raise ValueError("Table '{}' does not exist.".format(tablename))
            else:
                d = pd.read_csv(stream,index_col=None,skipinitialspace=True)
            t = Table(d,dsname=cls.DSNAME,tablename=tablename,filename=filename)

        if columns is None:
            return t
        prov = set(t.columns)
        reqd = set(columns)
        if reqd-prov:
            raise ValueError("Columns missing for table '{}': {}. Required columns are: {}"
                             .format(tablename,list(reqd-prov),columns))
        if prov-reqd:
            if not extrasok:
                raise ValueError("Extra columns for table '{}': {}. Required columns are: '{}'"
                                .format(tablename,list(prov-reqd),columns))
            t = t[columns]
        return t
    
    def write(self,ds_name=None,precision=None,index=False,prefix=None,makedir=False):
        if ds_name is None:
            ds_name = self.ds_name
        dirname = 'data/' + ds_name + '.d'
        if makedir and not os.path.exists(dirname):
            os.mkdir(dirname)
        if prefix is not None:
            dirname = dirname + '/' + prefix
            if makedir and not os.path.exists(dirname):
                os.mkdir(dirname)
        self.file_name = file_name = dirname + '/' + self.table_name + '.csv'
        float_format = None
        if precision is not None:
            float_format = '%.{:d}g'.format(precision)
        self.data.to_csv(file_name,index=index,float_format=float_format)
        return file_name
        
    def basename(self,file_name=None):
        if file_name is None:
            file_name = self.file_name
        return os.path.basename(file_name)
    
    def signature(self):
        file_name = self.file_name
        return (self.table_name,file_name,signature(file_name))
    
    def __len__(self):
        return len(self.data)
    
def signature(file_name):
    f = open(file_name,mode='rb')
    m = hashlib.sha256(f.read())
    f.close()
    return m.hexdigest()

In [4]:
##test:
DataSet.DSNAME = 'frame-6'
t = DataSet.get_table('nodes')
t


Out[4]:
NODEID X Y Z
0 A 0 0 5000
1 B 0 4000 5000
2 C 8000 4000 5000
3 D 8000 0 5000

In [5]:
##test:
type(t)


Out[5]:
__main__.Table

In [6]:
##test:
t.loc[:,['X','Y']] /= 3.
t


Out[6]:
NODEID X Y Z
0 A 0.000000 0.000000 5000
1 B 0.000000 1333.333333 5000
2 C 2666.666667 1333.333333 5000
3 D 2666.666667 0.000000 5000

In [7]:
##test:
type(t)


Out[7]:
__main__.Table

In [8]:
##test:
vars(t)


Out[8]:
{'_data': BlockManager
 Items: Index([u'NODEID', u'X', u'Y', u'Z'], dtype='object')
 Axis 1: RangeIndex(start=0, stop=4, step=1)
 IntBlock: slice(3, 4, 1), 1 x 4, dtype: int64
 ObjectBlock: slice(0, 1, 1), 1 x 4, dtype: object
 FloatBlock: slice(1, 2, 1), 1 x 4, dtype: float64
 FloatBlock: slice(2, 3, 1), 1 x 4, dtype: float64,
 '_iloc': <pandas.core.indexing._iLocIndexer at 0x7fde07bca3d0>,
 '_item_cache': {},
 '_loc': <pandas.core.indexing._LocIndexer at 0x7fde07bcad90>,
 'dsname': 'frame-6',
 'filename': 'data/frame-6.d/nodes.csv',
 'is_copy': None,
 'tablename': 'nodes'}

In [9]:
##test:
DataSet.get_table('nodes',columns=['NODEID','Y','X'])


Out[9]:
NODEID Y X
0 A 0 0
1 B 4000 0
2 C 4000 8000
3 D 0 8000

In [10]:
##test:
try:
    t = DataSet.get_table('nodes',columns=['NODEID','Y','X'],extrasok=False)
except Exception as e:
    print('***',e)
    t = None
t


*** Extra columns for table 'nodes': ['Z']. Required columns are: '['NODEID', 'Y', 'X']'

In [11]:
##test:
try:
    t = DataSet.get_table('nodes',columns=['NODEID','Y','X','C','D'])
except Exception as e:
    print('***',e)
    t = None
t


*** Columns missing for table 'nodes': ['C', 'D']. Required columns are: ['NODEID', 'Y', 'X', 'C', 'D']

In [12]:
##test:
try:
    t = DataSet.get_table('nodesxxx',columns=['NODEID','Y','X'],extrasok=False)
except Exception as e:
    print('***',e)
    t = None
t


*** Table 'nodesxxx' does not exist.

In [13]:
##test:
try:
    t = DataSet.get_table('nodesxxx',columns=['NODEID','Y','X'],extrasok=False,optional=True)
except Exception as e:
    print('***',e)
    t = None
t


Out[13]:
NODEID Y X

In [14]:
@register_cell_magic('Table')
def cell_table(line,cell):
    mo = re.match(r'\s*(\S+)\s*$',line)
    if not mo:
        raise ValueError('Usage: %%Table tablename')
    tablename = mo.group(1)
    global DataSet
    DataSet.DSTYPE = 'cell'
    DataSet.set_cell(tablename,cell)

In [15]:
%%Table nodes
NODEID,X,Y,Z
A,0,0,5001
B,0,4000,5002
C,8000,4000,5003
D,8000,0,5004

In [16]:
##test:
DataSet.DSTYPE


Out[16]:
'cell'

In [17]:
##test:
DataSet.CELLDATA


Out[17]:
{u'nodes': u'NODEID,X,Y,Z\nA,0,0,5001\nB,0,4000,5002\nC,8000,4000,5003\nD,8000,0,5004'}

In [18]:
##test:
t = DataSet.get_table('nodes',columns=['NODEID','Y','Z'])
t


Out[18]:
NODEID Y Z
0 A 0 5001
1 B 4000 5002
2 C 4000 5003
3 D 0 5004

In [19]:
##test:
DataSet.set_data('nodesxx',t)
tt = DataSet.get_table('nodesxx',columns=['NODEID','Y','Z'])
tt


Out[19]:
NODEID Y Z
0 A 0 5001
1 B 4000 5002
2 C 4000 5003
3 D 0 5004

In [22]:
##test:
tt.tablename


Out[22]:
'nodes'

In [21]:
##test:
vars(tt)


Out[21]:
{'_data': BlockManager
 Items: Index([u'NODEID', u'Y', u'Z'], dtype='object')
 Axis 1: RangeIndex(start=0, stop=4, step=1)
 ObjectBlock: slice(0, 1, 1), 1 x 4, dtype: object
 IntBlock: slice(1, 3, 1), 2 x 4, dtype: int64,
 '_iloc': <pandas.core.indexing._iLocIndexer at 0x7fde07bcaf50>,
 '_item_cache': {},
 'dsname': 'frame-6',
 'is_copy': <weakref at 0x7fde07bea6d8; dead>,
 'tablename': 'nodes'}

To Here


In [ ]:
##test:
t.write(precision=7,prefix='out',makedir=True)

In [ ]:
##test:
t.signature()

In [ ]:
##test:
vars(t)

In [ ]:
##test:
t.read()

In [ ]:
##test:
vars(t)

In [ ]: