Data Input Tables and Sources

Module to simplify handling of input and output tables (as .csv text and/or files).

There are two main classes defined here:

  • class Table - this is a slightly augmented subclass of a pandas DataFrame; augmented to retain the name of the table and the source of the data.
  • class DataSource - this class manages the various possible data sources to provide a unified API for programs that need the data. The most important method, '.read_table()', returns a single named Table. Currently, it tries the following sources in order, returning the first one it finds:
    1. a table provided directly by an application via the '.set_table()' method.
    2. a table provided as CSV data directly in a cell using the '%%Table' cell magic.
    3. a table in a CSV file with path name: 'root/dsname.d/tablename.csv'
      • root is the root directory of all data sets, default 'data'.
      • dsname is a data set name specified when an analysis is started by instantiating Frame2D.
      • tablename is the name of the table.

In [1]:
from salib import extend
import pandas as pd
import os, os.path
try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO
import hashlib
from IPython.core.magic import register_cell_magic
import re

class Table


In [2]:
class Table(pd.DataFrame):
    
    """A Table is just like a pandas DataFrame except that it has
    a table name, a data set name, and a file name - the latter two describing
    the source of the data."""
    
    _internal_names = pd.DataFrame._internal_names + ['filename','tablename']
    _internal_names_set = set(_internal_names)

    _metadata = ['dsname']
            
    def __init__(self,*args,**kwargs):
        dsname = kwargs.pop('dsname',None)
        tablename = kwargs.pop('tablename',None)
        filename = kwargs.pop('filename',None)
        super(self.__class__,self).__init__(*args,**kwargs)
        if dsname is not None:
            self.dsname = dsname
        if tablename is not None:
            self.tablename = tablename
        if filename is not None:
            self.filename = filename
        
    @property
    def _constructor(self):
        return self.__class__

In [3]:
##test:
t = Table(data=[(10,20.,'a'),(11,22.,'b'),(12,23.,'c')],
          columns=['I','F','S'],tablename='Test',dsname='Notebook')
t


Out[3]:
I F S
0 10 20.0 a
1 11 22.0 b
2 12 23.0 c

In [4]:
##test:
t.dtypes


Out[4]:
I      int64
F    float64
S     object
dtype: object

In [5]:
##test:
t.tablename, t.dsname


Out[5]:
('Test', 'Notebook')

In [6]:
##test:
t2 = t[['S','I']]
t2


Out[6]:
S I
0 a 10
1 b 11
2 c 12

In [7]:
##test:
hasattr(t2,'tablename'), hasattr(t2,'dsname')


Out[7]:
(False, True)

In [8]:
##test:
t2.dsname


Out[8]:
'Notebook'

In [9]:
##test:
t = pd.DataFrame(data=[(10,20.,'a'),(11,22.,'b'),(12,23.,'c')],columns=['I','F','S'])
u = Table(data=t,dsname='foo',copy=False)
u


Out[9]:
I F S
0 10 20.0 a
1 11 22.0 b
2 12 23.0 c

In [10]:
##test:
u['F'] *= 3
u


Out[10]:
I F S
0 10 60.0 a
1 11 66.0 b
2 12 69.0 c

In [11]:
##test:
t


Out[11]:
I F S
0 10 60.0 a
1 11 66.0 b
2 12 69.0 c

In [12]:
##test:
u.dsname


Out[12]:
'foo'

class DataSource

Class to unify the source of tables. For now, this assumes that a table:

  • has been specified directly via '.set_table', or
  • has been provided in CSV form using the cell magic '%%Table', or
  • is in a CSV file available in a directory '<root>/xyz.d', where 'xyz' is the 'data set name'.

Eventually, we will have a way of archiving sets of files in .zip files.


In [13]:
class DataSource(object):
    
    ROOT = 'data'
    DSNAME = None     # default data set name
    DSTYPE = 'dir'    # someday we will allow 'zip' for zip archives
    #DSTYPE = 'cell'  # for CSV data provided via %%Table cell magic
    #DSTYPE = 'data'  # for dataframe data provided directly
    CELLDATA = {}     # csv text from %%Table magic cells, indexed by table name
    TABLES = {}       # dataframes directly provided by client, indexed by table name
    
    DATASOURCE = None # the one and only data source
    
    def __init__(self):
        cls = self.__class__
        if cls.DATASOURCE is not None:
            raise ValueError("Can only create one instance of class '{}'".format(cls.__name__))
        self.root = cls.ROOT
        self.dsname = cls.DSNAME
        self.prefix = None
        self.dstype = cls.DSTYPE
        self.celldata = cls.CELLDATA
        self.tables = cls.TABLES
        cls.DATASOURCE = self

In [14]:
##test:
d = DataSource()
vars(d)


Out[14]:
{'celldata': {},
 'dsname': None,
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}

In [15]:
##test:
try:
    d2 = DataSource()
except Exception as e:
    print('*'*5,e)
    d2 = None
d2


***** Can only create one instance of class 'DataSource'

In [16]:
@extend
class DataSource:
    
    @classmethod
    def set_root(cls,newroot):
        self = cls.DATASOURCE
        if not os.path.exists(newroot):
            raise ValueError("Root '{}' does not exist.".format(newroot))
        self.root = newroot

    @classmethod
    def set_source(cls,dsname,dstype=None):
        self = cls.DATASOURCE
        if dsname is not None:
            if dstype is None:
                dirname = self.root + '/' + dsname + '.d'
                if os.path.exists(dirname):
                    dstype = 'dir'
                else:
                    dstype = 'unknown'
            if dstype not in ['dir','cell','data']:
                raise ValueError("dstype '{}' is invalid.".format(dstype))
        self.dsname = dsname
        self.dstype = dstype
        self.celldata = {}
        self.tables = {}
        
    @classmethod
    def set_table(cls,tablename,table):
        self = cls.DATASOURCE
        self.tables[tablename] = table
        if tablename in self.celldata:
            del self.celldata[tablename]
    
    @classmethod
    def set_celldata(cls,tablename,celltext):
        self = cls.DATASOURCE
        self.celldata[tablename] = celltext
        if tablename in self.tables:
            del self.tables[tablename]
    
    def _file_name(self,tablename,prefix=None):
        n = tablename
        if prefix:
            n = prefix + '/' + tablename
        return self.root + '/' + self.dsname + '.d/' + n + '.csv'

In [17]:
##test:
DataSource.DATASOURCE = None
ds = DataSource()
vars(ds)


Out[17]:
{'celldata': {},
 'dsname': None,
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}

In [18]:
##test:
try:
    DataSource.set_root('foo')
except Exception as e:
    print('*'*5,e)
vars(ds)


***** Root 'foo' does not exist.
Out[18]:
{'celldata': {},
 'dsname': None,
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}

In [19]:
##test:
DataSource.set_root('img')
vars(ds)


Out[19]:
{'celldata': {},
 'dsname': None,
 'dstype': 'dir',
 'prefix': None,
 'root': 'img',
 'tables': {}}

In [20]:
##test:
DataSource.set_root('data')

In [21]:
##test:
DataSource.set_source('frame-1')
vars(ds)


Out[21]:
{'celldata': {},
 'dsname': 'frame-1',
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}

In [22]:
##test:
DataSource.set_table('joints',[dict(NODEID='A',X=10,Y=20),dict(NODEID='B',Y=20,X=30)])
vars(ds)


Out[22]:
{'celldata': {},
 'dsname': 'frame-1',
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {'joints': [{'NODEID': 'A', 'X': 10, 'Y': 20},
   {'NODEID': 'B', 'X': 30, 'Y': 20}]}}

In [23]:
##test:
DataSource.set_celldata('joints','NODEID,X,Y\nA,10,20\nB,30,20')
vars(ds)


Out[23]:
{'celldata': {'joints': 'NODEID,X,Y\nA,10,20\nB,30,20'},
 'dsname': 'frame-1',
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}

In [24]:
##test:
ds._file_name('joints')


Out[24]:
'data/frame-1.d/joints.csv'

In [25]:
##test:
ds._file_name('joints',prefix='lcase1')


Out[25]:
'data/frame-1.d/lcase1/joints.csv'

Reading Tables


In [26]:
@extend
class DataSource:
    
    @classmethod
    def read_table(cls,tablename,optional=False,prefix=None,columns=None,extrasok=True):
        self = cls.DATASOURCE
        stream = None
        filename = None
        t = None
        
        def _chk(t,columns=columns):
            if columns is None:
                return t
            prov = set(t.columns)
            reqd = set(columns)
            if reqd-prov:
                raise ValueError("Columns missing for table '{}': {}. Required columns are: {}"
                                 .format(tablename,list(reqd-prov),columns))
            if prov-reqd:
                if not extrasok:
                    raise ValueError("Extra columns for table '{}': {}. Required columns are: '{}'"
                                    .format(tablename,list(prov-reqd),columns))
                t = t[columns]
            return t            
            
        if tablename in self.tables:
            return _chk(self.tables[tablename])

        if tablename in self.celldata:
            stream = StringIO(self.celldata[tablename])
        else:
            if self.dsname is not None:
                filename = self._file_name(tablename,prefix=prefix)
                if os.path.exists(filename):
                    stream = open(filename,'r')
        if stream is None:
            if optional:
                d = pd.DataFrame(columns=columns)
            else:
                raise ValueError("Table '{}' does not exist.".format(tablename))
        else:
            d = pd.read_csv(stream,index_col=None,skipinitialspace=True)
        t = Table(d,dsname=self.dsname,tablename=tablename,filename=filename)
        return _chk(t)

In [27]:
##test:
DataSource.set_source('frame-6')
t = DataSource.read_table('nodes')
t


Out[27]:
NODEID X Y Z
0 A 0 0 5000
1 B 0 4000 5000
2 C 8000 4000 5000
3 D 8000 0 5000

In [28]:
##test:
type(t)


Out[28]:
__main__.Table

In [29]:
##test:
len(t)


Out[29]:
4

In [30]:
##test:
t[['X','Y']] /= 3.
t


Out[30]:
NODEID X Y Z
0 A 0.000000 0.000000 5000
1 B 0.000000 1333.333333 5000
2 C 2666.666667 1333.333333 5000
3 D 2666.666667 0.000000 5000

In [31]:
##test:
vars(t)


Out[31]:
{'_data': BlockManager
 Items: Index(['NODEID', 'X', 'Y', 'Z'], dtype='object')
 Axis 1: RangeIndex(start=0, stop=4, step=1)
 IntBlock: slice(3, 4, 1), 1 x 4, dtype: int64
 ObjectBlock: slice(0, 1, 1), 1 x 4, dtype: object
 FloatBlock: slice(1, 2, 1), 1 x 4, dtype: float64
 FloatBlock: slice(2, 3, 1), 1 x 4, dtype: float64,
 '_iloc': <pandas.core.indexing._iLocIndexer at 0x7f35e27a56a0>,
 '_item_cache': {},
 '_loc': <pandas.core.indexing._LocIndexer at 0x7f35e27a59e8>,
 'dsname': 'frame-6',
 'filename': 'data/frame-6.d/nodes.csv',
 'is_copy': None,
 'tablename': 'nodes'}

In [32]:
##test:
DataSource.read_table('nodes',columns=['NODEID','Y','X'])


Out[32]:
NODEID Y X
0 A 0 0
1 B 4000 0
2 C 4000 8000
3 D 0 8000

In [33]:
##test:
try:
    t = DataSource.read_table('nodes',columns=['NODEID','Y','X'],extrasok=False)
except Exception as e:
    print('***',e)
    t = None
t


*** Extra columns for table 'nodes': ['Z']. Required columns are: '['NODEID', 'Y', 'X']'

In [34]:
##test:
try:
    t = DataSource.read_table('nodes',columns=['NODEID','Y','X','C','D'])
except Exception as e:
    print('***',e)
    t = None
t


*** Columns missing for table 'nodes': ['C', 'D']. Required columns are: ['NODEID', 'Y', 'X', 'C', 'D']

In [35]:
##test:
try:
    t = DataSource.read_table('nodesxxx',columns=['NODEID','Y','X'],extrasok=False)
except Exception as e:
    print('***',e)
    t = None
t


*** Table 'nodesxxx' does not exist.

In [36]:
##test:
try:
    t = DataSource.read_table('nodesxxx',columns=['NODEID','Y','X'],extrasok=False,optional=True)
except Exception as e:
    print('***',e)
    t = None
t


Out[36]:
NODEID Y X

In [37]:
@register_cell_magic('Table')
def cell_table(line,celltext):
    mo = re.match(r'\s*(\S+)\s*$',line)
    if not mo:
        raise ValueError('Usage: %%Table tablename')
    tablename = mo.group(1)
    global DataSource
    DataSource.set_celldata(tablename,celltext)

In [38]:
%%Table nodes
NODEID,X,Y,Z
A,0.,0.,50001
B,0,4000,50002
C,8000,4000,50003
D,8000,0,50004

In [39]:
##test:
t2 = DataSource.read_table('nodes')
t2


Out[39]:
NODEID X Y Z
0 A 0.0 0.0 50001
1 B 0.0 4000.0 50002
2 C 8000.0 4000.0 50003
3 D 8000.0 0.0 50004

In [40]:
##test:
DataSource.set_table('nodes',t2+t2)
t3 = DataSource.read_table('nodes')
t3


Out[40]:
NODEID X Y Z
0 AA 0.0 0.0 100002
1 BB 0.0 8000.0 100004
2 CC 16000.0 8000.0 100006
3 DD 16000.0 0.0 100008

In [41]:
##test:
vars(t2)


Out[41]:
{'_data': BlockManager
 Items: Index(['NODEID', 'X', 'Y', 'Z'], dtype='object')
 Axis 1: RangeIndex(start=0, stop=4, step=1)
 FloatBlock: slice(1, 3, 1), 2 x 4, dtype: float64
 IntBlock: slice(3, 4, 1), 1 x 4, dtype: int64
 ObjectBlock: slice(0, 1, 1), 1 x 4, dtype: object,
 '_iloc': <pandas.core.indexing._iLocIndexer at 0x7f35e27a5208>,
 '_item_cache': {},
 'dsname': 'frame-6',
 'is_copy': None,
 'tablename': 'nodes'}

Writing Tables


In [42]:
@extend
class DataSource:
    
    @classmethod
    def write_table(cls,table,root=None,dsname=None,tablename=None,prefix=None,precision=None,index=False,makedir=False):
        self = cls.DATASOURCE
        if root is None:
            root = self.root
        if dsname is None:
            dsname = self.dsname
        if tablename is None:
            tablename = table.tablename
        dirname = root + '/' + dsname + '.d'
        if makedir and not os.path.exists(dirname):
            os.mkdir(dirname)
        if prefix is not None:
            dirname = dirname + '/' + prefix
            if makedir and not os.path.exists(dirname):
                os.mkdir(dirname)
                
        table.tablename = tablename
        table.dsname = dsname
        table.filename = filename = dirname + '/' + tablename + '.csv'
        
        float_format = None
        if precision is not None:
            float_format = '%.{:d}g'.format(precision)
        table.to_csv(filename,index=index,float_format=float_format)
        return filename

In [43]:
@extend
class Table:
    
    def signature(self):
        filename = self.filename
        if os.path.exists(filename):
            return (self.tablename,self.filename,signature(filename))
        raise ValueError("Table {}: filename: {} - does not exist.".format(self.tablename,self.filename))
    
def signature(filename):
    f = open(filename,mode='rb')
    m = hashlib.sha256(f.read())
    f.close()
    return m.hexdigest()

In [44]:
DataSource.DATASOURCE = None
__ds__ = DataSource()

In [45]:
%%Table nodes
NODEID,X,Y,Z
A,0.,0.,6002.
B,0,4000,7003
C,8000,4000,8004
D,8000,0,9005

In [46]:
##test:
t = DataSource.read_table('nodes')
t


Out[46]:
NODEID X Y Z
0 A 0.0 0.0 6002.0
1 B 0.0 4000.0 7003.0
2 C 8000.0 4000.0 8004.0
3 D 8000.0 0.0 9005.0

In [47]:
##test:
t[['X','Z']] /= 3
t


Out[47]:
NODEID X Y Z
0 A 0.000000 0.0 2000.666667
1 B 0.000000 4000.0 2334.333333
2 C 2666.666667 4000.0 2668.000000
3 D 2666.666667 0.0 3001.666667

In [48]:
##test:
vars(t)


Out[48]:
{'_data': BlockManager
 Items: Index(['NODEID', 'X', 'Y', 'Z'], dtype='object')
 Axis 1: RangeIndex(start=0, stop=4, step=1)
 FloatBlock: slice(1, 4, 1), 3 x 4, dtype: float64
 ObjectBlock: slice(0, 1, 1), 1 x 4, dtype: object,
 '_iloc': <pandas.core.indexing._iLocIndexer at 0x7f35e2a41c50>,
 '_item_cache': {},
 '_loc': <pandas.core.indexing._LocIndexer at 0x7f35e2796748>,
 'is_copy': None,
 'tablename': 'nodes'}

In [49]:
##test:
try:
    DataSource.write_table(t,dsname='test',prefix='pfx',tablename='nodes2')
except Exception as e:
    print('*'*5,e)


***** [Errno 2] No such file or directory: 'data/test.d/pfx/nodes2.csv'

In [50]:
##test:
%rm -rf data/test.d
try:
    r = DataSource.write_table(t,dsname='test',prefix='pfx',tablename='nodes2',makedir=True,precision=15)
except Exception as e:
    print('*'*5,e)
r


Out[50]:
'data/test.d/pfx/nodes2.csv'

In [51]:
##test:
%cat data/test.d/pfx/nodes2.csv


NODEID,X,Y,Z
A,0,0,2000.66666666667
B,0,4000,2334.33333333333
C,2666.66666666667,4000,2668
D,2666.66666666667,0,3001.66666666667

In [52]:
##test:
t.signature()


Out[52]:
('nodes2',
 'data/test.d/pfx/nodes2.csv',
 '112e27f0a17e1ca9a580922707bb2fcf63146625708d2f2ec5e6f879b176e2b7')

In [53]:
##test:
%rm -rf data/test.d

In [54]:
##test:
vars(t)


Out[54]:
{'_data': BlockManager
 Items: Index(['NODEID', 'X', 'Y', 'Z'], dtype='object')
 Axis 1: RangeIndex(start=0, stop=4, step=1)
 FloatBlock: slice(1, 4, 1), 3 x 4, dtype: float64
 ObjectBlock: slice(0, 1, 1), 1 x 4, dtype: object,
 '_iloc': <pandas.core.indexing._iLocIndexer at 0x7f35e2a41c50>,
 '_item_cache': {},
 '_loc': <pandas.core.indexing._LocIndexer at 0x7f35e2796748>,
 'dsname': 'test',
 'filename': 'data/test.d/pfx/nodes2.csv',
 'is_copy': None,
 'tablename': 'nodes2'}

In [55]:
DataSource.DATASOURCE = None
__ds__ = DataSource()

In [ ]: