Data Input Tables and Sources

Module to simplify handling of input and output tables (as .csv text and/or files).

There are two main classes defined here:

class Table - this is a slightly augmented subclass of a pandas DataFrame; augmented to retain the name of the table and the source of the data.
class DataSource - this class manages the various possible data sources to provide a unified API for programs that need the data. The most important method, '.read_table()', returns a single named Table. Currently, it tries the following sources in order, returning the first one it finds:
1. a table provided directly by an application via the '.set_table()' method.
2. a table provided as CSV data directly in a cell using the '%%Table' cell magic.
3. a table in a CSV file with path name: 'root/dsname.d/tablename.csv'
  - root is the root directory of all data sets, default 'data'.
  - dsname is a data set name specified when an analysis is started by instantiating Frame2D.
  - tablename is the name of the table.



In [1]:

    
from salib import extend
import pandas as pd
import os, os.path
try:
    from StringIO import StringIO
except ImportError:
    from io import StringIO
import hashlib
from IPython.core.magic import register_cell_magic
import re

class Table



In [2]:

    
class Table(pd.DataFrame):
    
    """A Table is just like a pandas DataFrame except that it has
    a table name, a data set name, and a file name - the latter two describing
    the source of the data."""
    
    _internal_names = pd.DataFrame._internal_names + ['filename','tablename']
    _internal_names_set = set(_internal_names)

    _metadata = ['dsname']
            
    def __init__(self,*args,**kwargs):
        dsname = kwargs.pop('dsname',None)
        tablename = kwargs.pop('tablename',None)
        filename = kwargs.pop('filename',None)
        super(self.__class__,self).__init__(*args,**kwargs)
        if dsname is not None:
            self.dsname = dsname
        if tablename is not None:
            self.tablename = tablename
        if filename is not None:
            self.filename = filename
        
    @property
    def _constructor(self):
        return self.__class__



In [3]:

    
##test:
t = Table(data=[(10,20.,'a'),(11,22.,'b'),(12,23.,'c')],
          columns=['I','F','S'],tablename='Test',dsname='Notebook')
t



In [4]:

    
##test:
t.dtypes









    Out[4]:





I      int64
F    float64
S     object
dtype: object



In [5]:

    
##test:
t.tablename, t.dsname









    Out[5]:





('Test', 'Notebook')



In [6]:

    
##test:
t2 = t[['S','I']]
t2



In [7]:

    
##test:
hasattr(t2,'tablename'), hasattr(t2,'dsname')









    Out[7]:





(False, True)



In [8]:

    
##test:
t2.dsname









    Out[8]:





'Notebook'



In [9]:

    
##test:
t = pd.DataFrame(data=[(10,20.,'a'),(11,22.,'b'),(12,23.,'c')],columns=['I','F','S'])
u = Table(data=t,dsname='foo',copy=False)
u



In [10]:

    
##test:
u['F'] *= 3
u



In [11]:

    
##test:
t



In [12]:

    
##test:
u.dsname









    Out[12]:





'foo'

class DataSource

Class to unify the source of tables. For now, this assumes that a table:

has been specified directly via '.set_table', or
has been provided in CSV form using the cell magic '%%Table', or
is in a CSV file available in a directory '<root>/xyz.d', where 'xyz' is the 'data set name'.

Eventually, we will have a way of archiving sets of files in .zip files.



In [13]:

    
class DataSource(object):
    
    ROOT = 'data'
    DSNAME = None     # default data set name
    DSTYPE = 'dir'    # someday we will allow 'zip' for zip archives
    #DSTYPE = 'cell'  # for CSV data provided via %%Table cell magic
    #DSTYPE = 'data'  # for dataframe data provided directly
    CELLDATA = {}     # csv text from %%Table magic cells, indexed by table name
    TABLES = {}       # dataframes directly provided by client, indexed by table name
    
    DATASOURCE = None # the one and only data source
    
    def __init__(self):
        cls = self.__class__
        if cls.DATASOURCE is not None:
            raise ValueError("Can only create one instance of class '{}'".format(cls.__name__))
        self.root = cls.ROOT
        self.dsname = cls.DSNAME
        self.prefix = None
        self.dstype = cls.DSTYPE
        self.celldata = cls.CELLDATA
        self.tables = cls.TABLES
        cls.DATASOURCE = self



In [14]:

    
##test:
d = DataSource()
vars(d)









    Out[14]:





{'celldata': {},
 'dsname': None,
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}



In [15]:

    
##test:
try:
    d2 = DataSource()
except Exception as e:
    print('*'*5,e)
    d2 = None
d2









    



***** Can only create one instance of class 'DataSource'



In [16]:

    
@extend
class DataSource:
    
    @classmethod
    def set_root(cls,newroot):
        self = cls.DATASOURCE
        if not os.path.exists(newroot):
            raise ValueError("Root '{}' does not exist.".format(newroot))
        self.root = newroot

    @classmethod
    def set_source(cls,dsname,dstype=None):
        self = cls.DATASOURCE
        if dsname is not None:
            if dstype is None:
                dirname = self.root + '/' + dsname + '.d'
                if os.path.exists(dirname):
                    dstype = 'dir'
                else:
                    dstype = 'unknown'
            if dstype not in ['dir','cell','data']:
                raise ValueError("dstype '{}' is invalid.".format(dstype))
        self.dsname = dsname
        self.dstype = dstype
        self.celldata = {}
        self.tables = {}
        
    @classmethod
    def set_table(cls,tablename,table):
        self = cls.DATASOURCE
        self.tables[tablename] = table
        if tablename in self.celldata:
            del self.celldata[tablename]
    
    @classmethod
    def set_celldata(cls,tablename,celltext):
        self = cls.DATASOURCE
        self.celldata[tablename] = celltext
        if tablename in self.tables:
            del self.tables[tablename]
    
    def _file_name(self,tablename,prefix=None):
        n = tablename
        if prefix:
            n = prefix + '/' + tablename
        return self.root + '/' + self.dsname + '.d/' + n + '.csv'



In [17]:

    
##test:
DataSource.DATASOURCE = None
ds = DataSource()
vars(ds)









    Out[17]:





{'celldata': {},
 'dsname': None,
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}



In [18]:

    
##test:
try:
    DataSource.set_root('foo')
except Exception as e:
    print('*'*5,e)
vars(ds)









    



***** Root 'foo' does not exist.






    Out[18]:





{'celldata': {},
 'dsname': None,
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}



In [19]:

    
##test:
DataSource.set_root('img')
vars(ds)









    Out[19]:





{'celldata': {},
 'dsname': None,
 'dstype': 'dir',
 'prefix': None,
 'root': 'img',
 'tables': {}}



In [20]:

    
##test:
DataSource.set_root('data')



In [21]:

    
##test:
DataSource.set_source('frame-1')
vars(ds)









    Out[21]:





{'celldata': {},
 'dsname': 'frame-1',
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}



In [22]:

    
##test:
DataSource.set_table('joints',[dict(NODEID='A',X=10,Y=20),dict(NODEID='B',Y=20,X=30)])
vars(ds)









    Out[22]:





{'celldata': {},
 'dsname': 'frame-1',
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {'joints': [{'NODEID': 'A', 'X': 10, 'Y': 20},
   {'NODEID': 'B', 'X': 30, 'Y': 20}]}}



In [23]:

    
##test:
DataSource.set_celldata('joints','NODEID,X,Y\nA,10,20\nB,30,20')
vars(ds)









    Out[23]:





{'celldata': {'joints': 'NODEID,X,Y\nA,10,20\nB,30,20'},
 'dsname': 'frame-1',
 'dstype': 'dir',
 'prefix': None,
 'root': 'data',
 'tables': {}}



In [24]:

    
##test:
ds._file_name('joints')









    Out[24]:





'data/frame-1.d/joints.csv'



In [25]:

    
##test:
ds._file_name('joints',prefix='lcase1')









    Out[25]:





'data/frame-1.d/lcase1/joints.csv'

Reading Tables



In [26]:

    
@extend
class DataSource:
    
    @classmethod
    def read_table(cls,tablename,optional=False,prefix=None,columns=None,extrasok=True):
        self = cls.DATASOURCE
        stream = None
        filename = None
        t = None
        
        def _chk(t,columns=columns):
            if columns is None:
                return t
            prov = set(t.columns)
            reqd = set(columns)
            if reqd-prov:
                raise ValueError("Columns missing for table '{}': {}. Required columns are: {}"
                                 .format(tablename,list(reqd-prov),columns))
            if prov-reqd:
                if not extrasok:
                    raise ValueError("Extra columns for table '{}': {}. Required columns are: '{}'"
                                    .format(tablename,list(prov-reqd),columns))
                t = t[columns]
            return t            
            
        if tablename in self.tables:
            return _chk(self.tables[tablename])

        if tablename in self.celldata:
            stream = StringIO(self.celldata[tablename])
        else:
            if self.dsname is not None:
                filename = self._file_name(tablename,prefix=prefix)
                if os.path.exists(filename):
                    stream = open(filename,'r')
        if stream is None:
            if optional:
                d = pd.DataFrame(columns=columns)
            else:
                raise ValueError("Table '{}' does not exist.".format(tablename))
        else:
            d = pd.read_csv(stream,index_col=None,skipinitialspace=True)
        t = Table(d,dsname=self.dsname,tablename=tablename,filename=filename)
        return _chk(t)



In [27]:

    
##test:
DataSource.set_source('frame-6')
t = DataSource.read_table('nodes')
t



In [28]:

    
##test:
type(t)









    Out[28]:





__main__.Table



In [29]:

    
##test:
len(t)









    Out[29]:





4



In [30]:

    
##test:
t[['X','Y']] /= 3.
t









    Out[30]:







  
    
      
      NODEID
      X
      Y
      Z
    
  
  
    
      0
      A
      0.000000
      0.000000
      5000
    
    
      1
      B
      0.000000
      1333.333333
      5000
    
    
      2
      C
      2666.666667
      1333.333333
      5000
    
    
      3
      D
      2666.666667
      0.000000
      5000



In [31]:

    
##test:
vars(t)









    Out[31]:





{'_data': BlockManager
 Items: Index(['NODEID', 'X', 'Y', 'Z'], dtype='object')
 Axis 1: RangeIndex(start=0, stop=4, step=1)
 IntBlock: slice(3, 4, 1), 1 x 4, dtype: int64
 ObjectBlock: slice(0, 1, 1), 1 x 4, dtype: object
 FloatBlock: slice(1, 2, 1), 1 x 4, dtype: float64
 FloatBlock: slice(2, 3, 1), 1 x 4, dtype: float64,
 '_iloc': <pandas.core.indexing._iLocIndexer at 0x7f35e27a56a0>,
 '_item_cache': {},
 '_loc': <pandas.core.indexing._LocIndexer at 0x7f35e27a59e8>,
 'dsname': 'frame-6',
 'filename': 'data/frame-6.d/nodes.csv',
 'is_copy': None,
 'tablename': 'nodes'}



In [32]:

    
##test:
DataSource.read_table('nodes',columns=['NODEID','Y','X'])



In [33]:

    
##test:
try:
    t = DataSource.read_table('nodes',columns=['NODEID','Y','X'],extrasok=False)
except Exception as e:
    print('***',e)
    t = None
t









    



*** Extra columns for table 'nodes': ['Z']. Required columns are: '['NODEID', 'Y', 'X']'



In [34]:

    
##test:
try:
    t = DataSource.read_table('nodes',columns=['NODEID','Y','X','C','D'])
except Exception as e:
    print('***',e)
    t = None
t









    



*** Columns missing for table 'nodes': ['C', 'D']. Required columns are: ['NODEID', 'Y', 'X', 'C', 'D']



In [35]:

    
##test:
try:
    t = DataSource.read_table('nodesxxx',columns=['NODEID','Y','X'],extrasok=False)
except Exception as e:
    print('***',e)
    t = None
t









    



*** Table 'nodesxxx' does not exist.



In [36]:

    
##test:
try:
    t = DataSource.read_table('nodesxxx',columns=['NODEID','Y','X'],extrasok=False,optional=True)
except Exception as e:
    print('***',e)
    t = None
t









    Out[36]:







  
    
      
      NODEID
      Y
      X



In [37]:

    
@register_cell_magic('Table')
def cell_table(line,celltext):
    mo = re.match(r'\s*(\S+)\s*$',line)
    if not mo:
        raise ValueError('Usage: %%Table tablename')
    tablename = mo.group(1)
    global DataSource
    DataSource.set_celldata(tablename,celltext)



In [38]:

    
%%Table nodes
NODEID,X,Y,Z
A,0.,0.,50001
B,0,4000,50002
C,8000,4000,50003
D,8000,0,50004



In [39]:

    
##test:
t2 = DataSource.read_table('nodes')
t2



In [40]:

    
##test:
DataSource.set_table('nodes',t2+t2)
t3 = DataSource.read_table('nodes')
t3



In [41]:

    
##test:
vars(t2)









    Out[41]:





{'_data': BlockManager
 Items: Index(['NODEID', 'X', 'Y', 'Z'], dtype='object')
 Axis 1: RangeIndex(start=0, stop=4, step=1)
 FloatBlock: slice(1, 3, 1), 2 x 4, dtype: float64
 IntBlock: slice(3, 4, 1), 1 x 4, dtype: int64
 ObjectBlock: slice(0, 1, 1), 1 x 4, dtype: object,
 '_iloc': <pandas.core.indexing._iLocIndexer at 0x7f35e27a5208>,
 '_item_cache': {},
 'dsname': 'frame-6',
 'is_copy': None,
 'tablename': 'nodes'}

Writing Tables



In [42]:

    
@extend
class DataSource:
    
    @classmethod
    def write_table(cls,table,root=None,dsname=None,tablename=None,prefix=None,precision=None,index=False,makedir=False):
        self = cls.DATASOURCE
        if root is None:
            root = self.root
        if dsname is None:
            dsname = self.dsname
        if tablename is None:
            tablename = table.tablename
        dirname = root + '/' + dsname + '.d'
        if makedir and not os.path.exists(dirname):
            os.mkdir(dirname)
        if prefix is not None:
            dirname = dirname + '/' + prefix
            if makedir and not os.path.exists(dirname):
                os.mkdir(dirname)
                
        table.tablename = tablename
        table.dsname = dsname
        table.filename = filename = dirname + '/' + tablename + '.csv'
        
        float_format = None
        if precision is not None:
            float_format = '%.{:d}g'.format(precision)
        table.to_csv(filename,index=index,float_format=float_format)
        return filename



In [43]:

    
@extend
class Table:
    
    def signature(self):
        filename = self.filename
        if os.path.exists(filename):
            return (self.tablename,self.filename,signature(filename))
        raise ValueError("Table {}: filename: {} - does not exist.".format(self.tablename,self.filename))
    
def signature(filename):
    f = open(filename,mode='rb')
    m = hashlib.sha256(f.read())
    f.close()
    return m.hexdigest()



In [44]:

    
DataSource.DATASOURCE = None
__ds__ = DataSource()



In [45]:

    
%%Table nodes
NODEID,X,Y,Z
A,0.,0.,6002.
B,0,4000,7003
C,8000,4000,8004
D,8000,0,9005



In [46]:

    
##test:
t = DataSource.read_table('nodes')
t



In [47]:

    
##test:
t[['X','Z']] /= 3
t









    Out[47]:







  
    
      
      NODEID
      X
      Y
      Z
    
  
  
    
      0
      A
      0.000000
      0.0
      2000.666667
    
    
      1
      B
      0.000000
      4000.0
      2334.333333
    
    
      2
      C
      2666.666667
      4000.0
      2668.000000
    
    
      3
      D
      2666.666667
      0.0
      3001.666667



In [48]:

    
##test:
vars(t)









    Out[48]:





{'_data': BlockManager
 Items: Index(['NODEID', 'X', 'Y', 'Z'], dtype='object')
 Axis 1: RangeIndex(start=0, stop=4, step=1)
 FloatBlock: slice(1, 4, 1), 3 x 4, dtype: float64
 ObjectBlock: slice(0, 1, 1), 1 x 4, dtype: object,
 '_iloc': <pandas.core.indexing._iLocIndexer at 0x7f35e2a41c50>,
 '_item_cache': {},
 '_loc': <pandas.core.indexing._LocIndexer at 0x7f35e2796748>,
 'is_copy': None,
 'tablename': 'nodes'}



In [49]:

    
##test:
try:
    DataSource.write_table(t,dsname='test',prefix='pfx',tablename='nodes2')
except Exception as e:
    print('*'*5,e)









    



***** [Errno 2] No such file or directory: 'data/test.d/pfx/nodes2.csv'



In [50]:

    
##test:
%rm -rf data/test.d
try:
    r = DataSource.write_table(t,dsname='test',prefix='pfx',tablename='nodes2',makedir=True,precision=15)
except Exception as e:
    print('*'*5,e)
r









    Out[50]:





'data/test.d/pfx/nodes2.csv'



In [51]:

    
##test:
%cat data/test.d/pfx/nodes2.csv









    



NODEID,X,Y,Z
A,0,0,2000.66666666667
B,0,4000,2334.33333333333
C,2666.66666666667,4000,2668
D,2666.66666666667,0,3001.66666666667



In [52]:

    
##test:
t.signature()









    Out[52]:





('nodes2',
 'data/test.d/pfx/nodes2.csv',
 '112e27f0a17e1ca9a580922707bb2fcf63146625708d2f2ec5e6f879b176e2b7')



In [53]:

    
##test:
%rm -rf data/test.d



In [54]:

    
##test:
vars(t)









    Out[54]:





{'_data': BlockManager
 Items: Index(['NODEID', 'X', 'Y', 'Z'], dtype='object')
 Axis 1: RangeIndex(start=0, stop=4, step=1)
 FloatBlock: slice(1, 4, 1), 3 x 4, dtype: float64
 ObjectBlock: slice(0, 1, 1), 1 x 4, dtype: object,
 '_iloc': <pandas.core.indexing._iLocIndexer at 0x7f35e2a41c50>,
 '_item_cache': {},
 '_loc': <pandas.core.indexing._LocIndexer at 0x7f35e2796748>,
 'dsname': 'test',
 'filename': 'data/test.d/pfx/nodes2.csv',
 'is_copy': None,
 'tablename': 'nodes2'}



In [55]:

    
DataSource.DATASOURCE = None
__ds__ = DataSource()



In [ ]:

	NODEID	X	Y	Z
0	AA	0.0	0.0	100002
1	BB	0.0	8000.0	100004
2	CC	16000.0	8000.0	100006
3	DD	16000.0	0.0	100008

	NODEID	X	Y	Z
0	A	0.000000	0.000000	5000
1	B	0.000000	1333.333333	5000
2	C	2666.666667	1333.333333	5000
3	D	2666.666667	0.000000	5000

	NODEID	X	Y	Z
0	A	0.0	0.0	50001
1	B	0.0	4000.0	50002
2	C	8000.0	4000.0	50003
3	D	8000.0	0.0	50004

	NODEID	X	Y	Z
0	A	0.0	0.0	6002.0
1	B	0.0	4000.0	7003.0
2	C	8000.0	4000.0	8004.0
3	D	8000.0	0.0	9005.0

	NODEID	X	Y	Z
0	A	0.000000	0.0	2000.666667
1	B	0.000000	4000.0	2334.333333
2	C	2666.666667	4000.0	2668.000000
3	D	2666.666667	0.0	3001.666667