Module to simplify handling of input and output tables (as .csv text and/or files).
There are two main classes defined here:
.read_table()
', returns a single
named Table
. Currently, it tries the following sources in order, returning the first
one it finds:.set_table()
' method.%%Table
' cell magic./
dsname.d/
tablename.csv
'data
'.Frame2D
.
In [1]:
from salib import extend
import pandas as pd
import os, os.path
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
import hashlib
from IPython.core.magic import register_cell_magic
import re
In [2]:
class Table(pd.DataFrame):
"""A Table is just like a pandas DataFrame except that it has
a table name, a data set name, and a file name - the latter two describing
the source of the data."""
_internal_names = pd.DataFrame._internal_names + ['filename','tablename']
_internal_names_set = set(_internal_names)
_metadata = ['dsname']
def __init__(self,*args,**kwargs):
dsname = kwargs.pop('dsname',None)
tablename = kwargs.pop('tablename',None)
filename = kwargs.pop('filename',None)
super(self.__class__,self).__init__(*args,**kwargs)
if dsname is not None:
self.dsname = dsname
if tablename is not None:
self.tablename = tablename
if filename is not None:
self.filename = filename
@property
def _constructor(self):
return self.__class__
In [3]:
##test:
t = Table(data=[(10,20.,'a'),(11,22.,'b'),(12,23.,'c')],
columns=['I','F','S'],tablename='Test',dsname='Notebook')
t
Out[3]:
In [4]:
##test:
t.dtypes
Out[4]:
In [5]:
##test:
t.tablename, t.dsname
Out[5]:
In [6]:
##test:
t2 = t[['S','I']]
t2
Out[6]:
In [7]:
##test:
hasattr(t2,'tablename'), hasattr(t2,'dsname')
Out[7]:
In [8]:
##test:
t2.dsname
Out[8]:
In [9]:
##test:
t = pd.DataFrame(data=[(10,20.,'a'),(11,22.,'b'),(12,23.,'c')],columns=['I','F','S'])
u = Table(data=t,dsname='foo',copy=False)
u
Out[9]:
In [10]:
##test:
u['F'] *= 3
u
Out[10]:
In [11]:
##test:
t
Out[11]:
In [12]:
##test:
u.dsname
Out[12]:
Class to unify the source of tables. For now, this assumes that a table:
.set_table
', or%%Table
', or<root>/xyz.d
', where 'xyz
' is the
'data set name'. Eventually, we will have a way of archiving sets of files in .zip files.
In [13]:
class DataSource(object):
ROOT = 'data'
DSNAME = None # default data set name
DSTYPE = 'dir' # someday we will allow 'zip' for zip archives
#DSTYPE = 'cell' # for CSV data provided via %%Table cell magic
#DSTYPE = 'data' # for dataframe data provided directly
CELLDATA = {} # csv text from %%Table magic cells, indexed by table name
TABLES = {} # dataframes directly provided by client, indexed by table name
DATASOURCE = None # the one and only data source
def __init__(self):
cls = self.__class__
if cls.DATASOURCE is not None:
raise ValueError("Can only create one instance of class '{}'".format(cls.__name__))
self.root = cls.ROOT
self.dsname = cls.DSNAME
self.prefix = None
self.dstype = cls.DSTYPE
self.celldata = cls.CELLDATA
self.tables = cls.TABLES
cls.DATASOURCE = self
In [14]:
##test:
d = DataSource()
vars(d)
Out[14]:
In [15]:
##test:
try:
d2 = DataSource()
except Exception as e:
print('*'*5,e)
d2 = None
d2
In [16]:
@extend
class DataSource:
@classmethod
def set_root(cls,newroot):
self = cls.DATASOURCE
if not os.path.exists(newroot):
raise ValueError("Root '{}' does not exist.".format(newroot))
self.root = newroot
@classmethod
def set_source(cls,dsname,dstype=None):
self = cls.DATASOURCE
if dsname is not None:
if dstype is None:
dirname = self.root + '/' + dsname + '.d'
if os.path.exists(dirname):
dstype = 'dir'
else:
dstype = 'unknown'
if dstype not in ['dir','cell','data']:
raise ValueError("dstype '{}' is invalid.".format(dstype))
self.dsname = dsname
self.dstype = dstype
self.celldata = {}
self.tables = {}
@classmethod
def set_table(cls,tablename,table):
self = cls.DATASOURCE
self.tables[tablename] = table
if tablename in self.celldata:
del self.celldata[tablename]
@classmethod
def set_celldata(cls,tablename,celltext):
self = cls.DATASOURCE
self.celldata[tablename] = celltext
if tablename in self.tables:
del self.tables[tablename]
def _file_name(self,tablename,prefix=None):
n = tablename
if prefix:
n = prefix + '/' + tablename
return self.root + '/' + self.dsname + '.d/' + n + '.csv'
In [17]:
##test:
DataSource.DATASOURCE = None
ds = DataSource()
vars(ds)
Out[17]:
In [18]:
##test:
try:
DataSource.set_root('foo')
except Exception as e:
print('*'*5,e)
vars(ds)
Out[18]:
In [19]:
##test:
DataSource.set_root('img')
vars(ds)
Out[19]:
In [20]:
##test:
DataSource.set_root('data')
In [21]:
##test:
DataSource.set_source('frame-1')
vars(ds)
Out[21]:
In [22]:
##test:
DataSource.set_table('joints',[dict(NODEID='A',X=10,Y=20),dict(NODEID='B',Y=20,X=30)])
vars(ds)
Out[22]:
In [23]:
##test:
DataSource.set_celldata('joints','NODEID,X,Y\nA,10,20\nB,30,20')
vars(ds)
Out[23]:
In [24]:
##test:
ds._file_name('joints')
Out[24]:
In [25]:
##test:
ds._file_name('joints',prefix='lcase1')
Out[25]:
In [26]:
@extend
class DataSource:
@classmethod
def read_table(cls,tablename,optional=False,prefix=None,columns=None,extrasok=True):
self = cls.DATASOURCE
stream = None
filename = None
t = None
def _chk(t,columns=columns):
if columns is None:
return t
prov = set(t.columns)
reqd = set(columns)
if reqd-prov:
raise ValueError("Columns missing for table '{}': {}. Required columns are: {}"
.format(tablename,list(reqd-prov),columns))
if prov-reqd:
if not extrasok:
raise ValueError("Extra columns for table '{}': {}. Required columns are: '{}'"
.format(tablename,list(prov-reqd),columns))
t = t[columns]
return t
if tablename in self.tables:
return _chk(self.tables[tablename])
if tablename in self.celldata:
stream = StringIO(self.celldata[tablename])
else:
if self.dsname is not None:
filename = self._file_name(tablename,prefix=prefix)
if os.path.exists(filename):
stream = open(filename,'r')
if stream is None:
if optional:
d = pd.DataFrame(columns=columns)
else:
raise ValueError("Table '{}' does not exist.".format(tablename))
else:
d = pd.read_csv(stream,index_col=None,skipinitialspace=True)
t = Table(d,dsname=self.dsname,tablename=tablename,filename=filename)
return _chk(t)
In [27]:
##test:
DataSource.set_source('frame-6')
t = DataSource.read_table('nodes')
t
Out[27]:
In [28]:
##test:
type(t)
Out[28]:
In [29]:
##test:
len(t)
Out[29]:
In [30]:
##test:
t[['X','Y']] /= 3.
t
Out[30]:
In [31]:
##test:
vars(t)
Out[31]:
In [32]:
##test:
DataSource.read_table('nodes',columns=['NODEID','Y','X'])
Out[32]:
In [33]:
##test:
try:
t = DataSource.read_table('nodes',columns=['NODEID','Y','X'],extrasok=False)
except Exception as e:
print('***',e)
t = None
t
In [34]:
##test:
try:
t = DataSource.read_table('nodes',columns=['NODEID','Y','X','C','D'])
except Exception as e:
print('***',e)
t = None
t
In [35]:
##test:
try:
t = DataSource.read_table('nodesxxx',columns=['NODEID','Y','X'],extrasok=False)
except Exception as e:
print('***',e)
t = None
t
In [36]:
##test:
try:
t = DataSource.read_table('nodesxxx',columns=['NODEID','Y','X'],extrasok=False,optional=True)
except Exception as e:
print('***',e)
t = None
t
Out[36]:
In [37]:
@register_cell_magic('Table')
def cell_table(line,celltext):
mo = re.match(r'\s*(\S+)\s*$',line)
if not mo:
raise ValueError('Usage: %%Table tablename')
tablename = mo.group(1)
global DataSource
DataSource.set_celldata(tablename,celltext)
In [38]:
%%Table nodes
NODEID,X,Y,Z
A,0.,0.,50001
B,0,4000,50002
C,8000,4000,50003
D,8000,0,50004
In [39]:
##test:
t2 = DataSource.read_table('nodes')
t2
Out[39]:
In [40]:
##test:
DataSource.set_table('nodes',t2+t2)
t3 = DataSource.read_table('nodes')
t3
Out[40]:
In [41]:
##test:
vars(t2)
Out[41]:
In [42]:
@extend
class DataSource:
@classmethod
def write_table(cls,table,root=None,dsname=None,tablename=None,prefix=None,precision=None,index=False,makedir=False):
self = cls.DATASOURCE
if root is None:
root = self.root
if dsname is None:
dsname = self.dsname
if tablename is None:
tablename = table.tablename
dirname = root + '/' + dsname + '.d'
if makedir and not os.path.exists(dirname):
os.mkdir(dirname)
if prefix is not None:
dirname = dirname + '/' + prefix
if makedir and not os.path.exists(dirname):
os.mkdir(dirname)
table.tablename = tablename
table.dsname = dsname
table.filename = filename = dirname + '/' + tablename + '.csv'
float_format = None
if precision is not None:
float_format = '%.{:d}g'.format(precision)
table.to_csv(filename,index=index,float_format=float_format)
return filename
In [43]:
@extend
class Table:
def signature(self):
filename = self.filename
if os.path.exists(filename):
return (self.tablename,self.filename,signature(filename))
raise ValueError("Table {}: filename: {} - does not exist.".format(self.tablename,self.filename))
def signature(filename):
f = open(filename,mode='rb')
m = hashlib.sha256(f.read())
f.close()
return m.hexdigest()
In [44]:
DataSource.DATASOURCE = None
__ds__ = DataSource()
In [45]:
%%Table nodes
NODEID,X,Y,Z
A,0.,0.,6002.
B,0,4000,7003
C,8000,4000,8004
D,8000,0,9005
In [46]:
##test:
t = DataSource.read_table('nodes')
t
Out[46]:
In [47]:
##test:
t[['X','Z']] /= 3
t
Out[47]:
In [48]:
##test:
vars(t)
Out[48]:
In [49]:
##test:
try:
DataSource.write_table(t,dsname='test',prefix='pfx',tablename='nodes2')
except Exception as e:
print('*'*5,e)
In [50]:
##test:
%rm -rf data/test.d
try:
r = DataSource.write_table(t,dsname='test',prefix='pfx',tablename='nodes2',makedir=True,precision=15)
except Exception as e:
print('*'*5,e)
r
Out[50]:
In [51]:
##test:
%cat data/test.d/pfx/nodes2.csv
In [52]:
##test:
t.signature()
Out[52]:
In [53]:
##test:
%rm -rf data/test.d
In [54]:
##test:
vars(t)
Out[54]:
In [55]:
DataSource.DATASOURCE = None
__ds__ = DataSource()
In [ ]: