`klepto`

Unified Persistent Storage to Memory, Disk, or Database

<h2 align="center", style="color:gray">Mike McKerns</h2>

<h3 align="center", style="color:gray">California Institute of Technology; the UQ Foundation</h3>

motivating example: N-dimensional serach, with storage of interpolated function


In [2]:
%%file global_search.py
"""
Uses Ensemble Solvers to provide 'pseudo-global' search.
"""
# the ensemble solvers
from mystic.solvers import BuckshotSolver, LatticeSolver

# the local solvers
from mystic.solvers import PowellDirectionalSolver, NelderMeadSimplexSolver

# cost function
from mystic.models import griewangk as model

# if available, use a multiprocessing worker pool
try:
    from pathos.multiprocessing import ProcessingPool as Pool
except ImportError:
    from mystic.python import PythonSerial as Pool

# tools
from mystic.termination import VTR, ChangeOverGeneration as COG
from mystic.termination import NormalizedChangeOverGeneration as NCOG
from mystic.monitors import LoggingMonitor, VerboseMonitor, Monitor

def solve(cost, bounds, sprayer, seeker, stop, npts, _map, id=None, disp=True):
    # configure monitor
    stepmon = LoggingMonitor(1)

    # get dimensions
    ndim = len(bounds)
    _min, _max = zip(*bounds)

    # solve with ensemble solver
    solver = sprayer(ndim, npts)
    solver.id = id
    solver.SetNestedSolver(seeker)
    solver.SetMapper(_map)
    solver.SetGenerationMonitor(stepmon)
    solver.SetStrictRanges(min=_min, max=_max)
    solver.Solve(cost, stop, disp=disp)
    return solver

def print_results(solver, tol=8):
    for _solver in solver._allSolvers:
        bestSol = tuple(round(s, tol) for s in _solver.bestSolution)
        bestRes = round(_solver.bestEnergy, tol)
        print (bestSol, bestRes)

def memoize(solver, archive, tol=1):
    from klepto import inf_cache
    from klepto.keymaps import keymap

    km = keymap()
    ca = inf_cache(tol=tol, ignore=('**','out'), cache=archive, keymap=km)

    @ca
    def memo(*args, **kwds):
        return kwds['out']

    for _solver in solver._allSolvers:
        bestSol = tuple(_solver.bestSolution)
        bestRes = float(_solver.bestEnergy)
        memo(*bestSol, out=bestRes)
    return memo


if __name__ == '__main__':
    try:
        from pathos.helpers import freeze_support
        freeze_support()
    except ImportError:
        pass

    tol = 8   # rounding precision
    ndim = 2  # model dimensionality
    bounds = ndim * [(-9.5,9.5)] # griewangk
    npts = 25 # number of solvers
    sprayer = BuckshotSolver
    seeker = PowellDirectionalSolver
    stop = NCOG(1e-4)
    mpmap = Pool().map
    disp = False # print optimization summary
    retry = 1 #9 # max consectutive iteration retries without a cache 'miss'

    from klepto.archives import dir_archive
    ar_name = '__%s_%sD_cache__' % (model.im_class.__name__,ndim)
    archive = dir_archive(ar_name, serialized=True, cached=False)

    count = 0 if retry else -1 #XXX: 'rerun' much shorter... unless clear
    sid = 0  # keep track of which solver is which across multiple runs
    while retry > count: # stop after retry consecutive no new results
        _size = -1
        size = osize = len(archive) #XXX: better compare 'size' or 'len(vals)'?
        while size > _size: # stop if no new results
            solver = solve(model, bounds, sprayer, seeker, stop, npts, mpmap, sid, disp)
            sid += len(solver._allSolvers)
#           print_results(solver, tol=tol)
            _size = size
            info = memoize(solver, archive, tol=1).info()
            print info
            size = info.size
        if size == osize: count = count + 1
        else: count = 0

    #NOTE: len(size) = # of dirs; len(vals) = # unique dirs
    vals = archive.values()
    _min = min(vals)
    _nmin = len([v for v in vals if round(v, tol) == round(_min, tol)])
    print "min: %s (count=%s)" % (_min, _nmin)
    print "pts: %s (values=%s, size=%s)" % (len(vals), len(set(vals)), len(archive))


Writing global_search.py

In [4]:
%%file global_search_max.py
"""
Uses Ensemble Solvers to provide 'pseudo-global' search.
"""
# the ensemble solvers
from mystic.solvers import BuckshotSolver, LatticeSolver

# the local solvers
from mystic.solvers import PowellDirectionalSolver, NelderMeadSimplexSolver

# cost function
from mystic.models import griewangk as model
invmodel = lambda *args, **kwds: -model(*args, **kwds)

# if available, use a multiprocessing worker pool
try:
    from pathos.multiprocessing import ProcessingPool as Pool
except ImportError:
    from mystic.python import PythonSerial as Pool

# tools
from mystic.termination import VTR, ChangeOverGeneration as COG
from mystic.termination import NormalizedChangeOverGeneration as NCOG
from mystic.monitors import LoggingMonitor, VerboseMonitor, Monitor

def solve(cost, bounds, sprayer, seeker, stop, npts, _map, id=None, disp=True):
    # configure monitor
    stepmon = LoggingMonitor(1, filename='inv.txt')

    # get dimensions
    ndim = len(bounds)
    _min, _max = zip(*bounds)

    # solve with ensemble solver
    solver = sprayer(ndim, npts)
    solver.id = id
    solver.SetNestedSolver(seeker)
    solver.SetMapper(_map)
    solver.SetGenerationMonitor(stepmon)
    solver.SetStrictRanges(min=_min, max=_max)
    solver.Solve(cost, stop, disp=disp)
    return solver

def print_results(solver, tol=8):
    for _solver in solver._allSolvers:
        bestSol = tuple(round(s, tol) for s in _solver.bestSolution)
        bestRes = round(_solver.bestEnergy, tol)
        print (bestSol, bestRes)

def memoize(solver, archive, tol=1):
    from klepto import inf_cache
    from klepto.keymaps import keymap

    km = keymap()
    ca = inf_cache(tol=tol, ignore=('**','out'), cache=archive, keymap=km)

    @ca
    def memo(*args, **kwds):
        return kwds['out']

    for _solver in solver._allSolvers:
        bestSol = tuple(_solver.bestSolution)
        bestRes = float(_solver.bestEnergy)
        memo(*bestSol, out=bestRes)
    return memo


if __name__ == '__main__':
    try:
        from pathos.helpers import freeze_support
        freeze_support()
    except ImportError:
        pass

    tol = 8   # rounding precision
    ndim = 2  # model dimensionality
    bounds = ndim * [(-9.5,9.5)] # griewangk
    npts = 25 # number of solvers
    sprayer = BuckshotSolver
    seeker = PowellDirectionalSolver
    stop = NCOG(1e-4)
    mpmap = Pool().map
    disp = False # print optimization summary
    retry = 1 #9 # max consectutive iteration retries without a cache 'miss'

    from klepto.archives import dir_archive
    ar_name = '__%s_%sD_invcache__' % (model.im_class.__name__,ndim)
    archive = dir_archive(ar_name, serialized=True, cached=False)

    count = 0 if retry else -1 #XXX: 'rerun' much shorter... unless clear
    sid = 0  # keep track of which solver is which across multiple runs
    while retry > count: # stop after retry consecutive no new results
        _size = -1
        size = osize = len(archive) #XXX: better compare 'size' or 'len(vals)'?
        while size > _size: # stop if no new results
            solver = solve(invmodel, bounds, sprayer, seeker, stop, npts, mpmap, sid, disp)
            sid += len(solver._allSolvers)
#           print_results(solver, tol=tol)
            _size = size
            info = memoize(solver, archive, tol=1).info()
            print info
            size = info.size
        if size == osize: count = count + 1
        else: count = 0

    #NOTE: len(size) = # of dirs; len(vals) = # unique dirs
    vals = archive.values()
    _min = min(vals)
    _nmin = len([v for v in vals if round(v, tol) == round(_min, tol)])
    print "max: %s (count=%s)" % (-_min, _nmin)
    print "pts: %s (values=%s, size=%s)" % (len(vals), len(set(vals)), len(archive))


Writing global_search_max.py

In [1]:
%%file interp.py
import sys
import numpy as np
from scipy.interpolate import Rbf, griddata
# http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html
from mystic.munge import *
from mystic.models import griewangk as model
i,xy,z = logfile_reader('log.txt')
i,ixy,iz = logfile_reader('inv.txt')
#xy,z = ixy,iz
del i

#############
shift = 0
scale = 0
N = 10000.
M = 200
args = {
'smooth': 0,
'function': 'thin_plate',
}
#############


xyz = np.vstack((np.array(xy).T,z))
ixyz = np.vstack((np.array(ixy).T,iz))
ixyz[-1,:] = -ixyz[-1]
del ixy,iz
xyz = np.hstack((xyz, ixyz))

x = xyz.T[:,0]
y = xyz.T[:,1]
z = xyz.T[:,2]

#HACK: remove any duplicate points by adding noise
_x = x + np.random.normal(scale=1e-8, size=x.shape)
_y = y + np.random.normal(scale=1e-8, size=y.shape)

from mpl_toolkits.mplot3d import axes3d
import matplotlib.pyplot as plt
from matplotlib import cm

figure = plt.figure()
kwds = {'projection':'3d'}
ax = figure.gca(**kwds)
ax.autoscale(tight=True)

if len(z) > N:
    N = max(int(round(len(z)/N)),1)
    print "for speed, sampling {} down to {}".format(len(z),len(z)/N)
    x, _x, y, _y, z = x[::N], _x[::N], y[::N], _y[::N], z[::N]
#   ax.plot(x, y, z, 'ko', linewidth=2, markersize=4)
#   plt.show()
#   sys.exit(0)


f = Rbf(_x, _y, z, **args)


M = complex('{}j'.format(M))
xgrid,ygrid = np.mgrid[x.min():x.max():M, y.min():y.max():M]
z_ = f(xgrid, ygrid)


mz = np.argmin(z)
xx,yy = x[mz],y[mz]
print "min: {}; min@f: {}".format(z[mz], f(xx,yy))
mz = np.argmax(z)
xx,yy = x[mz],y[mz]
print "max: {}; max@f: {}".format(z[mz], f(xx,yy))


# scaling used by model plotter
if scale:
    if shift:
        z_ = np.asarray(z_)+shift
        z = np.asarray(z)+shift
    z_ = np.log(4*np.asarray(z_)*scale+1)+2
    z = np.log(4*np.asarray(z)*scale+1)+2


density = 9
d = max(11 - density, 1)
ax.plot_wireframe(xgrid, ygrid, z_, rstride=d, cstride=d)
#ax.plot_surface(xgrid, ygrid, z_, rstride=d, cstride=d, cmap=cm.jet, linewidth=0, antialiased=False)
ax.plot(x, y, z, 'ko', linewidth=2, markersize=4)

plt.show()
figure.savefig('griewangk.png')


import klepto
try:
    from klepto.archives import file_archive
    archive = file_archive('models.pkl', serialized=True, cached=False)
    archive[model.im_class.__name__.lower()] = f
except Exception:
    print "serialization failed"


Overwriting interp.py

In [2]:
%matplotlib inline

In [3]:
!python2.7 -c "from mystic.models import griewangk as model; print model.__doc__"


evaluates an N-dimensional Griewangk's function for a list of coeffs

f(x) = f_0(x) - f_1(x) + 1

Where:
f_0(x) = \sum_(i=0)^(N-1) x_(i)^(2) / 4000.
and:
f_1(x) = \prod_(i=0)^(N-1) \cos( x_i / (i+1)^(1/2) )

Inspect with mystic_model_plotter using::
    mystic.models.griewangk -b "-10:10:.1, -10:10:.1" -d -x 5

The minimum is f(x)=0.0 for x_i=0.0

In [4]:
import mystic
import mystic.models
mystic.model_plotter(mystic.models.griewangk, bounds="-10:10:.1, -10:10:.1", depth=True)



In [5]:
!python2.7 global_search.py


CacheInfo(hit=8, miss=17, load=0, maxsize=None, size=17)
CacheInfo(hit=22, miss=3, load=0, maxsize=None, size=20)
CacheInfo(hit=23, miss=2, load=0, maxsize=None, size=22)
CacheInfo(hit=25, miss=0, load=0, maxsize=None, size=22)
CacheInfo(hit=25, miss=0, load=0, maxsize=None, size=22)
min: 0.0 (count=1)
pts: 17 (values=7, size=22)

In [6]:
mystic.model_plotter(mystic.models.griewangk, 'log.txt', bounds="-10:10:.1, -10:10:.1")



In [7]:
!python2.7 global_search_max.py


CacheInfo(hit=10, miss=15, load=0, maxsize=None, size=15)
CacheInfo(hit=20, miss=5, load=0, maxsize=None, size=20)
CacheInfo(hit=23, miss=2, load=0, maxsize=None, size=22)
CacheInfo(hit=24, miss=1, load=0, maxsize=None, size=23)
CacheInfo(hit=25, miss=0, load=0, maxsize=None, size=23)
CacheInfo(hit=25, miss=0, load=0, maxsize=None, size=23)
max: 2.04197668684 (count=4)
pts: 18 (values=6, size=23)

In [8]:
!python2.7 interp.py


/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scipy/interpolate/rbf.py:135: RuntimeWarning: divide by zero encountered in log
  result = r**2 * log(r)
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scipy/interpolate/rbf.py:135: RuntimeWarning: invalid value encountered in multiply
  result = r**2 * log(r)
min: 0.0; min@f: 3.17231751978e-08
max: 2.041977; max@f: 2.04197724257

In [9]:
from IPython.display import Image
Image(filename='griewangk.png')


Out[9]:

In [10]:
import klepto.archives as karch
models = karch.file_archive('models.pkl', serialized=True, cached=False)
griewangk = models['griewangk']
print griewangk(0,0)


-1.23982317746e-08

In [12]:
import os
os.chdir('branins')

In [8]:
!python2.7 interp.py


/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scipy/interpolate/rbf.py:135: RuntimeWarning: divide by zero encountered in log
  result = r**2 * log(r)
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scipy/interpolate/rbf.py:135: RuntimeWarning: invalid value encountered in multiply
  result = r**2 * log(r)
min: 0.397887; min@f: 0.397674560547
max: 12005.217981; max@f: 12005.2183838

In [13]:
Image(filename='branins.png')


Out[13]:

In [14]:
os.chdir('../ackley')

In [10]:
!python2.7 interp.py


for speed, sampling 24169 down to 12084
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scipy/interpolate/rbf.py:135: RuntimeWarning: divide by zero encountered in log
  result = r**2 * log(r)
/opt/local/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/scipy/interpolate/rbf.py:135: RuntimeWarning: invalid value encountered in multiply
  result = r**2 * log(r)
min: 0.0; min@f: 0.000259548425674
max: 19.3590300028; max@f: 19.3583301306

In [15]:
Image(filename='ackley.png')


Out[15]:

In [11]:
os.chdir('..')

dictionary interface to caching and archiving

basic use (direct access to archive)


In [1]:
from klepto.archives import sqltable_archive
d = sqltable_archive(cached=False)

#from klepto.archives import sql_archive
#d = sql_archive('mysql://user:pass@localhost/defaultdb', cached=False)

d['1'] = 1
d['2'] = 2
d['max'] = max
squared = lambda x:x**2
d['squared'] = squared

# max(squared(2), 1)
print d['max'](d['squared'](d['2']), d['1'])


4

In [2]:
from klepto.archives import dir_archive

def check_numpy(archive):
    import numpy as np
    d = archive
    x = np.array([1,2,3,4,5])
    y = np.arange(1000)
    t = np.dtype([('int',np.int),('float32',np.float32)])
    d['a'] = x
    d['b'] = y
    d['c'] = np.inf
    d['d'] = np.ptp
    d['e'] = t
    assert all(d['a'] == x)
    assert all(d['b'] == y)
    assert d['c'] == np.inf
    assert d['d'](x) == np.ptp(x)
    assert d['e'] == t
    return

archive = dir_archive(cached=False)
check_numpy(archive)

archive = dir_archive(cached=False,fast=True)
check_numpy(archive)

archive = dir_archive(cached=False,compression=3)
check_numpy(archive)

archive = dir_archive(cached=False,memmode='r+')
check_numpy(archive)

archive = dir_archive(cached=False,serialized=False)
check_numpy(archive)

from pox import rmtree
rmtree('memo')

print "done"


done

alternate archive backends


In [3]:
import klepto.archives
print '\n'.join([i for i in dir(klepto.archives) if not i.startswith('_') and i.endswith('archive')])


dict_archive
dir_archive
file_archive
null_archive
sql_archive
sqltable_archive

using a cache layer


In [4]:
from klepto.archives import file_archive
original = file_archive('foo')
print "empty:", original

original['a'] = 1
original['b'] = 2
print "a,b in cache:", original
print "not in archive:", original.archive
original.dump()
print "a,b in archive:", original.archive


empty: file_archive('foo', {}, cached=True)
a,b in cache: file_archive('foo', {'a': 1, 'b': 2}, cached=True)
not in archive: file_archive('foo', {}, cached=False)
a,b in archive: file_archive('foo', {'a': 1, 'b': 2}, cached=False)

In [5]:
print original.keys()
print original.values()
print original.items()
original.setdefault('c',3)

print "a,b,c in cache:", original
print "c not in archive:", original.archive


['a', 'b']
[1, 2]
[('a', 1), ('b', 2)]
a,b,c in cache: file_archive('foo', {'a': 1, 'c': 3, 'b': 2}, cached=True)
c not in archive: file_archive('foo', {'a': 1, 'b': 2}, cached=False)

In [6]:
duplicate = file_archive('foo')
print "new empty:", duplicate
duplicate.load()
print "load from file:", duplicate

original.dump()
duplicate.load()
print "c from original:", duplicate


new empty: file_archive('foo', {}, cached=True)
load from file: file_archive('foo', {'a': 1, 'b': 2}, cached=True)
c from original: file_archive('foo', {'a': 1, 'c': 3, 'b': 2}, cached=True)

In [7]:
duplicate['d'] = 4
print "d in cache:", duplicate.archive

duplicate.sync() 
print "sync'd:", duplicate.archive

print "d in archive:", original.archive
print "d not in cache:", original

original.sync()
print "sync'd:", original


d in cache: file_archive('foo', {'a': 1, 'c': 3, 'b': 2}, cached=False)
sync'd: file_archive('foo', {'a': 1, 'c': 3, 'b': 2, 'd': 4}, cached=False)
d in archive: file_archive('foo', {'a': 1, 'c': 3, 'b': 2, 'd': 4}, cached=False)
d not in cache: file_archive('foo', {'a': 1, 'c': 3, 'b': 2}, cached=True)
sync'd: file_archive('foo', {'a': 1, 'c': 3, 'b': 2, 'd': 4}, cached=True)

In [8]:
print '\n'.join([i for i in dir(duplicate) if not i.startswith("_")])


archive
archived
clear
copy
drop
dump
fromkeys
get
has_key
items
iteritems
iterkeys
itervalues
keys
load
open
pop
popitem
setdefault
sync
update
values
viewitems
viewkeys
viewvalues

In [9]:
!rm -rf foo

decorator interface for memoization

basic use (caching, looking up stored values)


In [10]:
import dill
import klepto

@klepto.lru_cache()
def squared(x):
    return x**2

print squared(2)
print squared(4)
print squared(6)
print squared
print squared.__cache__()

_s = dill.loads(dill.dumps(squared))
assert _s.lookup(4) == 16
assert squared.__cache__() == _s.__cache__()


4
16
36
<function squared at 0x112bfb1b8>
null_archive({-8145078762942369576: 4, -8145078762938039476: 36, -8145078762935874426: 16}, cached=True)

In [11]:
from klepto.safe import inf_cache as memoized
from klepto.archives import sql_archive

@memoized(cache=sql_archive())
def add(x,y):
    return x+y

add(1,2)
add(1,2)
add(1,3)
print add.info()


@memoized(cache=add.__cache__())
def add(x,y):
    return x+y

add(1,2)
add(2,2)
add(1,3)
print "cache hit:", add.info().hit
print "cache miss:", add.info().miss


CacheInfo(hit=1, miss=2, load=0, maxsize=None, size=2)
cache hit: 2
cache miss: 1

masking inputs


In [12]:
from klepto import lru_cache as memoize
from klepto.keymaps import hashmap
hasher = hashmap(algorithm='md5')

class Adder(object):
    """A simple class with a memoized method"""

    @memoize(keymap=hasher, ignore=('self','**'))
    def __call__(self, x, *args, **kwds):
        debug = kwds.get('debug', False)
        if debug:
            print ('debug:', x, args, kwds)
        return sum((x,)+args)
    add = __call__


add = Adder()
assert add(2,0) == 2
assert add(2,0,z=4) == 2          # cached (ignore z)
assert add(2,0,debug=False) == 2  # cached (ignore debug)
assert add(1,2,debug=False) == 3
assert add(1,2,debug=True) == 3   # cached (ignore debug)
assert add(4) == 4
assert add(x=4) == 4              # cached

plus = Adder()
assert plus(4,debug=True) == 4    # cached (ignore debug)
assert plus(2,0,3) == 5
assert plus(2,0,4) == 6

info = add.__call__.info()
assert info.hit == 5
assert info.miss == 5
cache = add.__call__.__cache__()
assert sorted(cache.values()) == [2,3,4,5,6]

# careful... remember we have self as first argument
key = add.__call__.key(add,2,0)
assert cache[key] == add.__call__.__wrapped__(add,2,0)
assert cache[key] == add.__call__.lookup(add,2,0)

print "OK"


OK

In [13]:
print key
print cache[key]
print add.__call__.__wrapped__


cfd9809c56982c2baf98916f82b19a4e
2
<function __call__ at 0x112c08320>

In [14]:
from klepto import inf_cache

class _Foo(object):
    @inf_cache(ignore='self')
    def bar(self, x,y):
        return x+y

_fu = _Foo()
_fu.bar(1,2)
_fu.bar(2,2)
_fu.bar(2,3)
_fu.bar(1,2)
assert len(_fu.bar.__cache__()) == 3
assert _fu.bar.__cache__()[_fu.bar.key(_fu,1,2)] == 3
assert _fu.bar.lookup(_fu,1,2) == 3

print "OK"


OK

In [15]:
@memoize(keymap=hasher, ignore=('self','**'))
def _add(x, *args, **kwds):
    debug = kwds.get('debug', False)
    if debug:
        print ('debug:', x, args, kwds)
    return sum((x,)+args)

_add(2,0)
_add(2,0,z=4)
_add(2,0,debug=False)
_add(1,2,debug=False)
_add(1,2,debug=True)
_add(4)
_add(x=4)
_add(4,debug=True)
_add(2,0,3)
_add(2,0,4)

_cache =  _add.__cache__()
_func = _add.__wrapped__

# do a lookup
assert _add.lookup(2,0) == _func(2,0)

# generate the key, and do a look-up
key = _add.key(2,0)
assert _cache[key] == _func(2,0)

# look-up the key again, doing a little more work...
from klepto import keygen
lookup = keygen('self','**')(_func)
lookup.register(hasher)
key = lookup(2,0)
assert _cache[key] == _func(2,0)

# since we have the 'key lookup', let's play with lookup a bit
# check the inputs passed match the function signature
assert lookup.valid()
# actually call the function
assert lookup.call() == _func(2,0)

print "OK"


OK

In [16]:
print _add.lookup(2,0)
print _add.key(2,0)


2
cfd9809c56982c2baf98916f82b19a4e

function signature discovery and validation


In [17]:
from functools import partial
def foo(x,y,z,a=1,b=2):
    return x+y+z+a+b

p = partial(foo, 0,1)

print p(1,2,3)
print p(1,2,3,4,b=5)


7
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-17-1c3b1706ea89> in <module>()
      6 
      7 print p(1,2,3)
----> 8 print p(1,2,3,4,b=5)

TypeError: foo() takes at most 5 arguments (7 given)

In [18]:
from klepto import validate
assert validate(p, 1,2,3) == None
validate(p, 1,2,3,4,b=5)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-18-a0f83e539d60> in <module>()
      1 from klepto import validate
      2 assert validate(p, 1,2,3) == None
----> 3 validate(p, 1,2,3,4,b=5)

/Users/mmckerns/lib/python2.7/site-packages/klepto-0.1.2.dev0-py2.7.egg/klepto/_inspect.pyc in validate(func, *args, **kwds)
    212     if var_args and not hasargs:
    213         var_kwds = set(kwds) - set(named)
--> 214         raise TypeError("%s() takes at most %d arguments (%d given)" % (func.__name__, len(named)+len(p_args), len(p_args)+len(args)+len(kwds)))
    215 
    216     # check any varkwds; FAIL if func doesn't take varkwds

TypeError: foo() takes at most 5 arguments (7 given)

In [19]:
from klepto import signature
print signature(p)

# takes args = (z,a,b), kwds = {a,y,b}
# cannot take (x,y) as kwds (fixed by the partial)
# no *args, **kwds used in the function


(('z', 'a', 'b'), {'a': 1, '!y': 1, 'b': 2, '!x': 0}, '', '')

In [20]:
def bar(x,y,z,a=1,b=2,*args):
    return x+y+z+a+b

p = partial(bar, 0,b=10)
print signature(p)

# a partial with a 'fixed' x, and positionally 'unsettable' b
# takes *args, but not **kwds


(('y', 'z', 'a', '!b'), {'a': 1, '!x': 0, 'b': 10}, 'args', '')

rounding


In [21]:
from klepto.rounding import *

@deep_round(tol=1)
def add(x,y):
    return x+y

result = add(2.54, 5.47)
assert result == 8.0

# rounds each float, regardless of depth in an object
result = add([2.54, 5.47],['x','y'])
assert result == [2.5, 5.5, 'x', 'y']

# rounds each float, regardless of depth in an object
result = add([2.54, 5.47],['x',[8.99, 'y']])
assert result == [2.5, 5.5, 'x', [9.0, 'y']]


@simple_round(tol=1)
def add(x,y):
    return x+y

result = add(2.54, 5.47)
assert result == 8.0

# does not round elements of iterables, only rounds at the top-level
result = add([2.54, 5.47],['x','y'])
assert result == [2.54, 5.4699999999999998, 'x', 'y']

# does not round elements of iterables, only rounds at the top-level
result = add([2.54, 5.47],['x',[8.99, 'y']])
assert result == [2.54, 5.4699999999999998, 'x', [8.9900000000000002, 'y']]

print "OK"


OK

exchange strategies


In [24]:
from klepto.archives import file_archive
from random import choice, seed

def _test_hits(algorithm, maxsize=20, keymap=None, rangelimit=5,
               tries=1000, archived=False, purge=False):

    @algorithm(maxsize=maxsize, keymap=keymap, purge=purge)
    def f(x, y):
        return 3*x+y

    if archived:
        f.archive(file_archive('cache.pkl',cached=False))

    domain = list(range(rangelimit))
    domain += [float(i) for i in domain]
    
    for i in range(tries):
        r = f(choice(domain), choice(domain))
    f.dump()

    #print(f.info())
    return f.info()


import os
import sys
from klepto import *
    
caches = [rr_cache, mru_cache, lru_cache, lfu_cache, inf_cache, no_cache]

# clean-up
if os.path.exists('cache.pkl'): os.remove('cache.pkl')

print ("WITHOUT ARCHIVE")
for cache in caches:
    msg = cache.__name__ + ":"
    msg += "%s" % str(_test_hits(cache, maxsize=100, rangelimit=20, tries=100))
    print (msg)

print ("\nWITH ARCHIVE (purging)")
for cache in caches:
    msg = cache.__name__ + ":"
    msg += "%s" % str(_test_hits(cache, maxsize=100,
                                 rangelimit=20, tries=100, archived=True))
    print (msg)
    
print ("\nWITH ARCHIVE (non-purging)")
for cache in caches:
    msg = cache.__name__ + ":"
    msg += "%s" % str(_test_hits(cache, maxsize=50, purge=False,
                                 rangelimit=20, tries=100, archived=True))
    print (msg)


# clean-up
if os.path.exists('cache.pkl'): os.remove('cache.pkl')


WITHOUT ARCHIVE
rr_cache:CacheInfo(hit=12, miss=88, load=0, maxsize=100, size=88)
mru_cache:CacheInfo(hit=12, miss=88, load=0, maxsize=100, size=88)
lru_cache:CacheInfo(hit=13, miss=87, load=0, maxsize=100, size=87)
lfu_cache:CacheInfo(hit=9, miss=91, load=0, maxsize=100, size=91)
inf_cache:CacheInfo(hit=14, miss=86, load=0, maxsize=None, size=86)
no_cache:CacheInfo(hit=0, miss=100, load=0, maxsize=0, size=0)

WITH ARCHIVE (purging)
rr_cache:CacheInfo(hit=12, miss=88, load=0, maxsize=100, size=88)
mru_cache:CacheInfo(hit=12, miss=60, load=28, maxsize=100, size=88)
lru_cache:CacheInfo(hit=9, miss=51, load=40, maxsize=100, size=91)
lfu_cache:CacheInfo(hit=11, miss=39, load=50, maxsize=100, size=89)
inf_cache:CacheInfo(hit=13, miss=28, load=59, maxsize=None, size=87)
no_cache:CacheInfo(hit=0, miss=39, load=61, maxsize=0, size=0)

WITH ARCHIVE (non-purging)
rr_cache:CacheInfo(hit=7, miss=20, load=73, maxsize=50, size=50)
mru_cache:CacheInfo(hit=2, miss=18, load=80, maxsize=50, size=50)
lru_cache:CacheInfo(hit=6, miss=10, load=84, maxsize=50, size=50)
lfu_cache:CacheInfo(hit=11, miss=9, load=80, maxsize=50, size=49)
inf_cache:CacheInfo(hit=13, miss=8, load=79, maxsize=None, size=87)
no_cache:CacheInfo(hit=0, miss=9, load=91, maxsize=0, size=0)

encoding and hashing

cryptographic key generation and encoding


In [25]:
from klepto.crypto import *

assert string('1') == '1'
assert string('1', encoding='repr') == "'1'"

x = [1,2,3,'4',"'5'", min]
assert hash(x, 'sha1') == '3bdd73e79be4277dcb874d193b8dd08a46bc6885'
assert string(x) == '[1, 2, 3, \'4\', "\'5\'", <built-in function min>]'
assert string(x, encoding='repr') == '[1, 2, 3, \'4\', "\'5\'", <built-in function min>]'
assert string(x, encoding='utf_8') == '[1, 2, 3, \'4\', "\'5\'", <built-in function min>]'
assert string(x, encoding='unicode') == unicode('[1, 2, 3, \'4\', "\'5\'", <built-in function min>]')
assert string(x, encoding='zlib_codec') == 'x\x9c\x8b6\xd4Q0\xd2Q0\xd6QP7Q\xd7QPR7UW\xd2Q\xb0I*\xcd\xcc)\xd1\xcd\xccSH+\xcdK.\xc9\xcc\xcfS\xc8\xcd\xcc\xb3\x8b\x05\x00\xf6(\x0c\x9c'
assert string(x, encoding='hex_codec') == '5b312c20322c20332c202734272c2022273527222c203c6275696c742d696e2066756e6374696f6e206d696e3e5d'

print "OK"


OK

In [26]:
print algorithms()
print serializers()
print encodings()


(None, 'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512')
(None, 'pickle', 'json', 'cPickle', 'dill')
(None, 'bz2_codec', 'cp1140', 'euc_jp', 'cp932', 'punycode', 'euc_jisx0213', 'aliases', 'hex_codec', 'cp500', 'uu_codec', 'big5hkscs', 'mac_romanian', 'mbcs', 'euc_jis_2004', 'iso2022_jp_3', 'iso2022_jp_2', 'iso2022_jp_1', 'gbk', 'iso2022_jp_2004', 'unicode_internal', 'utf_16_be', 'quopri_codec', 'cp424', 'iso2022_jp', 'mac_iceland', 'raw_unicode_escape', 'hp_roman8', 'iso2022_kr', 'cp875', 'iso8859_6', 'cp1254', 'utf_32_be', 'gb2312', 'cp850', 'shift_jis', 'cp852', 'cp855', 'iso8859_3', 'cp857', 'cp856', 'cp775', 'unicode_escape', 'cp1026', 'mac_latin2', 'utf_32', 'mac_cyrillic', 'base64_codec', 'ptcp154', 'palmos', 'mac_centeuro', 'euc_kr', 'hz', 'utf_8', 'utf_32_le', 'mac_greek', 'utf_7', 'mac_turkish', 'utf_8_sig', 'mac_arabic', 'tactis', 'cp949', 'zlib_codec', 'big5', 'iso8859_9', 'iso8859_8', 'iso8859_5', 'iso8859_4', 'iso8859_7', 'cp874', 'iso8859_1', 'utf_16_le', 'iso8859_2', 'charmap', 'gb18030', 'cp1006', 'shift_jis_2004', 'mac_roman', 'ascii', 'string_escape', 'iso8859_15', 'iso8859_14', 'tis_620', 'iso8859_16', 'iso8859_11', 'iso8859_10', 'iso8859_13', 'cp720', 'cp950', 'utf_16', 'cp869', 'mac_farsi', 'rot_13', 'cp860', 'cp861', 'cp862', 'cp863', 'cp864', 'cp865', 'cp866', 'shift_jisx0213', 'johab', 'mac_croatian', 'cp1255', 'latin_1', 'cp1257', 'cp1256', 'cp1251', 'cp1250', 'cp1253', 'cp1252', 'cp437', 'cp1258', 'undefined', 'cp737', 'koi8_r', 'cp037', 'koi8_u', 'iso2022_jp_ext', 'cp858', 'idna', 'str', 'repr', 'unicode', 'bytes')

In [27]:
from klepto.keymaps import * 
from klepto.crypto import *

h = hashmap(algorithm='md5')
p = picklemap(serializer='dill')
hp = p + h

assert p(1) == pickle(1, serializer='dill')
assert h(1) == 'c4ca4238a0b923820dcc509a6f75849b'
assert hp(1) == 'ee7f32117eab97ec5460313282938c93'
assert h(p(1)) == hp(1)
assert hp.inner(1) == p(1)
assert hp.outer(1) == h(1)

print "OK"


OK

Current developments:

accelerate HPC applications

The main process reads and writes to a database through a cached function, while N-worker processes write to the database and try to fill in the "holes" in the database. Works in parallel and across network boundaries.

asynchronous sync

We have research-grade asynchronous cache-to-archive synchronization. Functions interact with fast local caches, and asynchronously sync with their arhive backends. Working on distributed archive-to-archive synchronization to seamlessly distribute archived data everywhere.

HDF5 backend

In the process of adding a h5py backend, and also "dill-augmented" hickle serialization.

adding interpolation strategies

Some preliminary work done with kriging interpolation, but API decisions need to be made before this is committed.

easier to add new exchange strategies

Slight refactoring needed so a cache exchange strategy is a first class object, as opposed to being produced by subclass a cache.

New release coming very soon...

https://github.com/uqfoundation/klepto

Help is definitely welcome!