In [1]:
from IPython.display import Image
import matplotlib.pyplot as plt
%run talktools
Aug 30th 2014
<img src="MO_Master_W.jpg", width=150, align="right">
Patrick Peglar - UK Met Office, Exeter
( this talk: https://github.com/pp-mo/biggus_talk )
Biggus makes data analysis on large datasets easier
The key technique is "lazy evaluation"
Aka "don't do it until you have to"
This goes by several near-interchangeable names:
Virtual arrays can be processed in the usual ways
The 'deferred' operation is the source of all the benefits ...
In [2]:
import biggus
print biggus.__version__
In [3]:
import numpy as np
array_1 = np.array([[1., 5., 2.], [7., 6., 5.]])
print 'simple array :\n', array_1
mean_a1 = array_1.mean(axis=1)
print
print 'mean over axis 1 :\n', mean_a1
In [4]:
from biggus import NumpyArrayAdapter as npwrap
lazy_1 = npwrap(array_1)
print 'a lazy array : ', lazy_1
In [5]:
lazy_mean = biggus.mean(lazy_1, axis=1)
print 'lazy mean :', lazy_mean
print
print 'lazy mean *result* :\n', lazy_mean.ndarray()
print
print 'same as original ...:\n', mean_a1
In [6]:
lazy_mean2 = biggus.mean(lazy_1, axis=1)
print lazy_mean2
array_1[0,:] = -1
print lazy_mean2.ndarray()
Code using biggus looks very similar, but
The uniformity here is key:
In [7]:
class constant_array(object):
def __init__(self, shape, value=0.0):
self.shape = shape
self.dtype = np.array(value).dtype
self._value = value
def __getitem__(self, indices):
print ' !!! accessing :', indices
return self._value * np.ones(self.shape)[indices]
lazy_const_234 = npwrap(constant_array((2, 3, 4), value=3.5))
print 'lazy_234:', lazy_const_234
const_section = lazy_const_234[0, 1]
print 'section:', const_section
print '\nresult:\n', const_section.ndarray()
In [8]:
lazy_const_2x3 = biggus.ConstantArray((2, 3, 4), 3.77)
print lazy_const_2x3
const_section = lazy_const_2x3[0, 1]
print const_section
print const_section.ndarray()
In [9]:
arrays = [num * np.ones((3)) for num in range(1,5)]
print 'array#2:', arrays[2]
print
lazy_arrays = np.array([npwrap(array) for array in arrays])
stack = biggus.ArrayStack(lazy_arrays)
print stack
print stack.ndarray()
In [10]:
# Make one of the components arrays smaller than the others
lazy_arrays[2] = lazy_arrays[2][:1]
# Combine into one
mosaic = biggus.LinearMosaic(lazy_arrays, axis=0)
print mosaic
print mosaic.ndarray()
Note:
With a nested construction, we can make a complex patchwork + extract 2d regions seamlessly ...
<img src='TiledIndexing.png', width=600>
The pink cells show separate data sources, joined together with nested 'LinearMosaic's.
The extracted area is a new virtual array, seamlessly extracted across the original boundaries.
In [11]:
mean = biggus.mean(lazy_arrays[0], axis=0)
print mean
The other operations simply follow the same pattern.
Chunking is managed by an "evaluation engine" (see on..)
However, a statistical operation can specify constraints on its chunking process.
In [12]:
values = npwrap(np.array([1.0, 2.0, 3.0]))
constants = biggus.ConstantArray((3,), 1000.0)
sum = biggus.add(values, constants)
print 'sum:', sum
print 'sum results:\n', sum.ndarray()
The implementation of these is much more straightforward than statistics :
In [13]:
# Make test data (ordinary numpy arrays)
src_a = np.arange(10.0) % 3
src_b = np.arange(40.0).reshape(10, 4) % 7
print src_a
In [14]:
# Process in biggus to make 3 lazy results.
A = npwrap(src_a)
B = npwrap(src_b)
B_mean = biggus.mean(B, axis=1)
diff = biggus.sub(A, B_mean)
X = biggus.count(diff, axis=0)
Y = biggus.mean(diff, axis=0)
Z = biggus.std(diff, axis=0)
print X
In [15]:
# Evaluate.
x, y, z = biggus.ndarrays((X, Y, Z))
print x, y, z
Current status
Future directions
"Forget all your memory worries; calculate faster and easier with ..."
https://github.com/SciTools/biggus
We look forward to your contribution !
( this talk : https://github.com/pp-mo/biggus_talk )
In [16]:
def ta(id, dims):
return npwrap(id * np.ones(dims))
row1 = biggus.LinearMosaic(np.array([ta(1, (3, 5)), ta(2, (3, 2)), ta(3, (3, 2))]), axis=1)
row2 = biggus.LinearMosaic(np.array([ta(5, (5, 2)), ta(6, (5, 2)), ta(7, (5, 5))]), axis=1)
tiles2d = biggus.LinearMosaic(np.array([row1, row2]), axis=0)
part2d = tiles2d[1:6, 1:7]
whole, part = tiles2d.ndarray(), part2d.ndarray()
print 'whole:', whole
print 'part:', part
plt.figure(figsize=(12,4))
plt.subplot(1, 2, 1); plt.pcolormesh(whole, edgecolor='black');
plt.plot([1, 7, 7, 1, 1], [1, 1, 6, 6, 1], color='red', linewidth=5)
plt.subplot(1, 2, 2); plt.pcolormesh(part, edgecolor='black')
Out[16]: