Advanced NumPy


In [1]:
from __future__ import division
from numpy.random import randn
from pandas import Series
import numpy as np
np.set_printoptions(precision=4)
import sys; sys.path.append('book_scripts')
%cd book_scripts


/home/phillip/Documents/code/py/pandas-book/rev_539000/book_scripts

ndarray object internals

NumPy dtype hierarchy


In [2]:
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)


Out[2]:
True

In [3]:
np.float64.mro()


Out[3]:
[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

Advanced array manipulation

Reshaping arrays


In [4]:
arr = np.arange(8)
arr
arr.reshape((4, 2))


Out[4]:
array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])

In [5]:
arr.reshape((4, 2)).reshape((2, 4))


Out[5]:
array([[0, 1, 2, 3],
       [4, 5, 6, 7]])

In [6]:
arr = np.arange(15)
arr.reshape((5, -1))


Out[6]:
array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [7]:
other_arr = np.ones((3, 5))
other_arr.shape
arr.reshape(other_arr.shape)


Out[7]:
array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [8]:
arr = np.arange(15).reshape((5, 3))
arr
arr.ravel()


Out[8]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [9]:
arr.flatten()


Out[9]:
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

C vs. Fortran order


In [10]:
arr = np.arange(12).reshape((3, 4))
arr
arr.ravel()
arr.ravel('F')


Out[10]:
array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

Concatenating and splitting arrays


In [11]:
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
np.concatenate([arr1, arr2], axis=0)
np.concatenate([arr1, arr2], axis=1)


Out[11]:
array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [12]:
np.vstack((arr1, arr2))
np.hstack((arr1, arr2))


Out[12]:
array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])

In [13]:
from numpy.random import randn
arr = randn(5, 2)
arr
first, second, third = np.split(arr, [1, 3])
first
second
third


Out[13]:
array([[ 0.0865, -0.0964],
       [ 1.7154,  0.3276]])

Stacking helpers:


In [14]:
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = randn(3, 2)
np.r_[arr1, arr2]
np.c_[np.r_[arr1, arr2], arr]


Out[14]:
array([[ 0.    ,  1.    ,  0.    ],
       [ 2.    ,  3.    ,  1.    ],
       [ 4.    ,  5.    ,  2.    ],
       [-0.477 ,  1.153 ,  3.    ],
       [ 0.0919, -0.3852,  4.    ],
       [-1.891 , -1.4744,  5.    ]])

In [15]:
np.c_[1:6, -10:-5]


Out[15]:
array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

Repeating elements: tile and repeat


In [16]:
arr = np.arange(3)
arr.repeat(3)


Out[16]:
array([0, 0, 0, 1, 1, 1, 2, 2, 2])

In [17]:
arr.repeat([2, 3, 4])


Out[17]:
array([0, 0, 1, 1, 1, 2, 2, 2, 2])

In [18]:
arr = randn(2, 2)
arr
arr.repeat(2, axis=0)


Out[18]:
array([[ 0.8373, -0.0382],
       [ 0.8373, -0.0382],
       [-2.3026, -3.1157],
       [-2.3026, -3.1157]])

In [19]:
arr.repeat([2, 3], axis=0)
arr.repeat([2, 3], axis=1)


Out[19]:
array([[ 0.8373,  0.8373, -0.0382, -0.0382, -0.0382],
       [-2.3026, -2.3026, -3.1157, -3.1157, -3.1157]])

In [20]:
arr
np.tile(arr, 2)


Out[20]:
array([[ 0.8373, -0.0382,  0.8373, -0.0382],
       [-2.3026, -3.1157, -2.3026, -3.1157]])

In [21]:
arr
np.tile(arr, (2, 1))
np.tile(arr, (3, 2))


Out[21]:
array([[ 0.8373, -0.0382,  0.8373, -0.0382],
       [-2.3026, -3.1157, -2.3026, -3.1157],
       [ 0.8373, -0.0382,  0.8373, -0.0382],
       [-2.3026, -3.1157, -2.3026, -3.1157],
       [ 0.8373, -0.0382,  0.8373, -0.0382],
       [-2.3026, -3.1157, -2.3026, -3.1157]])

Fancy indexing equivalents: take and put


In [22]:
arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr[inds]


Out[22]:
array([700, 100, 200, 600])

In [23]:
arr.take(inds)
arr.put(inds, 42)
arr
arr.put(inds, [40, 41, 42, 43])
arr


Out[23]:
array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])

In [24]:
inds = [2, 0, 2, 1]
arr = randn(2, 4)
arr
arr.take(inds, axis=1)


Out[24]:
array([[ 0.7526, -0.5752,  0.7526, -0.9173],
       [ 0.5017,  0.8759,  0.5017, -0.4772]])

Broadcasting


In [25]:
arr = np.arange(5)
arr
arr * 4


Out[25]:
array([ 0,  4,  8, 12, 16])

In [26]:
arr = randn(4, 3)
arr.mean(0)
demeaned = arr - arr.mean(0)
demeaned
demeaned.mean(0)


Out[26]:
array([ 0.,  0., -0.])

In [27]:
arr
row_means = arr.mean(1)
row_means.reshape((4, 1))
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)


Out[27]:
array([ 0., -0.,  0., -0.])

Broadcasting over other axes


In [28]:
arr - arr.mean(1)


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-28-7b87b85a20b2> in <module>()
----> 1 arr - arr.mean(1)

ValueError: operands could not be broadcast together with shapes (4,3) (4,) 

In [29]:
arr - arr.mean(1).reshape((4, 1))


Out[29]:
array([[-1.6838, -0.3736,  2.0574],
       [ 1.513 , -0.6511, -0.8619],
       [ 0.4619,  0.3708, -0.8327],
       [ 0.0346, -0.6972,  0.6626]])

In [30]:
arr = np.zeros((4, 4))
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape


Out[30]:
(4, 1, 4)

In [31]:
arr_1d = np.random.normal(size=3)
arr_1d[:, np.newaxis]
arr_1d[np.newaxis, :]


Out[31]:
array([[ 0.6034,  0.4693,  0.6303]])

In [32]:
arr = randn(3, 4, 5)
depth_means = arr.mean(2)
depth_means
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)


Out[32]:
array([[ 0.,  0., -0.,  0.],
       [ 0.,  0.,  0., -0.],
       [ 0., -0., -0., -0.]])

In [34]:
def demean_axis(arr, axis=0):
    means = arr.mean(axis)

    # This generalized things like [:, :, np.newaxis] to N dimensions
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]

Setting array values by broadcasting


In [35]:
arr = np.zeros((4, 3))
arr[:] = 5
arr


Out[35]:
array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.]])

In [36]:
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr
arr[:2] = [[-1.37], [0.509]]
arr


Out[36]:
array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

Advanced ufunc usage

Ufunc instance methods


In [37]:
arr = np.arange(10)
np.add.reduce(arr)
arr.sum()


Out[37]:
45

In [38]:
np.random.seed(12346)

In [39]:
arr = randn(5, 5)
arr[::2].sort(1) # sort a few rows
arr[:, :-1] < arr[:, 1:]
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)


Out[39]:
array([ True, False,  True, False,  True], dtype=bool)

In [40]:
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)


Out[40]:
array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]])

In [41]:
arr = np.arange(3).repeat([1, 2, 2])
arr
np.multiply.outer(arr, np.arange(5))


Out[41]:
array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])

In [42]:
result = np.subtract.outer(randn(3, 4), randn(5))
result.shape


Out[42]:
(3, 4, 5)

In [43]:
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])


Out[43]:
array([10, 18, 17])

In [44]:
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
np.add.reduceat(arr, [0, 2, 4], axis=1)


Out[44]:
array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]])

Custom ufuncs


In [45]:
def add_elements(x, y):
    return x + y
add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))


Out[45]:
array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)

In [46]:
add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))


Out[46]:
array([  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.])

In [47]:
arr = randn(10000)
%timeit add_them(arr, arr)
%timeit np.add(arr, arr)


1000 loops, best of 3: 1.7 ms per loop
100000 loops, best of 3: 4.68 µs per loop

Structured and record arrays


In [48]:
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr


Out[48]:
array([(1.5, 6), (3.141592653589793, -2)], 
      dtype=[('x', '<f8'), ('y', '<i4')])

In [49]:
sarr[0]
sarr[0]['y']


Out[49]:
6

In [50]:
sarr['x']


Out[50]:
array([ 1.5   ,  3.1416])

Nested dtypes and multidimensional fields


In [51]:
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arr


Out[51]:
array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)], 
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])

In [52]:
arr[0]['x']


Out[52]:
array([0, 0, 0])

In [53]:
arr['x']


Out[53]:
array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [54]:
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data['x']
data['y']
data['x']['a']


Out[54]:
array([ 1.,  3.])

Why use structured arrays?

Structured array manipulations: numpy.lib.recfunctions

More about sorting


In [55]:
arr = randn(6)
arr.sort()
arr


Out[55]:
array([-1.082 ,  0.3759,  0.8014,  1.1397,  1.2888,  1.8413])

In [56]:
arr = randn(3, 5)
arr
arr[:, 0].sort()  # Sort first column values in-place
arr


Out[56]:
array([[-1.0111, -1.4711,  0.8705, -0.0847, -1.1329],
       [-0.3318, -0.3436,  2.1714,  0.1234, -0.0189],
       [ 0.1773,  0.7424,  0.8548,  1.038 , -0.329 ]])

In [57]:
arr = randn(5)
arr
np.sort(arr)
arr


Out[57]:
array([-1.1181, -0.2415, -2.0051,  0.7379, -1.0614])

In [58]:
arr = randn(3, 5)
arr
arr.sort(axis=1)
arr


Out[58]:
array([[-0.2682, -0.1872,  0.5955,  0.9111,  1.3389],
       [-0.5168, -0.3215, -0.1989,  1.0054,  1.1925],
       [-1.7638, -0.2222, -0.2171,  0.3969,  0.6071]])

In [59]:
arr[:, ::-1]


Out[59]:
array([[ 1.3389,  0.9111,  0.5955, -0.1872, -0.2682],
       [ 1.1925,  1.0054, -0.1989, -0.3215, -0.5168],
       [ 0.6071,  0.3969, -0.2171, -0.2222, -1.7638]])

Indirect sorts: argsort and lexsort


In [60]:
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer
values[indexer]


Out[60]:
array([0, 1, 2, 3, 5])

In [61]:
arr = randn(3, 5)
arr[0] = values
arr
arr[:, arr[0].argsort()]


Out[61]:
array([[ 0.    ,  1.    ,  2.    ,  3.    ,  5.    ],
       [-0.1378,  2.1777,  0.8356, -0.4728, -0.3636],
       [ 0.2316,  0.728 ,  1.9956, -1.3918, -0.2089]])

In [62]:
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))
zip(last_name[sorter], first_name[sorter])


Out[62]:
[('Arnold', 'Jane'),
 ('Arnold', 'Steve'),
 ('Jones', 'Bill'),
 ('Jones', 'Bob'),
 ('Walters', 'Barbara')]

Alternate sort algorithms


In [63]:
values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer
values.take(indexer)


Out[63]:
array(['1:first', '1:second', '1:third', '2:first', '2:second'], 
      dtype='|S8')

numpy.searchsorted: Finding elements in a sorted array


In [64]:
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)


Out[64]:
3

In [65]:
arr.searchsorted([0, 8, 11, 16])


Out[65]:
array([0, 3, 3, 5])

In [66]:
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])
arr.searchsorted([0, 1], side='right')


Out[66]:
array([3, 7])

In [67]:
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data


Out[67]:
array([ 8304.,  4181.,  9352.,  4907.,  3250.,  8546.,  2673.,  6152.,
        2774.,  5130.,  9553.,  4997.,  1794.,  9688.,   426.,  1612.,
         651.,  8653.,  1695.,  4764.,  1052.,  4836.,  8020.,  3479.,
        1513.,  5872.,  8992.,  7656.,  4764.,  5383.,  2319.,  4280.,
        4150.,  8601.,  3946.,  9904.,  7286.,  9969.,  6032.,  4574.,
        8480.,  4298.,  2708.,  7358.,  6439.,  7916.,  3899.,  9182.,
         871.,  7973.])

In [68]:
labels = bins.searchsorted(data)
labels


Out[68]:
array([4, 3, 4, 3, 3, 4, 3, 4, 3, 4, 4, 3, 3, 4, 2, 3, 2, 4, 3, 3, 3, 3, 4,
       3, 3, 4, 4, 4, 3, 4, 3, 3, 3, 4, 3, 4, 4, 4, 4, 3, 4, 3, 3, 4, 4, 4,
       3, 4, 2, 4])

In [69]:
Series(data).groupby(labels).mean()


Out[69]:
2     649.3333
3    3411.5217
4    7935.0417
dtype: float64

In [70]:
np.digitize(data, bins)


Out[70]:
array([4, 3, 4, 3, 3, 4, 3, 4, 3, 4, 4, 3, 3, 4, 2, 3, 2, 4, 3, 3, 3, 3, 4,
       3, 3, 4, 4, 4, 3, 4, 3, 3, 3, 4, 3, 4, 4, 4, 4, 3, 4, 3, 3, 4, 4, 4,
       3, 4, 2, 4])

NumPy matrix class


In [71]:
X =  np.array([[ 8.82768214,  3.82222409, -1.14276475,  2.04411587],
               [ 3.82222409,  6.75272284,  0.83909108,  2.08293758],
               [-1.14276475,  0.83909108,  5.01690521,  0.79573241],
               [ 2.04411587,  2.08293758,  0.79573241,  6.24095859]])
X[:, 0]  # one-dimensional
y = X[:, :1]  # two-dimensional by slicing
X
y


Out[71]:
array([[ 8.8277],
       [ 3.8222],
       [-1.1428],
       [ 2.0441]])

In [72]:
np.dot(y.T, np.dot(X, y))


Out[72]:
array([[ 1195.468]])

In [73]:
Xm = np.matrix(X)
ym = Xm[:, 0]
Xm
ym
ym.T * Xm * ym


Out[73]:
matrix([[ 1195.468]])

In [74]:
Xm.I * X


Out[74]:
matrix([[ 1.,  0., -0.,  0.],
        [-0.,  1.,  0., -0.],
        [ 0.,  0.,  1.,  0.],
        [ 0.,  0.,  0.,  1.]])

Advanced array input and output

Memory-mapped files


In [75]:
mmap = np.memmap('mymmap', dtype='float64', mode='w+', shape=(10000, 10000))
mmap


Out[75]:
memmap([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])

In [76]:
section = mmap[:5]

In [77]:
section[:] = np.random.randn(5, 10000)
mmap.flush()
mmap
del mmap

In [78]:
mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmap


Out[78]:
memmap([[-0.1614, -0.1768,  0.422 , ..., -0.2195, -0.1256, -0.4012],
       [ 0.4898, -2.2219, -0.7684, ..., -2.3517, -1.0782,  1.3208],
       [-0.6875,  1.6901, -0.7444, ..., -1.4218, -0.0509,  1.2224],
       ..., 
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]])

In [79]:
%xdel mmap
!rm mymmap

HDF5 and other array storage options

Performance tips

The importance of contiguous memory


In [80]:
arr_c = np.ones((1000, 1000), order='C')
arr_f = np.ones((1000, 1000), order='F')
arr_c.flags
arr_f.flags
arr_f.flags.f_contiguous


Out[80]:
True

In [81]:
%timeit arr_c.sum(1)
%timeit arr_f.sum(1)


1000 loops, best of 3: 565 µs per loop
1000 loops, best of 3: 597 µs per loop

In [82]:
arr_f.copy('C').flags


Out[82]:
  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

In [83]:
arr_c[:50].flags.contiguous
arr_c[:, :50].flags


Out[83]:
  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False

In [84]:
%xdel arr_c
%xdel arr_f
%cd ..


/home/phillip/Documents/code/py/pandas-book/rev_539000

Other speed options: Cython, f2py, C

from numpy cimport ndarray, float64_t

def sum_elements(ndarray[float64_t] arr):
    cdef Py_ssize_t i, n = len(arr)
    cdef float64_t result = 0

    for i in range(n):
        result += arr[i]

    return result