Advanced NumPy



In [1]:

    
from __future__ import division
from numpy.random import randn
from pandas import Series
import numpy as np
np.set_printoptions(precision=4)
import sys; sys.path.append('book_scripts')
%cd book_scripts









    



/home/phillip/Documents/code/py/pandas-book/rev_539000/book_scripts

ndarray object internals

NumPy dtype hierarchy



In [2]:

    
ints = np.ones(10, dtype=np.uint16)
floats = np.ones(10, dtype=np.float32)
np.issubdtype(ints.dtype, np.integer)
np.issubdtype(floats.dtype, np.floating)









    Out[2]:





True



In [3]:

    
np.float64.mro()









    Out[3]:





[numpy.float64,
 numpy.floating,
 numpy.inexact,
 numpy.number,
 numpy.generic,
 float,
 object]

Advanced array manipulation

Reshaping arrays



In [4]:

    
arr = np.arange(8)
arr
arr.reshape((4, 2))









    Out[4]:





array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7]])



In [5]:

    
arr.reshape((4, 2)).reshape((2, 4))









    Out[5]:





array([[0, 1, 2, 3],
       [4, 5, 6, 7]])



In [6]:

    
arr = np.arange(15)
arr.reshape((5, -1))









    Out[6]:





array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])



In [7]:

    
other_arr = np.ones((3, 5))
other_arr.shape
arr.reshape(other_arr.shape)









    Out[7]:





array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])



In [8]:

    
arr = np.arange(15).reshape((5, 3))
arr
arr.ravel()









    Out[8]:





array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])



In [9]:

    
arr.flatten()









    Out[9]:





array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

C vs. Fortran order



In [10]:

    
arr = np.arange(12).reshape((3, 4))
arr
arr.ravel()
arr.ravel('F')









    Out[10]:





array([ 0,  4,  8,  1,  5,  9,  2,  6, 10,  3,  7, 11])

Concatenating and splitting arrays



In [11]:

    
arr1 = np.array([[1, 2, 3], [4, 5, 6]])
arr2 = np.array([[7, 8, 9], [10, 11, 12]])
np.concatenate([arr1, arr2], axis=0)
np.concatenate([arr1, arr2], axis=1)









    Out[11]:





array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])



In [12]:

    
np.vstack((arr1, arr2))
np.hstack((arr1, arr2))









    Out[12]:





array([[ 1,  2,  3,  7,  8,  9],
       [ 4,  5,  6, 10, 11, 12]])



In [13]:

    
from numpy.random import randn
arr = randn(5, 2)
arr
first, second, third = np.split(arr, [1, 3])
first
second
third









    Out[13]:





array([[ 0.0865, -0.0964],
       [ 1.7154,  0.3276]])

Stacking helpers:



In [14]:

    
arr = np.arange(6)
arr1 = arr.reshape((3, 2))
arr2 = randn(3, 2)
np.r_[arr1, arr2]
np.c_[np.r_[arr1, arr2], arr]









    Out[14]:





array([[ 0.    ,  1.    ,  0.    ],
       [ 2.    ,  3.    ,  1.    ],
       [ 4.    ,  5.    ,  2.    ],
       [-0.477 ,  1.153 ,  3.    ],
       [ 0.0919, -0.3852,  4.    ],
       [-1.891 , -1.4744,  5.    ]])



In [15]:

    
np.c_[1:6, -10:-5]









    Out[15]:





array([[  1, -10],
       [  2,  -9],
       [  3,  -8],
       [  4,  -7],
       [  5,  -6]])

Repeating elements: tile and repeat



In [16]:

    
arr = np.arange(3)
arr.repeat(3)









    Out[16]:





array([0, 0, 0, 1, 1, 1, 2, 2, 2])



In [17]:

    
arr.repeat([2, 3, 4])









    Out[17]:





array([0, 0, 1, 1, 1, 2, 2, 2, 2])



In [18]:

    
arr = randn(2, 2)
arr
arr.repeat(2, axis=0)









    Out[18]:





array([[ 0.8373, -0.0382],
       [ 0.8373, -0.0382],
       [-2.3026, -3.1157],
       [-2.3026, -3.1157]])



In [19]:

    
arr.repeat([2, 3], axis=0)
arr.repeat([2, 3], axis=1)









    Out[19]:





array([[ 0.8373,  0.8373, -0.0382, -0.0382, -0.0382],
       [-2.3026, -2.3026, -3.1157, -3.1157, -3.1157]])



In [20]:

    
arr
np.tile(arr, 2)









    Out[20]:





array([[ 0.8373, -0.0382,  0.8373, -0.0382],
       [-2.3026, -3.1157, -2.3026, -3.1157]])



In [21]:

    
arr
np.tile(arr, (2, 1))
np.tile(arr, (3, 2))









    Out[21]:





array([[ 0.8373, -0.0382,  0.8373, -0.0382],
       [-2.3026, -3.1157, -2.3026, -3.1157],
       [ 0.8373, -0.0382,  0.8373, -0.0382],
       [-2.3026, -3.1157, -2.3026, -3.1157],
       [ 0.8373, -0.0382,  0.8373, -0.0382],
       [-2.3026, -3.1157, -2.3026, -3.1157]])

Fancy indexing equivalents: take and put



In [22]:

    
arr = np.arange(10) * 100
inds = [7, 1, 2, 6]
arr[inds]









    Out[22]:





array([700, 100, 200, 600])



In [23]:

    
arr.take(inds)
arr.put(inds, 42)
arr
arr.put(inds, [40, 41, 42, 43])
arr









    Out[23]:





array([  0,  41,  42, 300, 400, 500,  43,  40, 800, 900])



In [24]:

    
inds = [2, 0, 2, 1]
arr = randn(2, 4)
arr
arr.take(inds, axis=1)









    Out[24]:





array([[ 0.7526, -0.5752,  0.7526, -0.9173],
       [ 0.5017,  0.8759,  0.5017, -0.4772]])

Broadcasting



In [25]:

    
arr = np.arange(5)
arr
arr * 4









    Out[25]:





array([ 0,  4,  8, 12, 16])



In [26]:

    
arr = randn(4, 3)
arr.mean(0)
demeaned = arr - arr.mean(0)
demeaned
demeaned.mean(0)









    Out[26]:





array([ 0.,  0., -0.])



In [27]:

    
arr
row_means = arr.mean(1)
row_means.reshape((4, 1))
demeaned = arr - row_means.reshape((4, 1))
demeaned.mean(1)









    Out[27]:





array([ 0., -0.,  0., -0.])

Broadcasting over other axes



In [28]:

    
arr - arr.mean(1)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-28-7b87b85a20b2> in <module>()
----> 1 arr - arr.mean(1)

ValueError: operands could not be broadcast together with shapes (4,3) (4,)



In [29]:

    
arr - arr.mean(1).reshape((4, 1))









    Out[29]:





array([[-1.6838, -0.3736,  2.0574],
       [ 1.513 , -0.6511, -0.8619],
       [ 0.4619,  0.3708, -0.8327],
       [ 0.0346, -0.6972,  0.6626]])



In [30]:

    
arr = np.zeros((4, 4))
arr_3d = arr[:, np.newaxis, :]
arr_3d.shape









    Out[30]:





(4, 1, 4)



In [31]:

    
arr_1d = np.random.normal(size=3)
arr_1d[:, np.newaxis]
arr_1d[np.newaxis, :]









    Out[31]:





array([[ 0.6034,  0.4693,  0.6303]])



In [32]:

    
arr = randn(3, 4, 5)
depth_means = arr.mean(2)
depth_means
demeaned = arr - depth_means[:, :, np.newaxis]
demeaned.mean(2)









    Out[32]:





array([[ 0.,  0., -0.,  0.],
       [ 0.,  0.,  0., -0.],
       [ 0., -0., -0., -0.]])



In [34]:

    
def demean_axis(arr, axis=0):
    means = arr.mean(axis)

    # This generalized things like [:, :, np.newaxis] to N dimensions
    indexer = [slice(None)] * arr.ndim
    indexer[axis] = np.newaxis
    return arr - means[indexer]

Setting array values by broadcasting



In [35]:

    
arr = np.zeros((4, 3))
arr[:] = 5
arr









    Out[35]:





array([[ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.],
       [ 5.,  5.,  5.]])



In [36]:

    
col = np.array([1.28, -0.42, 0.44, 1.6])
arr[:] = col[:, np.newaxis]
arr
arr[:2] = [[-1.37], [0.509]]
arr









    Out[36]:





array([[-1.37 , -1.37 , -1.37 ],
       [ 0.509,  0.509,  0.509],
       [ 0.44 ,  0.44 ,  0.44 ],
       [ 1.6  ,  1.6  ,  1.6  ]])

Advanced ufunc usage

Ufunc instance methods



In [37]:

    
arr = np.arange(10)
np.add.reduce(arr)
arr.sum()









    Out[37]:





45



In [38]:

    
np.random.seed(12346)



In [39]:

    
arr = randn(5, 5)
arr[::2].sort(1) # sort a few rows
arr[:, :-1] < arr[:, 1:]
np.logical_and.reduce(arr[:, :-1] < arr[:, 1:], axis=1)









    Out[39]:





array([ True, False,  True, False,  True], dtype=bool)



In [40]:

    
arr = np.arange(15).reshape((3, 5))
np.add.accumulate(arr, axis=1)









    Out[40]:





array([[ 0,  1,  3,  6, 10],
       [ 5, 11, 18, 26, 35],
       [10, 21, 33, 46, 60]])



In [41]:

    
arr = np.arange(3).repeat([1, 2, 2])
arr
np.multiply.outer(arr, np.arange(5))









    Out[41]:





array([[0, 0, 0, 0, 0],
       [0, 1, 2, 3, 4],
       [0, 1, 2, 3, 4],
       [0, 2, 4, 6, 8],
       [0, 2, 4, 6, 8]])



In [42]:

    
result = np.subtract.outer(randn(3, 4), randn(5))
result.shape









    Out[42]:





(3, 4, 5)



In [43]:

    
arr = np.arange(10)
np.add.reduceat(arr, [0, 5, 8])









    Out[43]:





array([10, 18, 17])



In [44]:

    
arr = np.multiply.outer(np.arange(4), np.arange(5))
arr
np.add.reduceat(arr, [0, 2, 4], axis=1)









    Out[44]:





array([[ 0,  0,  0],
       [ 1,  5,  4],
       [ 2, 10,  8],
       [ 3, 15, 12]])

Custom ufuncs



In [45]:

    
def add_elements(x, y):
    return x + y
add_them = np.frompyfunc(add_elements, 2, 1)
add_them(np.arange(8), np.arange(8))









    Out[45]:





array([0, 2, 4, 6, 8, 10, 12, 14], dtype=object)



In [46]:

    
add_them = np.vectorize(add_elements, otypes=[np.float64])
add_them(np.arange(8), np.arange(8))









    Out[46]:





array([  0.,   2.,   4.,   6.,   8.,  10.,  12.,  14.])



In [47]:

    
arr = randn(10000)
%timeit add_them(arr, arr)
%timeit np.add(arr, arr)









    



1000 loops, best of 3: 1.7 ms per loop
100000 loops, best of 3: 4.68 µs per loop

Structured and record arrays



In [48]:

    
dtype = [('x', np.float64), ('y', np.int32)]
sarr = np.array([(1.5, 6), (np.pi, -2)], dtype=dtype)
sarr









    Out[48]:





array([(1.5, 6), (3.141592653589793, -2)], 
      dtype=[('x', '<f8'), ('y', '<i4')])



In [49]:

    
sarr[0]
sarr[0]['y']









    Out[49]:





6



In [50]:

    
sarr['x']









    Out[50]:





array([ 1.5   ,  3.1416])

Nested dtypes and multidimensional fields



In [51]:

    
dtype = [('x', np.int64, 3), ('y', np.int32)]
arr = np.zeros(4, dtype=dtype)
arr









    Out[51]:





array([([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0), ([0, 0, 0], 0)], 
      dtype=[('x', '<i8', (3,)), ('y', '<i4')])



In [52]:

    
arr[0]['x']









    Out[52]:





array([0, 0, 0])



In [53]:

    
arr['x']









    Out[53]:





array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])



In [54]:

    
dtype = [('x', [('a', 'f8'), ('b', 'f4')]), ('y', np.int32)]
data = np.array([((1, 2), 5), ((3, 4), 6)], dtype=dtype)
data['x']
data['y']
data['x']['a']









    Out[54]:





array([ 1.,  3.])

Why use structured arrays?

Structured array manipulations: numpy.lib.recfunctions

More about sorting



In [55]:

    
arr = randn(6)
arr.sort()
arr









    Out[55]:





array([-1.082 ,  0.3759,  0.8014,  1.1397,  1.2888,  1.8413])



In [56]:

    
arr = randn(3, 5)
arr
arr[:, 0].sort()  # Sort first column values in-place
arr









    Out[56]:





array([[-1.0111, -1.4711,  0.8705, -0.0847, -1.1329],
       [-0.3318, -0.3436,  2.1714,  0.1234, -0.0189],
       [ 0.1773,  0.7424,  0.8548,  1.038 , -0.329 ]])



In [57]:

    
arr = randn(5)
arr
np.sort(arr)
arr









    Out[57]:





array([-1.1181, -0.2415, -2.0051,  0.7379, -1.0614])



In [58]:

    
arr = randn(3, 5)
arr
arr.sort(axis=1)
arr









    Out[58]:





array([[-0.2682, -0.1872,  0.5955,  0.9111,  1.3389],
       [-0.5168, -0.3215, -0.1989,  1.0054,  1.1925],
       [-1.7638, -0.2222, -0.2171,  0.3969,  0.6071]])



In [59]:

    
arr[:, ::-1]









    Out[59]:





array([[ 1.3389,  0.9111,  0.5955, -0.1872, -0.2682],
       [ 1.1925,  1.0054, -0.1989, -0.3215, -0.5168],
       [ 0.6071,  0.3969, -0.2171, -0.2222, -1.7638]])

Indirect sorts: argsort and lexsort



In [60]:

    
values = np.array([5, 0, 1, 3, 2])
indexer = values.argsort()
indexer
values[indexer]









    Out[60]:





array([0, 1, 2, 3, 5])



In [61]:

    
arr = randn(3, 5)
arr[0] = values
arr
arr[:, arr[0].argsort()]









    Out[61]:





array([[ 0.    ,  1.    ,  2.    ,  3.    ,  5.    ],
       [-0.1378,  2.1777,  0.8356, -0.4728, -0.3636],
       [ 0.2316,  0.728 ,  1.9956, -1.3918, -0.2089]])



In [62]:

    
first_name = np.array(['Bob', 'Jane', 'Steve', 'Bill', 'Barbara'])
last_name = np.array(['Jones', 'Arnold', 'Arnold', 'Jones', 'Walters'])
sorter = np.lexsort((first_name, last_name))
zip(last_name[sorter], first_name[sorter])









    Out[62]:





[('Arnold', 'Jane'),
 ('Arnold', 'Steve'),
 ('Jones', 'Bill'),
 ('Jones', 'Bob'),
 ('Walters', 'Barbara')]

Alternate sort algorithms



In [63]:

    
values = np.array(['2:first', '2:second', '1:first', '1:second', '1:third'])
key = np.array([2, 2, 1, 1, 1])
indexer = key.argsort(kind='mergesort')
indexer
values.take(indexer)









    Out[63]:





array(['1:first', '1:second', '1:third', '2:first', '2:second'], 
      dtype='|S8')

numpy.searchsorted: Finding elements in a sorted array



In [64]:

    
arr = np.array([0, 1, 7, 12, 15])
arr.searchsorted(9)









    Out[64]:





3



In [65]:

    
arr.searchsorted([0, 8, 11, 16])









    Out[65]:





array([0, 3, 3, 5])



In [66]:

    
arr = np.array([0, 0, 0, 1, 1, 1, 1])
arr.searchsorted([0, 1])
arr.searchsorted([0, 1], side='right')









    Out[66]:





array([3, 7])



In [67]:

    
data = np.floor(np.random.uniform(0, 10000, size=50))
bins = np.array([0, 100, 1000, 5000, 10000])
data









    Out[67]:





array([ 8304.,  4181.,  9352.,  4907.,  3250.,  8546.,  2673.,  6152.,
        2774.,  5130.,  9553.,  4997.,  1794.,  9688.,   426.,  1612.,
         651.,  8653.,  1695.,  4764.,  1052.,  4836.,  8020.,  3479.,
        1513.,  5872.,  8992.,  7656.,  4764.,  5383.,  2319.,  4280.,
        4150.,  8601.,  3946.,  9904.,  7286.,  9969.,  6032.,  4574.,
        8480.,  4298.,  2708.,  7358.,  6439.,  7916.,  3899.,  9182.,
         871.,  7973.])



In [68]:

    
labels = bins.searchsorted(data)
labels









    Out[68]:





array([4, 3, 4, 3, 3, 4, 3, 4, 3, 4, 4, 3, 3, 4, 2, 3, 2, 4, 3, 3, 3, 3, 4,
       3, 3, 4, 4, 4, 3, 4, 3, 3, 3, 4, 3, 4, 4, 4, 4, 3, 4, 3, 3, 4, 4, 4,
       3, 4, 2, 4])



In [69]:

    
Series(data).groupby(labels).mean()









    Out[69]:





2     649.3333
3    3411.5217
4    7935.0417
dtype: float64



In [70]:

    
np.digitize(data, bins)









    Out[70]:





array([4, 3, 4, 3, 3, 4, 3, 4, 3, 4, 4, 3, 3, 4, 2, 3, 2, 4, 3, 3, 3, 3, 4,
       3, 3, 4, 4, 4, 3, 4, 3, 3, 3, 4, 3, 4, 4, 4, 4, 3, 4, 3, 3, 4, 4, 4,
       3, 4, 2, 4])

NumPy matrix class



In [71]:

    
X =  np.array([[ 8.82768214,  3.82222409, -1.14276475,  2.04411587],
               [ 3.82222409,  6.75272284,  0.83909108,  2.08293758],
               [-1.14276475,  0.83909108,  5.01690521,  0.79573241],
               [ 2.04411587,  2.08293758,  0.79573241,  6.24095859]])
X[:, 0]  # one-dimensional
y = X[:, :1]  # two-dimensional by slicing
X
y









    Out[71]:





array([[ 8.8277],
       [ 3.8222],
       [-1.1428],
       [ 2.0441]])



In [72]:

    
np.dot(y.T, np.dot(X, y))









    Out[72]:





array([[ 1195.468]])



In [73]:

    
Xm = np.matrix(X)
ym = Xm[:, 0]
Xm
ym
ym.T * Xm * ym









    Out[73]:





matrix([[ 1195.468]])



In [74]:

    
Xm.I * X









    Out[74]:





matrix([[ 1.,  0., -0.,  0.],
        [-0.,  1.,  0., -0.],
        [ 0.,  0.,  1.,  0.],
        [ 0.,  0.,  0.,  1.]])

Advanced array input and output

Memory-mapped files



In [75]:

    
mmap = np.memmap('mymmap', dtype='float64', mode='w+', shape=(10000, 10000))
mmap









    Out[75]:





memmap([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]])



In [76]:

    
section = mmap[:5]



In [77]:

    
section[:] = np.random.randn(5, 10000)
mmap.flush()
mmap
del mmap



In [78]:

    
mmap = np.memmap('mymmap', dtype='float64', shape=(10000, 10000))
mmap









    Out[78]:





memmap([[-0.1614, -0.1768,  0.422 , ..., -0.2195, -0.1256, -0.4012],
       [ 0.4898, -2.2219, -0.7684, ..., -2.3517, -1.0782,  1.3208],
       [-0.6875,  1.6901, -0.7444, ..., -1.4218, -0.0509,  1.2224],
       ..., 
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ]])



In [79]:

    
%xdel mmap
!rm mymmap

HDF5 and other array storage options

Performance tips

The importance of contiguous memory



In [80]:

    
arr_c = np.ones((1000, 1000), order='C')
arr_f = np.ones((1000, 1000), order='F')
arr_c.flags
arr_f.flags
arr_f.flags.f_contiguous









    Out[80]:





True



In [81]:

    
%timeit arr_c.sum(1)
%timeit arr_f.sum(1)









    



1000 loops, best of 3: 565 µs per loop
1000 loops, best of 3: 597 µs per loop



In [82]:

    
arr_f.copy('C').flags









    Out[82]:





  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False



In [83]:

    
arr_c[:50].flags.contiguous
arr_c[:, :50].flags









    Out[83]:





  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  UPDATEIFCOPY : False



In [84]:

    
%xdel arr_c
%xdel arr_f
%cd ..









    



/home/phillip/Documents/code/py/pandas-book/rev_539000

Other speed options: Cython, f2py, C

from numpy cimport ndarray, float64_t

def sum_elements(ndarray[float64_t] arr):
    cdef Py_ssize_t i, n = len(arr)
    cdef float64_t result = 0

    for i in range(n):
        result += arr[i]

    return result