In [1]:
import numpy as np
import pandas as pd
In [2]:
from numpy.testing.utils import assert_almost_equal
In [3]:
rows = 10000000
# Equivalent numpy array
arr = np.random.uniform(size=rows*3).reshape(rows, 3)
# The pandas dataFrame with column names
df = pd.DataFrame(arr, columns=['x','y','z'])
# a `numpy.recarray`
rec = df.to_records()
In [4]:
df.head()
Out[4]:
In [5]:
df.dtypes
Out[5]:
numpy.ndarray
In [6]:
%timeit arr[:, 2].sum()
arrsum = arr[:, 2].sum()
pandas.dataFrame
In [13]:
%timeit df.z.sum()
pdattsum = df.z.sum()
In [14]:
%timeit df.z.values.sum()
pdattsum = df.z.values.sum()
In [15]:
%timeit df.z.values.sum()
pdattsum = df.values['z'].sum()
In [ ]:
assert_almost_equal(arrsum, pdattsum)
pandas.dataFrame
In [ ]:
%timeit df['z'].sum()
pdnstyle = df['z'].sum()
numpy.rec.array
In [ ]:
%timeit rec['z'].sum()
reccolnames = rec['z'].sum()
pandas.dataFrame
with object type, expected to be slow
In [ ]:
df['z'] = df['z'].astype('object')
In [ ]:
df.dtypes
In [ ]:
%timeit df['z'].sum()
objectSum = df['z'].sum()
I would have expected pandas.dataFrame.sum
to be more competitive with numpy.ndarray.sum
, where the type of the dataFrame column was specified.
List comprehension style iteration in numpy.ndarray
In [ ]:
%timeit sum(i for i in arr[:, 2])
itersumnumpy = sum(i for i in arr[:, 2])
In [ ]:
assert_almost_equal(itersumnumpy,arrsum, decimal=5)
List comprehension style sum on a list: Again expected to be slow
In [ ]:
l = arr[:, 2].tolist
In [ ]:
%timeit sum(i for i in l)
listsum = sum(i for i in l)
In [ ]:
assert_almost_equal(listsum, arrsum, 5)
In [ ]:
%timeit sum(i for i in df['z'])
pandasitersum = sum(i for i in df['z'])
In [ ]:
t = tuple(l)
In [ ]:
%timeit sum(i for i in t)
tuplesum = sum(i for i in t)
In [ ]:
assert_almost_equal(pandasitersum, arrsum, 5)
In [ ]:
assert_almost_equal(tuplesum, arrsum, 5)
So for a dataFame with object type, doing array operations like sum (admittedly silly), is about as good as doing this with a list comprehension. But iterating through the dataFrame rows using a list comprehension style is much worse.
In [17]:
%timeit df.values
In [ ]: