In [1]:
import pandas as pd
import numpy as np
df = pd.DataFrame({
'A': np.random.randn(1000),
'B': np.random.randn(1000),
'N': np.random.randint(100, 1000, (1000)),
'x': 'x',
})
df.head(10)
Out[1]:
In [2]:
# python 原生函数
def f(x):
return x * (x - 1)
def intergate_f(x, y, N):
s = 0
dx = (y - x) / N
for i in range(N):
s += f(x + i * dx)
return s * dx
%timeit df.apply(lambda x: intergate_f(x['A'], x['B'], x['N']), axis=1)
In [3]:
# 加载 IPython Cython 语法扩展
%load_ext cythonmagic
In [4]:
%%cython
# 用 cython 编译 python 原生函数
def f_plain(x):
return x * (x - 1)
def intergate_f_plain(x, y, N):
s = 0
dx = (y - x) / N
for i in range(N):
s += f_plain(x + i * dx)
return s * dx
In [5]:
%timeit df.apply(lambda x: intergate_f_plain(x['A'], x['B'], x['N']), axis=1)
# 用 cython 编译后快了 1/8
add type
In [6]:
%%cython
# 在 cython 中定义数据类型
cdef double f_typed(double x) except? -2:
return x * (x - 1)
cpdef double intergate_f_typed(double x, double y, int N):
cdef int i
cdef double s, dx
s = 0
dx = (y - x) / N
for i in range(N):
s += f_typed(x + i * dx)
return s * dx
In [7]:
%timeit df.apply(lambda x: intergate_f_typed(x['A'], x['B'], x['N']), axis=1)
# 用 cython 指定类型后性能大幅提升 x5
In [8]:
%%cython
# 用 cython 写 pandas.apply 操作
cimport numpy as np
import numpy as np
cdef double f_typed(double x) except? -2:
return x * (x - 1)
cpdef double intergate_f_typed(double x, double y, int N):
cdef int i
cdef double s, dx
s = 0
dx = (y - x) / N
for i in range(N):
s += f_typed(x + i * dx)
return s * dx
cpdef np.ndarray[double] apply_intergate_f(np.ndarray[double] col_a, np.ndarray[double] col_b, np.ndarray col_n):
cdef Py_ssize_t i, n = len(col_n)
cdef np.ndarray[double] res = np.empty(n)
for i in range(len(col_n)):
res[i] = intergate_f_typed(col_a[i], col_b[i], col_n[i])
return res
In [9]:
%timeit apply_intergate_f(df['A'].values, df['B'].values, df['N'].values)
# 用 cython 实现 pandas.apply 之后性能 x60
In [10]:
%%cython
# 更深入的 cython 方法
cimport cython
cimport numpy as np
import numpy as np
cdef double f_typed(double x) except? -2:
return x * (x - 1)
cpdef double intergate_f_typed(double x, double y, int N):
cdef int i
cdef double s, dx
s = 0
dx = (y - x) / N
for i in range(N):
s += f_typed(x + i * dx)
return s * dx
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef np.ndarray[double] apply_intergate_f_wrap(np.ndarray[double] col_a, np.ndarray[double] col_b, np.ndarray[Py_ssize_t] col_n):
cdef Py_ssize_t i, n = len(col_n)
cdef np.ndarray[double] res = np.empty(n)
for i in range(len(col_n)):
res[i] = intergate_f_typed(col_a[i], col_b[i], col_n[i])
return res
In [11]:
%timeit apply_intergate_f_wrap(df['A'].values, df['B'].values, df['N'].values)
# 没有太大的性能提升
In [11]: