In [1]:
%load_ext cython
Batches of data are collected from field instruments. These instruments capture the date in three separate columns: day, month and year.
Data is processed in Pandas, but currently it is slow to convert the three columns into datetimes.
In [2]:
import numpy as np
import pandas as pd
def make_sample_data(size):
d = dict(
# Years: 1980 - 2015
year=np.random.randint(1980, 2016, int(size)),
# Months 1 - 12
month=np.random.randint(1, 13, int(size)),
# Day number: 1 - 28
day=np.random.randint(1, 28, int(size)),
)
return pd.DataFrame(d)
In [3]:
df = make_sample_data(5)
df
Out[3]:
Let's see the Python code first:
In [4]:
import datetime
def create_datetime_py(year, month, day):
""" Take year, month, day and return a datetime """
return datetime.datetime(year, month, day, 0, 0, 0, 0, None)
Pandas has an apply() method that runs your function on a bunch of columns.
You must provide a function that receives a row, and your function must return a value. All the output values get put into a new Pandas series.
In [5]:
# Refer to fields by name! Very cool 👍
df.apply(lambda x : create_datetime_py(
x['year'], x['month'], x['day']), axis=1)
Out[5]:
Note: the type is "datetime64[ns]".
Awkward to type that all out each time. Let's make a convenient function.
In [6]:
def make_datetime_py(df):
return df.apply(lambda x : create_datetime_py(
x['year'], x['month'], x['day']), axis=1)
Then we can just call it like so:
In [7]:
make_datetime_py(df)
Out[7]:
With lots of data, the conversion to a datetime column takes a very long time! Let's try a bunch of data:
In [8]:
df_big = make_sample_data(100000)
%timeit make_datetime_py(df_big)
In [9]:
%%cython
# cython: boundscheck = False
# cython: wraparound = False
from cpython.datetime cimport (
import_datetime, datetime_new, datetime, timedelta)
from pandas import Timestamp
import_datetime()
cpdef convert_arrays_ts(
long[:] year, long[:] month, long[:] day,
long long[:] out):
""" Result goes into `out` """
cdef int i, n = year.shape[0]
cdef datetime dt
for i in range(n):
dt = <datetime>datetime_new(
year[i], month[i], day[i], 0, 0, 0, 0, None)
out[i] = Timestamp(dt).value
In [10]:
def make_datetime_cy(df, method):
s = pd.Series(np.zeros(len(df), dtype='datetime64[ns]'))
method(df['year'].values, df['month'].values, df['day'].values,
s.values.view('int64'))
return s
In [11]:
# Test it out
make_datetime_cy(df, convert_arrays_ts)
Out[11]:
In [12]:
df_big = make_sample_data(100000)
%timeit make_datetime_py(df_big)
%timeit make_datetime_cy(df_big, convert_arrays_ts)
XX / XX
In [13]:
%%cython -a
# cython: boundscheck = False
# cython: wraparound = False
from cpython.datetime cimport (
import_datetime, datetime_new, datetime, timedelta,
timedelta_seconds, timedelta_days)
import_datetime() # <-- Pretty important
cpdef convert_arrays_dt(long[:] year, long[:] month, long[:] day,
long long[:] out):
""" Result goes into `out` """
cdef int i, n = year.shape[0]
cdef datetime dt, epoch = datetime_new(1970, 1, 1, 0, 0, 0, 0, None)
cdef timedelta td
cdef long seconds
for i in range(n):
dt = <datetime>datetime_new(
year[i], month[i], day[i], 0, 0, 0, 0, None)
td = <timedelta>(dt - epoch)
seconds = timedelta_days(td) * 86400
out[i] = seconds * 1000000000 # Nanoseconds, remember?
Out[13]:
In [14]:
make_datetime_cy(df, convert_arrays_dt)
Out[14]:
In [15]:
df_big = make_sample_data(100000)
%timeit make_datetime_py(df_big)
%timeit make_datetime_cy(df_big, convert_arrays_ts)
%timeit make_datetime_cy(df_big, convert_arrays_dt)
XX / XX
In [16]:
%%cython -a
# cython: boundscheck = False
# cython: wraparound = False
from libc.time cimport mktime, tm, timezone
cdef inline long to_unix(long year, long month, long day):
""" month: 1 - 12, day: 1 - 31
Result is in UTC. """
cdef tm tms
tms.tm_year = year - 1900 # years since 1900 !!
tms.tm_mon = month - 1 # 0 to 11 !!
tms.tm_mday = day # 1 - 31
tms.tm_hour, tms.tm_min, tms.tm_sec = 0, 0, 0
return mktime(&tms) - timezone
cpdef convert_arrays_libc(
long[:] year, long[:] month, long[:] day,
long long[:] out):
""" Result goes into `out` """
cdef int i, n = year.shape[0]
cdef long unix
for i in range(n):
unix = to_unix(year[i], month[i], day[i])
#print(unix)
#out[i] = to_unix(year[i], month[i], day[i]) * 1000000000
out[i] = unix * 1000000000
Out[16]:
In [17]:
make_datetime_cy(df, convert_arrays_libc)
Out[17]:
In [18]:
df_big = make_sample_data(100000)
%timeit make_datetime_py(df_big)
%timeit make_datetime_cy(df_big, convert_arrays_dt)
%timeit make_datetime_cy(df_big, convert_arrays_ts)
%timeit make_datetime_cy(df_big, convert_arrays_libc)
In [ ]: