In [1]:
import timeit
from astropy.io import ascii
import pandas
import numpy as np
from astropy.table import Table, Column
from tempfile import NamedTemporaryFile
import random
import string
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def make_table(table, size=10000, n_floats=10, n_ints=0, n_strs=0, float_format=None, str_val=None):
    if str_val is None:
        str_val = "abcde12345"
    cols = []
    for i in xrange(n_floats):
        dat = np.random.uniform(low=1, high=10, size=size)
        cols.append(Column(dat, name='f{}'.format(i)))
    for i in xrange(n_ints):
        dat = np.random.randint(low=-9999999, high=9999999, size=size)
        cols.append(Column(dat, name='i{}'.format(i)))
    for i in xrange(n_strs):
        if str_val == 'random':
            dat = np.array([''.join([random.choice(string.letters) for j in range(10)]) for k in range(size)])
        else:
            dat = np.repeat(str_val, size)
        cols.append(Column(dat, name='s{}'.format(i)))
    t = Table(cols)

    if float_format is not None:
        for col in t.columns.values():
            if col.name.startswith('f'):
                col.format = float_format

    t.write(table, format='ascii')

In [3]:
def plot_case(n_floats=10, n_ints=0, n_strs=0, float_format=None, str_val=None, genfromtxt=True):
    global table1
    n_rows = (100, 200, 500, 1000, 2000, 5000, 10000, 20000, 50000)  # include 50000 for publish run
    numbers = (10, 10,   5,     2,    1,    1,     1,     1,    1)
    repeats = (3,   3,    3,    3,    3,    3,     3,     2,    1)
    times_slow = []
    times_fast = []
    times_fast_converter = []
    times_pandas = []
    times_genfromtxt = []
    for n_row, number, repeat in zip(n_rows, numbers, repeats):
        table1 = 'float.txt'
        make_table(table1, n_row, n_floats, n_ints, n_strs, float_format, str_val)
        t = timeit.repeat("ascii.read(table1, use_fast_reader=False, format='basic', guess=False)", 
                   setup='from __main__ import ascii, table1', number=number, repeat=repeat)
        times_slow.append(min(t) / number)
        t = timeit.repeat("ascii.read(table1, format='basic', guess=False)", 
                   setup='from __main__ import ascii, table1', number=number, repeat=repeat)
        times_fast.append(min(t) / number)
        if n_floats > 0:
            t = timeit.repeat("ascii.read(table1, format='basic', guess=False, use_fast_converter=True)", 
                   setup='from __main__ import ascii, table1', number=number, repeat=repeat)
            times_fast_converter.append(min(t) / number)
        t = timeit.repeat("pandas.read_csv(table1, sep=' ', header=0)", 
                   setup='from __main__ import table1, pandas', number=number, repeat=repeat)
        times_pandas.append(min(t) / number)
        if genfromtxt:
            t = timeit.repeat("np.genfromtxt(table1, names=True)", 
                   setup='from __main__ import table1, np', number=number, repeat=repeat)
            times_genfromtxt.append(min(t) / number)
    plt.loglog(n_rows, times_slow, '-ob', label='io.ascii Python')
    plt.loglog(n_rows, times_fast, '-or', label='io.ascii Fast-c')
    if n_floats > 0:
        plt.loglog(n_rows, times_fast_converter, '-oy', label='Fast converter')
    plt.loglog(n_rows, times_pandas, '-oc', label='Pandas')
    if genfromtxt:
        plt.loglog(n_rows, times_genfromtxt, '-om', label='np.genfromtxt', alpha=0.5)
    plt.grid()
    plt.legend(loc='best')
    plt.title('n_floats={} n_ints={} n_strs={} float_format={}'.format(n_floats, n_ints, n_strs, float_format))
    plt.xlabel('Number of rows')
    plt.ylabel('Time (sec)')
    print('Fast-C to Python speed ratio: {:.2f} : 1'.format(times_slow[-1] / times_fast[-1]))
    if n_floats > 0:
        print('Fast-C with converter to Fast-C speed ratio: {:.2f} : 1'.format(times_fast[-1] / times_fast_converter[-1]))
        print('Pandas to Fast-C with converter speed ratio: {:.2f} : 1'.format(times_fast_converter[-1] / times_pandas[-1]))
    else:
        print('Pandas to Fast-C speed ratio: {:.2f} : 1'.format(times_fast[-1] / times_pandas[-1]))

In [4]:
plot_case(n_floats=10, n_ints=0, n_strs=0, float_format=None)


Fast-C to Python speed ratio: 3.44 : 1
Fast-C with converter to Fast-C speed ratio: 1.79 : 1
Pandas to Fast-C with converter speed ratio: 0.91 : 1

In [5]:
plot_case(n_floats=10, n_ints=10, n_strs=10, float_format=None)


Fast-C to Python speed ratio: 4.15 : 1
Fast-C with converter to Fast-C speed ratio: 1.29 : 1
Pandas to Fast-C with converter speed ratio: 1.09 : 1

In [6]:
plot_case(n_floats=10, n_ints=10, n_strs=10, float_format='%.4f')


Fast-C to Python speed ratio: 4.95 : 1
Fast-C with converter to Fast-C speed ratio: 1.10 : 1
Pandas to Fast-C with converter speed ratio: 1.22 : 1

In [7]:
plot_case(n_floats=10, n_ints=0, n_strs=0, float_format='%.4f')


Fast-C to Python speed ratio: 4.74 : 1
Fast-C with converter to Fast-C speed ratio: 2.18 : 1
Pandas to Fast-C with converter speed ratio: 0.95 : 1

In [8]:
plot_case(n_floats=0, n_ints=0, n_strs=10)


Fast-C to Python speed ratio: 4.15 : 1
Pandas to Fast-C speed ratio: 1.50 : 1

In [9]:
plot_case(n_floats=0, n_ints=0, n_strs=10, str_val="'asdf asdfa'", genfromtxt=False)


Fast-C to Python speed ratio: 4.32 : 1
Pandas to Fast-C speed ratio: 1.62 : 1

In [10]:
plot_case(n_floats=0, n_ints=0, n_strs=10, str_val="random")


Fast-C to Python speed ratio: 3.94 : 1
Pandas to Fast-C speed ratio: 0.66 : 1

In [11]:
%plot_case(n_floats=0, n_ints=10, n_strs=0)


Fast-C to Python speed ratio: 9.43 : 1
Pandas to Fast-C speed ratio: 1.03 : 1