In [1]:
import timeit
from astropy.io import ascii
import pandas
import numpy as np
from astropy.table import Table, Column
from cStringIO import StringIO
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def make_table(size=10000, n_floats=10, n_ints=0, n_strs=0, float_format=None, str_val=None):
    if str_val is None:
        str_val = "abcde12345"
    cols = []
    for i in xrange(n_floats):
        dat = np.random.uniform(low=1, high=10, size=size)
        cols.append(Column(dat, name='f{}'.format(i)))
    for i in xrange(n_ints):
        dat = np.random.randint(low=-9999999, high=9999999, size=size)
        cols.append(Column(dat, name='i{}'.format(i)))
    for i in xrange(n_strs):
        dat = np.repeat(str_val, size)
        cols.append(Column(dat, name='s{}'.format(i)))
    t = Table(cols)

    if float_format is not None:
        for col in t.columns.values():
            if col.name.startswith('f'):
                col.format = float_format

    return t

In [3]:
t = make_table(5, float_format='%.4f')
print(t)


  f0     f1     f2     f3     f4     f5     f6     f7     f8     f9  
------ ------ ------ ------ ------ ------ ------ ------ ------ ------
3.8303 8.4571 7.8665 3.8209 6.3194 3.1620 8.5229 3.3333 7.8304 8.6393
3.3609 7.6057 6.3219 1.4221 3.0528 2.6562 1.1607 1.6113 1.8004 6.0744
2.6694 4.9182 8.9608 4.3329 4.6248 5.0090 6.0787 9.0370 6.8887 6.1026
3.1817 4.0416 5.8638 1.9688 8.8393 6.3279 6.6718 9.1173 3.8640 6.1094
9.1954 8.5942 8.1698 6.5603 5.6420 5.0458 6.7071 7.8081 6.8218 3.1500

In [4]:
out = StringIO()
ascii.write(t, out, use_fast_writer=True)
print out.getvalue()


f0 f1 f2 f3 f4 f5 f6 f7 f8 f9
3.8303 8.4571 7.8665 3.8209 6.3194 3.1620 8.5229 3.3333 7.8304 8.6393
3.3609 7.6057 6.3219 1.4221 3.0528 2.6562 1.1607 1.6113 1.8004 6.0744
2.6694 4.9182 8.9608 4.3329 4.6248 5.0090 6.0787 9.0370 6.8887 6.1026
3.1817 4.0416 5.8638 1.9688 8.8393 6.3279 6.6718 9.1173 3.8640 6.1094
9.1954 8.5942 8.1698 6.5603 5.6420 5.0458 6.7071 7.8081 6.8218 3.1500


In [5]:
out = StringIO()
pandas_table = pandas.DataFrame(np.array(t))
pandas_table.to_csv(out, float_format='%.4f')
print out.getvalue()


,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9
0,3.8303,8.4571,7.8665,3.8209,6.3194,3.1620,8.5229,3.3333,7.8304,8.6393
1,3.3609,7.6057,6.3219,1.4221,3.0528,2.6562,1.1607,1.6113,1.8004,6.0744
2,2.6694,4.9182,8.9608,4.3329,4.6248,5.0090,6.0787,9.0370,6.8887,6.1026
3,3.1817,4.0416,5.8638,1.9688,8.8393,6.3279,6.6718,9.1173,3.8640,6.1094
4,9.1954,8.5942,8.1698,6.5603,5.6420,5.0458,6.7071,7.8081,6.8218,3.1500


In [6]:
def plot_case(n_floats=10, n_ints=0, n_strs=0, float_format=None, str_val=None, strip=True):
    global table, np_table, pandas_table, flt_format, strip_whitespace
    strip_whitespace=strip
    flt_format = float_format
    n_rows = (100, 200, 500, 1000, 2000, 5000, 10000, 20000)  # include 50000 for publish run
    numbers = (10, 10,   5,     2,    1,    1,     1,     1)
    repeats = (3,   3,    3,    3,    3,    3,     3,     2)
    times_slow = []
    times_fast = []
    times_pandas = []
    for n_row, number, repeat in zip(n_rows, numbers, repeats):
        table = make_table(n_row, n_floats, n_ints, n_strs, float_format)
        np_table = np.array(table)
        pandas_table = pandas.DataFrame(np_table)
        t = timeit.repeat("out = StringIO(); ascii.write(table, out, use_fast_writer=False, strip_whitespace=strip_whitespace)", 
                   setup='from __main__ import ascii, table, StringIO, strip_whitespace', number=number, repeat=repeat)
        times_slow.append(min(t) / number)
        t = timeit.repeat("out = StringIO(); ascii.write(table, out, use_fast_writer=True, strip_whitespace=strip_whitespace)", 
                   setup='from __main__ import ascii, table, StringIO, strip_whitespace', number=number, repeat=repeat)
        times_fast.append(min(t) / number)
        t = timeit.repeat("out = StringIO(); pandas_table.to_csv(out, float_format=flt_format)", 
                   setup='from __main__ import pandas_table, pandas, StringIO, flt_format', number=number, repeat=repeat)
        times_pandas.append(min(t) / number)
    plt.loglog(n_rows, times_slow, '-ob', label='io.ascii Python')
    plt.loglog(n_rows, times_fast, '-or', label='io.ascii Fast-c')
    plt.loglog(n_rows, times_pandas, '-oc', label='Pandas')
    plt.grid()
    plt.legend(loc='best')
    plt.title('n_floats={} n_ints={} n_strs={} float_format={}'.format(n_floats, n_ints, n_strs, float_format))
    plt.xlabel('Number of rows')
    plt.ylabel('Time (sec)')
    print('Fast-C to Python speed ratio: {:.2f} : 1'.format(times_slow[-1] / times_fast[-1]))
    print('Pandas to Fast-C speed ratio: {:.2f} : 1'.format(times_fast[-1] / times_pandas[-1]))

In [7]:
plot_case(n_floats=10, n_ints=0, n_strs=0, float_format=None)


Fast-C to Python speed ratio: 1.87 : 1
Pandas to Fast-C speed ratio: 1.06 : 1

In [8]:
plot_case(n_floats=10, n_ints=10, n_strs=10, float_format=None)


Fast-C to Python speed ratio: 3.63 : 1
Pandas to Fast-C speed ratio: 1.30 : 1

In [9]:
plot_case(n_floats=10, n_ints=10, n_strs=10, float_format='%.4f')


Fast-C to Python speed ratio: 3.91 : 1
Pandas to Fast-C speed ratio: 1.56 : 1

In [10]:
plot_case(n_floats=10, n_ints=0, n_strs=0, float_format='%.4f')


Fast-C to Python speed ratio: 1.57 : 1
Pandas to Fast-C speed ratio: 1.27 : 1

In [11]:
plot_case(n_floats=0, n_ints=0, n_strs=10)


Fast-C to Python speed ratio: 3.08 : 1
Pandas to Fast-C speed ratio: 2.01 : 1

In [12]:
plot_case(n_floats=0, n_ints=0, n_strs=10, str_val="'asdf asdfa'")


Fast-C to Python speed ratio: 2.96 : 1
Pandas to Fast-C speed ratio: 2.04 : 1

In [13]:
plot_case(n_floats=0, n_ints=0, n_strs=10, strip=False)


Fast-C to Python speed ratio: 3.60 : 1
Pandas to Fast-C speed ratio: 1.46 : 1

In [14]:
plot_case(n_floats=0, n_ints=10, n_strs=0)


Fast-C to Python speed ratio: 12.05 : 1
Pandas to Fast-C speed ratio: 1.23 : 1