In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas

data = pandas.read_csv('results_ubuntu14.04LTS_py3.4.3_anaconda2.3.0_laptop.csv', delimiter=',')
data.columns = ['function', 'size', 'cols', 'time']

In [2]:
data[:19].describe


Out[2]:
<bound method DataFrame.describe of                               function  size  cols      time
0                          simple_read     1     5  0.000729
1               naive_read_split_split     1     5  0.007550
2                  for_line_in_f_split     1     5  0.008100
3             for_line_in_f_split_with     1     5  0.008175
4                   pandas_read_csv_df     1     5  0.010363
5             pandas_read_csv_iterrows     1     5  0.468545
6             fileinput_with_firstline     1     5  0.022614
7                           csv_reader     1     5  0.017964
8                      csv_dict_reader     1     5  0.053676
9                           mmap_while     1     5  0.014912
10                            mmap_for     1     5  0.015031
11                       numpy_loadtxt     1     5  0.094875
12                    numpy_genfromtxt     1     5  0.102993
13             numpy_genfromtxt_nditer     1     5  0.112323
14        numpy_genfromtxt_range_shape     1     5  0.104734
15       for_line_in_f_readlines_split     1     5  0.008348
16  for_line_in_f_readlines_split_with     1     5  0.008335
17                        re_readlines     1     5  0.021768
18                   re_readlines_with     1     5  0.021641>

In [3]:
# Let's just make a quick extremum plot of file size vs read time

fig, ax = plt.subplots(1, 1, figsize=(16, 8))
data[data['cols'] == 200].groupby('function').plot(x='size', y='time', ax=ax)
plt.legend(data['function'], loc='upper left')
plt.ylim((0, 5))
plt.title('Read Times for 200 MB Files of Different Sizes')


Out[3]:
<matplotlib.text.Text at 0x7f732adc7b70>

In [4]:
# now let's plot # of columns vs read time

fig, ax = plt.subplots(1, 1, figsize=(16, 8))
data[data['size'] == 200].groupby('function').plot(x='cols', y='time', ax=ax)
plt.legend(data['function'], loc='upper right')
plt.ylim((0, 25))
plt.title('Read Times for 200 Column Files of Different Sizes')


Out[4]:
<matplotlib.text.Text at 0x7f732ab27470>

In [5]:
data[data['cols'] == 200][data['size'] == 200].sort('time')


/home/jstilley/anaconda3/lib/python3.4/site-packages/pandas/core/frame.py:1825: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)
Out[5]:
function size cols time
551 simple_read 200 200 0.174799
566 for_line_in_f_readlines_split 200 200 0.947235
567 for_line_in_f_readlines_split_with 200 200 0.949191
554 for_line_in_f_split_with 200 200 1.054141
553 for_line_in_f_split 200 200 1.154243
557 fileinput_with_firstline 200 200 1.242460
552 naive_read_split_split 200 200 1.318512
569 re_readlines_with 200 200 1.498748
568 re_readlines 200 200 1.515388
561 mmap_for 200 200 1.558482
560 mmap_while 200 200 1.571111
555 pandas_read_csv_df 200 200 2.288987
558 csv_reader 200 200 3.296065
559 csv_dict_reader 200 200 4.306151
556 pandas_read_csv_iterrows 200 200 5.636963
562 numpy_loadtxt 200 200 6.652580
565 numpy_genfromtxt_range_shape 200 200 17.940645
563 numpy_genfromtxt 200 200 18.061994
564 numpy_genfromtxt_nditer 200 200 19.986187

In [6]:
data[data['cols'] == 5][data['size'] == 200].sort('time')


/home/jstilley/anaconda3/lib/python3.4/site-packages/pandas/core/frame.py:1825: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)
Out[6]:
function size cols time
95 simple_read 200 5 0.111992
96 naive_read_split_split 200 5 1.406370
98 for_line_in_f_split_with 200 5 1.523781
97 for_line_in_f_split 200 5 1.526934
99 pandas_read_csv_df 200 5 1.662912
111 for_line_in_f_readlines_split_with 200 5 1.803978
110 for_line_in_f_readlines_split 200 5 1.890794
104 mmap_while 200 5 2.914811
105 mmap_for 200 5 2.946188
102 csv_reader 200 5 3.594512
101 fileinput_with_firstline 200 5 4.462262
113 re_readlines_with 200 5 5.212917
112 re_readlines 200 5 5.602563
103 csv_dict_reader 200 5 10.468787
106 numpy_loadtxt 200 5 18.527393
107 numpy_genfromtxt 200 5 20.213743
109 numpy_genfromtxt_range_shape 200 5 21.823151
108 numpy_genfromtxt_nditer 200 5 23.973133
100 pandas_read_csv_iterrows 200 5 91.271756

In [16]:
''' Let's ignore several of the really slow functions (like genfromtxt),
    and and the duplicates (like the extra "with" versions).
'''
FUNCTS = ['for_line_in_f_readlines_split',
          'for_line_in_f_split', 'fileinput_with_firstline', 'naive_read_split_split']
COLORS = ['#000000', '#FF00FF', '#00FF00', '#0000FF', '#00FFFF', '#FFFF00', '#00FF00', '#0000FF']
STYLES = ['-', '-.', '--', ':', 'h', '--']

# Let's just make a quick extremum plot for only the functions that are doing well.

fig, ax = plt.subplots(1, 1, figsize=(16, 8))
for i in range(len(FUNCTS)):
    data[data['cols'] == 5][data['function'] == FUNCTS[i]].plot(x='size', y='time', ax=ax,
                                                                  label=FUNCTS[i] + ' (5 cols)',
                                                                  color=COLORS[i],
                                                                  linewidth=2,
                                                                  linestyle='--')
for i in range(len(FUNCTS)):
    data[data['cols'] == 200][data['function'] == FUNCTS[i]].plot(x='size', y='time', ax=ax,
                                                                  label=FUNCTS[i] + ' (200 cols)',
                                                                  color=COLORS[i],
                                                                  linewidth=2,
                                                                  linestyle='-')
plt.xlim((0, 200))
plt.ylim((0, 2))
plt.legend(loc='upper left')
plt.title('Read Times for 5 or 200 Column Files of Different Sizes')


/home/jstilley/anaconda3/lib/python3.4/site-packages/pandas/core/frame.py:1825: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)
Out[16]:
<matplotlib.text.Text at 0x7f73287224e0>

In [21]:
''' And let's take a quick look at what difference the number of columns make. '''
COLUMNS = [5, 10, 50, 100, 200]

# Let's just make a quick extremum plot for only the functions that are doing well.

fig, ax = plt.subplots(1, 1, figsize=(16, 8))
for cols in COLUMNS:
    data[data['cols'] == cols][data['function'] == 'for_line_in_f_readlines_split'].plot(x='size', y='time', ax=ax,
                                                                  label=str(cols) + ' columns',
                                                                  linewidth=2)
plt.xlim((0, 200))
plt.ylim((0, 2))
plt.legend(loc='upper left')
plt.title('Read Times 200 MB CSV Files with Different Numbers of Columns')


/home/jstilley/anaconda3/lib/python3.4/site-packages/pandas/core/frame.py:1825: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)
Out[21]:
<matplotlib.text.Text at 0x7f7328321358>

In [ ]: