notebook.community

Edit and run



In [1]:

    
%matplotlib inline
from matplotlib import pyplot as plt
import pandas

data = pandas.read_csv('results_ubuntu14.04LTS_py3.4.3_anaconda2.3.0_laptop.csv', delimiter=',')
data.columns = ['function', 'size', 'cols', 'time']



In [2]:

    
data[:19].describe









    Out[2]:





<bound method DataFrame.describe of                               function  size  cols      time
0                          simple_read     1     5  0.000729
1               naive_read_split_split     1     5  0.007550
2                  for_line_in_f_split     1     5  0.008100
3             for_line_in_f_split_with     1     5  0.008175
4                   pandas_read_csv_df     1     5  0.010363
5             pandas_read_csv_iterrows     1     5  0.468545
6             fileinput_with_firstline     1     5  0.022614
7                           csv_reader     1     5  0.017964
8                      csv_dict_reader     1     5  0.053676
9                           mmap_while     1     5  0.014912
10                            mmap_for     1     5  0.015031
11                       numpy_loadtxt     1     5  0.094875
12                    numpy_genfromtxt     1     5  0.102993
13             numpy_genfromtxt_nditer     1     5  0.112323
14        numpy_genfromtxt_range_shape     1     5  0.104734
15       for_line_in_f_readlines_split     1     5  0.008348
16  for_line_in_f_readlines_split_with     1     5  0.008335
17                        re_readlines     1     5  0.021768
18                   re_readlines_with     1     5  0.021641>



In [3]:

    
# Let's just make a quick extremum plot of file size vs read time

fig, ax = plt.subplots(1, 1, figsize=(16, 8))
data[data['cols'] == 200].groupby('function').plot(x='size', y='time', ax=ax)
plt.legend(data['function'], loc='upper left')
plt.ylim((0, 5))
plt.title('Read Times for 200 MB Files of Different Sizes')









    Out[3]:





<matplotlib.text.Text at 0x7f732adc7b70>



In [4]:

    
# now let's plot # of columns vs read time

fig, ax = plt.subplots(1, 1, figsize=(16, 8))
data[data['size'] == 200].groupby('function').plot(x='cols', y='time', ax=ax)
plt.legend(data['function'], loc='upper right')
plt.ylim((0, 25))
plt.title('Read Times for 200 Column Files of Different Sizes')









    Out[4]:





<matplotlib.text.Text at 0x7f732ab27470>



In [5]:

    
data[data['cols'] == 200][data['size'] == 200].sort('time')









    



/home/jstilley/anaconda3/lib/python3.4/site-packages/pandas/core/frame.py:1825: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)






    Out[5]:






  
    
      
      function
      size
      cols
      time
    
  
  
    
      551
      simple_read
      200
      200
      0.174799
    
    
      566
      for_line_in_f_readlines_split
      200
      200
      0.947235
    
    
      567
      for_line_in_f_readlines_split_with
      200
      200
      0.949191
    
    
      554
      for_line_in_f_split_with
      200
      200
      1.054141
    
    
      553
      for_line_in_f_split
      200
      200
      1.154243
    
    
      557
      fileinput_with_firstline
      200
      200
      1.242460
    
    
      552
      naive_read_split_split
      200
      200
      1.318512
    
    
      569
      re_readlines_with
      200
      200
      1.498748
    
    
      568
      re_readlines
      200
      200
      1.515388
    
    
      561
      mmap_for
      200
      200
      1.558482
    
    
      560
      mmap_while
      200
      200
      1.571111
    
    
      555
      pandas_read_csv_df
      200
      200
      2.288987
    
    
      558
      csv_reader
      200
      200
      3.296065
    
    
      559
      csv_dict_reader
      200
      200
      4.306151
    
    
      556
      pandas_read_csv_iterrows
      200
      200
      5.636963
    
    
      562
      numpy_loadtxt
      200
      200
      6.652580
    
    
      565
      numpy_genfromtxt_range_shape
      200
      200
      17.940645
    
    
      563
      numpy_genfromtxt
      200
      200
      18.061994
    
    
      564
      numpy_genfromtxt_nditer
      200
      200
      19.986187



In [6]:

    
data[data['cols'] == 5][data['size'] == 200].sort('time')









    



/home/jstilley/anaconda3/lib/python3.4/site-packages/pandas/core/frame.py:1825: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)






    Out[6]:






  
    
      
      function
      size
      cols
      time
    
  
  
    
      95
      simple_read
      200
      5
      0.111992
    
    
      96
      naive_read_split_split
      200
      5
      1.406370
    
    
      98
      for_line_in_f_split_with
      200
      5
      1.523781
    
    
      97
      for_line_in_f_split
      200
      5
      1.526934
    
    
      99
      pandas_read_csv_df
      200
      5
      1.662912
    
    
      111
      for_line_in_f_readlines_split_with
      200
      5
      1.803978
    
    
      110
      for_line_in_f_readlines_split
      200
      5
      1.890794
    
    
      104
      mmap_while
      200
      5
      2.914811
    
    
      105
      mmap_for
      200
      5
      2.946188
    
    
      102
      csv_reader
      200
      5
      3.594512
    
    
      101
      fileinput_with_firstline
      200
      5
      4.462262
    
    
      113
      re_readlines_with
      200
      5
      5.212917
    
    
      112
      re_readlines
      200
      5
      5.602563
    
    
      103
      csv_dict_reader
      200
      5
      10.468787
    
    
      106
      numpy_loadtxt
      200
      5
      18.527393
    
    
      107
      numpy_genfromtxt
      200
      5
      20.213743
    
    
      109
      numpy_genfromtxt_range_shape
      200
      5
      21.823151
    
    
      108
      numpy_genfromtxt_nditer
      200
      5
      23.973133
    
    
      100
      pandas_read_csv_iterrows
      200
      5
      91.271756



In [16]:

    
''' Let's ignore several of the really slow functions (like genfromtxt),
    and and the duplicates (like the extra "with" versions).
'''
FUNCTS = ['for_line_in_f_readlines_split',
          'for_line_in_f_split', 'fileinput_with_firstline', 'naive_read_split_split']
COLORS = ['#000000', '#FF00FF', '#00FF00', '#0000FF', '#00FFFF', '#FFFF00', '#00FF00', '#0000FF']
STYLES = ['-', '-.', '--', ':', 'h', '--']

# Let's just make a quick extremum plot for only the functions that are doing well.

fig, ax = plt.subplots(1, 1, figsize=(16, 8))
for i in range(len(FUNCTS)):
    data[data['cols'] == 5][data['function'] == FUNCTS[i]].plot(x='size', y='time', ax=ax,
                                                                  label=FUNCTS[i] + ' (5 cols)',
                                                                  color=COLORS[i],
                                                                  linewidth=2,
                                                                  linestyle='--')
for i in range(len(FUNCTS)):
    data[data['cols'] == 200][data['function'] == FUNCTS[i]].plot(x='size', y='time', ax=ax,
                                                                  label=FUNCTS[i] + ' (200 cols)',
                                                                  color=COLORS[i],
                                                                  linewidth=2,
                                                                  linestyle='-')
plt.xlim((0, 200))
plt.ylim((0, 2))
plt.legend(loc='upper left')
plt.title('Read Times for 5 or 200 Column Files of Different Sizes')









    



/home/jstilley/anaconda3/lib/python3.4/site-packages/pandas/core/frame.py:1825: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)






    Out[16]:





<matplotlib.text.Text at 0x7f73287224e0>



In [21]:

    
''' And let's take a quick look at what difference the number of columns make. '''
COLUMNS = [5, 10, 50, 100, 200]

# Let's just make a quick extremum plot for only the functions that are doing well.

fig, ax = plt.subplots(1, 1, figsize=(16, 8))
for cols in COLUMNS:
    data[data['cols'] == cols][data['function'] == 'for_line_in_f_readlines_split'].plot(x='size', y='time', ax=ax,
                                                                  label=str(cols) + ' columns',
                                                                  linewidth=2)
plt.xlim((0, 200))
plt.ylim((0, 2))
plt.legend(loc='upper left')
plt.title('Read Times 200 MB CSV Files with Different Numbers of Columns')









    



/home/jstilley/anaconda3/lib/python3.4/site-packages/pandas/core/frame.py:1825: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
  "DataFrame index.", UserWarning)






    Out[21]:





<matplotlib.text.Text at 0x7f7328321358>



In [ ]:

	function	size	cols	time
551	simple_read	200	200	0.174799
566	for_line_in_f_readlines_split	200	200	0.947235
567	for_line_in_f_readlines_split_with	200	200	0.949191
554	for_line_in_f_split_with	200	200	1.054141
553	for_line_in_f_split	200	200	1.154243
557	fileinput_with_firstline	200	200	1.242460
552	naive_read_split_split	200	200	1.318512
569	re_readlines_with	200	200	1.498748
568	re_readlines	200	200	1.515388
561	mmap_for	200	200	1.558482
560	mmap_while	200	200	1.571111
555	pandas_read_csv_df	200	200	2.288987
558	csv_reader	200	200	3.296065
559	csv_dict_reader	200	200	4.306151
556	pandas_read_csv_iterrows	200	200	5.636963
562	numpy_loadtxt	200	200	6.652580
565	numpy_genfromtxt_range_shape	200	200	17.940645
563	numpy_genfromtxt	200	200	18.061994
564	numpy_genfromtxt_nditer	200	200	19.986187

	function	size	cols	time
95	simple_read	200	5	0.111992
96	naive_read_split_split	200	5	1.406370
98	for_line_in_f_split_with	200	5	1.523781
97	for_line_in_f_split	200	5	1.526934
99	pandas_read_csv_df	200	5	1.662912
111	for_line_in_f_readlines_split_with	200	5	1.803978
110	for_line_in_f_readlines_split	200	5	1.890794
104	mmap_while	200	5	2.914811
105	mmap_for	200	5	2.946188
102	csv_reader	200	5	3.594512
101	fileinput_with_firstline	200	5	4.462262
113	re_readlines_with	200	5	5.212917
112	re_readlines	200	5	5.602563
103	csv_dict_reader	200	5	10.468787
106	numpy_loadtxt	200	5	18.527393
107	numpy_genfromtxt	200	5	20.213743
109	numpy_genfromtxt_range_shape	200	5	21.823151
108	numpy_genfromtxt_nditer	200	5	23.973133
100	pandas_read_csv_iterrows	200	5	91.271756