In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
import pandas
data = pandas.read_csv('results_ubuntu14.04LTS_py3.4.3_anaconda2.3.0_laptop.csv', delimiter=',')
data.columns = ['function', 'size', 'cols', 'time']
In [2]:
data[:19].describe
Out[2]:
In [3]:
# Let's just make a quick extremum plot of file size vs read time
fig, ax = plt.subplots(1, 1, figsize=(16, 8))
data[data['cols'] == 200].groupby('function').plot(x='size', y='time', ax=ax)
plt.legend(data['function'], loc='upper left')
plt.ylim((0, 5))
plt.title('Read Times for 200 MB Files of Different Sizes')
Out[3]:
In [4]:
# now let's plot # of columns vs read time
fig, ax = plt.subplots(1, 1, figsize=(16, 8))
data[data['size'] == 200].groupby('function').plot(x='cols', y='time', ax=ax)
plt.legend(data['function'], loc='upper right')
plt.ylim((0, 25))
plt.title('Read Times for 200 Column Files of Different Sizes')
Out[4]:
In [5]:
data[data['cols'] == 200][data['size'] == 200].sort('time')
Out[5]:
In [6]:
data[data['cols'] == 5][data['size'] == 200].sort('time')
Out[6]:
In [16]:
''' Let's ignore several of the really slow functions (like genfromtxt),
and and the duplicates (like the extra "with" versions).
'''
FUNCTS = ['for_line_in_f_readlines_split',
'for_line_in_f_split', 'fileinput_with_firstline', 'naive_read_split_split']
COLORS = ['#000000', '#FF00FF', '#00FF00', '#0000FF', '#00FFFF', '#FFFF00', '#00FF00', '#0000FF']
STYLES = ['-', '-.', '--', ':', 'h', '--']
# Let's just make a quick extremum plot for only the functions that are doing well.
fig, ax = plt.subplots(1, 1, figsize=(16, 8))
for i in range(len(FUNCTS)):
data[data['cols'] == 5][data['function'] == FUNCTS[i]].plot(x='size', y='time', ax=ax,
label=FUNCTS[i] + ' (5 cols)',
color=COLORS[i],
linewidth=2,
linestyle='--')
for i in range(len(FUNCTS)):
data[data['cols'] == 200][data['function'] == FUNCTS[i]].plot(x='size', y='time', ax=ax,
label=FUNCTS[i] + ' (200 cols)',
color=COLORS[i],
linewidth=2,
linestyle='-')
plt.xlim((0, 200))
plt.ylim((0, 2))
plt.legend(loc='upper left')
plt.title('Read Times for 5 or 200 Column Files of Different Sizes')
Out[16]:
In [21]:
''' And let's take a quick look at what difference the number of columns make. '''
COLUMNS = [5, 10, 50, 100, 200]
# Let's just make a quick extremum plot for only the functions that are doing well.
fig, ax = plt.subplots(1, 1, figsize=(16, 8))
for cols in COLUMNS:
data[data['cols'] == cols][data['function'] == 'for_line_in_f_readlines_split'].plot(x='size', y='time', ax=ax,
label=str(cols) + ' columns',
linewidth=2)
plt.xlim((0, 200))
plt.ylim((0, 2))
plt.legend(loc='upper left')
plt.title('Read Times 200 MB CSV Files with Different Numbers of Columns')
Out[21]:
In [ ]: