Setup pythonpath, import libraries and initialized DataFrame to store results
In [99]:
# Use this statement to import the current development version
import sys; sys.path.insert(0, '../')
In [100]:
from copy import deepcopy
In [101]:
import raccoon as rc
import pandas as pd
In [102]:
import platform
print(platform.machine())
print(platform.processor())
print(platform.platform())
print("python ", platform.python_version())
In [103]:
results = rc.DataFrame(columns=['raccoon', 'pandas', 'ratio'], sort=False)
In [104]:
def add_results(index):
results[index, 'raccoon'] = res_rc.best
results[index, 'pandas'] = res_pd.best
results[index, 'ratio'] = res_rc.best / res_pd.best
In [105]:
results['version', 'raccoon'] = rc.__version__
results['version', 'pandas'] = pd.__version__
print(results)
In [106]:
def init_rc():
for x in range(10000):
df = rc.DataFrame()
def init_pd():
for x in range(10000):
df = pd.DataFrame()
In [107]:
res_rc = %timeit -o init_rc()
In [108]:
res_pd = %timeit -o init_pd()
In [109]:
add_results('initialize empty')
In [110]:
results.print()
In [111]:
data = dict()
for x in range(100):
data['a' + str(x)] = list(range(100))
In [112]:
res_rc = %timeit -o df=rc.DataFrame(data=data, sort=False)
In [113]:
res_pd = %timeit -o df=pd.DataFrame(data=data)
In [114]:
add_results('initialize with matrix')
In [115]:
results.print()
In [116]:
def one_col_add_rc():
df = rc.DataFrame()
for x in range(10000):
df.set(x, 'a', x)
def one_col_add_pd():
df = pd.DataFrame()
for x in range(10000):
df.at[x, 'a'] = x
In [117]:
res_rc = %timeit -o one_col_add_rc()
In [118]:
res_pd = %timeit -o one_col_add_pd()
In [119]:
add_results('add rows one column')
In [120]:
print(results)
In [121]:
new_row = {('a' + str(x)): x for x in range(100)}
columns = ['a' + str(x) for x in range(100)]
def matrix_add_rc():
df = rc.DataFrame(columns=columns)
for x in range(100):
df.set(indexes=x, values=new_row)
def matrix_add_pd():
df = pd.DataFrame(columns=columns)
for x in range(100):
df.loc[x] = new_row
In [122]:
res_rc = %timeit -o matrix_add_rc()
In [123]:
res_pd = %timeit -o matrix_add_pd()
In [124]:
add_results('add matrix')
In [125]:
print(results)
In [126]:
def append_rc():
grid = {'a' + str(x): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] for x in range(10)}
df = rc.DataFrame(data=deepcopy(grid), columns=list(grid.keys()))
for x in range(100):
index = [(y + 1) + (x + 1) * 10 for y in range(10)]
new_grid = deepcopy(grid)
new_df = rc.DataFrame(data=new_grid, columns=list(new_grid.keys()), index=index)
df.append(new_df)
def append_pd():
grid = {'a' + str(x): [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] for x in range(10)}
df = pd.DataFrame(data=grid, columns=list(grid.keys()))
for x in range(100):
index = [(y + 1) + (x + 1) * 10 for y in range(10)]
new_grid = deepcopy(grid)
new_df = pd.DataFrame(data=new_grid, columns=list(new_grid.keys()), index=index)
df = df.append(new_df)
In [127]:
res_rc = %timeit -o append_rc()
In [128]:
res_pd = %timeit -o append_pd()
In [129]:
add_results('append')
In [130]:
print(results)
In [131]:
# First create a 1000 row X 100 col matrix for the test. Index is [0...999]
col = [x for x in range(1000)]
grid = {'a' + str(x): col[:] for x in range(100)}
df_rc = rc.DataFrame(data=grid, columns=sorted(grid.keys()))
df_pd = pd.DataFrame(data=grid, columns=sorted(grid.keys()))
In [132]:
# get cell
def rc_get_cell():
for c in df_rc.columns:
for r in df_rc.index:
x = df_rc.get(r, c)
def pd_get_cell():
for c in df_pd.columns:
for r in df_pd.index:
x = df_pd.at[r, c]
In [133]:
res_rc = %timeit -o rc_get_cell()
In [134]:
res_pd = %timeit -o pd_get_cell()
In [135]:
add_results('get cell')
In [136]:
print(results)
In [137]:
# get column all index
def get_column_all_rc():
for c in df_rc.columns:
x = df_rc.get(columns=c)
def get_column_all_pd():
for c in df_pd.columns:
x = df_pd[c]
In [138]:
res_rc = %timeit -o get_column_all_rc()
In [139]:
res_pd = %timeit -o get_column_all_pd()
In [140]:
add_results('get column all index')
In [141]:
print(results)
In [142]:
# get subset of the index of the column
def get_column_subset_rc():
for c in df_rc.columns:
for r in range(100):
rows = list(range(r*10, r*10 + 9))
x = df_rc.get(indexes=rows, columns=c)
def get_column_subset_pd():
for c in df_pd.columns:
for r in range(100):
rows = list(range(r*10, r*10 + 9))
x = df_pd.loc[rows, c]
In [143]:
res_rc = %timeit -o get_column_subset_rc()
In [144]:
res_pd = %timeit -o get_column_subset_pd()
In [145]:
add_results('get column subset index')
In [146]:
print(results)
In [147]:
# get index all columns
def get_index_all_rc():
for i in df_rc.index:
x = df_rc.get(indexes=i)
def get_index_all_pd():
for i in df_pd.index:
x = df_pd.loc[i]
In [148]:
res_rc = %timeit -o get_index_all_rc()
In [149]:
res_pd = %timeit -o get_index_all_pd()
In [150]:
add_results('get index all columns')
In [151]:
print(results)
In [152]:
# First create a 1000 row X 100 col matrix for the test. Index is [0...999]
col = [x for x in range(1000)]
grid = {'a' + str(x): col[:] for x in range(100)}
df_rc = rc.DataFrame(data=grid, columns=sorted(grid.keys()))
df_pd = pd.DataFrame(data=grid, columns=sorted(grid.keys()))
In [153]:
# set cell
def rc_set_cell():
for c in df_rc.columns:
for r in df_rc.index:
df_rc.set(r, c, 99)
def pd_set_cell():
for c in df_pd.columns:
for r in df_pd.index:
df_pd.at[r, c] = 99
In [154]:
res_rc = %timeit -o rc_set_cell()
In [155]:
res_pd = %timeit -o pd_set_cell()
In [156]:
add_results('set cell')
In [157]:
print(results)
In [158]:
# set column all index
def set_column_all_rc():
for c in df_rc.columns:
x = df_rc.set(columns=c, values=99)
def set_column_all_pd():
for c in df_pd.columns:
x = df_pd[c] = 99
In [159]:
res_rc = %timeit -o set_column_all_rc()
In [160]:
res_pd = %timeit -o set_column_all_pd()
In [161]:
add_results('set column all index')
In [162]:
print(results)
In [163]:
# set subset of the index of the column
def set_column_subset_rc():
for c in df_rc.columns:
for r in range(100):
rows = list(range(r*10, r*10 + 10))
x = df_rc.set(indexes=rows, columns=c, values=list(range(10)))
def set_column_subset_pd():
for c in df_pd.columns:
for r in range(100):
rows = list(range(r*10, r*10 + 10))
x = df_pd.loc[rows, c] = list(range(10))
In [164]:
res_rc = %timeit -o set_column_subset_rc()
In [165]:
res_pd = %timeit -o set_column_subset_pd()
In [166]:
add_results('set column subset index')
In [167]:
print(results)
In [168]:
row = {x:x for x in grid.keys()}
In [169]:
# set index all columns
def set_index_all_rc():
for i in df_rc.index:
x = df_rc.set(indexes=i, values=row)
def set_index_all_pd():
for i in df_pd.index:
x = df_pd.loc[i] = row
In [170]:
res_rc = %timeit -o set_index_all_rc()
In [171]:
res_pd = %timeit -o set_index_all_pd()
In [172]:
add_results('set index all columns')
In [173]:
print(results)
In [174]:
# make a dataframe 1000x100 with index in reverse order
rev = list(reversed(range(1000)))
df_rc = rc.DataFrame(data=grid, index=rev)
df_pd = pd.DataFrame(grid, index=rev)
In [175]:
res_rc = %timeit -o df_rc.sort_index()
In [176]:
res_pd = %timeit -o df_pd.sort_index()
In [177]:
add_results('sort index')
In [178]:
print(results)
In [179]:
# First create a 1000 row X 100 col matrix for the test. Index is [0...999]
col = [x for x in range(1000)]
grid = {'a' + str(x): col[:] for x in range(100)}
df_rc = rc.DataFrame(data=grid, columns=sorted(grid.keys()))
df_pd = pd.DataFrame(data=grid, columns=sorted(grid.keys()))
In [180]:
# iterate over the rows
def iter_rc():
for row in df_rc.iterrows():
x = row
def iter_pd():
for row in df_pd.itertuples():
x = row
In [181]:
res_rc = %timeit -o iter_rc()
In [182]:
res_pd = %timeit -o iter_pd()
In [183]:
add_results('iterate rows')
In [184]:
print(results)
In [185]:
# First create a 500 row X 100 col matrix for the test. Index is [1, 3, 5, 7,...500] every other
col = [x for x in range(1, 1000, 2)]
grid = {'a' + str(x): col[:] for x in range(100)}
df_rc = rc.DataFrame(data=grid, columns=sorted(grid.keys()), sort=True)
df_pd = pd.DataFrame(data=grid, columns=sorted(grid.keys()))
In [186]:
row = {x:x for x in grid.keys()}
In [187]:
# set index all columns
def insert_rows_rc():
for i in range(0, 999, 2):
x = df_rc.set(indexes=i, values=row)
def insert_rows_pd():
for i in range(0, 999, 2):
x = df_pd.loc[i] = row
In [188]:
res_rc = %timeit -o insert_rows_rc()
In [189]:
res_pd = %timeit -o insert_rows_pd()
In [190]:
add_results('insert rows')
In [191]:
print(results)
In [192]:
data_row = {'open': 100, 'high': 101, 'low': 99, 'close': 100.5, 'volume': 999}
dates = pd.date_range('2010-01-01 09:30:00', periods=10000, freq='1min')
def time_series_rc():
ts = rc.DataFrame(columns=['open', 'high', 'low', 'close', 'volume'], index_name='datetime', sort=True)
for date in dates:
ts.set_row(date, data_row)
def time_series_pd():
ts = pd.DataFrame(columns=['open', 'high', 'low', 'close', 'volume'])
for date in dates:
ts.loc[date] = data_row
In [193]:
res_rc = %timeit -o time_series_rc()
In [194]:
res_pd = %timeit -o time_series_pd()
In [195]:
add_results('time series')
In [196]:
print(results)
In [ ]: