This notebook shows a simple example of profiling alternative methods of concatenating two pandas DataFrames.
In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import cProfile
from pstatsviewer import StatsViewer
from qgrid import nbinstall
nbinstall()
In [2]:
# Construct two 5000 x 8 frames with random floats.
df1 = pd.DataFrame(
np.random.randn(5000, 8),
columns=[chr(ord('A') + i) for i in range(8)],
index=range(5000),
)
df2 = pd.DataFrame(
np.random.randn(5000, 8),
columns=[chr(ord('A') + i) for i in range(8)],
index=range(5000, 10000),
)
df1.head(5)
Out[2]:
In [3]:
from qgrid import show_grid
In [4]:
def concat_naive():
for i in range(500):
pd.concat([df1, df2])
cProfile.run(
'concat_naive()',
'naive.stats',
)
In [5]:
slow = StatsViewer("naive.stats")
slow.table()
In [6]:
slow.chart()
In [7]:
def concat_fast():
"""
Concatenate using numpy primitives instead of pd.concat.
"""
for i in range(500):
pd.DataFrame(
np.vstack([df1.values, df2.values]),
columns=df1.columns,
index=np.hstack([
df1.index.values,
df2.index.values,
])
)
cProfile.run(
'concat_fast()',
'fast.stats',
)
fast = StatsViewer("fast.stats")
In [9]:
slow.compare_table(fast, lsuffix="_slow", rsuffix="_fast")
In [11]:
slow.compare_chart(fast, 'tottime', 25)
DataFrame
In [ ]: