Here, we are only looking of ways to flatten 2D-lists, i.e., lists of sublists. Flattening of lists with more than 2 dimensions will be tested separately.
The desired behavior of the different approaches shall be:
[[1],[2,3,4],[5],[6]] -> [1, 2, 3, 4, 5, 6] [[1],[2,[3],4],[5],[6]] -> [1, 2, [3], 4, 5, 6]
In [1]:
def append_in_for_loop(list_2d):
flattened = []
for sublist in list_2d:
for item in sublist:
flattened.append(item)
return flattened
In [2]:
def extend_in_for_loop(list_2d):
flattened = []
for sublist in list_2d:
flattened.extend(sublist)
return flattened
In [3]:
def list_comprehension(list_2d):
return [item for sublist in list_2d for item in sublist]
In [4]:
def flatten_via_sum(list_2d):
return sum(list_2d, [])
In [5]:
import itertools
In [6]:
def itertools_chain_from_iterable(list_2d):
return list(itertools.chain.from_iterable(list_2d))
In [7]:
def itertools_chain(list_2d):
return list(itertools.chain(*list_2d))
In [8]:
import functools
import operator
In [9]:
def functools_reduce_lambda(list_2d):
return functools.reduce(lambda x,y: x+y, list_2d)
In [10]:
def functools_reduce_operator(list_2d):
return functools.reduce(operator.add, map(list, list_2d))
In [11]:
def functools_reduce_list_add(list_2d):
return functools.reduce(list.__add__, (list(sub) for sub in list_2d))
In [12]:
funcs = [
append_in_for_loop, list_comprehension, flatten_via_sum,
itertools_chain_from_iterable, itertools_chain,
functools_reduce_lambda, extend_in_for_loop,
functools_reduce_operator, functools_reduce_list_add
]
In [13]:
for f in funcs:
assert (f([[1],[2,3,4],[5],[6]]) == [1, 2, 3, 4, 5, 6]),\
'%s failed test 1'% f.__name__
assert (f([[1],[2,[3],4],[5],[6]]) == [1, 2, [3], 4, 5, 6]),\
'%s failed test 2'% f.__name__
print('ok')
In [14]:
import timeit
import random
import copy
import numpy as np
random.seed(12345)
def make_copy(l):
return copy.deepcopy(l)
func_names = [f.__name__ for f in funcs]
orders_n = [10**n for n in range(1, 5)]
timings = {f:[] for f in func_names}
for n in orders_n:
l = [[i for i in range(random.randint(1,10))] for j in range(n)]
for f in func_names:
timings[f].append(min(timeit.Timer('%s(make_copy(l))' %f,
'from __main__ import %s, l, make_copy' %f)
.repeat(repeat=3, number=10)))
In [15]:
%matplotlib inline
In [19]:
import platform
import multiprocessing
def print_sysinfo():
print('\nPython version :', platform.python_version())
print('compiler :', platform.python_compiler())
print('\nsystem :', platform.system())
print('release :', platform.release())
print('machine :', platform.machine())
print('processor :', platform.processor())
print('CPU count :', multiprocessing.cpu_count())
print('interpreter:', platform.architecture()[0])
print('\n\n')
In [20]:
import matplotlib.pyplot as plt
def plot():
labels_fast = {'append_in_for_loop':'.append() in for-loop',
'extend_in_for_loop':'.extend() in for-loop',
'list_comprehension': 'List comprehension',
'itertools_chain_from_iterable': 'list(itertools.chain.from_iterable(list_2d))',
'itertools_chain': 'list(itertools.chain(*list_2d))',
}
labels_slow = {'functools_reduce_lambda': 'functools.reduce(lambda x,y: x+y, list_2d)',
'functools_reduce_operator': 'functools.reduce(operator.add, map(list, list_2d))',
'flatten_via_sum': 'sum(list_2d, [])',
'functools_reduce_list_add': 'functools.reduce(list.__add__, (list(sub) for sub in list_2d))'
}
plt.rcParams.update({'font.size': 9})
fig = plt.figure(figsize=(10,8))
plt.title('Performance of different methods for flattening a list of sublists', fontsize=18)
reds = ['#F87217', '#F75D59', '#FF0000', '#C24641']
for lb,col in zip(sorted(labels_slow.keys()), reds):
plt.plot(orders_n, [t**2 for t in timings[lb]], alpha=0.3, label=labels_slow[lb],
marker='o', lw=2, c=col, markersize=8)
blues = ['#000080', '#2B60DE', '#736AFF', '#659EC7', '#46C7C7']
for lb,col in zip(sorted(labels_fast.keys()), blues):
plt.plot(orders_n, [t**2 for t in timings[lb]], alpha=0.3, label=labels_fast[lb],
marker='^', lw=2, c=col, markersize=10)
max_perf = max( s/c for s,c in zip(timings['flatten_via_sum'],
timings['list_comprehension']) )
ftext = 'A list comprehension \n >>[item for sublist '\
'in list_2d for item in sublist]\n'\
'is {:.2f}x faster for flattening a 2D list than \n >>sum(list_2d, [])\nat n = 10000'\
.format(max_perf)
plt.figtext(.4,.15, ftext, fontsize=13, ha='left')
plt.legend(loc='upper left')
plt.grid()
plt.ylabel('Time per computation for 10 loops (in seconds)', fontsize=12)
plt.xlabel('Number of sublists', fontsize=12)
plt.xscale('log')
plt.yscale('log')
plt.show()
In [21]:
print_sysinfo()
plot()
The line plot above indicates that we have 2 different classes of list flattening methods: the first tier is taking linear time (blue shades), where the four other approaches (in shades of red) which have quadratic growth rates and therefore are significantly slower.
Of course, the sum(list_2d, [])
has a very short syntax and might be just convenient to use it for list flattening, but the reason why it is so slow is that it will create a new "intermediate" list for every element (Note that all operations in that are shown in red shades in the plot above follow this same principle).
I would like to suggest to you the nice article "How Not to Flatten a List of Lists in Python" by Mathieu Larose, who described this mechanism very nicely.
Looking at the line plot above, it is actually really hard to tell which of the methods with linear growth rate $O(n)$ (blue shades) is the fastest.
Let us plot timings for $n=10^6$ for the fastest methods only.
In [26]:
import timeit
import random
import copy
import numpy as np
random.seed(12345)
def make_copy(l):
return copy.deepcopy(l)
labels_fast = {
'append_in_for_loop':'.append() in for-loop',
'extend_in_for_loop':'.extend() in for-loop',
'list_comprehension': 'List comprehension',
'itertools_chain_from_iterable': 'list(itertools.chain.from_iterable(list_2d))',
'itertools_chain': 'list(itertools.chain(*list_2d))',
}
fast_names = [f for f in labels_fast.keys()]
fast_times = []
l = [[i for i in range(random.randint(1,10))] for j in range(10**5)]
for f in fast_names:
fast_times.append(min(timeit.Timer('%s(make_copy(l))' %f,
'from __main__ import %s, l, make_copy' %f)
.repeat(repeat=3, number=10)))
In [29]:
plt.rcParams.update({'font.size': 12})
fast_labels = [labels_fast[l] for l in fast_names]
x_pos = np.arange(len(fast_labels))
plt.figure(figsize=(8,7))
plt.bar(x_pos, fast_times, align='center', alpha=0.5)
plt.xticks(x_pos, fast_labels, rotation=90)
plt.axhline(y=min(fast_times), linestyle='--', color='black')
plt.ylabel('Time per computation for 10 loops in seconds for n=10000')
plt.title('List flattening methods with linear growth rates')
plt.ylim(8,9)
plt.show()
In [ ]: