For this benchmark, we will be testing different approaches to determine is a string is a number.
Note that methods like .isdigit()
and .isnumeric()
are only evaluating if a string in an integer,
e.g., both '1.2'.isdigit()
and '1.2'.isnumeric()
return False
.
In [1]:
def is_number_tryexcept(s):
""" Returns True is string is a number. """
try:
float(s)
return True
except ValueError:
return False
In [2]:
from re import match as re_match
def is_number_regex(s):
""" Returns True is string is a number. """
if re_match("^\d+?\.\d+?$", s) is None:
return s.isdigit()
return True
In [3]:
from re import compile as re_compile
comp = re_compile("^\d+?\.\d+?$")
def compiled_regex(s):
""" Returns True is string is a number. """
if comp.match(s) is None:
return s.isdigit()
return True
In [4]:
def is_number_repl_isdigit(s):
""" Returns True is string is a number. """
return s.replace('.','',1).isdigit()
Quick note on why I am importing re.search
as re_search
:
To decrease the overhead for the lookup.
In [5]:
import re
%timeit re_match("^\d+?\.\d+?$", '1.2345')
%timeit re.match("^\d+?\.\d+?$", '1.2345')
In [6]:
a_float = '1.1234'
inv_float = '1.12.34'
no_number = 'abc123'
an_int = '12345'
funcs = [
is_number_tryexcept,
is_number_regex,
compiled_regex,
is_number_repl_isdigit
]
for f in funcs:
assert (f(a_float) == True), 'Error in %s(%s)' %(f.__name__, a_float)
assert (f(inv_float) == False), 'Error in %s(%s)' %(f.__name__, inv_float)
assert (f(no_number) == False), 'Error in %s(%s)' %(f.__name__, no_number)
assert (f(an_int) == True), 'Error in %s(%s)' %(f.__name__, an_int)
print('ok')
In [7]:
a_float = '.1234'
print('Float notation ".1234" is not supported by:')
for f in funcs:
if not f(a_float):
print('\t -', f.__name__)
In [8]:
scientific1 = '1.000000e+50'
scientific2 = '1e50'
print('Scientific notation "1.000000e+50" is not supported by:')
for f in funcs:
if not f(scientific1):
print('\t -', f.__name__)
print('Scientific notation "1e50" is not supported by:')
for f in funcs:
if not f(scientific2):
print('\t -', f.__name__)
In [9]:
import timeit
test_cases = ['1.12345', '1.12.345', 'abc12345', '12345']
times_n = {f.__name__:[] for f in funcs}
for t in test_cases:
for f in funcs:
f = f.__name__
times_n[f].append(min(timeit.Timer('%s(t)' %f,
'from __main__ import %s, t' %f)
.repeat(repeat=3, number=1000000)))
In [10]:
%matplotlib inline
In [11]:
import platform
import multiprocessing
def print_sysinfo():
print('\nPython version :', platform.python_version())
print('compiler :', platform.python_compiler())
print('\nsystem :', platform.system())
print('release :', platform.release())
print('machine :', platform.machine())
print('processor :', platform.processor())
print('CPU count :', multiprocessing.cpu_count())
print('interpreter:', platform.architecture()[0])
print('\n\n')
In [31]:
from numpy import arange
import matplotlib.pyplot as plt
def plot():
labels = [('is_number_tryexcept','Try-except method'),
('is_number_regex', 'Regular expression'),
('compiled_regex', 'Compiled regular expression'),
('is_number_repl_isdigit', 'replace-isdigit method')
]
x_labels = ['float: "1.12345"',
'invalid float: "1.12.345"',
'no number: "abc12345"',
'integer: "12345"']
plt.rcParams.update({'font.size': 12})
ind = arange(len(test_cases)) # the x locations for the groups
width = 0.2
fig = plt.figure(figsize=(12,8))
ax = fig.add_subplot(111)
colors = [(0,'c'), (1,'b'), (2,'g'), (3,'r')]
for l,c in zip(labels,colors):
ax.bar(ind + c[0]*width,
times_n[l[0]],
width,
alpha=0.5,
color=c[1],
label=l[1])
plt.grid()
ax.set_ylabel('time in microseconds')
ax.set_title('Methods for determening if a string is a number')
ax.set_xticks(ind + width)
ax.set_xticklabels(x_labels)
plt.legend(loc='upper right', fontsize=13)
plt.xlim([-0.2, 3.8])
plt.ylim([-0, 2])
plt.show()
In [32]:
print_sysinfo()
plot()
The try-except
approach appears to be slower than the replace-isdigit method for cases where the string is not a number - executing the except-block
seems to be very costly. However, we have to consider that it also works for special cases like:
So it really depends on the data set which method to choose - the more thorough "try-except" or the faster "replace-isdigit"