In [1]:
import string
from functools import partial
In [2]:
N_ASCII_CHARACTERS = 1 << 7
In [3]:
string.printable
Out[3]:
string.printable has non-printable characters that I don't want, such as '\x0b', so I make my own set of good characters.
In [6]:
s = {chr(c) for c in range(0, N_ASCII_CHARACTERS) if chr(c).isprintable()}
len(s), ''.join(sorted(s))
Out[6]:
In [5]:
s = ''.join(c for c in string.printable if ord(' ') <= ord(c) <= ord('~'))
len(s), s
Out[5]:
In [6]:
good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
In [7]:
len(good_characters), good_characters
Out[7]:
In [8]:
filename = '20150223-cohpy-memoization.ipynb'
In [9]:
def pass_good_characters(lines):
for line in lines:
yield ''.join(
c for c in line
if 31 < ord(c) < 127 or c == '\t')
In [10]:
%timeit list(pass_good_characters(open(filename)))
In [11]:
def pass_good_characters(lines):
for line in lines:
yield ''.join(
c for c in line
if ord(' ')-1 < ord(c) < ord('~')+1 or c == '\t')
In [12]:
%timeit list(pass_good_characters(open(filename)))
In [13]:
def pass_good_characters(lines):
for line in lines:
yield ''.join(
c for c in line
if ord(' ') <= ord(c) <= ord('~') or c == '\t')
In [14]:
%timeit list(pass_good_characters(open(filename)))
In [15]:
def pass_good_characters(lines):
for line in lines:
yield ''.join(
c for c in line
if ' ' <= c <= '~' or c == '\t')
In [16]:
%timeit list(pass_good_characters(open(filename)))
In [17]:
def pass_good_characters(lines):
for line in lines:
yield ''.join(
c for c in line
if c <= '~' and (c.isprintable() or c == '\t'))
In [18]:
%timeit list(pass_good_characters(open(filename)))
In [19]:
def pass_good_characters(lines):
for line in lines:
yield ''.join(filter(lambda c: c <= '~' and (c.isprintable() or c == '\t'), line))
In [20]:
%timeit list(pass_good_characters(open(filename)))
In [21]:
def pass_good_characters(lines):
good_characters = [chr(c) for c in range(ord(' '), ord('~')+1)] + ['\t']
for line in lines:
yield ''.join(c for c in line if c in good_characters)
In [22]:
%timeit list(pass_good_characters(open(filename)))
In [23]:
def pass_good_characters(lines):
good_characters = ''.join([chr(c) for c in range(ord(' '), ord('~')+1)] + ['\t'])
for line in lines:
yield ''.join(c for c in line if c in good_characters)
In [24]:
%timeit list(pass_good_characters(open(filename)))
In [25]:
def pass_good_characters(lines):
good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
for line in lines:
yield ''.join(c for c in line if c in good_characters)
In [26]:
%timeit list(pass_good_characters(open(filename)))
In [27]:
def pass_good_characters(lines):
good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
yield from (
''.join(c for c in line if c in good_characters)
for line in lines)
In [28]:
%timeit list(pass_good_characters(open(filename)))
In [29]:
def pass_good_characters(lines):
good_characters = {
chr(c) for c in range(N_ASCII_CHARACTERS)
if chr(c).isprintable()} | {'\t'}
for line in lines:
yield ''.join(c for c in line if c in good_characters)
In [30]:
%timeit list(pass_good_characters(open(filename)))
In [31]:
def pass_good_characters(lines):
good_characters = {
chr(c) for c in range(N_ASCII_CHARACTERS)
if chr(c).isprintable() or chr(c) == '\t'}
for line in lines:
yield ''.join(c for c in line if c in good_characters)
In [32]:
%timeit list(pass_good_characters(open(filename)))
In [33]:
def pass_good_characters(lines):
good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
for line in lines:
yield ''.join(filter(lambda c: c in good_characters, line))
In [34]:
%timeit list(pass_good_characters(open(filename)))
In [35]:
class MyStringIO():
def __init__(self, s=''):
self.s = s
self.i = 0
def __iter__(self):
return self
def __next__(self):
s = []
for c in iter(partial(self.read, 1), ''):
s.append(c)
if c == '\n':
break
if not s:
raise StopIteration
return ''.join(s)
def read(self, n):
s = self.s[self.i:self.i+n]
self.i += n
self.i = min(self.i, len(self.s))
return s
def write(self, s):
self.s += s
In [36]:
s = 'hello\nwo\1\200\trld\n'
In [37]:
f = MyStringIO(s)
f.write('peas\n')
f
Out[37]:
In [38]:
for i, line in enumerate(f):
print(i, repr(line))
In [39]:
f = MyStringIO(s)
f.write('peas\n')
f
Out[39]:
In [40]:
for i, line in enumerate(pass_good_characters(f)):
print(i, repr(line))
In [41]:
# str.isprintable() for many characters above ASCII.
for i in range(2 * N_ASCII_CHARACTERS):
c = chr(i)
print(i, repr(c), c.isprintable())
In [42]:
string.digits
Out[42]:
In [43]:
string.ascii_letters
Out[43]:
In [44]:
string.punctuation
Out[44]:
In [45]:
s = set('\t' + ' ' + string.digits + string.ascii_letters + string.punctuation)
len(s)
Out[45]: