In [1]:
    
import string
from functools import partial
    
In [2]:
    
N_ASCII_CHARACTERS = 1 << 7
    
In [3]:
    
string.printable
    
    Out[3]:
string.printable has non-printable characters that I don't want, such as '\x0b', so I make my own set of good characters.
In [6]:
    
s = {chr(c) for c in range(0, N_ASCII_CHARACTERS) if chr(c).isprintable()}
len(s), ''.join(sorted(s))
    
    Out[6]:
In [5]:
    
s = ''.join(c for c in string.printable if ord(' ') <= ord(c) <= ord('~'))
len(s), s
    
    Out[5]:
In [6]:
    
good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
    
In [7]:
    
len(good_characters), good_characters
    
    Out[7]:
In [8]:
    
filename = '20150223-cohpy-memoization.ipynb'
    
In [9]:
    
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if 31 < ord(c) < 127 or c == '\t')
    
In [10]:
    
%timeit list(pass_good_characters(open(filename)))
    
    
In [11]:
    
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if ord(' ')-1 < ord(c) < ord('~')+1 or c == '\t')
    
In [12]:
    
%timeit list(pass_good_characters(open(filename)))
    
    
In [13]:
    
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if ord(' ') <= ord(c) <= ord('~') or c == '\t')
    
In [14]:
    
%timeit list(pass_good_characters(open(filename)))
    
    
In [15]:
    
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if ' ' <= c <= '~' or c == '\t')
    
In [16]:
    
%timeit list(pass_good_characters(open(filename)))
    
    
In [17]:
    
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if c <= '~' and (c.isprintable() or c == '\t'))
    
In [18]:
    
%timeit list(pass_good_characters(open(filename)))
    
    
In [19]:
    
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(filter(lambda c: c <= '~' and (c.isprintable() or c == '\t'), line))
    
In [20]:
    
%timeit list(pass_good_characters(open(filename)))
    
    
In [21]:
    
def pass_good_characters(lines):
    good_characters = [chr(c) for c in range(ord(' '), ord('~')+1)] + ['\t']
    for line in lines:
        yield ''.join(c for c in line if c in good_characters)
    
In [22]:
    
%timeit list(pass_good_characters(open(filename)))
    
    
In [23]:
    
def pass_good_characters(lines):
    good_characters = ''.join([chr(c) for c in range(ord(' '), ord('~')+1)] + ['\t'])
    for line in lines:
        yield ''.join(c for c in line if c in good_characters)
    
In [24]:
    
%timeit list(pass_good_characters(open(filename)))
    
    
In [25]:
    
def pass_good_characters(lines):
    good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
    for line in lines:
        yield ''.join(c for c in line if c in good_characters)
    
In [26]:
    
%timeit list(pass_good_characters(open(filename)))
    
    
In [27]:
    
def pass_good_characters(lines):
    good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
    yield from (
        ''.join(c for c in line if c in good_characters)
        for line in lines)
    
In [28]:
    
%timeit list(pass_good_characters(open(filename)))
    
    
In [29]:
    
def pass_good_characters(lines):
    good_characters = {
        chr(c) for c in range(N_ASCII_CHARACTERS)
        if chr(c).isprintable()} | {'\t'}
    for line in lines:
        yield ''.join(c for c in line if c in good_characters)
    
In [30]:
    
%timeit list(pass_good_characters(open(filename)))
    
    
In [31]:
    
def pass_good_characters(lines):
    good_characters = {
        chr(c) for c in range(N_ASCII_CHARACTERS)
        if chr(c).isprintable() or chr(c) == '\t'}
    for line in lines:
        yield ''.join(c for c in line if c in good_characters)
    
In [32]:
    
%timeit list(pass_good_characters(open(filename)))
    
    
In [33]:
    
def pass_good_characters(lines):
    good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
    for line in lines:
        yield ''.join(filter(lambda c: c in good_characters, line))
    
In [34]:
    
%timeit list(pass_good_characters(open(filename)))
    
    
In [35]:
    
class MyStringIO():
    def __init__(self, s=''):
        self.s = s
        self.i = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        s = []
        for c in iter(partial(self.read, 1), ''):
            s.append(c)
            if c == '\n':
                break
        if not s:
            raise StopIteration
        return ''.join(s)
    def read(self, n):
        s = self.s[self.i:self.i+n]
        self.i += n
        self.i = min(self.i, len(self.s))
        return s
    def write(self, s):
        self.s += s
    
In [36]:
    
s = 'hello\nwo\1\200\trld\n'
    
In [37]:
    
f = MyStringIO(s)
f.write('peas\n')
f
    
    Out[37]:
In [38]:
    
for i, line in enumerate(f):
    print(i, repr(line))
    
    
In [39]:
    
f = MyStringIO(s)
f.write('peas\n')
f
    
    Out[39]:
In [40]:
    
for i, line in enumerate(pass_good_characters(f)):
    print(i, repr(line))
    
    
In [41]:
    
# str.isprintable() for many characters above ASCII.
for i in range(2 * N_ASCII_CHARACTERS):
    c = chr(i)
    print(i, repr(c), c.isprintable())
    
    
In [42]:
    
string.digits
    
    Out[42]:
In [43]:
    
string.ascii_letters
    
    Out[43]:
In [44]:
    
string.punctuation
    
    Out[44]:
In [45]:
    
s = set('\t' + ' ' + string.digits + string.ascii_letters + string.punctuation)
len(s)
    
    Out[45]: