In [1]:
import string
from functools import partial

In [2]:
N_ASCII_CHARACTERS = 1 << 7

In [3]:
string.printable


Out[3]:
'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

string.printable has non-printable characters that I don't want, such as '\x0b', so I make my own set of good characters.


In [6]:
s = {chr(c) for c in range(0, N_ASCII_CHARACTERS) if chr(c).isprintable()}
len(s), ''.join(sorted(s))


Out[6]:
(95,
 ' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~')

In [5]:
s = ''.join(c for c in string.printable if ord(' ') <= ord(c) <= ord('~'))
len(s), s


Out[5]:
(95,
 '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ ')

In [6]:
good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}

In [7]:
len(good_characters), good_characters


Out[7]:
(96,
 {'\t',
  ' ',
  '!',
  '"',
  '#',
  '$',
  '%',
  '&',
  "'",
  '(',
  ')',
  '*',
  '+',
  ',',
  '-',
  '.',
  '/',
  '0',
  '1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  ':',
  ';',
  '<',
  '=',
  '>',
  '?',
  '@',
  'A',
  'B',
  'C',
  'D',
  'E',
  'F',
  'G',
  'H',
  'I',
  'J',
  'K',
  'L',
  'M',
  'N',
  'O',
  'P',
  'Q',
  'R',
  'S',
  'T',
  'U',
  'V',
  'W',
  'X',
  'Y',
  'Z',
  '[',
  '\\',
  ']',
  '^',
  '_',
  '`',
  'a',
  'b',
  'c',
  'd',
  'e',
  'f',
  'g',
  'h',
  'i',
  'j',
  'k',
  'l',
  'm',
  'n',
  'o',
  'p',
  'q',
  'r',
  's',
  't',
  'u',
  'v',
  'w',
  'x',
  'y',
  'z',
  '{',
  '|',
  '}',
  '~'})

In [8]:
filename = '20150223-cohpy-memoization.ipynb'

In [9]:
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if 31 < ord(c) < 127 or c == '\t')

In [10]:
%timeit list(pass_good_characters(open(filename)))


100 loops, best of 3: 8.78 ms per loop

In [11]:
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if ord(' ')-1 < ord(c) < ord('~')+1 or c == '\t')

In [12]:
%timeit list(pass_good_characters(open(filename)))


100 loops, best of 3: 14.8 ms per loop

In [13]:
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if ord(' ') <= ord(c) <= ord('~') or c == '\t')

In [14]:
%timeit list(pass_good_characters(open(filename)))


100 loops, best of 3: 13.1 ms per loop

In [15]:
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if ' ' <= c <= '~' or c == '\t')

In [16]:
%timeit list(pass_good_characters(open(filename)))


100 loops, best of 3: 7.6 ms per loop

In [17]:
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(
            c for c in line
            if c <= '~' and (c.isprintable() or c == '\t'))

In [18]:
%timeit list(pass_good_characters(open(filename)))


100 loops, best of 3: 8.68 ms per loop

In [19]:
def pass_good_characters(lines):
    for line in lines:
        yield ''.join(filter(lambda c: c <= '~' and (c.isprintable() or c == '\t'), line))

In [20]:
%timeit list(pass_good_characters(open(filename)))


100 loops, best of 3: 10.2 ms per loop

In [21]:
def pass_good_characters(lines):
    good_characters = [chr(c) for c in range(ord(' '), ord('~')+1)] + ['\t']
    for line in lines:
        yield ''.join(c for c in line if c in good_characters)

In [22]:
%timeit list(pass_good_characters(open(filename)))


10 loops, best of 3: 37.4 ms per loop

In [23]:
def pass_good_characters(lines):
    good_characters = ''.join([chr(c) for c in range(ord(' '), ord('~')+1)] + ['\t'])
    for line in lines:
        yield ''.join(c for c in line if c in good_characters)

In [24]:
%timeit list(pass_good_characters(open(filename)))


100 loops, best of 3: 6.95 ms per loop

In [25]:
def pass_good_characters(lines):
    good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
    for line in lines:
        yield ''.join(c for c in line if c in good_characters)

In [26]:
%timeit list(pass_good_characters(open(filename)))


100 loops, best of 3: 6.23 ms per loop

In [27]:
def pass_good_characters(lines):
    good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
    yield from (
        ''.join(c for c in line if c in good_characters)
        for line in lines)

In [28]:
%timeit list(pass_good_characters(open(filename)))


100 loops, best of 3: 6.19 ms per loop

In [29]:
def pass_good_characters(lines):
    good_characters = {
        chr(c) for c in range(N_ASCII_CHARACTERS)
        if chr(c).isprintable()} | {'\t'}
    for line in lines:
        yield ''.join(c for c in line if c in good_characters)

In [30]:
%timeit list(pass_good_characters(open(filename)))


100 loops, best of 3: 6.1 ms per loop

In [31]:
def pass_good_characters(lines):
    good_characters = {
        chr(c) for c in range(N_ASCII_CHARACTERS)
        if chr(c).isprintable() or chr(c) == '\t'}
    for line in lines:
        yield ''.join(c for c in line if c in good_characters)

In [32]:
%timeit list(pass_good_characters(open(filename)))


100 loops, best of 3: 6 ms per loop

In [33]:
def pass_good_characters(lines):
    good_characters = {chr(c) for c in range(ord(' '), ord('~')+1)} | {'\t'}
    for line in lines:
        yield ''.join(filter(lambda c: c in good_characters, line))

In [34]:
%timeit list(pass_good_characters(open(filename)))


100 loops, best of 3: 7.69 ms per loop

In [35]:
class MyStringIO():
    def __init__(self, s=''):
        self.s = s
        self.i = 0
        
    def __iter__(self):
        return self
    
    def __next__(self):
        s = []
        for c in iter(partial(self.read, 1), ''):
            s.append(c)
            if c == '\n':
                break
        if not s:
            raise StopIteration
        return ''.join(s)

    def read(self, n):
        s = self.s[self.i:self.i+n]
        self.i += n
        self.i = min(self.i, len(self.s))
        return s

    def write(self, s):
        self.s += s

In [36]:
s = 'hello\nwo\1\200\trld\n'

In [37]:
f = MyStringIO(s)
f.write('peas\n')
f


Out[37]:
<__main__.MyStringIO at 0xb215176c>

In [38]:
for i, line in enumerate(f):
    print(i, repr(line))


0 'hello\n'
1 'wo\x01\x80\trld\n'
2 'peas\n'

In [39]:
f = MyStringIO(s)
f.write('peas\n')
f


Out[39]:
<__main__.MyStringIO at 0xb214b46c>

In [40]:
for i, line in enumerate(pass_good_characters(f)):
    print(i, repr(line))


0 'hello'
1 'wo\trld'
2 'peas'

In [41]:
# str.isprintable() for many characters above ASCII.
for i in range(2 * N_ASCII_CHARACTERS):
    c = chr(i)
    print(i, repr(c), c.isprintable())


0 '\x00' False
1 '\x01' False
2 '\x02' False
3 '\x03' False
4 '\x04' False
5 '\x05' False
6 '\x06' False
7 '\x07' False
8 '\x08' False
9 '\t' False
10 '\n' False
11 '\x0b' False
12 '\x0c' False
13 '\r' False
14 '\x0e' False
15 '\x0f' False
16 '\x10' False
17 '\x11' False
18 '\x12' False
19 '\x13' False
20 '\x14' False
21 '\x15' False
22 '\x16' False
23 '\x17' False
24 '\x18' False
25 '\x19' False
26 '\x1a' False
27 '\x1b' False
28 '\x1c' False
29 '\x1d' False
30 '\x1e' False
31 '\x1f' False
32 ' ' True
33 '!' True
34 '"' True
35 '#' True
36 '$' True
37 '%' True
38 '&' True
39 "'" True
40 '(' True
41 ')' True
42 '*' True
43 '+' True
44 ',' True
45 '-' True
46 '.' True
47 '/' True
48 '0' True
49 '1' True
50 '2' True
51 '3' True
52 '4' True
53 '5' True
54 '6' True
55 '7' True
56 '8' True
57 '9' True
58 ':' True
59 ';' True
60 '<' True
61 '=' True
62 '>' True
63 '?' True
64 '@' True
65 'A' True
66 'B' True
67 'C' True
68 'D' True
69 'E' True
70 'F' True
71 'G' True
72 'H' True
73 'I' True
74 'J' True
75 'K' True
76 'L' True
77 'M' True
78 'N' True
79 'O' True
80 'P' True
81 'Q' True
82 'R' True
83 'S' True
84 'T' True
85 'U' True
86 'V' True
87 'W' True
88 'X' True
89 'Y' True
90 'Z' True
91 '[' True
92 '\\' True
93 ']' True
94 '^' True
95 '_' True
96 '`' True
97 'a' True
98 'b' True
99 'c' True
100 'd' True
101 'e' True
102 'f' True
103 'g' True
104 'h' True
105 'i' True
106 'j' True
107 'k' True
108 'l' True
109 'm' True
110 'n' True
111 'o' True
112 'p' True
113 'q' True
114 'r' True
115 's' True
116 't' True
117 'u' True
118 'v' True
119 'w' True
120 'x' True
121 'y' True
122 'z' True
123 '{' True
124 '|' True
125 '}' True
126 '~' True
127 '\x7f' False
128 '\x80' False
129 '\x81' False
130 '\x82' False
131 '\x83' False
132 '\x84' False
133 '\x85' False
134 '\x86' False
135 '\x87' False
136 '\x88' False
137 '\x89' False
138 '\x8a' False
139 '\x8b' False
140 '\x8c' False
141 '\x8d' False
142 '\x8e' False
143 '\x8f' False
144 '\x90' False
145 '\x91' False
146 '\x92' False
147 '\x93' False
148 '\x94' False
149 '\x95' False
150 '\x96' False
151 '\x97' False
152 '\x98' False
153 '\x99' False
154 '\x9a' False
155 '\x9b' False
156 '\x9c' False
157 '\x9d' False
158 '\x9e' False
159 '\x9f' False
160 '\xa0' False
161 '¡' True
162 '¢' True
163 '£' True
164 '¤' True
165 '¥' True
166 '¦' True
167 '§' True
168 '¨' True
169 '©' True
170 'ª' True
171 '«' True
172 '¬' True
173 '\xad' False
174 '®' True
175 '¯' True
176 '°' True
177 '±' True
178 '²' True
179 '³' True
180 '´' True
181 'µ' True
182 '¶' True
183 '·' True
184 '¸' True
185 '¹' True
186 'º' True
187 '»' True
188 '¼' True
189 '½' True
190 '¾' True
191 '¿' True
192 'À' True
193 'Á' True
194 'Â' True
195 'Ã' True
196 'Ä' True
197 'Å' True
198 'Æ' True
199 'Ç' True
200 'È' True
201 'É' True
202 'Ê' True
203 'Ë' True
204 'Ì' True
205 'Í' True
206 'Î' True
207 'Ï' True
208 'Ð' True
209 'Ñ' True
210 'Ò' True
211 'Ó' True
212 'Ô' True
213 'Õ' True
214 'Ö' True
215 '×' True
216 'Ø' True
217 'Ù' True
218 'Ú' True
219 'Û' True
220 'Ü' True
221 'Ý' True
222 'Þ' True
223 'ß' True
224 'à' True
225 'á' True
226 'â' True
227 'ã' True
228 'ä' True
229 'å' True
230 'æ' True
231 'ç' True
232 'è' True
233 'é' True
234 'ê' True
235 'ë' True
236 'ì' True
237 'í' True
238 'î' True
239 'ï' True
240 'ð' True
241 'ñ' True
242 'ò' True
243 'ó' True
244 'ô' True
245 'õ' True
246 'ö' True
247 '÷' True
248 'ø' True
249 'ù' True
250 'ú' True
251 'û' True
252 'ü' True
253 'ý' True
254 'þ' True
255 'ÿ' True

In [42]:
string.digits


Out[42]:
'0123456789'

In [43]:
string.ascii_letters


Out[43]:
'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'

In [44]:
string.punctuation


Out[44]:
'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [45]:
s = set('\t' + ' ' + string.digits + string.ascii_letters + string.punctuation)
len(s)


Out[45]:
96