In [ ]:
import os
import os.path
import glob
In [ ]:
from os.path import isdir
basedir = "/tmp/course"
if not isdir(basedir):
os.makedirs(basedir)
Als das Kind Kind war, ging es mit hängenden Armen, wollte der Bach sei ein Fluß, der Fluß sei ein Strom, und diese Pfütze das Meer. Als das Kind Kind war, wußte es nicht, daß es Kind war, alles war ihm beseelt, und alle Seelen waren eins. |
characters were bytes, and strings list of bytes." |
When the child was a child It walked with its arms swinging, wanted the brook to be a river, the river to be a torrent, and this puddle to be the sea. When the child was a child, it didn’t know that it was a child, everything was soulful, and all souls were one. |
Encoding is a map between typographical characters and byte-sequences.
Decoding is its reverse map.
char -> | utf8 | cp1252 | ascii |
---|---|---|---|
y -> | [121] | [121] | [121] |
z -> | [122] | [122] | [122] |
{ -> | [123] | [123] | [123] |
¢ -> | [194, 162] | [162] | - |
£ -> | [194, 163] | [163] | - |
¤ -> | [194, 164] | [164] | - |
¥ -> | [194, 165] | [165] | - |
Ɓ -> | [198, 129] | - | - |
Ƃ -> | [198, 130] | - | - |
ƃ -> | [198, 131] | - | - |
In [ ]:
# Py3 doesn't need the 'u' prefix before the string.
the_string = u"S\u00fcd" # Sued
print(the_string)
In [ ]:
# the_string Sued can be encoded in different...
in_utf8 = the_string.encode('utf-8')
in_win = the_string.encode('cp1252')
# ...byte-sequences
assert type(in_utf8) == bytes
In [ ]:
# Now you can see the differences between
print(repr(in_utf8))
# and
print(repr(in_win))
In [ ]:
# Decoding bytes using the wrong map...
# ...gives Süd results
print(in_utf8.decode('cp1252'))
In [ ]:
# Filenames are actually binary data
# we should be careful when our scripts read
# eg from a vfat filesystem.
# To make Py2 encoding-aware we must
from __future__ import unicode_literals, print_function
# Create 3 windows-encoded filenames in
# using the provided function
from course import create_espana
create_espana(basedir)
In [ ]:
# Just list the newly created files
# and check that they are not showing correctly (unless we have windows :D)
!dir {basedir}
In [ ]:
from glob import glob as ls
#expands wildcards like ls
# To avoid encoding issue like the following...
files = ls("/tmp/course/*.txt")
#UnicodeDecodeError: 'ascii' codec can't decode
# byte 0xe9 in position 5: # remember ñ in cp1252
# ordinal not in range(128)
In [ ]:
# We must explicitly use bytes prefixing with "b"
files = ls(b"/tmp/course/*.txt")
# And the file names are shown with bytes.
print(files)
In [ ]:
# Exercise: don't run this cell!
# Which outcome do you expect from the following instruction?
print('\n'.join(files))
In [ ]: