The codecs module provides stream and file interfaces for transcoding data. It is most commonly used to work with Unicode text, but other encodings are also available for other purposes.
In [1]:
import binascii
def to_hex(t, nbytes):
"""Format text t as a sequence of nbyte long values
separated by spaces.
"""
chars_per_item = nbytes * 2
hex_version = binascii.hexlify(t)
return b' '.join(
hex_version[start:start + chars_per_item]
for start in range(0, len(hex_version), chars_per_item)
)
print(to_hex(b'abcdef', 1))
In [2]:
print(to_hex(b'abcdef', 2))
In [3]:
import unicodedata
text = 'français'
print('Raw : {!r}'.format(text))
for c in text:
print(' {!r}: {}'.format(c, unicodedata.name(c, c)))
print('UTF-8 : {!r}'.format(to_hex(text.encode('utf-8'), 1)))
print('UTF-16: {!r}'.format(to_hex(text.encode('utf-16'), 2)))
In [4]:
text = 'français'
encoded = text.encode('utf-8')
decoded = encoded.decode('utf-8')
print('Original :', repr(text))
print('Encoded :', to_hex(encoded, 1), type(encoded))
print('Decoded :', repr(decoded), type(decoded))
In [5]:
import codecs
BOM_TYPES = [
'BOM', 'BOM_BE', 'BOM_LE',
'BOM_UTF8',
'BOM_UTF16', 'BOM_UTF16_BE', 'BOM_UTF16_LE',
'BOM_UTF32', 'BOM_UTF32_BE', 'BOM_UTF32_LE',
]
for name in BOM_TYPES:
print('{:12} : {}'.format(
name, to_hex(getattr(codecs, name), 2)))
In [6]:
import codecs
# Pick the nonnative version of UTF-16 encoding
if codecs.BOM_UTF16 == codecs.BOM_UTF16_BE:
bom = codecs.BOM_UTF16_LE
encoding = 'utf_16_le'
else:
bom = codecs.BOM_UTF16_BE
encoding = 'utf_16_be'
print('Native order :', to_hex(codecs.BOM_UTF16, 2))
print('Selected order:', to_hex(bom, 2))
# Encode the text.
encoded_text = 'français'.encode(encoding)
print('{:14}: {}'.format(encoding, to_hex(encoded_text, 2)))
with open('nonnative-encoded.txt', mode='wb') as f:
# Write the selected byte-order marker. It is not included
# in the encoded text because the byte order was given
# explicitly when selecting the encoding.
f.write(bom)
# Write the byte string for the encoded text.
f.write(encoded_text)
In [7]:
with open('nonnative-encoded.txt', mode='rb') as f:
raw_bytes = f.read()
print('Raw :', to_hex(raw_bytes, 2))
# Re-open the file and let codecs detect the BOM
with codecs.open('nonnative-encoded.txt',
mode='r',
encoding='utf-16',
) as f:
decoded_text = f.read()
print('Decoded:', repr(decoded_text))
In [9]:
import codecs
import sys
text = 'français'
try:
# Save the data, encoded as ASCII, using the error
# handling mode specified on the command line.
with codecs.open('encode_error.txt', 'w', encoding='ascii') as f:
f.write(text)
except UnicodeEncodeError as err:
print('ERROR:', err)
else:
# If there was no error writing to the file,
# show what it contains.
with open('encode_error.txt', 'rb') as f:
print('File contents: {!r}'.format(f.read()))