Character Issues


In [1]:
s = 'café'
len(s)


Out[1]:
4

In [3]:
b = s.encode('utf8')
b


Out[3]:
b'caf\xc3\xa9'

In [4]:
len(b)


Out[4]:
5

In [5]:
b.decode('utf8')


Out[5]:
'café'

Byte Essentials


In [6]:
cafe = bytes('café', encoding='utf_8')
cafe


Out[6]:
b'caf\xc3\xa9'

In [7]:
cafe[0]


Out[7]:
99

In [8]:
cafe[:1]


Out[8]:
b'c'

In [12]:
cafe_arr = bytearray(cafe)
cafe_arr


Out[12]:
bytearray(b'caf\xc3\xa9')

In [13]:
cafe_arr[-1:]


Out[13]:
bytearray(b'\xa9')

In [17]:
bytes.fromhex('31 4B CE A9')


Out[17]:
b'1K\xce\xa9'

In [19]:
import array
numbers = array.array('h', [-2, -1, 0, 1, 2])
octets = bytes(numbers)
octets


Out[19]:
b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

Structs and Memory Views


In [7]:
import struct

In [8]:
fmt = '<3s3sHH'
with open('b_globe.gif', 'rb') as fp:
    img = memoryview(fp.read())

In [9]:
header = img[:10]
header


Out[9]:
<memory at 0x00000056D227C1C8>

In [10]:
bytes(header)


Out[10]:
b'GIF89a\x10\x00\x10\x00'

In [11]:
struct.unpack(fmt, header)


Out[11]:
(b'GIF', b'89a', 16, 16)

In [12]:
del header
del img

Basic Encoders/Decoders


In [13]:
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')


latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'

Understanding Encode/Decode Problems

Coping with UnicodeEncodeError


In [15]:
city = 'São Paulo'
city.encode('utf_8')


Out[15]:
b'S\xc3\xa3o Paulo'

In [19]:
city.encode('utf_16')


Out[19]:
b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'

In [20]:
city.encode('iso8859_1')


Out[20]:
b'S\xe3o Paulo'

In [21]:
city.encode('cp437')


---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-21-768485688c3d> in <module>()
----> 1 city.encode('cp437')

c:\users\langestrst01\appdata\local\continuum\anaconda3\envs\fluentpy\lib\encodings\cp437.py in encode(self, input, errors)
     10 
     11     def encode(self,input,errors='strict'):
---> 12         return codecs.charmap_encode(input,errors,encoding_map)
     13 
     14     def decode(self,input,errors='strict'):

UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>

In [22]:
city.encode('cp437', errors='ignore')


Out[22]:
b'So Paulo'

In [23]:
city.encode('cp437', errors='replace')


Out[23]:
b'S?o Paulo'

In [24]:
city.encode('cp437', errors='xmlcharrefreplace')


Out[24]:
b'S&#227;o Paulo'

Coping with UnicodeDecodeError


In [25]:
octets = b'Montr\xe9al'
octets.decode('cp1252')


Out[25]:
'Montréal'

In [26]:
octets.decode('iso8859_7')


Out[26]:
'Montrιal'

In [27]:
octets.decode('koi8_r')


Out[27]:
'MontrИal'

In [28]:
octets.decode('utf_8')


---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-28-f3a91f0d51e5> in <module>()
----> 1 octets.decode('utf_8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte

In [29]:
octets.decode('utf_8', errors='replace')


Out[29]:
'Montr�al'

BOM: A Useful Gremlin


In [31]:
u16 = 'El Niño'.encode('utf_16')
u16


Out[31]:
b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'

In [32]:
list(u16)


Out[32]:
[255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]

In [33]:
u16le = 'El Niño'.encode('utf_16le')
list(u16le)


Out[33]:
[69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]

In [34]:
u16be = 'El Niño'.encode('utf_16be')
list(u16be)


Out[34]:
[0, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111]

Handling Text Files


In [35]:
open('cafe.txt', 'w', encoding='utf_8').write('café')


Out[35]:
4

In [36]:
open('cafe.txt').read()


Out[36]:
'café'

In [37]:
fp = open('cafe.txt', 'w', encoding='utf_8')
fp


Out[37]:
<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf_8'>

In [38]:
fp.write('café')


Out[38]:
4

In [39]:
fp.close()

In [40]:
import os

In [41]:
os.stat('cafe.txt').st_size


Out[41]:
5

In [42]:
fp2 = open('cafe.txt')
fp2


Out[42]:
<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='cp1252'>

In [43]:
fp2.read()


Out[43]:
'café'

In [44]:
fp3 = open('cafe.txt', encoding='utf_8')
fp3


Out[44]:
<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='utf_8'>

In [45]:
fp3.read()


Out[45]:
'café'

In [46]:
fp4 = open('cafe.txt', 'rb')

In [47]:
fp4


Out[47]:
<_io.BufferedReader name='cafe.txt'>

In [48]:
fp4.read()


Out[48]:
b'caf\xc3\xa9'

Encoding Defaults: A Madhouse


In [49]:
import sys, locale

In [58]:
expressions = """
    locale.getpreferredencoding()
    type(my_file)
    my_file.encoding
    sys.stdout.isatty()
    sys.stdout.encoding
    sys.stdin.isatty()
    sys.stdin.encoding
    sys.stderr.isatty()
    sys.stderr.encoding
    sys.getdefaultencoding()
    sys.getfilesystemencoding()
    """

In [52]:
my_file = open('dummy', 'w')

In [60]:
for expression in expressions.split():
    value = eval(expression)
    print(expression.rjust(30), '->', repr(value))


 locale.getpreferredencoding() -> 'cp1252'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'cp1252'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'cp1252'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'mbcs'

In [62]:
value = eval('locale.getpreferredencoding()')

In [64]:
repr(value)


Out[64]:
"'cp1252'"

Normalizing Unicode for Saner Comparisons


In [65]:
s1 = 'café'
s2 = 'cafe\u0301'
s1, s2


Out[65]:
('café', 'café')

In [66]:
len(s1), len(s2)


Out[66]:
(4, 5)

In [67]:
s1 == s2


Out[67]:
False

In [71]:
from unicodedata import normalize

In [73]:
len(normalize('NFC', s1)), len(normalize('NFC', s2))


Out[73]:
(4, 4)

In [74]:
len(normalize('NFD', s1)), len(normalize('NFD', s2))


Out[74]:
(5, 5)

In [75]:
normalize('NFC', s1) == normalize('NFC', s2)


Out[75]:
True

In [76]:
normalize('NFD', s1) == normalize('NFD', s2)


Out[76]:
True

In [81]:
from unicodedata import normalize, name

In [82]:
ohm = '\u2126'

In [85]:
name(ohm)


Out[85]:
'OHM SIGN'

In [86]:
ohm_c = normalize('NFC', ohm)

In [88]:
name(ohm_c)


Out[88]:
'GREEK CAPITAL LETTER OMEGA'

In [89]:
ohm == ohm_c


Out[89]:
False

In [90]:
normalize('NFC', ohm) == normalize('NFC', ohm_c)


Out[90]:
True

In [91]:
from unicodedata import normalize, name

In [92]:
half = '½'
normalize('NFKC', half)


Out[92]:
'1⁄2'

In [94]:
four_squared = '4²'
normalize('NFKC', four_squared)


Out[94]:
'42'

In [95]:
micro = 'µ'
micro_kc = normalize('NFKC', micro)

In [96]:
micro, micro_kc


Out[96]:
('µ', 'μ')

In [97]:
ord(micro), ord(micro_kc)


Out[97]:
(181, 956)

In [98]:
name(micro), name(micro_kc)


Out[98]:
('MICRO SIGN', 'GREEK SMALL LETTER MU')

Case Folding


In [12]:
from unicodedata import name
micro = 'µ'
name(micro)


Out[12]:
'MICRO SIGN'

In [14]:
micro_cf = micro.casefold()
name(micro_cf)


Out[14]:
'GREEK SMALL LETTER MU'

In [16]:
micro, micro_cf


Out[16]:
('µ', 'μ')

In [17]:
eszett = 'ß'
name(eszett)


Out[17]:
'LATIN SMALL LETTER SHARP S'

In [19]:
eszett_cf = eszett.casefold()
eszett, eszett_cf


Out[19]:
('ß', 'ss')

In [26]:
name(eszett_cf[1])


Out[26]:
'LATIN SMALL LETTER S'

Utility Functions for Normalized Text Matching


In [34]:
s1 = 'café'
s2 = 'cafe\u0301'
s1 == s2


Out[34]:
False

In [35]:
from unicodedata import normalize

def nfc_equal(str1, str2):
    return normalize('NFC', str1) == normalize('NFC', str2)

def fold_equal(str1, str2):
    return (normalize('NFC', str1).casefold() ==
            normalize('NFC', str2).casefold())

In [36]:
nfc_equal(s1, s2)


Out[36]:
True

In [38]:
nfc_equal('A','a')


Out[38]:
False

In [39]:
s3 = 'Straße'
s4 = 'strasse'
s3 == s4


Out[39]:
False

In [40]:
nfc_equal(s3, s4)


Out[40]:
False

In [41]:
fold_equal(s3, s4)


Out[41]:
True

In [42]:
fold_equal(s1, s2)


Out[42]:
True

In [43]:
fold_equal('A','a')


Out[43]:
True

Extreme "normalization": Taking Out Diacritics


In [44]:
import unicodedata
import string

def shave_marks(txt):
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)

In [45]:
order = '“ Herr Voß: • ½ cup of Œtker ™ caffè latte • bowl of açaí.”'

In [46]:
shave_marks(order)


Out[46]:
'“ Herr Voß: • ½ cup of Œtker ™ caffe latte • bowl of acai.”'

In [47]:
Greek = 'Ζέφυρος, Zéfiro'

In [48]:
shave_marks(Greek)


Out[48]:
'Ζεφυρος, Zefiro'

In [51]:
def shave_marks_latin(txt):
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue
        keepers.append(c)
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)

In [52]:
shave_marks_latin(Greek)


Out[52]:
'Ζέφυρος, Zefiro'

In [75]:
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•– —˜›""",
                           """'f"*^<''""-- -~>""")

In [55]:
multi_map = str.maketrans({
        '€': '< euro >',
        '…': '...',
        'Œ': 'OE',
        '™': '( TM)',
        'œ': 'oe',
        '‰': '< per mille >',
        '‡': '**',
    })

In [76]:
multi_map.update(single_map)

In [63]:
def dewinize(txt):
    """Replace Win1252 symbols with ASCII chars or sequences"""
    return txt.translate(multi_map)

In [64]:
def asciize(txt):
    no_marks = shave_marks_latin(dewinize(txt))
    no_marks = no_marks.replace('ß', 'ss')
    return unicodedata.normalize('NFKC', no_marks)

In [67]:
order = '“Herr Voß: • ½ cup of Œtker ™ caffè latte • bowl of açaí.”'

In [77]:
dewinize(order)


Out[77]:
'"Herr Voß: - ½ cup of OEtker ( TM) caffè latte - bowl of açaí."'

In [78]:
asciize(order)


Out[78]:
'"Herr Voss: - 1⁄2 cup of OEtker ( TM) caffe latte - bowl of acai."'

Sorting Unicode Text


In [80]:
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted(fruits)


Out[80]:
['acerola', 'atemoia', 'açaí', 'caju', 'cajá']

In [83]:
import locale
locale.setlocale( locale.LC_COLLATE, 'pt_BR.UTF-8')


---------------------------------------------------------------------------
Error                                     Traceback (most recent call last)
<ipython-input-83-449c374d86bc> in <module>()
      1 import locale
----> 2 locale.setlocale( locale.LC_COLLATE, 'pt_BR.UTF-8')

c:\users\langestrst01\appdata\local\continuum\anaconda3\envs\fluentpy\lib\locale.py in setlocale(category, locale)
    592         # convert to string
    593         locale = normalize(_build_localename(locale))
--> 594     return _setlocale(category, locale)
    595 
    596 def resetlocale(category=LC_ALL):

Error: unsupported locale setting

In [84]:
sorted_fruits = sorted(fruits, key=locale.strxfrm)
sorted_fruits


Out[84]:
['acerola', 'atemoia', 'açaí', 'caju', 'cajá']

Sorting with the Unicode Collation Algorithm


In [86]:
import pyuca

In [87]:
coll = pyuca.Collator()
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=coll.sort_key)
sorted_fruits


Out[87]:
['açaí', 'acerola', 'atemoia', 'cajá', 'caju']

The Unicode Database


In [88]:
import unicodedata
import re

In [89]:
re_digit = re.compile(r'\d')

In [92]:
sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'

In [99]:
for char in sample:
    print('U+%04x' % ord(char),
          char.center(6),
          're_dig' if re_digit.match(char) else '-',
          'isdig' if char.isdigit() else '-',
          'isnum' if char.isnumeric() else '-',
          format(unicodedata.numeric(char), '5.2f'),
          unicodedata.name(char),
          sep='\t'
         )


U+0031	  1   	re_dig	isdig	isnum	 1.00	DIGIT ONE
U+00bc	  ¼   	-	-	isnum	 0.25	VULGAR FRACTION ONE QUARTER
U+00b2	  ²   	-	isdig	isnum	 2.00	SUPERSCRIPT TWO
U+0969	  ३   	re_dig	isdig	isnum	 3.00	DEVANAGARI DIGIT THREE
U+136b	  ፫   	-	isdig	isnum	 3.00	ETHIOPIC DIGIT THREE
U+216b	  Ⅻ   	-	-	isnum	12.00	ROMAN NUMERAL TWELVE
U+2466	  ⑦   	-	isdig	isnum	 7.00	CIRCLED DIGIT SEVEN
U+2480	  ⒀   	-	-	isnum	13.00	PARENTHESIZED NUMBER THIRTEEN
U+3285	  ㊅   	-	-	isnum	 6.00	CIRCLED IDEOGRAPH SIX

Dual-Mode str and bytes APIs

str Versus bytes in Regular Expressions


In [100]:
import re

In [101]:
re_numbers_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')

In [103]:
text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"
            " as 1729 = 1³ + 12³ = 9³ + 10³.")
text_bytes = text_str.encode('utf_8')

In [109]:
print('Text', repr(text_str), sep='\n ')
print('Numbers')
print('  str  :', re_numbers_str.findall(text_str))
print('  bytes:', re_numbers_bytes.findall(text_bytes))
print('words')
print('  str  :', re_words_str.findall(text_str))
print('  bytes:', re_words_bytes.findall(text_bytes))


Text
 'Ramanujan saw ௧௭௨௯ as 1729 = 1³ + 12³ = 9³ + 10³.'
Numbers
  str  : ['௧௭௨௯', '1729', '1', '12', '9', '10']
  bytes: [b'1729', b'1', b'12', b'9', b'10']
words
  str  : ['Ramanujan', 'saw', '௧௭௨௯', 'as', '1729', '1³', '12³', '9³', '10³']
  bytes: [b'Ramanujan', b'saw', b'as', b'1729', b'1', b'12', b'9', b'10']

str Versus bytes on os Functions


In [110]:
import os
os.listdir('.')


Out[110]:
['.DS_Store',
 '.git',
 '.ipynb_checkpoints',
 '1. The Python Data Model.ipynb',
 '2. An Array of Sequences.ipynb',
 '3. Dictionaries and Sets.ipynb',
 '4. Text versus Bytes.ipynb',
 'b_globe.gif',
 'requirements.txt',
 'zen.txt']

In [111]:
os.listdir(b'.')


c:\users\langestrst01\appdata\local\continuum\anaconda3\envs\fluentpy\lib\site-packages\ipykernel\__main__.py:1: DeprecationWarning: The Windows bytes API has been deprecated, use Unicode filenames instead
  if __name__ == '__main__':
Out[111]:
[b'.DS_Store',
 b'.git',
 b'.ipynb_checkpoints',
 b'1. The Python Data Model.ipynb',
 b'2. An Array of Sequences.ipynb',
 b'3. Dictionaries and Sets.ipynb',
 b'4. Text versus Bytes.ipynb',
 b'b_globe.gif',
 b'requirements.txt',
 b'zen.txt']

In [ ]: