Character Issues



In [1]:

    
s = 'café'
len(s)









    Out[1]:





4



In [3]:

    
b = s.encode('utf8')
b









    Out[3]:





b'caf\xc3\xa9'



In [4]:

    
len(b)









    Out[4]:





5



In [5]:

    
b.decode('utf8')









    Out[5]:





'café'

Byte Essentials



In [6]:

    
cafe = bytes('café', encoding='utf_8')
cafe









    Out[6]:





b'caf\xc3\xa9'



In [7]:

    
cafe[0]









    Out[7]:





99



In [8]:

    
cafe[:1]









    Out[8]:





b'c'



In [12]:

    
cafe_arr = bytearray(cafe)
cafe_arr









    Out[12]:





bytearray(b'caf\xc3\xa9')



In [13]:

    
cafe_arr[-1:]









    Out[13]:





bytearray(b'\xa9')



In [17]:

    
bytes.fromhex('31 4B CE A9')









    Out[17]:





b'1K\xce\xa9'



In [19]:

    
import array
numbers = array.array('h', [-2, -1, 0, 1, 2])
octets = bytes(numbers)
octets









    Out[19]:





b'\xfe\xff\xff\xff\x00\x00\x01\x00\x02\x00'

Structs and Memory Views



In [7]:

    
import struct



In [8]:

    
fmt = '<3s3sHH'
with open('b_globe.gif', 'rb') as fp:
    img = memoryview(fp.read())



In [9]:

    
header = img[:10]
header









    Out[9]:





<memory at 0x00000056D227C1C8>



In [10]:

    
bytes(header)









    Out[10]:





b'GIF89a\x10\x00\x10\x00'



In [11]:

    
struct.unpack(fmt, header)









    Out[11]:





(b'GIF', b'89a', 16, 16)



In [12]:

    
del header
del img

Basic Encoders/Decoders



In [13]:

    
for codec in ['latin_1', 'utf_8', 'utf_16']:
    print(codec, 'El Niño'.encode(codec), sep='\t')









    



latin_1	b'El Ni\xf1o'
utf_8	b'El Ni\xc3\xb1o'
utf_16	b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'

Understanding Encode/Decode Problems

Coping with UnicodeEncodeError



In [15]:

    
city = 'São Paulo'
city.encode('utf_8')









    Out[15]:





b'S\xc3\xa3o Paulo'



In [19]:

    
city.encode('utf_16')









    Out[19]:





b'\xff\xfeS\x00\xe3\x00o\x00 \x00P\x00a\x00u\x00l\x00o\x00'



In [20]:

    
city.encode('iso8859_1')









    Out[20]:





b'S\xe3o Paulo'



In [21]:

    
city.encode('cp437')









    



---------------------------------------------------------------------------
UnicodeEncodeError                        Traceback (most recent call last)
<ipython-input-21-768485688c3d> in <module>()
----> 1 city.encode('cp437')

c:\users\langestrst01\appdata\local\continuum\anaconda3\envs\fluentpy\lib\encodings\cp437.py in encode(self, input, errors)
     10 
     11     def encode(self,input,errors='strict'):
---> 12         return codecs.charmap_encode(input,errors,encoding_map)
     13 
     14     def decode(self,input,errors='strict'):

UnicodeEncodeError: 'charmap' codec can't encode character '\xe3' in position 1: character maps to <undefined>



In [22]:

    
city.encode('cp437', errors='ignore')









    Out[22]:





b'So Paulo'



In [23]:

    
city.encode('cp437', errors='replace')









    Out[23]:





b'S?o Paulo'



In [24]:

    
city.encode('cp437', errors='xmlcharrefreplace')









    Out[24]:





b'S&#227;o Paulo'

Coping with UnicodeDecodeError



In [25]:

    
octets = b'Montr\xe9al'
octets.decode('cp1252')









    Out[25]:





'Montréal'



In [26]:

    
octets.decode('iso8859_7')









    Out[26]:





'Montrιal'



In [27]:

    
octets.decode('koi8_r')









    Out[27]:





'MontrИal'



In [28]:

    
octets.decode('utf_8')









    



---------------------------------------------------------------------------
UnicodeDecodeError                        Traceback (most recent call last)
<ipython-input-28-f3a91f0d51e5> in <module>()
----> 1 octets.decode('utf_8')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 5: invalid continuation byte



In [29]:

    
octets.decode('utf_8', errors='replace')









    Out[29]:





'Montr�al'

BOM: A Useful Gremlin



In [31]:

    
u16 = 'El Niño'.encode('utf_16')
u16









    Out[31]:





b'\xff\xfeE\x00l\x00 \x00N\x00i\x00\xf1\x00o\x00'



In [32]:

    
list(u16)









    Out[32]:





[255, 254, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]



In [33]:

    
u16le = 'El Niño'.encode('utf_16le')
list(u16le)









    Out[33]:





[69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111, 0]



In [34]:

    
u16be = 'El Niño'.encode('utf_16be')
list(u16be)









    Out[34]:





[0, 69, 0, 108, 0, 32, 0, 78, 0, 105, 0, 241, 0, 111]

Handling Text Files



In [35]:

    
open('cafe.txt', 'w', encoding='utf_8').write('café')









    Out[35]:





4



In [36]:

    
open('cafe.txt').read()









    Out[36]:





'cafÃ©'



In [37]:

    
fp = open('cafe.txt', 'w', encoding='utf_8')
fp









    Out[37]:





<_io.TextIOWrapper name='cafe.txt' mode='w' encoding='utf_8'>



In [38]:

    
fp.write('café')









    Out[38]:





4



In [39]:

    
fp.close()



In [40]:

    
import os



In [41]:

    
os.stat('cafe.txt').st_size









    Out[41]:





5



In [42]:

    
fp2 = open('cafe.txt')
fp2









    Out[42]:





<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='cp1252'>



In [43]:

    
fp2.read()









    Out[43]:





'cafÃ©'



In [44]:

    
fp3 = open('cafe.txt', encoding='utf_8')
fp3









    Out[44]:





<_io.TextIOWrapper name='cafe.txt' mode='r' encoding='utf_8'>



In [45]:

    
fp3.read()









    Out[45]:





'café'



In [46]:

    
fp4 = open('cafe.txt', 'rb')



In [47]:

    
fp4









    Out[47]:





<_io.BufferedReader name='cafe.txt'>



In [48]:

    
fp4.read()









    Out[48]:





b'caf\xc3\xa9'

Encoding Defaults: A Madhouse



In [49]:

    
import sys, locale



In [58]:

    
expressions = """
    locale.getpreferredencoding()
    type(my_file)
    my_file.encoding
    sys.stdout.isatty()
    sys.stdout.encoding
    sys.stdin.isatty()
    sys.stdin.encoding
    sys.stderr.isatty()
    sys.stderr.encoding
    sys.getdefaultencoding()
    sys.getfilesystemencoding()
    """



In [52]:

    
my_file = open('dummy', 'w')



In [60]:

    
for expression in expressions.split():
    value = eval(expression)
    print(expression.rjust(30), '->', repr(value))









    



 locale.getpreferredencoding() -> 'cp1252'
                 type(my_file) -> <class '_io.TextIOWrapper'>
              my_file.encoding -> 'cp1252'
           sys.stdout.isatty() -> False
           sys.stdout.encoding -> 'UTF-8'
            sys.stdin.isatty() -> False
            sys.stdin.encoding -> 'cp1252'
           sys.stderr.isatty() -> False
           sys.stderr.encoding -> 'UTF-8'
      sys.getdefaultencoding() -> 'utf-8'
   sys.getfilesystemencoding() -> 'mbcs'



In [62]:

    
value = eval('locale.getpreferredencoding()')



In [64]:

    
repr(value)









    Out[64]:





"'cp1252'"

Normalizing Unicode for Saner Comparisons



In [65]:

    
s1 = 'café'
s2 = 'cafe\u0301'
s1, s2









    Out[65]:





('café', 'café')



In [66]:

    
len(s1), len(s2)









    Out[66]:





(4, 5)



In [67]:

    
s1 == s2









    Out[67]:





False



In [71]:

    
from unicodedata import normalize



In [73]:

    
len(normalize('NFC', s1)), len(normalize('NFC', s2))









    Out[73]:





(4, 4)



In [74]:

    
len(normalize('NFD', s1)), len(normalize('NFD', s2))









    Out[74]:





(5, 5)



In [75]:

    
normalize('NFC', s1) == normalize('NFC', s2)









    Out[75]:





True



In [76]:

    
normalize('NFD', s1) == normalize('NFD', s2)









    Out[76]:





True



In [81]:

    
from unicodedata import normalize, name



In [82]:

    
ohm = '\u2126'



In [85]:

    
name(ohm)









    Out[85]:





'OHM SIGN'



In [86]:

    
ohm_c = normalize('NFC', ohm)



In [88]:

    
name(ohm_c)









    Out[88]:





'GREEK CAPITAL LETTER OMEGA'



In [89]:

    
ohm == ohm_c









    Out[89]:





False



In [90]:

    
normalize('NFC', ohm) == normalize('NFC', ohm_c)









    Out[90]:





True



In [91]:

    
from unicodedata import normalize, name



In [92]:

    
half = '½'
normalize('NFKC', half)









    Out[92]:





'1⁄2'



In [94]:

    
four_squared = '4²'
normalize('NFKC', four_squared)









    Out[94]:





'42'



In [95]:

    
micro = 'µ'
micro_kc = normalize('NFKC', micro)



In [96]:

    
micro, micro_kc









    Out[96]:





('µ', 'μ')



In [97]:

    
ord(micro), ord(micro_kc)









    Out[97]:





(181, 956)



In [98]:

    
name(micro), name(micro_kc)









    Out[98]:





('MICRO SIGN', 'GREEK SMALL LETTER MU')

Case Folding



In [12]:

    
from unicodedata import name
micro = 'µ'
name(micro)









    Out[12]:





'MICRO SIGN'



In [14]:

    
micro_cf = micro.casefold()
name(micro_cf)









    Out[14]:





'GREEK SMALL LETTER MU'



In [16]:

    
micro, micro_cf









    Out[16]:





('µ', 'μ')



In [17]:

    
eszett = 'ß'
name(eszett)









    Out[17]:





'LATIN SMALL LETTER SHARP S'



In [19]:

    
eszett_cf = eszett.casefold()
eszett, eszett_cf









    Out[19]:





('ß', 'ss')



In [26]:

    
name(eszett_cf[1])









    Out[26]:





'LATIN SMALL LETTER S'

Utility Functions for Normalized Text Matching



In [34]:

    
s1 = 'café'
s2 = 'cafe\u0301'
s1 == s2









    Out[34]:





False



In [35]:

    
from unicodedata import normalize

def nfc_equal(str1, str2):
    return normalize('NFC', str1) == normalize('NFC', str2)

def fold_equal(str1, str2):
    return (normalize('NFC', str1).casefold() ==
            normalize('NFC', str2).casefold())



In [36]:

    
nfc_equal(s1, s2)









    Out[36]:





True



In [38]:

    
nfc_equal('A','a')









    Out[38]:





False



In [39]:

    
s3 = 'Straße'
s4 = 'strasse'
s3 == s4









    Out[39]:





False



In [40]:

    
nfc_equal(s3, s4)









    Out[40]:





False



In [41]:

    
fold_equal(s3, s4)









    Out[41]:





True



In [42]:

    
fold_equal(s1, s2)









    Out[42]:





True



In [43]:

    
fold_equal('A','a')









    Out[43]:





True

Extreme "normalization": Taking Out Diacritics



In [44]:

    
import unicodedata
import string

def shave_marks(txt):
    norm_txt = unicodedata.normalize('NFD', txt)
    shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
    return unicodedata.normalize('NFC', shaved)



In [45]:

    
order = '“ Herr Voß: • ½ cup of Œtker ™ caffè latte • bowl of açaí.”'



In [46]:

    
shave_marks(order)









    Out[46]:





'“ Herr Voß: • ½ cup of Œtker ™ caffe latte • bowl of acai.”'



In [47]:

    
Greek = 'Ζέφυρος, Zéfiro'



In [48]:

    
shave_marks(Greek)









    Out[48]:





'Ζεφυρος, Zefiro'



In [49]:

    
def shave_marks_latin(txt):
    norm_txt = unicodedata.normalize('NFD', txt)
    latin_base = False
    keepers = []
    for c in norm_txt:
        if unicodedata.combining(c) and latin_base:
            continue
        keepers.append(c)
        if not unicodedata.combining(c):
            latin_base = c in string.ascii_letters
    shaved = ''.join(keepers)
    return unicodedata.normalize('NFC', shaved)



In [ ]: