In [1]:
s = 'café'
len(s)
Out[1]:
In [3]:
b = s.encode('utf8')
b
Out[3]:
In [4]:
len(b)
Out[4]:
In [5]:
b.decode('utf8')
Out[5]:
In [6]:
cafe = bytes('café', encoding='utf_8')
cafe
Out[6]:
In [7]:
cafe[0]
Out[7]:
In [8]:
cafe[:1]
Out[8]:
In [12]:
cafe_arr = bytearray(cafe)
cafe_arr
Out[12]:
In [13]:
cafe_arr[-1:]
Out[13]:
In [17]:
bytes.fromhex('31 4B CE A9')
Out[17]:
In [19]:
import array
numbers = array.array('h', [-2, -1, 0, 1, 2])
octets = bytes(numbers)
octets
Out[19]:
In [7]:
import struct
In [8]:
fmt = '<3s3sHH'
with open('b_globe.gif', 'rb') as fp:
img = memoryview(fp.read())
In [9]:
header = img[:10]
header
Out[9]:
In [10]:
bytes(header)
Out[10]:
In [11]:
struct.unpack(fmt, header)
Out[11]:
In [12]:
del header
del img
In [13]:
for codec in ['latin_1', 'utf_8', 'utf_16']:
print(codec, 'El Niño'.encode(codec), sep='\t')
In [15]:
city = 'São Paulo'
city.encode('utf_8')
Out[15]:
In [19]:
city.encode('utf_16')
Out[19]:
In [20]:
city.encode('iso8859_1')
Out[20]:
In [21]:
city.encode('cp437')
In [22]:
city.encode('cp437', errors='ignore')
Out[22]:
In [23]:
city.encode('cp437', errors='replace')
Out[23]:
In [24]:
city.encode('cp437', errors='xmlcharrefreplace')
Out[24]:
In [25]:
octets = b'Montr\xe9al'
octets.decode('cp1252')
Out[25]:
In [26]:
octets.decode('iso8859_7')
Out[26]:
In [27]:
octets.decode('koi8_r')
Out[27]:
In [28]:
octets.decode('utf_8')
In [29]:
octets.decode('utf_8', errors='replace')
Out[29]:
In [31]:
u16 = 'El Niño'.encode('utf_16')
u16
Out[31]:
In [32]:
list(u16)
Out[32]:
In [33]:
u16le = 'El Niño'.encode('utf_16le')
list(u16le)
Out[33]:
In [34]:
u16be = 'El Niño'.encode('utf_16be')
list(u16be)
Out[34]:
In [35]:
open('cafe.txt', 'w', encoding='utf_8').write('café')
Out[35]:
In [36]:
open('cafe.txt').read()
Out[36]:
In [37]:
fp = open('cafe.txt', 'w', encoding='utf_8')
fp
Out[37]:
In [38]:
fp.write('café')
Out[38]:
In [39]:
fp.close()
In [40]:
import os
In [41]:
os.stat('cafe.txt').st_size
Out[41]:
In [42]:
fp2 = open('cafe.txt')
fp2
Out[42]:
In [43]:
fp2.read()
Out[43]:
In [44]:
fp3 = open('cafe.txt', encoding='utf_8')
fp3
Out[44]:
In [45]:
fp3.read()
Out[45]:
In [46]:
fp4 = open('cafe.txt', 'rb')
In [47]:
fp4
Out[47]:
In [48]:
fp4.read()
Out[48]:
In [49]:
import sys, locale
In [58]:
expressions = """
locale.getpreferredencoding()
type(my_file)
my_file.encoding
sys.stdout.isatty()
sys.stdout.encoding
sys.stdin.isatty()
sys.stdin.encoding
sys.stderr.isatty()
sys.stderr.encoding
sys.getdefaultencoding()
sys.getfilesystemencoding()
"""
In [52]:
my_file = open('dummy', 'w')
In [60]:
for expression in expressions.split():
value = eval(expression)
print(expression.rjust(30), '->', repr(value))
In [62]:
value = eval('locale.getpreferredencoding()')
In [64]:
repr(value)
Out[64]:
In [65]:
s1 = 'café'
s2 = 'cafe\u0301'
s1, s2
Out[65]:
In [66]:
len(s1), len(s2)
Out[66]:
In [67]:
s1 == s2
Out[67]:
In [71]:
from unicodedata import normalize
In [73]:
len(normalize('NFC', s1)), len(normalize('NFC', s2))
Out[73]:
In [74]:
len(normalize('NFD', s1)), len(normalize('NFD', s2))
Out[74]:
In [75]:
normalize('NFC', s1) == normalize('NFC', s2)
Out[75]:
In [76]:
normalize('NFD', s1) == normalize('NFD', s2)
Out[76]:
In [81]:
from unicodedata import normalize, name
In [82]:
ohm = '\u2126'
In [85]:
name(ohm)
Out[85]:
In [86]:
ohm_c = normalize('NFC', ohm)
In [88]:
name(ohm_c)
Out[88]:
In [89]:
ohm == ohm_c
Out[89]:
In [90]:
normalize('NFC', ohm) == normalize('NFC', ohm_c)
Out[90]:
In [91]:
from unicodedata import normalize, name
In [92]:
half = '½'
normalize('NFKC', half)
Out[92]:
In [94]:
four_squared = '4²'
normalize('NFKC', four_squared)
Out[94]:
In [95]:
micro = 'µ'
micro_kc = normalize('NFKC', micro)
In [96]:
micro, micro_kc
Out[96]:
In [97]:
ord(micro), ord(micro_kc)
Out[97]:
In [98]:
name(micro), name(micro_kc)
Out[98]:
In [12]:
from unicodedata import name
micro = 'µ'
name(micro)
Out[12]:
In [14]:
micro_cf = micro.casefold()
name(micro_cf)
Out[14]:
In [16]:
micro, micro_cf
Out[16]:
In [17]:
eszett = 'ß'
name(eszett)
Out[17]:
In [19]:
eszett_cf = eszett.casefold()
eszett, eszett_cf
Out[19]:
In [26]:
name(eszett_cf[1])
Out[26]:
In [34]:
s1 = 'café'
s2 = 'cafe\u0301'
s1 == s2
Out[34]:
In [35]:
from unicodedata import normalize
def nfc_equal(str1, str2):
return normalize('NFC', str1) == normalize('NFC', str2)
def fold_equal(str1, str2):
return (normalize('NFC', str1).casefold() ==
normalize('NFC', str2).casefold())
In [36]:
nfc_equal(s1, s2)
Out[36]:
In [38]:
nfc_equal('A','a')
Out[38]:
In [39]:
s3 = 'Straße'
s4 = 'strasse'
s3 == s4
Out[39]:
In [40]:
nfc_equal(s3, s4)
Out[40]:
In [41]:
fold_equal(s3, s4)
Out[41]:
In [42]:
fold_equal(s1, s2)
Out[42]:
In [43]:
fold_equal('A','a')
Out[43]:
In [44]:
import unicodedata
import string
def shave_marks(txt):
norm_txt = unicodedata.normalize('NFD', txt)
shaved = ''.join(c for c in norm_txt if not unicodedata.combining(c))
return unicodedata.normalize('NFC', shaved)
In [45]:
order = '“ Herr Voß: • ½ cup of Œtker ™ caffè latte • bowl of açaí.”'
In [46]:
shave_marks(order)
Out[46]:
In [47]:
Greek = 'Ζέφυρος, Zéfiro'
In [48]:
shave_marks(Greek)
Out[48]:
In [51]:
def shave_marks_latin(txt):
norm_txt = unicodedata.normalize('NFD', txt)
latin_base = False
keepers = []
for c in norm_txt:
if unicodedata.combining(c) and latin_base:
continue
keepers.append(c)
if not unicodedata.combining(c):
latin_base = c in string.ascii_letters
shaved = ''.join(keepers)
return unicodedata.normalize('NFC', shaved)
In [52]:
shave_marks_latin(Greek)
Out[52]:
In [75]:
single_map = str.maketrans("""‚ƒ„†ˆ‹‘’“”•– —˜›""",
"""'f"*^<''""-- -~>""")
In [55]:
multi_map = str.maketrans({
'€': '< euro >',
'…': '...',
'Œ': 'OE',
'™': '( TM)',
'œ': 'oe',
'‰': '< per mille >',
'‡': '**',
})
In [76]:
multi_map.update(single_map)
In [63]:
def dewinize(txt):
"""Replace Win1252 symbols with ASCII chars or sequences"""
return txt.translate(multi_map)
In [64]:
def asciize(txt):
no_marks = shave_marks_latin(dewinize(txt))
no_marks = no_marks.replace('ß', 'ss')
return unicodedata.normalize('NFKC', no_marks)
In [67]:
order = '“Herr Voß: • ½ cup of Œtker ™ caffè latte • bowl of açaí.”'
In [77]:
dewinize(order)
Out[77]:
In [78]:
asciize(order)
Out[78]:
In [80]:
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted(fruits)
Out[80]:
In [83]:
import locale
locale.setlocale( locale.LC_COLLATE, 'pt_BR.UTF-8')
In [84]:
sorted_fruits = sorted(fruits, key=locale.strxfrm)
sorted_fruits
Out[84]:
In [86]:
import pyuca
In [87]:
coll = pyuca.Collator()
fruits = ['caju', 'atemoia', 'cajá', 'açaí', 'acerola']
sorted_fruits = sorted(fruits, key=coll.sort_key)
sorted_fruits
Out[87]:
In [88]:
import unicodedata
import re
In [89]:
re_digit = re.compile(r'\d')
In [92]:
sample = '1\xbc\xb2\u0969\u136b\u216b\u2466\u2480\u3285'
In [99]:
for char in sample:
print('U+%04x' % ord(char),
char.center(6),
're_dig' if re_digit.match(char) else '-',
'isdig' if char.isdigit() else '-',
'isnum' if char.isnumeric() else '-',
format(unicodedata.numeric(char), '5.2f'),
unicodedata.name(char),
sep='\t'
)
In [100]:
import re
In [101]:
re_numbers_str = re.compile(r'\d+')
re_words_str = re.compile(r'\w+')
re_numbers_bytes = re.compile(rb'\d+')
re_words_bytes = re.compile(rb'\w+')
In [103]:
text_str = ("Ramanujan saw \u0be7\u0bed\u0be8\u0bef"
" as 1729 = 1³ + 12³ = 9³ + 10³.")
text_bytes = text_str.encode('utf_8')
In [109]:
print('Text', repr(text_str), sep='\n ')
print('Numbers')
print(' str :', re_numbers_str.findall(text_str))
print(' bytes:', re_numbers_bytes.findall(text_bytes))
print('words')
print(' str :', re_words_str.findall(text_str))
print(' bytes:', re_words_bytes.findall(text_bytes))
In [110]:
import os
os.listdir('.')
Out[110]:
In [111]:
os.listdir(b'.')
Out[111]:
In [ ]: