In [1]:
def analyze_characters(s):
""" core function: analyze characters
print utf8 number and unicode number of each characters in text
:param unicode s: input string
:type s: unicode
"""
print u" utf8 unicode"
for i in s:
unicode_number = hex(ord(i))[2:].zfill(4)
utf8_number = i.encode("utf-8").encode("hex")
if utf8_number in ["cc80", "cc82", "cc83", "cca3"]:
format_string = u"{:3s} -> {:>6s} -> {:>7s}"
else:
format_string = u"{:2s} -> {:>6s} -> {:>7s}"
print format_string.format(i, utf8_number, unicode_number)
In [2]:
# unicode tổ hợp
s = u"cộng hòa xã hội"
analyze_characters(s)
In [3]:
# unicode dựng sẵn
s = u"cộng hòa xã hội"
analyze_characters(s)
In [4]:
import unicodedata
# unicode tổ hợp
s = u"cộng hòa xã hội"
analyze_characters(s)
In [5]:
analyze_characters(unicodedata.normalize("NFC", s))
In [6]:
analyze_characters(unicodedata.normalize("NFD", s))
In [7]:
utf8_code(unicodedata.normalize("NFKC", s))
In [ ]:
analyze_characters(unicodedata.normalize("NFKD", s))
In [ ]:
text = u"ÐĐƉᴆ"
analyze_characters(text)
In [ ]:
def map_character_to_tcvn(c):
inverse_mapping_table = {
# c390
"Ð": [
"Đ" # c490
]
}
mapping_table = {}
for key, characters in inverse_mapping_table.iteritems():
for character in characters:
mapping_table[character] = key
utf8_code(c)
print mapping_table
print c in mapping_table
if c in mapping_table:
return mapping_table[c]
else:
return c
def map_text_to_tcvn(text):
"""
@param unicode text: converted to normalize nfc form
"""
return [map_character_to_tcvn(c) for c in text]
def convert_to_tcvn(text):
"""
@param text: unicode
"""
text = unicodedata.normalize("NFC", text)
text = map_text_to_tcvn(text)
return text
In [ ]:
analyze_characters(convert_to_tcvn(text))
In [ ]:
from locale import LC_ALL, setlocale
print setlocale(LC_ALL,"Vietnamese")
from string import letters
print letters
In [ ]: