In [1]:
    
def analyze_characters(s):
    """ core function: analyze characters
    print utf8 number and unicode number of each characters in text
    :param unicode s: input string
    :type s: unicode 
    """
    print u"        utf8    unicode"
    for i in s:
        unicode_number = hex(ord(i))[2:].zfill(4)
        utf8_number = i.encode("utf-8").encode("hex")
        if utf8_number in ["cc80", "cc82", "cc83", "cca3"]:
            format_string = u"{:3s} -> {:>6s} -> {:>7s}"
        else:
            format_string = u"{:2s} -> {:>6s} -> {:>7s}"
        print format_string.format(i, utf8_number, unicode_number)
    
In [2]:
    
# unicode tổ hợp
s = u"cộng hòa xã hội"
analyze_characters(s)
    
    
In [3]:
    
# unicode dựng sẵn
s = u"cộng hòa xã hội"
analyze_characters(s)
    
    
In [4]:
    
import unicodedata
# unicode tổ hợp
s = u"cộng hòa xã hội"
analyze_characters(s)
    
    
In [5]:
    
analyze_characters(unicodedata.normalize("NFC", s))
    
    
In [6]:
    
analyze_characters(unicodedata.normalize("NFD", s))
    
    
In [7]:
    
utf8_code(unicodedata.normalize("NFKC", s))
    
    
In [ ]:
    
analyze_characters(unicodedata.normalize("NFKD", s))
    
In [ ]:
    
text = u"ÐĐƉᴆ"
analyze_characters(text)
    
In [ ]:
    
def map_character_to_tcvn(c):
    inverse_mapping_table = {
        # c390
        "Ð": [
            "Đ" # c490
        ]
    }
    mapping_table = {}
    for key, characters in inverse_mapping_table.iteritems():
        for character in characters:
           mapping_table[character]  = key
    utf8_code(c)
    print mapping_table
    print c in mapping_table
    if c in mapping_table:
        return mapping_table[c]
    else:
        return c
def map_text_to_tcvn(text):
    """
    @param unicode text: converted to normalize nfc form
    """
    return [map_character_to_tcvn(c) for c in text]
def convert_to_tcvn(text):
    """
    @param text: unicode
    """
    text = unicodedata.normalize("NFC", text)
    text = map_text_to_tcvn(text)
    return text
    
In [ ]:
    
analyze_characters(convert_to_tcvn(text))
    
In [ ]:
    
from locale import LC_ALL, setlocale
print setlocale(LC_ALL,"Vietnamese")
from string import letters
print letters
    
In [ ]: