Vietnamese Unicode

This lab intend to give a demonstration about Vietname unicode normalization problems.



In [1]:

    
def analyze_characters(s):
    """ core function: analyze characters
    print utf8 number and unicode number of each characters in text

    :param unicode s: input string
    :type s: unicode 
    """
    print u"        utf8    unicode"
    for i in s:
        unicode_number = hex(ord(i))[2:].zfill(4)
        utf8_number = i.encode("utf-8").encode("hex")
        if utf8_number in ["cc80", "cc82", "cc83", "cca3"]:
            format_string = u"{:3s} -> {:>6s} -> {:>7s}"
        else:
            format_string = u"{:2s} -> {:>6s} -> {:>7s}"
        print format_string.format(i, utf8_number, unicode_number)

Unicode tổ hợp và unicode dựng sẵn



In [2]:

    
# unicode tổ hợp
s = u"cộng hòa xã hội"
analyze_characters(s)









    



        utf8    unicode
c  ->     63 ->    0063
o  ->     6f ->    006f
̣   ->   cca3 ->    0323
̂   ->   cc82 ->    0302
n  ->     6e ->    006e
g  ->     67 ->    0067
   ->     20 ->    0020
h  ->     68 ->    0068
o  ->     6f ->    006f
̀   ->   cc80 ->    0300
a  ->     61 ->    0061
   ->     20 ->    0020
x  ->     78 ->    0078
a  ->     61 ->    0061
̃   ->   cc83 ->    0303
   ->     20 ->    0020
h  ->     68 ->    0068
o  ->     6f ->    006f
̣   ->   cca3 ->    0323
̂   ->   cc82 ->    0302
i  ->     69 ->    0069



In [3]:

    
# unicode dựng sẵn
s = u"cộng hòa xã hội"
analyze_characters(s)









    



        utf8    unicode
c  ->     63 ->    0063
ộ  -> e1bb99 ->    1ed9
n  ->     6e ->    006e
g  ->     67 ->    0067
   ->     20 ->    0020
h  ->     68 ->    0068
ò  ->   c3b2 ->    00f2
a  ->     61 ->    0061
   ->     20 ->    0020
x  ->     78 ->    0078
ã  ->   c3a3 ->    00e3
   ->     20 ->    0020
h  ->     68 ->    0068
ộ  -> e1bb99 ->    1ed9
i  ->     69 ->    0069

After normalization



In [4]:

    
import unicodedata

# unicode tổ hợp
s = u"cộng hòa xã hội"

analyze_characters(s)









    



        utf8    unicode
c  ->     63 ->    0063
o  ->     6f ->    006f
̣   ->   cca3 ->    0323
̂   ->   cc82 ->    0302
n  ->     6e ->    006e
g  ->     67 ->    0067
   ->     20 ->    0020
h  ->     68 ->    0068
o  ->     6f ->    006f
̀   ->   cc80 ->    0300
a  ->     61 ->    0061
   ->     20 ->    0020
x  ->     78 ->    0078
a  ->     61 ->    0061
̃   ->   cc83 ->    0303
   ->     20 ->    0020
h  ->     68 ->    0068
o  ->     6f ->    006f
̣   ->   cca3 ->    0323
̂   ->   cc82 ->    0302
i  ->     69 ->    0069



In [5]:

    
analyze_characters(unicodedata.normalize("NFC", s))









    



        utf8    unicode
c  ->     63 ->    0063
ộ  -> e1bb99 ->    1ed9
n  ->     6e ->    006e
g  ->     67 ->    0067
   ->     20 ->    0020
h  ->     68 ->    0068
ò  ->   c3b2 ->    00f2
a  ->     61 ->    0061
   ->     20 ->    0020
x  ->     78 ->    0078
ã  ->   c3a3 ->    00e3
   ->     20 ->    0020
h  ->     68 ->    0068
ộ  -> e1bb99 ->    1ed9
i  ->     69 ->    0069



In [6]:

    
analyze_characters(unicodedata.normalize("NFD", s))









    



        utf8    unicode
c  ->     63 ->    0063
o  ->     6f ->    006f
̣   ->   cca3 ->    0323
̂   ->   cc82 ->    0302
n  ->     6e ->    006e
g  ->     67 ->    0067
   ->     20 ->    0020
h  ->     68 ->    0068
o  ->     6f ->    006f
̀   ->   cc80 ->    0300
a  ->     61 ->    0061
   ->     20 ->    0020
x  ->     78 ->    0078
a  ->     61 ->    0061
̃   ->   cc83 ->    0303
   ->     20 ->    0020
h  ->     68 ->    0068
o  ->     6f ->    006f
̣   ->   cca3 ->    0323
̂   ->   cc82 ->    0302
i  ->     69 ->    0069



In [7]:

    
utf8_code(unicodedata.normalize("NFKC", s))









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-af32954b3bbb> in <module>()
----> 1 utf8_code(unicodedata.normalize("NFKC", s))

NameError: name 'utf8_code' is not defined



In [ ]:

    
analyze_characters(unicodedata.normalize("NFKD", s))

One symbol, many characters

Letter `Đ` has many characters in unicode

Character	UTF-Code	Unicode
Ð	C3 90	U+00D0
Đ	C4 90	U+0110
Ɖ	C6 89	U+0189
ᴆ	E1 B4 86	U+1D06



In [ ]:

    
text = u"ÐĐƉᴆ"
analyze_characters(text)



In [ ]:

    
def map_character_to_tcvn(c):
    inverse_mapping_table = {
        # c390
        "Ð": [
            "Đ" # c490
        ]
    }
    mapping_table = {}
    for key, characters in inverse_mapping_table.iteritems():
        for character in characters:
           mapping_table[character]  = key
    utf8_code(c)
    print mapping_table
    print c in mapping_table
    if c in mapping_table:
        return mapping_table[c]
    else:
        return c

def map_text_to_tcvn(text):
    """
    @param unicode text: converted to normalize nfc form
    """
    return [map_character_to_tcvn(c) for c in text]

def convert_to_tcvn(text):
    """
    @param text: unicode
    """
    text = unicodedata.normalize("NFC", text)
    text = map_text_to_tcvn(text)
    return text



In [ ]:

    
analyze_characters(convert_to_tcvn(text))

Convert to TCVN 6609



In [ ]:

    
from locale import LC_ALL, setlocale
print setlocale(LC_ALL,"Vietnamese")

from string import letters
print letters



In [ ]: