Vietnamese Unicode

This lab intend to give a demonstration about Vietname unicode normalization problems.


In [1]:
def analyze_characters(s):
    """ core function: analyze characters
    print utf8 number and unicode number of each characters in text

    :param unicode s: input string
    :type s: unicode 
    """
    print u"        utf8    unicode"
    for i in s:
        unicode_number = hex(ord(i))[2:].zfill(4)
        utf8_number = i.encode("utf-8").encode("hex")
        if utf8_number in ["cc80", "cc82", "cc83", "cca3"]:
            format_string = u"{:3s} -> {:>6s} -> {:>7s}"
        else:
            format_string = u"{:2s} -> {:>6s} -> {:>7s}"
        print format_string.format(i, utf8_number, unicode_number)

Unicode tổ hợp và unicode dựng sẵn


In [2]:
# unicode tổ hợp
s = u"cộng hòa xã hội"
analyze_characters(s)


        utf8    unicode
c  ->     63 ->    0063
o  ->     6f ->    006f
̣   ->   cca3 ->    0323
̂   ->   cc82 ->    0302
n  ->     6e ->    006e
g  ->     67 ->    0067
   ->     20 ->    0020
h  ->     68 ->    0068
o  ->     6f ->    006f
̀   ->   cc80 ->    0300
a  ->     61 ->    0061
   ->     20 ->    0020
x  ->     78 ->    0078
a  ->     61 ->    0061
̃   ->   cc83 ->    0303
   ->     20 ->    0020
h  ->     68 ->    0068
o  ->     6f ->    006f
̣   ->   cca3 ->    0323
̂   ->   cc82 ->    0302
i  ->     69 ->    0069

In [3]:
# unicode dựng sẵn
s = u"cộng hòa xã hội"
analyze_characters(s)


        utf8    unicode
c  ->     63 ->    0063
ộ  -> e1bb99 ->    1ed9
n  ->     6e ->    006e
g  ->     67 ->    0067
   ->     20 ->    0020
h  ->     68 ->    0068
ò  ->   c3b2 ->    00f2
a  ->     61 ->    0061
   ->     20 ->    0020
x  ->     78 ->    0078
ã  ->   c3a3 ->    00e3
   ->     20 ->    0020
h  ->     68 ->    0068
ộ  -> e1bb99 ->    1ed9
i  ->     69 ->    0069

After normalization


In [4]:
import unicodedata

# unicode tổ hợp
s = u"cộng hòa xã hội"

analyze_characters(s)


        utf8    unicode
c  ->     63 ->    0063
o  ->     6f ->    006f
̣   ->   cca3 ->    0323
̂   ->   cc82 ->    0302
n  ->     6e ->    006e
g  ->     67 ->    0067
   ->     20 ->    0020
h  ->     68 ->    0068
o  ->     6f ->    006f
̀   ->   cc80 ->    0300
a  ->     61 ->    0061
   ->     20 ->    0020
x  ->     78 ->    0078
a  ->     61 ->    0061
̃   ->   cc83 ->    0303
   ->     20 ->    0020
h  ->     68 ->    0068
o  ->     6f ->    006f
̣   ->   cca3 ->    0323
̂   ->   cc82 ->    0302
i  ->     69 ->    0069

In [5]:
analyze_characters(unicodedata.normalize("NFC", s))


        utf8    unicode
c  ->     63 ->    0063
ộ  -> e1bb99 ->    1ed9
n  ->     6e ->    006e
g  ->     67 ->    0067
   ->     20 ->    0020
h  ->     68 ->    0068
ò  ->   c3b2 ->    00f2
a  ->     61 ->    0061
   ->     20 ->    0020
x  ->     78 ->    0078
ã  ->   c3a3 ->    00e3
   ->     20 ->    0020
h  ->     68 ->    0068
ộ  -> e1bb99 ->    1ed9
i  ->     69 ->    0069

In [6]:
analyze_characters(unicodedata.normalize("NFD", s))


        utf8    unicode
c  ->     63 ->    0063
o  ->     6f ->    006f
̣   ->   cca3 ->    0323
̂   ->   cc82 ->    0302
n  ->     6e ->    006e
g  ->     67 ->    0067
   ->     20 ->    0020
h  ->     68 ->    0068
o  ->     6f ->    006f
̀   ->   cc80 ->    0300
a  ->     61 ->    0061
   ->     20 ->    0020
x  ->     78 ->    0078
a  ->     61 ->    0061
̃   ->   cc83 ->    0303
   ->     20 ->    0020
h  ->     68 ->    0068
o  ->     6f ->    006f
̣   ->   cca3 ->    0323
̂   ->   cc82 ->    0302
i  ->     69 ->    0069

In [7]:
utf8_code(unicodedata.normalize("NFKC", s))


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-af32954b3bbb> in <module>()
----> 1 utf8_code(unicodedata.normalize("NFKC", s))

NameError: name 'utf8_code' is not defined

In [ ]:
analyze_characters(unicodedata.normalize("NFKD", s))

One symbol, many characters

Letter `Đ` has many characters in unicode

Character UTF-Code Unicode
Ð C3 90 U+00D0
Đ C4 90 U+0110
Ɖ C6 89 U+0189
E1 B4 86 U+1D06

In [ ]:
text = u"ÐĐƉᴆ"
analyze_characters(text)

In [ ]:
def map_character_to_tcvn(c):
    inverse_mapping_table = {
        # c390
        "Ð": [
            "Đ" # c490
        ]
    }
    mapping_table = {}
    for key, characters in inverse_mapping_table.iteritems():
        for character in characters:
           mapping_table[character]  = key
    utf8_code(c)
    print mapping_table
    print c in mapping_table
    if c in mapping_table:
        return mapping_table[c]
    else:
        return c

def map_text_to_tcvn(text):
    """
    @param unicode text: converted to normalize nfc form
    """
    return [map_character_to_tcvn(c) for c in text]

def convert_to_tcvn(text):
    """
    @param text: unicode
    """
    text = unicodedata.normalize("NFC", text)
    text = map_text_to_tcvn(text)
    return text

In [ ]:
analyze_characters(convert_to_tcvn(text))

Convert to TCVN 6609


In [ ]:
from locale import LC_ALL, setlocale
print setlocale(LC_ALL,"Vietnamese")

from string import letters
print letters

In [ ]: