In [2]:
import codecs

In [3]:
# Process Unihan Readings
Unihan = {}

# Codepoint (e.g. 'U+3405') to python character
def cp2chr(str):
    return unichr(int(str[2:], 16))


f = codecs.open('lib/Unihan_Readings.txt', encoding='utf-8')
for line in f:
    if line[0]!='#':
        l = line.rstrip('\n').split("\t")
        # Try for characters representable in 'narrow' build
        try:
            c = cp2chr(l[0])
            if c not in Unihan:
                Unihan[c] = {}
            Unihan[c][l[1]] = l[2]
        except:
            pass

        
# Process decomposition list
data = {} 

f = codecs.open('lib/cjk-decomp-0.4.0.txt', encoding='utf-8')
for line in f:
    l = line.rstrip('\n').split(':')
    s = l[1]
    s = s[s.find("(")+1:s.find(")")].split(',')
    data[l[0]] = s

In [6]:
# Create final decomposition list     
replace = {
    u'⺙': u'攵',
    u'⺆': u'冂',
    u'⺁': u'厂',
    u'卄': u'艹',
    u'\u31d0': u'一',
    u'\u31d4': u'丶',
    u'\u31d3': u'丿',
    u'\u31d1': u'丨',
    u'\u31df': u'乚',
    u'\u31e0': u'乙',
}

equivalent = {
    u'⻊': u'足',
    u'⺮': u'竹',
    u'⺌': u'小',
    u'⺍': u'小',
    u'⺤': u'爪',
    u'⺊': u'卜',
    u'⺈': u'刀',
    u'讠': u'言',
    u'亻': u'人',
    
}

fail = {}
        
def getDefinition(c):
    if c in replace:
        c = replace[c] 
    res = c
    
    if c in equivalent:
        c = equivalent[c]
        res += u'|' + c
        
    if c not in Unihan:
        if c not in fail:
            fail[c] = 1
        else:
            fail[c] += 1
        return c
    
    ucd = Unihan[c]
    if u'kMandarin' in ucd:
        res += u' ({})'.format(ucd[u'kMandarin'])
    if u'kDefinition' in ucd:
        res += u' ' + ucd[u'kDefinition']
    return res


# Recursively replace the number entries by characters
def replaceNumbers(L):
    res = []
    for l in L:
        if len(l) > 1:
            res += replaceNumbers(data[l])
        else:
            res += [ l ]
    return res

deComp = {}

for k, v in data.iteritems():
    if len(k) == 1:
        D = replaceNumbers(v)
        res = []
        for d in D:
            res += [ getDefinition(d) ]
        deComp[k] = res

# Write result file
fnew = codecs.open('dist/characterDecompositions.txt', 'w', encoding='utf-8')
for key, val in deComp.iteritems():
    line = u'{}: {}\n'.format(key, ' / '.join(val))
    fnew.write( line )

fnew.close()

In [5]:
import operator
s = sorted(fail.items(), key=operator.itemgetter(1), reverse=True)
for k in s:
    print k[0], k[1]
s


㇒ 158
㇗ 70
㇆ 63
㇉ 49
龹 36
 35
龴 33
⺶ 32
㇚ 26
⺀ 20
龸 19
龰 17
㇎ 17
㇏ 16
⺕ 15
⺺ 15
㇖ 14
龺 14
龶 12
龷 10
㇂ 10
⺹ 10
㇛ 10
㑒 9
㇇ 8
㇕ 8
㳟 8
㇝ 7
⺼ 6
㇜ 5
龵 4
⺧ 3
⺳ 3
㇀ 3
㇅ 3
㣊 3
㇣ 3
⺪ 2
⺻ 2
㇈ 2
㇋ 2
㇌ 2
㇙ 2
⻭ 2
㖈 1
⻖ 1
㇊ 1
䘖 1
㇢ 1
㢤 1
㦳 1
⺱ 1
㇁ 1
㇃ 1
⻍ 1
㽔 1
Out[5]:
[(u'\u31d2', 158),
 (u'\u31d7', 70),
 (u'\u31c6', 63),
 (u'\u31c9', 49),
 (u'\u9fb9', 36),
 (u'', 35),
 (u'\u9fb4', 33),
 (u'\u2eb6', 32),
 (u'\u31da', 26),
 (u'\u2e80', 20),
 (u'\u9fb8', 19),
 (u'\u9fb0', 17),
 (u'\u31ce', 17),
 (u'\u31cf', 16),
 (u'\u2e95', 15),
 (u'\u2eba', 15),
 (u'\u31d6', 14),
 (u'\u9fba', 14),
 (u'\u9fb6', 12),
 (u'\u9fb7', 10),
 (u'\u31c2', 10),
 (u'\u2eb9', 10),
 (u'\u31db', 10),
 (u'\u3452', 9),
 (u'\u31c7', 8),
 (u'\u31d5', 8),
 (u'\u3cdf', 8),
 (u'\u31dd', 7),
 (u'\u2ebc', 6),
 (u'\u31dc', 5),
 (u'\u9fb5', 4),
 (u'\u2ea7', 3),
 (u'\u2eb3', 3),
 (u'\u31c0', 3),
 (u'\u31c5', 3),
 (u'\u38ca', 3),
 (u'\u31e3', 3),
 (u'\u2eaa', 2),
 (u'\u2ebb', 2),
 (u'\u31c8', 2),
 (u'\u31cb', 2),
 (u'\u31cc', 2),
 (u'\u31d9', 2),
 (u'\u2eed', 2),
 (u'\u3588', 1),
 (u'\u2ed6', 1),
 (u'\u31ca', 1),
 (u'\u4616', 1),
 (u'\u31e2', 1),
 (u'\u38a4', 1),
 (u'\u39b3', 1),
 (u'\u2eb1', 1),
 (u'\u31c1', 1),
 (u'\u31c3', 1),
 (u'\u2ecd', 1),
 (u'\u3f54', 1)]

In [ ]: