In [2]:
import codecs
In [3]:
# Process Unihan Readings
Unihan = {}
# Codepoint (e.g. 'U+3405') to python character
def cp2chr(str):
return unichr(int(str[2:], 16))
f = codecs.open('lib/Unihan_Readings.txt', encoding='utf-8')
for line in f:
if line[0]!='#':
l = line.rstrip('\n').split("\t")
# Try for characters representable in 'narrow' build
try:
c = cp2chr(l[0])
if c not in Unihan:
Unihan[c] = {}
Unihan[c][l[1]] = l[2]
except:
pass
# Process decomposition list
data = {}
f = codecs.open('lib/cjk-decomp-0.4.0.txt', encoding='utf-8')
for line in f:
l = line.rstrip('\n').split(':')
s = l[1]
s = s[s.find("(")+1:s.find(")")].split(',')
data[l[0]] = s
In [6]:
# Create final decomposition list
replace = {
u'⺙': u'攵',
u'⺆': u'冂',
u'⺁': u'厂',
u'卄': u'艹',
u'\u31d0': u'一',
u'\u31d4': u'丶',
u'\u31d3': u'丿',
u'\u31d1': u'丨',
u'\u31df': u'乚',
u'\u31e0': u'乙',
}
equivalent = {
u'⻊': u'足',
u'⺮': u'竹',
u'⺌': u'小',
u'⺍': u'小',
u'⺤': u'爪',
u'⺊': u'卜',
u'⺈': u'刀',
u'讠': u'言',
u'亻': u'人',
}
fail = {}
def getDefinition(c):
if c in replace:
c = replace[c]
res = c
if c in equivalent:
c = equivalent[c]
res += u'|' + c
if c not in Unihan:
if c not in fail:
fail[c] = 1
else:
fail[c] += 1
return c
ucd = Unihan[c]
if u'kMandarin' in ucd:
res += u' ({})'.format(ucd[u'kMandarin'])
if u'kDefinition' in ucd:
res += u' ' + ucd[u'kDefinition']
return res
# Recursively replace the number entries by characters
def replaceNumbers(L):
res = []
for l in L:
if len(l) > 1:
res += replaceNumbers(data[l])
else:
res += [ l ]
return res
deComp = {}
for k, v in data.iteritems():
if len(k) == 1:
D = replaceNumbers(v)
res = []
for d in D:
res += [ getDefinition(d) ]
deComp[k] = res
# Write result file
fnew = codecs.open('dist/characterDecompositions.txt', 'w', encoding='utf-8')
for key, val in deComp.iteritems():
line = u'{}: {}\n'.format(key, ' / '.join(val))
fnew.write( line )
fnew.close()
In [5]:
import operator
s = sorted(fail.items(), key=operator.itemgetter(1), reverse=True)
for k in s:
print k[0], k[1]
s
Out[5]:
In [ ]: