In [14]:
import codecs
import json
from cjklib.reading import ReadingFactory
rf = ReadingFactory()
# Load ClassifierDescriptions
f = codecs.open('dist/classifierDescriptions.txt', encoding='utf-8')
ClDesc = {}
for line in f:
d = line.rstrip('\n').split('\t')
ClDesc[d[0]] = d[1]
# Process Unihan Readings
Unihan = {}
# Codepoint (e.g. 'U+3405') to python character
def cp2chr(str):
return unichr(int(str[2:], 16))
f = codecs.open('lib/Unihan_Readings.txt', encoding='utf-8')
for line in f:
if line[0]!='#':
l = line.rstrip('\n').split("\t")
# Try for characters representable in 'narrow' build
try:
c = cp2chr(l[0])
if c not in Unihan:
Unihan[c] = {}
Unihan[c][l[1]] = l[2]
except:
pass
# Load CEDICT
f = codecs.open('lib/cedict_1_0_ts_utf-8_mdbg.txt', encoding='utf-8')
Cl = []
fail = {}
for line in f:
if line.find("CL:") == -1:
continue
line = line.rstrip('\n')
d1 = line.find(" ")+1
d2 = line[d1:].find(" ") + d1
simp = line[d1:d2]
data = line.split("/")
for l in data:
if l[0:3] == "CL:":
l = l[3:]
kom = l.find(",")
if kom != -1:
l = l[:kom]
cl = l[:l.find("[")]
py = l[l.find("[")+1 : l.find("]")]
py = rf.convert(py, 'Pinyin', 'Pinyin', sourceOptions={'toneMarkType': 'numbers', 'yVowel': 'u:'})
sep = cl.find("|")
if sep != -1:
cl = cl[sep+1:]
# Add Definition
if cl in ClDesc:
defn = ClDesc[cl]
elif cl in Unihan:
defn = Unihan[cl][u'kDefinition']
else:
if cl not in fail:
fail[cl] = 1
fail[cl] += 1
defn = ''
Cl += [ [simp, cl, py, defn] ]
# Write tab-seperated list
with codecs.open('dist/classifiersAll.txt', 'w', encoding='utf-8') as f:
for k in Cl:
f.write("\t".join(k)+"\n")
with codecs.open('dist/classifiers.txt', 'w', encoding='utf-8') as f:
for k in Cl:
if k[1] != u'个':
f.write("\t".join(k)+"\n")
In [11]:
for k,v in fail.iteritems():
print k,v
In [ ]: