In [14]:
import codecs
import json

from cjklib.reading import ReadingFactory
rf = ReadingFactory()


# Load ClassifierDescriptions
f = codecs.open('dist/classifierDescriptions.txt', encoding='utf-8')
ClDesc = {}

for line in f:    
    d = line.rstrip('\n').split('\t')
    ClDesc[d[0]] = d[1]
    
    

# Process Unihan Readings
Unihan = {}

# Codepoint (e.g. 'U+3405') to python character
def cp2chr(str):
    return unichr(int(str[2:], 16))


f = codecs.open('lib/Unihan_Readings.txt', encoding='utf-8')
for line in f:
    if line[0]!='#':
        l = line.rstrip('\n').split("\t")
        # Try for characters representable in 'narrow' build
        try:
            c = cp2chr(l[0])
            if c not in Unihan:
                Unihan[c] = {}
            Unihan[c][l[1]] = l[2]
        except:
            pass

        
    
    
# Load CEDICT
f = codecs.open('lib/cedict_1_0_ts_utf-8_mdbg.txt', encoding='utf-8')
Cl = []

fail = {}

for line in f:
    if line.find("CL:") == -1:
        continue
        
    line = line.rstrip('\n')
    
    d1 = line.find(" ")+1
    d2 = line[d1:].find(" ") + d1
    simp = line[d1:d2]
    
        
    data = line.split("/")
    for l in data:
        if l[0:3] == "CL:":
            l = l[3:]
            
            kom = l.find(",")
            if kom != -1:
                l = l[:kom]
            
            cl = l[:l.find("[")]
            py = l[l.find("[")+1 : l.find("]")]
            py = rf.convert(py, 'Pinyin', 'Pinyin', sourceOptions={'toneMarkType': 'numbers', 'yVowel': 'u:'})
            
            
            sep = cl.find("|")
            if sep != -1:
                cl = cl[sep+1:]
                
            # Add Definition
            if cl in ClDesc:
                defn = ClDesc[cl]
            elif cl in Unihan:
                defn = Unihan[cl][u'kDefinition']
            else:
                if cl not in fail:
                    fail[cl] = 1
                fail[cl] += 1
                defn = ''
            
            Cl += [ [simp, cl, py, defn] ]
   

# Write tab-seperated list
with codecs.open('dist/classifiersAll.txt', 'w', encoding='utf-8') as f:
    for k in Cl:
        f.write("\t".join(k)+"\n")
        

with codecs.open('dist/classifiers.txt', 'w', encoding='utf-8') as f:
    for k in Cl:
        if k[1] != u'个':
            f.write("\t".join(k)+"\n")

In [11]:
for k,v in fail.iteritems():
    print k,v


碗 3
户 3
堆 2
袋 2
线 2
服 2
双 11
行 2
盒 3
滴 4
列 4
帖 2
队 2

In [ ]: