In [2]:
import sys
import unicodedata
In [3]:
sys.maxunicode
Out[3]:
In [4]:
unicodedata.unidata_version
Out[4]:
In [5]:
def python_named_chars():
for code in range(sys.maxunicode):
char = chr(code)
try:
yield char, unicodedata.name(char)
except ValueError: # no such name
continue
In [6]:
l_py = list(python_named_chars())
In [7]:
len(l_py)
Out[7]:
In [8]:
l_py[0]
Out[8]:
In [9]:
l_py[:5], l_py[-5:]
Out[9]:
In [10]:
set_py = {name for _, name in l_py}
In [11]:
import collections
words = collections.Counter()
for _, name in l_py:
parts = name.replace('-', ' ').split()
words.update(parts)
len(words)
Out[11]:
In [12]:
for word, count in words.most_common(10):
print(f'{count:6d} {word}')
In [21]:
mc = [(w, c) for w, c in words.most_common() if c > 1]
len(mc)
Out[21]:
In [26]:
mc[len(mc)//100]
Out[26]:
In [10]:
len(list(open('UnicodeData.txt')))
Out[10]:
In [11]:
import ucd # local module
In [12]:
l_ucd = list(ucd.parser())
len(l_ucd)
Out[12]:
In [13]:
l_ucd[:5], l_ucd[-5:]
Out[13]:
In [14]:
set_ucd = {rec.name for rec in l_ucd}
In [15]:
set_py > set_ucd
Out[15]:
In [16]:
set_ucd > set_py
Out[16]:
In [17]:
ucd_only = sorted(set_ucd - set_py)
len(ucd_only)
Out[17]:
In [18]:
ucd_only[:7], ucd_only[-7:]
Out[18]:
In [19]:
py_only = sorted(set_py - set_ucd)
len(py_only)
Out[19]:
In [20]:
py_only[:7], py_only[-7:]
Out[20]:
In [21]:
import collections
words = collections.Counter()
for name in py_only:
if 'CJK UNIFIED IDEOGRAPH' in name:
continue
parts = name.replace('-', ' ').split()
words.update(parts)
len(words)
Out[21]:
In [22]:
for word, count in words.most_common(10):
print(f'{count:6d} {word}')
In [32]:
words = collections.Counter()
for name in sorted(set_ucd):
parts = name.replace('-', ' ').split()
words.update(parts)
len(words)
Out[32]:
In [33]:
for word, count in words.most_common(100):
print(f'{count:6d} {word}')
In [25]:
max(words, key=len)
Out[25]:
In [26]:
singles = sorted((count, word) for word, count in words.items() if len(word)==1)
In [27]:
len(singles)
Out[27]:
In [28]:
for count, word in reversed(singles):
print(f'{count:6d} {word}')
In [29]:
unique = sorted(word for word, count in words.items() if count==1)
In [30]:
len(unique)
Out[30]:
In [31]:
unique[:50], unique[-50:]
Out[31]: