Analysis of Unicode Character Names

Character data from Python unicodedata module


In [2]:
import sys
import unicodedata

In [3]:
sys.maxunicode


Out[3]:
1114111

In [4]:
unicodedata.unidata_version


Out[4]:
'9.0.0'

In [5]:
def python_named_chars():
    for code in range(sys.maxunicode):
        char = chr(code)
        try:
            yield char, unicodedata.name(char)
        except ValueError: # no such name
            continue

In [6]:
l_py = list(python_named_chars())

In [7]:
len(l_py)


Out[7]:
122047

In [8]:
l_py[0]


Out[8]:
(' ', 'SPACE')

In [9]:
l_py[:5], l_py[-5:]


Out[9]:
([(' ', 'SPACE'),
  ('!', 'EXCLAMATION MARK'),
  ('"', 'QUOTATION MARK'),
  ('#', 'NUMBER SIGN'),
  ('$', 'DOLLAR SIGN')],
 [('󠇫', 'VARIATION SELECTOR-252'),
  ('󠇬', 'VARIATION SELECTOR-253'),
  ('󠇭', 'VARIATION SELECTOR-254'),
  ('󠇮', 'VARIATION SELECTOR-255'),
  ('󠇯', 'VARIATION SELECTOR-256')])

In [10]:
set_py = {name for _, name in l_py}

In [11]:
import collections

words = collections.Counter()

for _, name in l_py:
    parts = name.replace('-', ' ').split()
    words.update(parts)
    
len(words)


Out[11]:
102743

In [12]:
for word, count in words.most_common(10):
    print(f'{count:6d} {word}')


 81593 CJK
 81533 IDEOGRAPH
 80428 UNIFIED
 13393 SYLLABLE
 11735 HANGUL
  9280 LETTER
  3042 SIGN
  2630 WITH
  2557 SMALL
  1887 CAPITAL

In [21]:
mc = [(w, c) for w, c in words.most_common() if c > 1]
len(mc)


Out[21]:
4295

In [26]:
mc[len(mc)//100]


Out[26]:
('CYRILLIC', 444)

Character data from UnicodeData.txt


In [10]:
len(list(open('UnicodeData.txt')))


Out[10]:
31618

In [11]:
import ucd  # local module

In [12]:
l_ucd = list(ucd.parser())
len(l_ucd)


Out[12]:
31523

In [13]:
l_ucd[:5], l_ucd[-5:]


Out[13]:
([NameRecord(code=32, name='SPACE', old_name='', words=['SPACE']),
  NameRecord(code=33, name='EXCLAMATION MARK', old_name='', words=['EXCLAMATION', 'MARK']),
  NameRecord(code=34, name='QUOTATION MARK', old_name='', words=['MARK', 'QUOTATION']),
  NameRecord(code=35, name='NUMBER SIGN', old_name='', words=['NUMBER', 'SIGN']),
  NameRecord(code=36, name='DOLLAR SIGN', old_name='', words=['DOLLAR', 'SIGN'])],
 [NameRecord(code=917995, name='VARIATION SELECTOR-252', old_name='', words=['252', 'SELECTOR', 'VARIATION']),
  NameRecord(code=917996, name='VARIATION SELECTOR-253', old_name='', words=['253', 'SELECTOR', 'VARIATION']),
  NameRecord(code=917997, name='VARIATION SELECTOR-254', old_name='', words=['254', 'SELECTOR', 'VARIATION']),
  NameRecord(code=917998, name='VARIATION SELECTOR-255', old_name='', words=['255', 'SELECTOR', 'VARIATION']),
  NameRecord(code=917999, name='VARIATION SELECTOR-256', old_name='', words=['256', 'SELECTOR', 'VARIATION'])])

In [14]:
set_ucd = {rec.name for rec in l_ucd}

Difference between names from unicodedata module and UnicodeData.txt

Note: UnicodeData.txt does not contain algorthmically derived names such as 'CJK UNIFIED IDEOGRAPH-20004'


In [15]:
set_py > set_ucd


Out[15]:
False

In [16]:
set_ucd > set_py


Out[16]:
False

In [17]:
ucd_only = sorted(set_ucd - set_py)
len(ucd_only)


Out[17]:
1024

In [18]:
ucd_only[:7], ucd_only[-7:]


Out[18]:
(['ADULT',
  'BEARDED PERSON',
  'BENGALI ABBREVIATION SIGN',
  'BENGALI LETTER VEDIC ANUSVARA',
  'BILLED CAP',
  'BITCOIN SIGN',
  'BOPOMOFO LETTER O WITH DOT ABOVE'],
 ['ZANABAZAR SQUARE VOWEL SIGN O',
  'ZANABAZAR SQUARE VOWEL SIGN OE',
  'ZANABAZAR SQUARE VOWEL SIGN REVERSED I',
  'ZANABAZAR SQUARE VOWEL SIGN U',
  'ZANABAZAR SQUARE VOWEL SIGN UE',
  'ZEBRA FACE',
  'ZOMBIE'])

In [19]:
py_only = sorted(set_py - set_ucd)
len(py_only)


Out[19]:
91548

In [20]:
py_only[:7], py_only[-7:]


Out[20]:
(['CJK UNIFIED IDEOGRAPH-20000',
  'CJK UNIFIED IDEOGRAPH-20001',
  'CJK UNIFIED IDEOGRAPH-20002',
  'CJK UNIFIED IDEOGRAPH-20003',
  'CJK UNIFIED IDEOGRAPH-20004',
  'CJK UNIFIED IDEOGRAPH-20005',
  'CJK UNIFIED IDEOGRAPH-20006'],
 ['HANGUL SYLLABLE YUNG',
  'HANGUL SYLLABLE YUNH',
  'HANGUL SYLLABLE YUNJ',
  'HANGUL SYLLABLE YUP',
  'HANGUL SYLLABLE YUS',
  'HANGUL SYLLABLE YUSS',
  'HANGUL SYLLABLE YUT'])

In [21]:
import collections

words = collections.Counter()

for name in py_only:
    if 'CJK UNIFIED IDEOGRAPH' in name:
        continue
    parts = name.replace('-', ' ').split()
    words.update(parts)
    
len(words)


Out[21]:
11174

In [22]:
for word, count in words.most_common(10):
    print(f'{count:6d} {word}')


 11172 HANGUL
 11172 SYLLABLE
     1 A
     1 AB
     1 ABS
     1 AC
     1 AD
     1 AE
     1 AEB
     1 AEBS

In [32]:
words = collections.Counter()

for name in sorted(set_ucd):
    parts = name.replace('-', ' ').split()
    words.update(parts)
    
len(words)


Out[32]:
12465

In [33]:
for word, count in words.most_common(100):
    print(f'{count:6d} {word}')


  9719 LETTER
  3109 SIGN
  2658 WITH
  2558 SMALL
  2221 SYLLABLE
  1887 CAPITAL
  1654 HIEROGLYPH
  1492 LATIN
  1284 ARABIC
  1248 YI
  1234 CUNEIFORM
  1217 CJK
  1205 SYMBOL
  1157 IDEOGRAPH
  1152 MATHEMATICAL
  1071 EGYPTIAN
  1014 COMPATIBILITY
   807 FORM
   801 A
   795 DIGIT
   756 TANGUT
   755 COMPONENT
   719 VOWEL
   710 CANADIAN
   710 SYLLABICS
   672 SIGNWRITING
   664 TIMES
   657 BAMUM
   584 BOLD
   583 ANATOLIAN
   580 AND
   576 ARROW
   569 PHASE
   563 HANGUL
   552 LINEAR
   530 GREEK
   516 LIGATURE
   516 MUSICAL
   495 ETHIOPIC
   491 CHARACTER
   464 E
   455 FOR
   445 COMBINING
   444 CYRILLIC
   443 DOUBLE
   439 ABOVE
   429 ITALIC
   422 OLD
   418 SQUARE
   404 LEFT
   403 NUMBER
   401 RIGHT
   397 NUSHU
   393 U
   387 SERIF
   385 RADICAL
   385 SANS
   378 CIRCLED
   371 DOTS
   351 MARK
   349 FINAL
   344 O
   344 B
   344 TAI
   310 I
   300 VAI
   299 TWO
   294 HAND
   292 BLACK
   286 ONE
   285 HENTAIGANA
   280 BELOW
   279 DOT
   260 SELECTOR
   260 VARIATION
   257 PATTERN
   256 BRAILLE
   255 THREE
   253 WHITE
   249 MODIFIER
   246 BYZANTINE
   243 VERTICAL
   236 ISOLATED
   226 STROKE
   226 KATAKANA
   223 MYANMAR
   222 HEAVY
   217 OF
   214 KANGXI
   213 D
   213 MENDE
   213 KIKAKUI
   207 INITIAL
   207 TIBETAN
   206 TO
   204 FOUR
   202 MEEM
   202 C
   195 KA
   195 UP

In [25]:
max(words, key=len)


Out[25]:
'SYLLABLE'

In [26]:
singles = sorted((count, word) for word, count in words.items() if len(word)==1)

In [27]:
len(singles)


Out[27]:
0

In [28]:
for count, word in reversed(singles):
    print(f'{count:6d} {word}')

In [29]:
unique = sorted(word for word, count in words.items() if count==1)

In [30]:
len(unique)


Out[30]:
0

In [31]:
unique[:50], unique[-50:]


Out[31]:
([], [])