Why? Isn't UTF-8 good enough as-is?
Apparently not. I'm looking at you, Amazon Mechanical Turk. MTurk supports only a subset of UTF-8, a subset that does excludes characters longer than 3 bytes. How convenient.
Python has no concept of runes, as Golang calls them, so to process a Unicode string character-by-character, we must convert to bytes and consult the UTF-8 specification.
Not particularly fun, but it works.
In [1]:
import os
import csv
import unicodedata
import re
In [9]:
def bytes_in_sequence(byte):
if byte >= 0b11111100:
return 6
elif byte >= 0b11111000:
return 5
elif byte >= 0b11110000:
return 4
elif byte >= 0b11100000:
return 3
elif byte >= 0b11000000:
return 2
else:
return 1
In [3]:
def bitfilter(char_iter, maxbytes):
'''
From http://en.wikipedia.org/wiki/UTF-8
bits of code point ; bytes in sequence ; byte 1
7 1 0xxxxxxx
11 2 110xxxxx
16 3 1110xxxx
21 4 11110xxx
26 5 111110xx
31 6 1111110x
'''
while True:
char = char_iter.next()
seq_length = bytes_in_sequence(ord(char))
if seq_length > maxbytes:
# don't presume that unicode characters have a max 32-bit (4-byte) length
for _ in range(1, seq_length):
char_iter.next()
else:
yield char
In [10]:
def simplify(text_unicode, maxbytes):
'''
Input should be unicode, output will be unicode,
but it will translate to UTF-8 in the process.
'''
text_bytes = text_unicode.encode('utf-8')
text_iter = iter(text_bytes)
filtered_bytes_iter = bitfilter(text_iter, maxbytes)
return ''.join(filtered_bytes_iter).decode('utf-8')
In [6]:
input_filepath = 'fancy_emoji.txt'
output_filepath = 'impoverished_emoji.txt'
In [12]:
input_file_contents = open(input_filepath).read()
input_file_contents_u = input_file_contents.decode('utf8')
In [13]:
simplified_contents_u = simplify(input_file_contents_u)
simplified_contents = simplified_contents_u.encode('utf8')
In [115]:
with open(csv_filepath, 'wb') as fp:
writer = csv.DictWriter(fp, fieldnames=('dev-1k-id', 'text'))
writer.writeheader()
for index, text in enumerate(simplified_contents.split('\n'), 1):
writer.writerow({'dev-1k-id': index, 'text': text})
In [14]:
def chars(s):
for i, ch in enumerate(s):
print i, repr(ch)
In [31]:
def string_report(s):
for form in ['NFC', 'NFKC', 'NFD', 'NFKD']:
form, repr(unicodedata.normalize(form, s))
for encoding in ['utf_16', 'utf_8', 'ascii', 'latin_1', 'utf_32', 'utf_8_sig']:
encoding, repr(b4.encode(encoding, 'ignore').decode(encoding))