In [ ]:
#Unicode is non-encoded
txt = 'This is a string of bytes'
uni = u'This is unicode'
print type(txt), type(uni)
print "These are both subtypes of <type 'basestring'> in Python 2"
In [ ]:
#chardet #detect #text #encoding
import chardet
print chardet.detect(chapters[0])
#unicode #encode #decode
# encoding or decoding is always FROM unicode or TOWARDS unicode.
# [decode] [encode]
# ASCII ---> UNICODE ---> UTF-8
# 1 Glyph 1 Glyph
# = 1 Glyph =
# 1 Byte 1-4 Bytes
mystring = 'xxx'
unicode_str = mystring.decode('ascii')
utf8_str = unicode_str.encode('utf-8')
# note that this example is redundant, because ascii is a subset of UTF-8 so it always works
In [ ]:
#ascii
ord('a') == 97
chr(97) == 'a'
utf8 = '\xc\xb3' #bytes, oacute; not text, encoded
utf8
print utf8
unic = utf8.decode('utf-8')
#to get back
newtf8 = unic.encode('utf-8')
#ascii
#utf-8
#utf16
#latin-1 == iso-8859-1
#iso----
#open or save file as unicode
import codecs
with codecs.open('text.txt', 'r', encoding='utf-8') as F:
ascii = 7 bit to 128
utf 32 = 4 bytes 32 bits, fixed space, inefficient
utf 16
In [ ]:
In [ ]:
In [ ]:
In [4]:
import re
x = 'ÆÐELRIC'
In [2]:
x
Out[2]:
In [27]:
y= x.decode('utf-8')
In [28]:
y
Out[28]:
In [23]:
print y
In [24]:
import re
In [26]:
print re.sub('Æ'.decode('UTF-8'), '', y)
In [ ]:
# encode a unicode to str, decode a str to unicode
In [1]:
# this is not complete; notably missing are Hungarian diacritics
accents = [(r'Ą', r'A'), (r'ą', r'a'), (r'Č', r'C'), (r'č', r'c'), (r'ď', r'd'), (r'Ę', r'E'), (r'ę', r'e'),
(r'Ě', r'E'), (r'ě', r'e'), (r'Ĺ', r'L'), (r'ĺ', r'l'), (r'Ň', r'N'), (r'ň', r'n'), (r'Ŕ', r'R'),
(r'ŕ', r'r'), (r'Ř', r'R'), (r'ř', r'r'), (r'ť', r't'), (r'Ů', r'r'), (r'ů', r'r'), (r'Ž', r'Z'),
(r'ž', r'z'), (r'Á', r'A'), (r'á', r'a'), (r'Â', r'A'), (r'â', r'a'), (r'Ø', r'o'), (r'õ', r'o'),
(r'À', r'A'), (r'à', r'a'), (r'Ä', r'A'), (r'ä', r'a'), (r'Ç', r'C'), (r'ç', r'c'), (r'É', r'E'),
(r'é', r'e'), (r'Ê', r'E'), (r'ê', r'e'), (r'È', r'E'), (r'è', r'e'), (r'Ë', r'E'), (r'ë', r'e'),
(r'Í', r'I'), (r'í', r'i'), (r'Î', r'I'), (r'î', r'i'), (r'Ì', r'I'), (r'ì', r'i'), (r'Ï', r'I'),
(r'ï', r'i'), (r'Ñ', r'N'), (r'ñ', r'n'), (r'Ó', r'O'), (r'ó', r'o'), (r'Ô', r'O'), (r'ô', r'o'),
(r'Ò', r'O'), (r'ò', r'o'), (r'Ö', r'O'), (r'ö', r'o'), (r'ø', r'o'), (r'Õ', r'O'), (r'ä', r'a'),
(r'Ú', r'r'), (r'ú', r'r'), (r'Û', r'r'), (r'û', r'r'), (r'Ù', r'r'), (r'ù', r'r'), (r'Ü', r'r'),
(r'ü', r'r'), (r'Ý', r'Y'), (r'ý', r'y'), (r'Š', r'S'), (r'š', r's'), (r'ÿ', r'y'), (r'Ÿ', r'Y'),
(r'Å', r'A'), (r'å', r'a'), (r'Ã', r'A'), (r'ã', r'a'), (r'Ä', r'A'),
(r'Æ', r'AE'), (r'æ', r'ae'), (r'Œ', r'Oe'), (r'œ', r'oe'), (r'ß', r'ss')]
# remove common #accents / #diacritics
# list is incomplete; notably missing are Hungarian diacritics
def remove_accents(txt):
import re
for accentpair in accents:
txt = re.sub(accentpair[0], accentpair[1], txt)
return txt
examples = ['Résumé', 'encyclopædia']
for example in examples:
print remove_accents(example)
# to change a file
accent_change_path_in = "C:/Users/David/Documents/IPython Notebooks/GitHub/Baby_names_US_IPython/lists/bible in prog.txt"
accent_change_path_out = "C:/Users/David/Documents/IPython Notebooks/GitHub/Baby_names_US_IPython/lists/bible in prog_noacc.txt"
in_file = open(accent_change_path_in, 'r').read()
out_file = remove_accents(in_file)
open(accent_change_path_out, 'w+').write(out_file)
In [5]:
for accent in accents:
x = re.sub(accent[0], accent[1], x)
print x