In [ ]:
#Unicode is non-encoded
txt = 'This is a string of bytes'
uni = u'This is unicode'
print type(txt), type(uni)
print "These are both subtypes of <type 'basestring'> in Python 2"

In [ ]:
#chardet #detect #text #encoding
import chardet
print chardet.detect(chapters[0])

#unicode #encode #decode

# encoding or decoding is always FROM unicode or TOWARDS unicode.

#      [decode]     [encode]
# ASCII ---> UNICODE ---> UTF-8
# 1 Glyph                 1 Glyph 
#   =        1 Glyph        =
# 1 Byte                  1-4 Bytes

mystring = 'xxx'
unicode_str = mystring.decode('ascii')
utf8_str = unicode_str.encode('utf-8')

# note that this example is redundant, because ascii is a subset of UTF-8 so it always works

In [ ]:
#ascii
ord('a') == 97
chr(97) == 'a'


utf8 = '\xc\xb3' #bytes, oacute; not text, encoded
utf8
print utf8

unic = utf8.decode('utf-8')
#to get back
newtf8 = unic.encode('utf-8')

#ascii
#utf-8
#utf16
#latin-1 == iso-8859-1
#iso----

#open or save file as unicode
import codecs
with codecs.open('text.txt', 'r', encoding='utf-8') as F:

ascii = 7 bit to 128
utf 32 = 4 bytes 32 bits, fixed space, inefficient
utf 16

In [ ]:


In [ ]:


In [ ]:


In [4]:
import re
x = 'ÆÐELRIC'

In [2]:
x


Out[2]:
'\xc3\x86\xc3\x90ELRIC'

In [27]:
y= x.decode('utf-8')

In [28]:
y


Out[28]:
u'\xc6\xd0ELRIC'

In [23]:
print y


ÆÐELRIC

In [24]:
import re

In [26]:
print re.sub('Æ'.decode('UTF-8'), '', y)


ÐELRIC

In [ ]:
# encode a unicode to str, decode a str to unicode

In [1]:
# this is not complete; notably missing are Hungarian diacritics
accents = [(r'Ą', r'A'), (r'ą', r'a'), (r'Č', r'C'), (r'č', r'c'), (r'ď', r'd'), (r'Ę', r'E'), (r'ę', r'e'), 
           (r'Ě', r'E'), (r'ě', r'e'), (r'Ĺ', r'L'), (r'ĺ', r'l'), (r'Ň', r'N'), (r'ň', r'n'), (r'Ŕ', r'R'), 
           (r'ŕ', r'r'), (r'Ř', r'R'), (r'ř', r'r'), (r'ť', r't'), (r'Ů', r'r'), (r'ů', r'r'), (r'Ž', r'Z'), 
           (r'ž', r'z'), (r'Á', r'A'), (r'á', r'a'), (r'Â', r'A'), (r'â', r'a'), (r'Ø', r'o'), (r'õ', r'o'), 
           (r'À', r'A'), (r'à', r'a'), (r'Ä', r'A'), (r'ä', r'a'), (r'Ç', r'C'), (r'ç', r'c'), (r'É', r'E'), 
           (r'é', r'e'), (r'Ê', r'E'), (r'ê', r'e'), (r'È', r'E'), (r'è', r'e'), (r'Ë', r'E'), (r'ë', r'e'), 
           (r'Í', r'I'), (r'í', r'i'), (r'Î', r'I'), (r'î', r'i'), (r'Ì', r'I'), (r'ì', r'i'), (r'Ï', r'I'), 
           (r'ï', r'i'), (r'Ñ', r'N'), (r'ñ', r'n'), (r'Ó', r'O'), (r'ó', r'o'), (r'Ô', r'O'), (r'ô', r'o'), 
           (r'Ò', r'O'), (r'ò', r'o'), (r'Ö', r'O'), (r'ö', r'o'), (r'ø', r'o'), (r'Õ', r'O'), (r'ä', r'a'),
           (r'Ú', r'r'), (r'ú', r'r'), (r'Û', r'r'), (r'û', r'r'), (r'Ù', r'r'), (r'ù', r'r'), (r'Ü', r'r'), 
           (r'ü', r'r'), (r'Ý', r'Y'), (r'ý', r'y'), (r'Š', r'S'), (r'š', r's'), (r'ÿ', r'y'), (r'Ÿ', r'Y'), 
           (r'Å', r'A'), (r'å', r'a'), (r'Ã', r'A'), (r'ã', r'a'), (r'Ä', r'A'), 
           (r'Æ', r'AE'), (r'æ', r'ae'), (r'Œ', r'Oe'), (r'œ', r'oe'), (r'ß', r'ss')]

# remove common #accents / #diacritics

# list is incomplete; notably missing are Hungarian diacritics

def remove_accents(txt):
    import re
    for accentpair in accents:
        txt = re.sub(accentpair[0], accentpair[1], txt)
    return txt
        
examples = ['Résumé', 'encyclopædia']
for example in examples:
    print remove_accents(example)
    
# to change a file
accent_change_path_in = "C:/Users/David/Documents/IPython Notebooks/GitHub/Baby_names_US_IPython/lists/bible in prog.txt"
accent_change_path_out = "C:/Users/David/Documents/IPython Notebooks/GitHub/Baby_names_US_IPython/lists/bible in prog_noacc.txt"
in_file = open(accent_change_path_in, 'r').read()
out_file = remove_accents(in_file)
open(accent_change_path_out, 'w+').write(out_file)

In [5]:
for accent in accents:
    x = re.sub(accent[0], accent[1], x)
print x


AeÐELRIC
Unicode UTF-8 HTML Letter Upper Lower Description U+00C1 \xc3\x81 Á Á A a LATIN CAPITAL LETTER A WITH ACUTE U+00C2 \xc3\x82 Â Â A a LATIN CAPITAL LETTER A WITH CIRCUMFLEX U+00C3 \xc3\x83 Ã Ã A a LATIN CAPITAL LETTER A WITH TILDE U+00C4 \xc3\x84 Ä Ä A a LATIN CAPITAL LETTER A WITH DIAERESIS U+00C5 \xc3\x85 Å Å A a LATIN CAPITAL LETTER A WITH RING ABOVE U+00C6 \xc3\x86 Æ Æ AE ae LATIN CAPITAL LETTER AE U+00C7 \xc3\x87 Ç Ç C c LATIN CAPITAL LETTER C WITH CEDILLA U+00C8 \xc3\x88 È È E e LATIN CAPITAL LETTER E WITH GRAVE U+00C9 \xc3\x89 É É E e LATIN CAPITAL LETTER E WITH ACUTE U+00CA \xc3\x8a Ê Ê E e LATIN CAPITAL LETTER E WITH CIRCUMFLEX U+00CB \xc3\x8b Ë Ë E e LATIN CAPITAL LETTER E WITH DIAERESIS U+00CC \xc3\x8c Ì Ì I i LATIN CAPITAL LETTER I WITH GRAVE U+00CD \xc3\x8d Í Í I i LATIN CAPITAL LETTER I WITH ACUTE U+00CE \xc3\x8e Î Î I i LATIN CAPITAL LETTER I WITH CIRCUMFLEX U+00CF \xc3\x8f Ï Ï I i LATIN CAPITAL LETTER I WITH DIAERESIS U+00D0 \xc3\x90 Ð Ð TH th LATIN CAPITAL LETTER ETH U+00D1 \xc3\x91 Ñ Ñ N n LATIN CAPITAL LETTER N WITH TILDE U+00D2 \xc3\x92 Ò Ò O o LATIN CAPITAL LETTER O WITH GRAVE U+00D3 \xc3\x93 Ó Ó O o LATIN CAPITAL LETTER O WITH ACUTE U+00D4 \xc3\x94 Ô Ô O o LATIN CAPITAL LETTER O WITH CIRCUMFLEX U+00D5 \xc3\x95 Õ Õ O o LATIN CAPITAL LETTER O WITH TILDE U+00D6 \xc3\x96 Ö Ö O o LATIN CAPITAL LETTER O WITH DIAERESIS U+00D8 \xc3\x98 Ø Ø O o LATIN CAPITAL LETTER O WITH STROKE U+00D9 \xc3\x99 Ù Ù U u LATIN CAPITAL LETTER U WITH GRAVE U+00DA \xc3\x9a Ú Ú U u LATIN CAPITAL LETTER U WITH ACUTE U+00DB \xc3\x9b Û Û U u LATIN CAPITAL LETTER U WITH CIRCUMFLEX U+00DC \xc3\x9c Ü Ü U u LATIN CAPITAL LETTER U WITH DIAERESIS U+00DD \xc3\x9d Ý Ý Y y LATIN CAPITAL LETTER Y WITH ACUTE U+00DE \xc3\x9e Þ Þ TH th LATIN CAPITAL LETTER THORN U+00DF \xc3\x9f ß ß SS ss LATIN SMALL LETTER SHARP S U+00E0 \xc3\xa0 à à a a LATIN SMALL LETTER A WITH GRAVE U+00E1 \xc3\xa1 á á a a LATIN SMALL LETTER A WITH ACUTE U+00E2 \xc3\xa2 â â a a LATIN SMALL LETTER A WITH CIRCUMFLEX U+00E3 \xc3\xa3 ã ã a a LATIN SMALL LETTER A WITH TILDE U+00E4 \xc3\xa4 ä ä a a LATIN SMALL LETTER A WITH DIAERESIS U+00E5 \xc3\xa5 å å a a LATIN SMALL LETTER A WITH RING ABOVE U+00E6 \xc3\xa6 æ æ ae ae LATIN SMALL LETTER AE U+00E7 \xc3\xa7 ç ç c c LATIN SMALL LETTER C WITH CEDILLA U+00E8 \xc3\xa8 è è e e LATIN SMALL LETTER E WITH GRAVE U+00E9 \xc3\xa9 é é e e LATIN SMALL LETTER E WITH ACUTE U+00EA \xc3\xaa ê ê e e LATIN SMALL LETTER E WITH CIRCUMFLEX U+00EB \xc3\xab ë ë e e LATIN SMALL LETTER E WITH DIAERESIS U+00EC \xc3\xac ì ì i i LATIN SMALL LETTER I WITH GRAVE U+00ED \xc3\xad í í i i LATIN SMALL LETTER I WITH ACUTE U+00EE \xc3\xae î î i i LATIN SMALL LETTER I WITH CIRCUMFLEX U+00EF \xc3\xaf ï ï i i LATIN SMALL LETTER I WITH DIAERESIS U+00F0 \xc3\xb0 ð ð th th LATIN SMALL LETTER ETH U+00F1 \xc3\xb1 ñ ñ n n LATIN SMALL LETTER N WITH TILDE U+00F2 \xc3\xb2 ò ò o o LATIN SMALL LETTER O WITH GRAVE U+00F3 \xc3\xb3 ó ó o o LATIN SMALL LETTER O WITH ACUTE U+00F4 \xc3\xb4 ô ô o o LATIN SMALL LETTER O WITH CIRCUMFLEX U+00F5 \xc3\xb5 õ õ o o LATIN SMALL LETTER O WITH TILDE U+00F6 \xc3\xb6 ö ö o o LATIN SMALL LETTER O WITH DIAERESIS U+00F8 \xc3\xb8 ø ø o o LATIN SMALL LETTER O WITH STROKE U+00F9 \xc3\xb9 ù ù u u LATIN SMALL LETTER U WITH GRAVE U+00FA \xc3\xba ú ú u u LATIN SMALL LETTER U WITH ACUTE U+00FB \xc3\xbb û û u u LATIN SMALL LETTER U WITH CIRCUMFLEX U+00FC \xc3\xbc ü ü u u LATIN SMALL LETTER U WITH DIAERESIS U+00FD \xc3\xbd ý ý y y LATIN SMALL LETTER Y WITH ACUTE U+00FE \xc3\xbe þ þ th th LATIN SMALL LETTER THORN U+00FF \xc3\xbf ÿ ÿ y y LATIN SMALL LETTER Y WITH DIAERESIS
LATIN EXTENDED A Unicode character UTF-8 Equiv U+0100 Ā \xc4\x80 A U+0101 ā \xc4\x81 a U+0102 Ă \xc4\x82 A U+0103 ă \xc4\x83 a U+0104 Ą \xc4\x84 A U+0105 ą \xc4\x85 a U+0106 Ć \xc4\x86 C U+0107 ć \xc4\x87 c U+0108 Ĉ \xc4\x88 C U+0109 ĉ \xc4\x89 c U+010A Ċ \xc4\x8a C U+010B ċ \xc4\x8b c U+010C Č \xc4\x8c C U+010D č \xc4\x8d c U+010E Ď \xc4\x8e D U+010F ď \xc4\x8f d U+0110 Đ \xc4\x90 D U+0111 đ \xc4\x91 d U+0112 Ē \xc4\x92 E U+0113 ē \xc4\x93 e U+0114 Ĕ \xc4\x94 E U+0115 ĕ \xc4\x95 e U+0116 Ė \xc4\x96 E U+0117 ė \xc4\x97 e U+0118 Ę \xc4\x98 E U+0119 ę \xc4\x99 e U+011A Ě \xc4\x9a E U+011B ě \xc4\x9b e U+011C Ĝ \xc4\x9c G U+011D ĝ \xc4\x9d g U+011E Ğ \xc4\x9e G U+011F ğ \xc4\x9f g U+0120 Ġ \xc4\xa0 G U+0121 ġ \xc4\xa1 g U+0122 Ģ \xc4\xa2 G U+0123 ģ \xc4\xa3 g U+0124 Ĥ \xc4\xa4 H U+0125 ĥ \xc4\xa5 h U+0126 Ħ \xc4\xa6 H U+0127 ħ \xc4\xa7 h U+0128 Ĩ \xc4\xa8 I U+0129 ĩ \xc4\xa9 i U+012A Ī \xc4\xaa I U+012B ī \xc4\xab i U+012C Ĭ \xc4\xac I U+012D ĭ \xc4\xad i U+012E Į \xc4\xae I U+012F į \xc4\xaf i U+0130 İ \xc4\xb0 I U+0131 ı \xc4\xb1 i U+0132 IJ \xc4\xb2 IJ U+0133 ij \xc4\xb3 ij U+0134 Ĵ \xc4\xb4 J U+0135 ĵ \xc4\xb5 j U+0136 Ķ \xc4\xb6 K U+0137 ķ \xc4\xb7 k U+0138 ĸ \xc4\xb8 k U+0139 Ĺ \xc4\xb9 L U+013A ĺ \xc4\xba l U+013B Ļ \xc4\xbb L U+013C ļ \xc4\xbc l U+013D Ľ \xc4\xbd L U+013E ľ \xc4\xbe l U+013F Ŀ \xc4\xbf L U+0140 ŀ \xc5\x80 l U+0141 Ł \xc5\x81 L U+0142 ł \xc5\x82 l U+0143 Ń \xc5\x83 N U+0144 ń \xc5\x84 n U+0145 Ņ \xc5\x85 N U+0146 ņ \xc5\x86 n U+0147 Ň \xc5\x87 N U+0148 ň \xc5\x88 n U+0149 ʼn \xc5\x89 n U+014A Ŋ \xc5\x8a N U+014B ŋ \xc5\x8b n U+014C Ō \xc5\x8c O U+014D ō \xc5\x8d o U+014E Ŏ \xc5\x8e O U+014F ŏ \xc5\x8f o U+0150 Ő \xc5\x90 O U+0151 ő \xc5\x91 o U+0152 Œ \xc5\x92 OE U+0153 œ \xc5\x93 oe U+0154 Ŕ \xc5\x94 R U+0155 ŕ \xc5\x95 r U+0156 Ŗ \xc5\x96 R U+0157 ŗ \xc5\x97 r U+0158 Ř \xc5\x98 R U+0159 ř \xc5\x99 r U+015A Ś \xc5\x9a S U+015B ś \xc5\x9b s U+015C Ŝ \xc5\x9c S U+015D ŝ \xc5\x9d s U+015E Ş \xc5\x9e S U+015F ş \xc5\x9f s U+0160 Š \xc5\xa0 S U+0161 š \xc5\xa1 s U+0162 Ţ \xc5\xa2 T U+0163 ţ \xc5\xa3 t U+0164 Ť \xc5\xa4 T U+0165 ť \xc5\xa5 t U+0166 Ŧ \xc5\xa6 T U+0167 ŧ \xc5\xa7 t U+0168 Ũ \xc5\xa8 U U+0169 ũ \xc5\xa9 u U+016A Ū \xc5\xaa U U+016B ū \xc5\xab u U+016C Ŭ \xc5\xac U U+016D ŭ \xc5\xad u U+016E Ů \xc5\xae U U+016F ů \xc5\xaf u U+0170 Ű \xc5\xb0 U U+0171 ű \xc5\xb1 u U+0172 Ų \xc5\xb2 U U+0173 ų \xc5\xb3 u U+0174 Ŵ \xc5\xb4 W U+0175 ŵ \xc5\xb5 w U+0176 Ŷ \xc5\xb6 Y U+0177 ŷ \xc5\xb7 y U+0178 Ÿ \xc5\xb8 Y U+0179 Ź \xc5\xb9 Z U+017A ź \xc5\xba z U+017B Ż \xc5\xbb Z U+017C ż \xc5\xbc z U+017D Ž \xc5\xbd Z U+017E ž \xc5\xbe z U+017F ſ \xc5\xbf U+0180 ƀ \xc6\x80 b U+0181 Ɓ \xc6\x81 B U+0182 Ƃ \xc6\x82 b U+0183 ƃ \xc6\x83 b U+0184 Ƅ \xc6\x84 b U+0185 ƅ \xc6\x85 b U+0186 Ɔ \xc6\x86 C U+0187 Ƈ \xc6\x87 C U+0188 ƈ \xc6\x88 c U+0189 Ɖ \xc6\x89 D U+018A Ɗ \xc6\x8a D U+018B Ƌ \xc6\x8b d U+018C ƌ \xc6\x8c d U+018D ƍ \xc6\x8d d U+018E Ǝ \xc6\x8e E U+018F Ə \xc6\x8f e U+0190 Ɛ \xc6\x90 e U+0191 Ƒ \xc6\x91 F U+0192 ƒ \xc6\x92 f U+0193 Ɠ \xc6\x93 G U+0194 Ɣ \xc6\x94 g U+0195 ƕ \xc6\x95 h U+0196 Ɩ \xc6\x96 i U+0197 Ɨ \xc6\x97 i U+0198 Ƙ \xc6\x98 K U+0199 ƙ \xc6\x99 k U+019A ƚ \xc6\x9a l U+019B ƛ \xc6\x9b l U+019C Ɯ \xc6\x9c M U+019D Ɲ \xc6\x9d N U+019E ƞ \xc6\x9e n U+019F Ɵ \xc6\x9f TH U+01A0 Ơ \xc6\xa0 O U+01A1 ơ \xc6\xa1 o U+01A2 Ƣ \xc6\xa2 O U+01A3 ƣ \xc6\xa3 o U+01A4 Ƥ \xc6\xa4 O U+01A5 ƥ \xc6\xa5 o U+01A6 Ʀ \xc6\xa6 R U+01A7 Ƨ \xc6\xa7 S U+01A8 ƨ \xc6\xa8 s U+01A9 Ʃ \xc6\xa9 S U+01AA ƪ \xc6\xaa s U+01AB ƫ \xc6\xab t U+01AC Ƭ \xc6\xac T U+01AD ƭ \xc6\xad t U+01AE Ʈ \xc6\xae T U+01AF Ư \xc6\xaf U U+01B0 ư \xc6\xb0 u U+01B1 Ʊ \xc6\xb1 U U+01B2 Ʋ \xc6\xb2 u U+01B3 Ƴ \xc6\xb3 Y U+01B4 ƴ \xc6\xb4 y U+01B5 Ƶ \xc6\xb5 Z U+01B6 ƶ \xc6\xb6 z U+01B7 Ʒ \xc6\xb7 z U+01B8 Ƹ \xc6\xb8 z U+01B9 ƹ \xc6\xb9 z U+01BA ƺ \xc6\xba z U+01BB ƻ \xc6\xbb 2 U+01BC Ƽ \xc6\xbc 5 U+01BD ƽ \xc6\xbd 5 U+01BE ƾ \xc6\xbe ? U+01BF ƿ \xc6\xbf th U+01C0 ǀ \xc7\x80 | U+01C1 ǁ \xc7\x81 || U+01C2 ǂ \xc7\x82 | U+01C3 ǃ \xc7\x83 ! U+01C4 DŽ \xc7\x84 DZ U+01C5 Dž \xc7\x85 Dz U+01C6 dž \xc7\x86 dz U+01C7 LJ \xc7\x87 LJ U+01C8 Lj \xc7\x88 Lj U+01C9 lj \xc7\x89 lj U+01CA NJ \xc7\x8a NJ U+01CB Nj \xc7\x8b Nj U+01CC nj \xc7\x8c nj U+01CD Ǎ \xc7\x8d A U+01CE ǎ \xc7\x8e a U+01CF Ǐ \xc7\x8f I U+01D0 ǐ \xc7\x90 i U+01D1 Ǒ \xc7\x91 O U+01D2 ǒ \xc7\x92 o U+01D3 Ǔ \xc7\x93 U U+01D4 ǔ \xc7\x94 u U+01D5 Ǖ \xc7\x95 U U+01D6 ǖ \xc7\x96 u U+01D7 Ǘ \xc7\x97 U U+01D8 ǘ \xc7\x98 u U+01D9 Ǚ \xc7\x99 U U+01DA ǚ \xc7\x9a u U+01DB Ǜ \xc7\x9b U U+01DC ǜ \xc7\x9c u U+01DD ǝ \xc7\x9d e U+01DE Ǟ \xc7\x9e A U+01DF ǟ \xc7\x9f a U+01E0 Ǡ \xc7\xa0 A U+01E1 ǡ \xc7\xa1 a U+01E2 Ǣ \xc7\xa2 AE U+01E3 ǣ \xc7\xa3 ae U+01E4 Ǥ \xc7\xa4 G U+01E5 ǥ \xc7\xa5 g U+01E6 Ǧ \xc7\xa6 G U+01E7 ǧ \xc7\xa7 g U+01E8 Ǩ \xc7\xa8 K U+01E9 ǩ \xc7\xa9 k U+01EA Ǫ \xc7\xaa Q U+01EB ǫ \xc7\xab q U+01EC Ǭ \xc7\xac Q U+01ED ǭ \xc7\xad q U+01EE Ǯ \xc7\xae z U+01EF ǯ \xc7\xaf z U+01F0 ǰ \xc7\xb0 j U+01F1 DZ \xc7\xb1 DZ U+01F2 Dz \xc7\xb2 Dz U+01F3 dz \xc7\xb3 dz U+01F4 Ǵ \xc7\xb4 G U+01F5 ǵ \xc7\xb5 g U+01F6 Ƕ \xc7\xb6 H U+01F7 Ƿ \xc7\xb7 TH U+01F8 Ǹ \xc7\xb8 N U+01F9 ǹ \xc7\xb9 n U+01FA Ǻ \xc7\xba A U+01FB ǻ \xc7\xbb a U+01FC Ǽ \xc7\xbc AE U+01FD ǽ \xc7\xbd ae U+01FE Ǿ \xc7\xbe O U+01FF ǿ \xc7\xbf o