In [5]:
import itertools
import os
import re
import string

In [ ]:
tweet_files = map(lambda x: '../tweets/' + x, os.listdir('../tweets'))
tweet_files = filter(lambda x: x[-3:] == 'txt', tweet_files)
print tweet_files

In [6]:
alphabet = set(string.printable) - set(string.ascii_uppercase) - set(string.whitespace) - set(['`'])
alphabet = list(alphabet) + [' ', '\n']
alphabet.sort()
print alphabet, len(alphabet)


['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~'] 69

In [ ]:
multiple_spaces = re.compile(r'\s\s+')

html_entity = re.compile(r'&[a-z]+;')
html_replacements = {
    '&gt;'     : '>',
    '&lt;'     : '<',
    '&nbsp;'   : ' ',
    '&amp;'    : '&',
    '&quot;'   : '"',
    '&lsquo;'  : "'",
    '&rsquo;'  : "'",
    '&apos;'   : "'",
    '&laquo;'  : '"',
    '&raquo;'  : '"',
    '&mdash;'  : '-',
    '&ndash;'  : '-',
    '&reg;'    : '(r)',
    '&copy;'   : '(c)',
    '&pound;'  : 'pound',
    '&euro;'   : 'euro',
    '&bull;'   : '*',
    '&middot;' : '.',
    '&ldquo;'  : '"',
    '&rdquo;'  : '"',
    '&hellip;' : '...',
    '&grave;'  : "'",
    '&br;'     : ' ',
    '&times;'  : 'x',
    '&larr;'   : '<-',
    '&rarr;'   : '->',
    '&minus;'  : '-',
    '&cent;'   : 'cent'
}
bad = set()

def load_tweets(f, out):
    fname = f
    discarded, used = 0, 0
    f = open(f)
    for line in f:
        # Select only the actual text of the tweet.
        if line[0] != 'W':
            continue
        line = line[1:].strip()
        # Skip empty posts.
        if line == 'No Post Title':
            discarded += 1
            continue
        # Decode HTML entities.
        entities = list(set(re.findall(html_entity, line)))
        unrecognized = False
        while len(entities) > 0:
            entity = entities.pop()
            if entity not in html_replacements:
                unrecognized = True
                break
            line = line.replace(entity, html_replacements[entity])
        if unrecognized:
            if entity not in bad:
                bad.add(entity)
            discarded += 1
            continue
        line = line.lower()
        line = re.sub(multiple_spaces, ' ', line)
        line = line.replace('`', "'")
        if not all(c in alphabet for c in line):
            discarded += 1
            continue
        out.write(line + '\n')
        used += 1
    f.close()
    print "%s: Used %d tweets. Discarded %d tweets." % (fname, used, discarded)

def load_all_tweets(fs):
    return list(itertools.chain.from_iterable(map(load_tweets, fs)))

clean = open('twitter_cleaned.txt', 'w')
for f in tweet_files:
    tweets = load_tweets(f, clean)
clean.close()    
    
print 'Untranslated HTML entities: ' + str(list(bad))

In [ ]: