In [5]:
import itertools
import os
import re
import string
In [ ]:
tweet_files = map(lambda x: '../tweets/' + x, os.listdir('../tweets'))
tweet_files = filter(lambda x: x[-3:] == 'txt', tweet_files)
print tweet_files
In [6]:
alphabet = set(string.printable) - set(string.ascii_uppercase) - set(string.whitespace) - set(['`'])
alphabet = list(alphabet) + [' ', '\n']
alphabet.sort()
print alphabet, len(alphabet)
In [ ]:
multiple_spaces = re.compile(r'\s\s+')
html_entity = re.compile(r'&[a-z]+;')
html_replacements = {
'>' : '>',
'<' : '<',
' ' : ' ',
'&' : '&',
'"' : '"',
'‘' : "'",
'’' : "'",
''' : "'",
'«' : '"',
'»' : '"',
'—' : '-',
'–' : '-',
'®' : '(r)',
'©' : '(c)',
'£' : 'pound',
'€' : 'euro',
'•' : '*',
'·' : '.',
'“' : '"',
'”' : '"',
'…' : '...',
'`' : "'",
'&br;' : ' ',
'×' : 'x',
'←' : '<-',
'→' : '->',
'−' : '-',
'¢' : 'cent'
}
bad = set()
def load_tweets(f, out):
fname = f
discarded, used = 0, 0
f = open(f)
for line in f:
# Select only the actual text of the tweet.
if line[0] != 'W':
continue
line = line[1:].strip()
# Skip empty posts.
if line == 'No Post Title':
discarded += 1
continue
# Decode HTML entities.
entities = list(set(re.findall(html_entity, line)))
unrecognized = False
while len(entities) > 0:
entity = entities.pop()
if entity not in html_replacements:
unrecognized = True
break
line = line.replace(entity, html_replacements[entity])
if unrecognized:
if entity not in bad:
bad.add(entity)
discarded += 1
continue
line = line.lower()
line = re.sub(multiple_spaces, ' ', line)
line = line.replace('`', "'")
if not all(c in alphabet for c in line):
discarded += 1
continue
out.write(line + '\n')
used += 1
f.close()
print "%s: Used %d tweets. Discarded %d tweets." % (fname, used, discarded)
def load_all_tweets(fs):
return list(itertools.chain.from_iterable(map(load_tweets, fs)))
clean = open('twitter_cleaned.txt', 'w')
for f in tweet_files:
tweets = load_tweets(f, clean)
clean.close()
print 'Untranslated HTML entities: ' + str(list(bad))
In [ ]: