In [13]:
import re
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
class PreProcessTweets:
def __init__(self):
self._stopwords=set(stopwords.words('english')+list(punctuation)+['AT_USER','URL'])
def processTweets(self,list_of_tweets):
# The list of tweets is a list of dictionaries which should have the keys, "text" and "label"
processedTweets=[]
# This list will be a list of tuples. Each tuple is a tweet which is a list of words and its label
for tweet in list_of_tweets:
processedTweets.append(int((self._processTweet(tweet["text"]),tweet["label"])))
return processedTweets
def _processTweet(self,tweet):
# 1. Convert to lower case
tweet=tweet.lower()
# 2. Replace links with the word URL
tweet=re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
# 3. Replace @username with "AT_USER"
tweet=re.sub('@[^\s]+','AT_USER',tweet)
# 4. Replace #word with word
tweet=re.sub(r'#([^\s]+)',r'\1',tweet)
# You can do further cleanup as well if you like, replace
# repetitions of characters, for ex: change "huuuuungry" to "hungry"
# We'll leave that as an exercise for you on regular expressions
tweet=word_tokenize(tweet)
# This tokenizes the tweet into a list of words
# Let's now return this list minus any stopwords
return [word for word in tweet if word not in self._stopwords]
tweetProcessor=PreProcessTweets()
ppTrainingData=tweetProcessor.processTweets(trainingData)
ppTestData=tweetProcessor.processTweets(testData)