In [13]:
import re
from nltk.tokenize import word_tokenize
from string import punctuation 
from nltk.corpus import stopwords 


class PreProcessTweets:
    def __init__(self):
        self._stopwords=set(stopwords.words('english')+list(punctuation)+['AT_USER','URL'])
        
    def processTweets(self,list_of_tweets):
        # The list of tweets is a list of dictionaries which should have the keys, "text" and "label"
        processedTweets=[]
        # This list will be a list of tuples. Each tuple is a tweet which is a list of words and its label
        for tweet in list_of_tweets:
            processedTweets.append(int((self._processTweet(tweet["text"]),tweet["label"])))
        return processedTweets
    
    def _processTweet(self,tweet):
        # 1. Convert to lower case
        tweet=tweet.lower()
        # 2. Replace links with the word URL 
        tweet=re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)     
        # 3. Replace @username with "AT_USER"
        tweet=re.sub('@[^\s]+','AT_USER',tweet)
        # 4. Replace #word with word 
        tweet=re.sub(r'#([^\s]+)',r'\1',tweet)
        # You can do further cleanup as well if you like, replace 
        # repetitions of characters, for ex: change "huuuuungry" to "hungry"
        # We'll leave that as an exercise for you on regular expressions
        tweet=word_tokenize(tweet)
        # This tokenizes the tweet into a list of words 
        # Let's now return this list minus any stopwords 
        return [word for word in tweet if word not in self._stopwords]
    
tweetProcessor=PreProcessTweets()
ppTrainingData=tweetProcessor.processTweets(trainingData)
ppTestData=tweetProcessor.processTweets(testData)


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-13-62c0db54c2df> in <module>()
     35 
     36 tweetProcessor=PreProcessTweets()
---> 37 ppTrainingData=tweetProcessor.processTweets(trainingData)
     38 ppTestData=tweetProcessor.processTweets(testData)

<ipython-input-13-62c0db54c2df> in processTweets(self, list_of_tweets)
     14         # This list will be a list of tuples. Each tuple is a tweet which is a list of words and its label
     15         for tweet in list_of_tweets:
---> 16             processedTweets.append(int((self._processTweet(tweet["text"]),tweet["label"])))
     17         return processedTweets
     18 

TypeError: string indices must be integers