In [8]:
import re
from nltk.tokenize import word_tokenize
from string import punctuation
import nltk
from nltk.corpus import stopwords
class PreProcessTweets:
def __init__(self):
nltk.download('stopwords')
self._stopwords=set(stopwords.words('english')+list(punctuation)+['AT_USER','URL'])
def processTweets(self,list_of_tweets):
#Create a dictionary with the key text and label
processedTweets=[]
for tweet in list_of_tweets:
processedTweets.append((self._processTweet(tweet["text"]),tweet["label"]))
return processedTweets
def processTweet(self,tweet):
tweet=tweet.lower()
tweet=re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
tweet=re.sub('@[^\s]+','AT_USER',tweet)
tweet=re.sub(r'#([^\s]+)',r'\1',tweet)
tweet=word_tokenize(tweet)
return [word for word in tweet if word not in self._stopwords]
trainingData= "/home/dennis/Desktop/sanders-twitter-0.2/tweetDataFile.csv"
tweetProcessor=PreProcessTweets()
ppTrainingData=tweetProcessor.processTweets(trainingData)
ppTestData=tweetProcessor.processTweets(testData)
#Naive Bayes Classifier
#First build a vocabulary
def buildVocabulary(ppTrainingData):
all_word=[]
for (words,sentiment) in ppTrainingData:
all_words.extend(words)
"""This will create a list in which all the words in the tweet are
present."""
wordlist=nltk.FreqDist(all_words)
word_features=wordlist.keys()
return word_features
def extract_features(tweet):
tweet_words=set(tweet)
features={}
for word in word_features:
features['contain(%s)' %word]=(word in tweet_words)
return features
word_features = buildVocabulary(ppTrainingData)
trainingFeatures=nltk.classify.apply_features(extract_features,ppTrainingData)
NBayesClassifier=nltk.NaiveBayesClassifier.train(trainingFeatures)