In [30]:
import nltk
pos_tweets = [('I love this car', 'positive'),
('This view is amazing', 'positive'),
('I feel great this morning', 'positive'),
('I am so excited about the concert', 'positive'),
('He is my best friend', 'positive')]
neg_tweets = [('I do not like this car', 'negative'),
('This view is horrible', 'negative'),
('I feel tired this morning', 'negative'),
('I am not looking forward to the concert', 'negative'),
('He is my enemy', 'negative')]
In [2]:
#把正负面的tweets合并到一起
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered = [e.lower() for e in words.split() if len(e) >= 3] # 过滤大小写
tweets.append((words_filtered, sentiment))
tweets[0:2]
Out[2]:
In [3]:
test_tweets = [
(['feel', 'happy', 'this', 'morning'], 'positive'),
(['larry', 'friend'], 'positive'),
(['not', 'like', 'that', 'man'], 'negative'),
(['house', 'not', 'great'], 'negative'),
(['your', 'song', 'annoying'], 'negative')]
In [4]:
def get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
word_features = get_word_features(get_words_in_tweets(tweets))
' '.join(word_features)
Out[4]:
In [5]:
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
In [16]:
training_set = nltk.classify.util.apply_features(extract_features, tweets) #nltk有划分的功能
classifier = nltk.NaiveBayesClassifier.train(training_set)
In [17]:
def train(labeled_featuresets, estimator=nltk.probability.ELEProbDist):
label_probdist = estimator(label_freqdist)
feature_probdist = {}
return MaxentClassifier(label_probdist, feature_probdist)
In [18]:
tweet_positive = 'Larry is my friend'
print classifier.classify(extract_features(tweet_positive.split()))
In [19]:
tweet_positive = 'Larry is not my friend'
print classifier.classify(extract_features(tweet_positive.split()))
In [20]:
def classify_tweet(tweet):
return classifier.classify(extract_features(tweet))
# nltk.word_tokenize(tweet)
total = accuracy = float(len(test_tweets))
for tweet in test_tweets:
if classify_tweet(tweet[0]) != tweet[1]:
accuracy -= 1
print('Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy))
In [21]:
nltk_classifiers = dir(nltk)
for i in nltk_classifiers:
if 'Classifier' in i:
print i
In [25]:
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier
classif = SklearnClassifier(RandomForestClassifier())
svm_classifier = classif.train(training_set)
In [26]:
tweet_negative2 = 'Your song is annoying'
print svm_classifier.classify(extract_features(tweet_negative2.split()))
In [31]:
import nltk
from nltk.classify.naivebayes import NaiveBayesClassifier
def get_words_in_tweets(tweets):
all_words = []
for (words, sentiment) in tweets:
all_words.extend(words)
return all_words
def get_word_features(wordlist):
wordlist = nltk.FreqDist(wordlist)
word_features = wordlist.keys()
return word_features
def read_tweets(fname, t_type):
tweets = []
f = open(fname, 'r')
line = f.readline()
while line != '':
tweets.append([line, t_type])
line = f.readline()
f.close()
return tweets
def extract_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
return features
def classify_tweet(tweet):
return \
classifier.classify(extract_features(nltk.word_tokenize(tweet)))
pos_tweets = read_tweets('/Users/zhangyixin/Twitter-Sentimental-Analysis/happy.txt', 'positive')
neg_tweets = read_tweets('/Users/zhangyixin/Twitter-Sentimental-Analysis/sad.txt', 'negative')
tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
words_filtered = [e.lower() for e in words.split() if len(e) >= 3]
tweets.append((words_filtered, sentiment))
word_features = get_word_features(\
get_words_in_tweets(tweets))
training_set = nltk.classify.util.apply_features(extract_features, tweets)
classifier = NaiveBayesClassifier.train(training_set)
test_tweets = read_tweets('/Users/zhangyixin/Twitter-Sentimental-Analysis/happy_test.txt', 'positive')
test_tweets.extend(read_tweets('/Users/zhangyixin/Twitter-Sentimental-Analysis/sad_test.txt', 'negative'))
total = accuracy = float(len(test_tweets))
for tweet in test_tweets:
if classify_tweet(tweet[0]) != tweet[1]:
accuracy -= 1
print('Total accuracy: %f%% (%d/20).' % (accuracy / total * 100, accuracy))
In [ ]: