Let's take a sentence from Marcel Proust's "Swann's Way" and tokenize the sentence into discrete words.
In [22]:
import nltk
sentence = """"The thirst for something other than what we have…to bring something new, even if it is worse, some emotion, some sorrow; when our sensibility, which happiness has silenced like an idle harp, wants to resonate under some hand, even a rough one, and even if it might be broken by it."""
tokens = nltk.word_tokenize(sentence)
In [23]:
tokens
Out[23]:
In [24]:
tagged = nltk.pos_tag(tokens)
nltk.help.upenn_tagset()
In [20]:
tagged
Out[20]:
In [31]:
from nltk.corpus import treebank
t = treebank.parsed_sents('wsj_0001.mrg')[0]
t
Out[31]:
Sentiment Analysis
In [7]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
In [8]:
n_instances = 100
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]
len(subj_docs), len(obj_docs)
Out[8]:
In [9]:
subj_docs[0]
Out[9]:
In [34]:
obj_docs[0]
Out[34]:
In [10]:
train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]
training_docs = train_subj_docs+train_obj_docs
testing_docs = test_subj_docs+test_obj_docs
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
In [11]:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
len(unigram_feats)
Out[11]:
In [12]:
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
In [13]:
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)
In [14]:
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
In [15]:
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
print('{0}: {1}'.format(key, value))