In [1]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

In [3]:
stopwords.words('english')[:16]


Out[3]:
['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his']

In [9]:
# https://en.wikipedia.org/wiki/Cadet_Nurse_Corps
para = "The program was open to all women between the ages of 17 and 35, in good health, who had graduated from an accredited high school. Successful applicants were eligible for a government subsidy, paying for tuition, books, uniforms, and a stipend. In exchange, they were required to pledge to actively serve in essential civilian or federal government services for the duration of World War II. All state nursing schools were eligible to participate in the program. However, they needed to be accredited by the accrediting agency in their state, and connected with a hospital that had been approved by the American College of Surgeons."
words = word_tokenize(para)
print(words)
useful_words = [word for word in words if word not in stopwords.words('english')]
print(useful_words)


['The', 'program', 'was', 'open', 'to', 'all', 'women', 'between', 'the', 'ages', 'of', '17', 'and', '35', ',', 'in', 'good', 'health', ',', 'who', 'had', 'graduated', 'from', 'an', 'accredited', 'high', 'school', '.', 'Successful', 'applicants', 'were', 'eligible', 'for', 'a', 'government', 'subsidy', ',', 'paying', 'for', 'tuition', ',', 'books', ',', 'uniforms', ',', 'and', 'a', 'stipend', '.', 'In', 'exchange', ',', 'they', 'were', 'required', 'to', 'pledge', 'to', 'actively', 'serve', 'in', 'essential', 'civilian', 'or', 'federal', 'government', 'services', 'for', 'the', 'duration', 'of', 'World', 'War', 'II', '.', 'All', 'state', 'nursing', 'schools', 'were', 'eligible', 'to', 'participate', 'in', 'the', 'program', '.', 'However', ',', 'they', 'needed', 'to', 'be', 'accredited', 'by', 'the', 'accrediting', 'agency', 'in', 'their', 'state', ',', 'and', 'connected', 'with', 'a', 'hospital', 'that', 'had', 'been', 'approved', 'by', 'the', 'American', 'College', 'of', 'Surgeons', '.']
['The', 'program', 'open', 'women', 'ages', '17', '35', ',', 'good', 'health', ',', 'graduated', 'accredited', 'high', 'school', '.', 'Successful', 'applicants', 'eligible', 'government', 'subsidy', ',', 'paying', 'tuition', ',', 'books', ',', 'uniforms', ',', 'stipend', '.', 'In', 'exchange', ',', 'required', 'pledge', 'actively', 'serve', 'essential', 'civilian', 'federal', 'government', 'services', 'duration', 'World', 'War', 'II', '.', 'All', 'state', 'nursing', 'schools', 'eligible', 'participate', 'program', '.', 'However', ',', 'needed', 'accredited', 'accrediting', 'agency', 'state', ',', 'connected', 'hospital', 'approved', 'American', 'College', 'Surgeons', '.']

In [10]:
movie_reviews.words()


Out[10]:
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]

In [11]:
movie_reviews.categories()


Out[11]:
['neg', 'pos']

In [12]:
movie_reviews.fileids()[:4]


Out[12]:
['neg/cv000_29416.txt',
 'neg/cv001_19502.txt',
 'neg/cv002_17424.txt',
 'neg/cv003_12683.txt']

In [16]:
all_words = movie_reviews.words()

freq_dist = nltk.FreqDist(all_words)

freq_dist.most_common(20)


Out[16]:
[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822),
 ('s', 18513),
 ('"', 17612),
 ('it', 16107),
 ('that', 15924),
 ('-', 15595),
 (')', 11781),
 ('(', 11664),
 ('as', 11378),
 ('with', 10792),
 ('for', 9961)]

In [ ]: