In [1]:
import nltk
In [2]:
with open('book.txt', 'r') as file:
text = file.readlines()
Let's take a smaller chunk from the text:
In [3]:
# using a list comprehension to simplify iterating over the the text structure
snippet = " ".join(block.strip() for block in text[175:200])
In [4]:
snippet
Out[4]:
In [5]:
# alternative with for-loop
other_snippet = []
for block in text[175:200]:
other_snippet.append(block.strip())
other_snippet = " ".join(other_snippet)
In [6]:
other_snippet
Out[6]:
In [7]:
whole_text = " ".join(block.strip() for block in text)
In [8]:
whole_text[5000:7500]
Out[8]:
In [9]:
from nltk.tokenize import sent_tokenize, word_tokenize
In [10]:
str.split?
In [11]:
# you can try to separate sentences by splitting on punctuation
snippet.split('.')
Out[11]:
In [12]:
# The sentence tokenizer has some clever tricks to do a better job
sent_tokenize(snippet)
Out[12]:
In [13]:
# splitting a text into tokens based on white space
snippet.split()
Out[13]:
In [14]:
words = word_tokenize(snippet)
In [15]:
# word tokenize treats punctuation as a token
words
Out[15]:
In [16]:
# let's plot the frequency of occurrence of different words
nltk.FreqDist?
In [17]:
fdist = nltk.FreqDist(words)
In [18]:
fdist.plot(30)
Stop words are words that you want to filter out from your text for downstream analysis. They are typically very common words which don't contain much useful information for the task at hand. There is no universal set of stop words and some domain knowledge is helpful for deciding what you want to include when processing your text.
In [19]:
from nltk.corpus import stopwords
In [20]:
stops = stopwords.words('english')
In [21]:
stops
Out[21]:
In [22]:
filtered_words = [word.lower() for word in words if word.lower() not in stops]
In [23]:
filtered_words
Out[23]:
In [24]:
filtered_fdist = nltk.FreqDist(filtered_words)
In [25]:
filtered_fdist.plot(30)
In [26]:
import string
In [27]:
string.punctuation
Out[27]:
In [28]:
stops = stopwords.words('english') + list(string.punctuation)
In [29]:
stops
Out[29]:
In [30]:
filtered_words = [word.lower() for word in words if word.lower() not in stops]
In [31]:
filtered_fdist2 = nltk.FreqDist(filtered_words)
In [32]:
filtered_fdist2.plot(30)
In [33]:
def process_text(text):
# break text into word tokens
tokens = word_tokenize(text)
# remove stopwords
filtered_words = [token.lower() for token in tokens if not token.lower() in stops]
# filter for short punctuation
filtered_words = [w for w in filtered_words if (len(w) > 2)]
return filtered_words
In [34]:
whole_text[:110]
Out[34]:
In [35]:
len(whole_text)
Out[35]:
In [36]:
%%time
clean_text = process_text(whole_text)
In [37]:
fdist_whole_text = nltk.FreqDist(clean_text)
In [38]:
fdist_whole_text.plot(25)
In [39]:
boring_words = ['sir', 'upon', 'said', 'one']
stops += boring_words
In [40]:
%%time
cleaned_text = process_text(whole_text)
In [41]:
fdist_whole_text['holmes']
Out[41]:
In [42]:
fdist_whole_text['watson']
Out[42]:
In [43]:
from nltk.stem import PorterStemmer
In [44]:
help(nltk.stem)
In [45]:
ps = PorterStemmer()
In [46]:
print(ps.stem('Happy'))
print(ps.stem('Happiness'))
print(ps.stem('Had'))
print(ps.stem('Fishing'))
print(ps.stem('Fish'))
print(ps.stem('Fisher'))
print(ps.stem('Fishes'))
print(ps.stem('Fished'))
In [47]:
words = process_text(snippet)
In [48]:
stemmed = [ps.stem(word) for word in words]
In [49]:
for w, stem in zip(words, stemmed):
print('{} ---> {}'.format(w, stem))
In [50]:
def stem_process(text):
# tokenize
tokens = word_tokenize(text)
# remove stops
filtered_words = [token.lower() for token in tokens if not token.lower() in stops]
filtered_words = [w for w in filtered_words if (len(w) > 2)]
# stem
stemmed_words = [ps.stem(w) for w in filtered_words]
return stemmed_words
In [51]:
%%time
stemmed = stem_process(whole_text)
In [52]:
stemmed
Out[52]:
In [53]:
fdist_stems = nltk.FreqDist(stemmed)
In [54]:
fdist_stems.plot(30)
In [55]:
from nltk.stem import WordNetLemmatizer
In [56]:
lemmatizer = WordNetLemmatizer()
In [57]:
lemmatizer.lemmatize?
In [58]:
print(lemmatizer.lemmatize('having'))
print(lemmatizer.lemmatize('have'))
print(lemmatizer.lemmatize('had'))
print()
print(lemmatizer.lemmatize('fishing'))
print(lemmatizer.lemmatize('fish'))
print(lemmatizer.lemmatize('fisher'))
print(lemmatizer.lemmatize('fishes'))
print(lemmatizer.lemmatize('fished'))
print()
print(lemmatizer.lemmatize('am'))
print(lemmatizer.lemmatize('is'))
print(lemmatizer.lemmatize('was'))
In [59]:
# including POS for the lemmatizer can improve its output
print(lemmatizer.lemmatize('having', pos='v'))
print(lemmatizer.lemmatize('have', pos='v'))
print(lemmatizer.lemmatize('had', pos='v'))
print()
print(lemmatizer.lemmatize('fishing', pos='v'))
print(lemmatizer.lemmatize('fish', pos='v'))
print(lemmatizer.lemmatize('fisher', pos='n'))
print(lemmatizer.lemmatize('fishes', pos='v'))
print(lemmatizer.lemmatize('fished', pos='v'))
print()
print(lemmatizer.lemmatize('am', pos='v'))
print(lemmatizer.lemmatize('is', pos='v'))
print(lemmatizer.lemmatize('was', pos='v'))
In [60]:
lemmatized = [lemmatizer.lemmatize(word) for word in words]
In [61]:
for w, lemma in zip(words, lemmatized):
print('{} ---> {}'.format(w, lemma))
In [62]:
lemmatizer.lemmatize('running', pos='v')
Out[62]:
In [63]:
def lemma_process(text):
# tokenize
tokens = word_tokenize(text)
# remove stops
filtered_words = [token.lower() for token in tokens if not token.lower() in stops]
filtered_words = [w for w in filtered_words if (len(w) > 2)]
# lemmatize
lemmatized_words = [lemmatizer.lemmatize(w) for w in filtered_words]
return lemmatized_words
In [64]:
%%time
lemma_text = lemma_process(whole_text)
In [65]:
lemma_fdist = nltk.FreqDist(lemma_text)
lemma_fdist.plot(30)
The process of marking up a word in a text as corresponding to a particular part of speech, based on both its definition and its context.
POS tagging is tricky because some words can have more than one POS depending on the context.
"Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo."
In [66]:
nltk.help.upenn_tagset()
In [67]:
snippet
Out[67]:
In [68]:
nltk.pos_tag(word_tokenize(sent_tokenize(snippet)[1]))
Out[68]:
In [69]:
def process_POS(text):
sentences = sent_tokenize(text)
tagged_words = []
for sentence in sentences:
words = word_tokenize(sentence)
tagged = nltk.pos_tag(words)
tagged_words.append(tagged)
return tagged_words
In [70]:
tagged_sentences = process_POS(snippet)
In [71]:
tagged_sentences
Out[71]:
In [72]:
sentences =[]
for sentence in tagged_sentences[:5]:
print(sentence)
lemmas = []
for word, pos in sentence:
if pos == 'VBP':
lemmas.append(lemmatizer.lemmatize(word, 'v'))
elif pos in ['NN', 'NNS']:
lemmas.append(lemmatizer.lemmatize(word, 'n'))
else:
lemmas.append(lemmatizer.lemmatize(word))
sentences.append(lemmas)
In [73]:
from nltk import ngrams
from collections import Counter
In [74]:
bigrams = Counter(ngrams(word_tokenize(whole_text), 2))
In [75]:
for phrase, freq in bigrams.most_common(30):
print("{}\t{}".format(phrase, freq))
In [76]:
trigrams = Counter(ngrams(word_tokenize(whole_text), 3))
In [77]:
for phrase, freq in trigrams.most_common(30):
print("{}\t{}".format(phrase, freq))
In [78]:
stemmed = stem_process(whole_text)
In [79]:
stemmed_bigrams = Counter(ngrams(stemmed, 2))
In [80]:
stemmed_bigrams.most_common(20)
Out[80]:
In [81]:
stemmed_trigrams = Counter(ngrams(stemmed, 3))
In [82]:
stemmed_trigrams.most_common(20)
Out[82]:
In [ ]:
In [83]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
data_corpus = ["John likes to watch movies. Mary likes movies too.",
"John also likes to watch football games."]
X = vectorizer.fit_transform(data_corpus)
print(X.toarray())
print(vectorizer.get_feature_names())
In [84]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
In [85]:
vader = SentimentIntensityAnalyzer()
In [86]:
text = "I dont hate movies!"
In [87]:
vader.polarity_scores(text)
Out[87]:
In [ ]: