In [ ]:
%%HTML
<style>
.container { width:100% }
</style>
In [ ]:
import nltk
In [ ]:
# nltk.download()
In [ ]:
from nltk.corpus import gutenberg
Show the titles of those books that are part of the gutenberg corpus.
In [ ]:
gutenberg.fileids()
Retrieve the book "Paradise Lost" by John Milton.
In [ ]:
print(gutenberg.raw('milton-paradise.txt'))
Retrieve the book "Paradise Lost" by John Milton as a list of sentences.
In [ ]:
print(gutenberg.sents('milton-paradise.txt'))
Retrieve the book "Paradise Lost" by John Milton as a list of words.
In [ ]:
print(gutenberg.words('milton-paradise.txt'))
In [ ]:
from nltk.corpus import brown
In [ ]:
brown.ensure_loaded()
help(brown)
In [ ]:
brown.fileids()
In [ ]:
brown.categories()
In [ ]:
brown.sents(categories='editorial')
In [ ]:
from nltk.tokenize import sent_tokenize
In [ ]:
generic_text = 'Lorem ipsum dolor sit amet, amet minim temporibus in sit. Vel ne impedit consequat intellegebat.'
The function sent_tokenize
splits a string into a list of sentences.
In [ ]:
sent_tokenize(generic_text)
In [ ]:
english_text = 'Where is the closest train station? I need to reach London.'
sent_tokenize(english_text)
In [ ]:
spanish_text = '¿Dónde está la estación más cercana? Inmediatamente me tengo que ir a Barcelona.'
sent_tokenize(spanish_text, language='spanish')
In [ ]:
from nltk.tokenize import TreebankWordTokenizer
In [ ]:
simple_text = 'This is a simple text.'
tbwt = TreebankWordTokenizer()
tbwt.tokenize(simple_text)
In [ ]:
help(TreebankWordTokenizer)
In [ ]:
from nltk.tokenize import RegexpTokenizer
In [ ]:
complex_text = "This isn't a simple text."
In [ ]:
ret = RegexpTokenizer('[a-zA-Z\']+')
In [ ]:
ret.tokenize(complex_text)
In [ ]:
complex_text = 'This isn\'t a simple text. Count 1, 2, 3 and then go!'
In [ ]:
ret.tokenize(complex_text)
In [ ]:
from nltk.corpus import stopwords
In [ ]:
sw = set(stopwords.words('english'))
sw
In [ ]:
len(sw)
In [ ]:
complex_text = 'This isn\'t a simple text. Count 1, 2, 3 and then go!'
tokens = ret.tokenize(complex_text)
clean_tokens = [t for t in tokens if t not in sw]
clean_tokens
In [ ]:
from langdetect import detect, detect_langs
In [ ]:
detect('This is German')
In [ ]:
detect_langs('I really love you mon doux amour!')
In [ ]:
detect('I really love you mon doux amour!')
In [ ]:
from nltk.stem.snowball import SnowballStemmer
In [ ]:
ess = SnowballStemmer('english', ignore_stopwords=True)
In [ ]:
ess.stem('flies')
In [ ]:
from nltk.stem.snowball import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer
In [ ]:
ps = PorterStemmer()
ps.stem('teeth')
In [ ]:
ls = LancasterStemmer()
ls.stem('teeth')
In [ ]:
from sklearn.feature_extraction.text import CountVectorizer
In [ ]:
corpus = [ 'This is a simple test corpus',
'A corpus is a set of text documents',
'We want to analyze the corpus and the documents',
'Documents can be automatically tokenized'
]
In [ ]:
cv = CountVectorizer()
In [ ]:
vectorized_corpus = cv.fit_transform(corpus)
In [ ]:
vectorized_corpus.todense()
In [ ]:
cv.vocabulary_
In [ ]:
In [ ]: