In [ ]:

    
%%HTML
<style>
.container { width:100% }
</style>

Corpora Examples



In [ ]:

    
import nltk



In [ ]:

    
# nltk.download()



In [ ]:

    
from nltk.corpus import gutenberg

Show the titles of those books that are part of the gutenberg corpus.



In [ ]:

    
gutenberg.fileids()

Retrieve the book "Paradise Lost" by John Milton.



In [ ]:

    
print(gutenberg.raw('milton-paradise.txt'))

Retrieve the book "Paradise Lost" by John Milton as a list of sentences.



In [ ]:

    
print(gutenberg.sents('milton-paradise.txt'))

Retrieve the book "Paradise Lost" by John Milton as a list of words.



In [ ]:

    
print(gutenberg.words('milton-paradise.txt'))



In [ ]:

    
from nltk.corpus import brown



In [ ]:

    
brown.ensure_loaded()
help(brown)



In [ ]:

    
brown.fileids()



In [ ]:

    
brown.categories()



In [ ]:

    
brown.sents(categories='editorial')

Tokenizing



In [ ]:

    
from nltk.tokenize import sent_tokenize



In [ ]:

    
generic_text = 'Lorem ipsum dolor sit amet, amet minim temporibus in sit. Vel ne impedit consequat intellegebat.'

The function sent_tokenize splits a string into a list of sentences.



In [ ]:

    
sent_tokenize(generic_text)



In [ ]:

    
english_text = 'Where is the closest train station? I need to reach London.'
sent_tokenize(english_text)



In [ ]:

    
spanish_text = '¿Dónde está la estación más cercana? Inmediatamente me tengo que ir a Barcelona.'
sent_tokenize(spanish_text, language='spanish')



In [ ]:

    
from nltk.tokenize import TreebankWordTokenizer



In [ ]:

    
simple_text = 'This is a simple text.'
tbwt        = TreebankWordTokenizer()
tbwt.tokenize(simple_text)



In [ ]:

    
help(TreebankWordTokenizer)



In [ ]:

    
from nltk.tokenize import RegexpTokenizer



In [ ]:

    
complex_text = "This isn't a simple text."



In [ ]:

    
ret = RegexpTokenizer('[a-zA-Z\']+')



In [ ]:

    
ret.tokenize(complex_text)



In [ ]:

    
complex_text = 'This isn\'t a simple text. Count 1, 2, 3 and then go!'



In [ ]:

    
ret.tokenize(complex_text)

Stopword Removal



In [ ]:

    
from nltk.corpus import stopwords



In [ ]:

    
sw = set(stopwords.words('english'))
sw



In [ ]:

    
len(sw)



In [ ]:

    
complex_text = 'This isn\'t a simple text. Count 1, 2, 3 and then go!'
tokens = ret.tokenize(complex_text)
clean_tokens = [t for t in tokens if t not in sw]
clean_tokens

Language Detection



In [ ]:

    
from langdetect import detect, detect_langs



In [ ]:

    
detect('This is German')



In [ ]:

    
detect_langs('I really love you mon doux amour!')



In [ ]:

    
detect('I really love you mon doux amour!')

Stemming



In [ ]:

    
from nltk.stem.snowball import SnowballStemmer



In [ ]:

    
ess = SnowballStemmer('english', ignore_stopwords=True)



In [ ]:

    
ess.stem('flies')



In [ ]:

    
from nltk.stem.snowball  import PorterStemmer
from nltk.stem.lancaster import LancasterStemmer



In [ ]:

    
ps = PorterStemmer()
ps.stem('teeth')



In [ ]:

    
ls = LancasterStemmer()
ls.stem('teeth')

Vectorization



In [ ]:

    
from sklearn.feature_extraction.text import CountVectorizer



In [ ]:

    
corpus = [ 'This is a simple test corpus',
           'A corpus is a set of text documents',
           'We want to analyze the corpus and the documents',
           'Documents can be automatically tokenized'
         ]



In [ ]:

    
cv = CountVectorizer()



In [ ]:

    
vectorized_corpus = cv.fit_transform(corpus)



In [ ]:

    
vectorized_corpus.todense()



In [ ]:

    
cv.vocabulary_



In [ ]:



In [ ]: