notebook.community

Edit and run



In [4]:

    
import nltk



In [5]:

    
nltk.download()









    



showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml






    Out[5]:





True



In [6]:

    
from nltk.book import *









    



*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908



In [7]:

    
text1.similar("great")









    



good whale long vast sea whole living small other large dead mighty
same such last more much sperm noble old



In [8]:

    
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
type(raw)









    Out[8]:





str



In [9]:

    
len(raw)









    Out[9]:





1176896



In [10]:

    
raw[:75]









    Out[10]:





'The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky\r\n'



In [12]:

    
tokens = nltk.word_tokenize(raw)
type(tokens)









    Out[12]:





list



In [13]:

    
len(tokens)









    Out[13]:





254352



In [14]:

    
tokens[:10]









    Out[14]:





['The',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Crime',
 'and',
 'Punishment',
 ',',
 'by']

http://www.cs.duke.edu/courses/spring14/compsci290/assignments/lab02.html