In [4]:
import nltk

In [5]:
nltk.download()


showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
Out[5]:
True

In [6]:
from nltk.book import *


*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908

In [7]:
text1.similar("great")


good whale long vast sea whole living small other large dead mighty
same such last more much sperm noble old

In [8]:
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
type(raw)


Out[8]:
str

In [9]:
len(raw)


Out[9]:
1176896

In [10]:
raw[:75]


Out[10]:
'The Project Gutenberg EBook of Crime and Punishment, by Fyodor Dostoevsky\r\n'

In [12]:
tokens = nltk.word_tokenize(raw)
type(tokens)


Out[12]:
list

In [13]:
len(tokens)


Out[13]:
254352

In [14]:
tokens[:10]


Out[14]:
['The',
 'Project',
 'Gutenberg',
 'EBook',
 'of',
 'Crime',
 'and',
 'Punishment',
 ',',
 'by']