In [4]:
import nltk
In [5]:
nltk.download()
Out[5]:
In [6]:
from nltk.book import *
In [7]:
text1.similar("great")
In [8]:
from urllib import request
url = "http://www.gutenberg.org/files/2554/2554.txt"
response = request.urlopen(url)
raw = response.read().decode('utf8')
type(raw)
Out[8]:
In [9]:
len(raw)
Out[9]:
In [10]:
raw[:75]
Out[10]:
In [12]:
tokens = nltk.word_tokenize(raw)
type(tokens)
Out[12]:
In [13]:
len(tokens)
Out[13]:
In [14]:
tokens[:10]
Out[14]: