In [7]:
import nltk
%matplotlib inline

In [8]:
from urllib.request import urlopen
response = urlopen('http://python.org')
html = response.read()
print (len(html))


47381

In [9]:
tokens = [tok for tok in html.split()]
print ("Total no of tokens:" +str (len(tokens)))
print( tokens[100:130])


Total no of tokens:2859
[b'embossed,', b'speech,', b'tty"', b'/>', b'<!--[if', b'(lte', b'IE', b'8)&(!IEMobile)]>', b'<link', b'href="/static/stylesheets/no-mq.css"', b'rel="stylesheet"', b'type="text/css"', b'media="screen"', b'/>', b'<![endif]-->', b'<link', b'rel="icon"', b'type="image/x-icon"', b'href="/static/favicon.ico">', b'<link', b'rel="apple-touch-icon-precomposed"', b'sizes="144x144"', b'href="/static/apple-touch-icon-144x144-precomposed.png">', b'<link', b'rel="apple-touch-icon-precomposed"', b'sizes="114x114"', b'href="/static/apple-touch-icon-114x114-precomposed.png">', b'<link', b'rel="apple-touch-icon-precomposed"', b'sizes="72x72"']

In [10]:
clean = nltk.clean_html(html)


---------------------------------------------------------------------------
NotImplementedError                       Traceback (most recent call last)
<ipython-input-10-18a019de0075> in <module>()
----> 1 clean = nltk.clean_html(html)

/usr/local/lib/python3.4/dist-packages/nltk/util.py in clean_html(html)
    344 
    345 def clean_html(html):
--> 346     raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
    347 
    348 def clean_url(url):

NotImplementedError: To remove HTML markup, use BeautifulSoup's get_text() function

In [5]:
import nltk
print (nltk.__version__)


3.2.1

In [6]:
from bs4 import BeautifulSoup

In [ ]: