In [2]:
import nltk
#%matplotlib inline

In [3]:
from urllib.request import urlopen
response = urlopen('http://python.org')
html = response.read()
print (len(html))


47393

In [4]:
tokens = [tok for tok in html.split()]
print ("Total no of tokens:" +str (len(tokens)))
print( tokens[100:130])


Total no of tokens:2857
[b'embossed,', b'speech,', b'tty"', b'/>', b'<!--[if', b'(lte', b'IE', b'8)&(!IEMobile)]>', b'<link', b'href="/static/stylesheets/no-mq.css"', b'rel="stylesheet"', b'type="text/css"', b'media="screen"', b'/>', b'<![endif]-->', b'<link', b'rel="icon"', b'type="image/x-icon"', b'href="/static/favicon.ico">', b'<link', b'rel="apple-touch-icon-precomposed"', b'sizes="144x144"', b'href="/static/apple-touch-icon-144x144-precomposed.png">', b'<link', b'rel="apple-touch-icon-precomposed"', b'sizes="114x114"', b'href="/static/apple-touch-icon-114x114-precomposed.png">', b'<link', b'rel="apple-touch-icon-precomposed"', b'sizes="72x72"']

In [5]:
#clean = nltk.clean_html(html)



NotImplementedErrorTraceback (most recent call last)
<ipython-input-5-18a019de0075> in <module>()
----> 1 clean = nltk.clean_html(html)

/usr/local/lib/python2.7/dist-packages/nltk/util.pyc in clean_html(html)
    344 
    345 def clean_html(html):
--> 346     raise NotImplementedError ("To remove HTML markup, use BeautifulSoup's get_text() function")
    347 
    348 def clean_url(url):

NotImplementedError: To remove HTML markup, use BeautifulSoup's get_text() function

In [ ]:
import nltk
print (nltk.__version__)

In [ ]:
from bs4 import BeautifulSoup

In [ ]: