In [4]:
import nltk
import numpy
import urllib2

response = urllib2.urlopen('http://www.nltk.org/api/nltk.html#nltk.util.clean_html')
html = response.read()
print len(html)

print(nltk.version_info)
#clean = nltk.clean_html(html)

sentence = "At eight oclock on Thursday morning Arthur didnt feel very good."
tokens = nltk.word_tokenize(sentence)


587681
sys.version_info(major=2, minor=7, micro=6, releaselevel='final', serial=0)
---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
<ipython-input-4-b662abd04d23> in <module>()
     11 
     12 sentence = "At eight oclock on Thursday morning Arthur didnt feel very good."
---> 13 tokens = nltk.word_tokenize(sentence)

/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.pyc in word_tokenize(text)
     91     along with :class:`.PunktSentenceTokenizer`).
     92     """
---> 93     return [token for sent in sent_tokenize(text)
     94             for token in _treebank_word_tokenize(sent)]
     95 

/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.pyc in sent_tokenize(text)
     79     (currently :class:`.PunktSentenceTokenizer`).
     80     """
---> 81     tokenizer = load('tokenizers/punkt/english.pickle')
     82     return tokenizer.tokenize(text)
     83 

/usr/local/lib/python2.7/dist-packages/nltk/data.pyc in load(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding)
    772 
    773     # Load the resource.
--> 774     opened_resource = _open(resource_url)
    775 
    776     if format == 'raw':

/usr/local/lib/python2.7/dist-packages/nltk/data.pyc in _open(resource_url)
    886 
    887     if protocol is None or protocol.lower() == 'nltk':
--> 888         return find(path_, path + ['']).open()
    889     elif protocol.lower() == 'file':
    890         # urllib might not use mode='rb', so handle this one ourselves:

/usr/local/lib/python2.7/dist-packages/nltk/data.pyc in find(resource_name, paths)
    616     sep = '*'*70
    617     resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 618     raise LookupError(resource_not_found)
    619 
    620 def retrieve(resource_url, filename=None, verbose=True):

LookupError: 
**********************************************************************
  Resource u'tokenizers/punkt/english.pickle' not found.  Please
  use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/home/moonbury/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - u''
**********************************************************************

In [ ]: