In [4]:
import nltk
import numpy
import urllib2
response = urllib2.urlopen('http://www.nltk.org/api/nltk.html#nltk.util.clean_html')
html = response.read()
print len(html)
print(nltk.version_info)
#clean = nltk.clean_html(html)
sentence = "At eight oclock on Thursday morning Arthur didnt feel very good."
tokens = nltk.word_tokenize(sentence)
587681
sys.version_info(major=2, minor=7, micro=6, releaselevel='final', serial=0)
---------------------------------------------------------------------------
LookupError Traceback (most recent call last)
<ipython-input-4-b662abd04d23> in <module>()
11
12 sentence = "At eight oclock on Thursday morning Arthur didnt feel very good."
---> 13 tokens = nltk.word_tokenize(sentence)
/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.pyc in word_tokenize(text)
91 along with :class:`.PunktSentenceTokenizer`).
92 """
---> 93 return [token for sent in sent_tokenize(text)
94 for token in _treebank_word_tokenize(sent)]
95
/usr/local/lib/python2.7/dist-packages/nltk/tokenize/__init__.pyc in sent_tokenize(text)
79 (currently :class:`.PunktSentenceTokenizer`).
80 """
---> 81 tokenizer = load('tokenizers/punkt/english.pickle')
82 return tokenizer.tokenize(text)
83
/usr/local/lib/python2.7/dist-packages/nltk/data.pyc in load(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding)
772
773 # Load the resource.
--> 774 opened_resource = _open(resource_url)
775
776 if format == 'raw':
/usr/local/lib/python2.7/dist-packages/nltk/data.pyc in _open(resource_url)
886
887 if protocol is None or protocol.lower() == 'nltk':
--> 888 return find(path_, path + ['']).open()
889 elif protocol.lower() == 'file':
890 # urllib might not use mode='rb', so handle this one ourselves:
/usr/local/lib/python2.7/dist-packages/nltk/data.pyc in find(resource_name, paths)
616 sep = '*'*70
617 resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 618 raise LookupError(resource_not_found)
619
620 def retrieve(resource_url, filename=None, verbose=True):
LookupError:
**********************************************************************
Resource u'tokenizers/punkt/english.pickle' not found. Please
use the NLTK Downloader to obtain the resource: >>>
nltk.download()
Searched in:
- '/home/moonbury/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
- u''
**********************************************************************
In [ ]:
Content source: moonbury/pythonanywhere
Similar notebooks: