LookupErrorTraceback (most recent call last)
<ipython-input-7-6e137b4be272> in <module>()
5 # Create TF-IDF of texts
6 tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words='english', max_features=max_features)
----> 7 sparse_tfidf_texts = tfidf.fit_transform(texts)
/Users/lipingzhang/anaconda/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in fit_transform(self, raw_documents, y)
1379 Tf-idf-weighted document-term matrix.
1380 """
-> 1381 X = super(TfidfVectorizer, self).fit_transform(raw_documents)
1382 self._tfidf.fit(X)
1383 # X is already a transformed view of raw_documents so
/Users/lipingzhang/anaconda/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in fit_transform(self, raw_documents, y)
867
868 vocabulary, X = self._count_vocab(raw_documents,
--> 869 self.fixed_vocabulary_)
870
871 if self.binary:
/Users/lipingzhang/anaconda/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in _count_vocab(self, raw_documents, fixed_vocab)
790 for doc in raw_documents:
791 feature_counter = {}
--> 792 for feature in analyze(doc):
793 try:
794 feature_idx = vocabulary[feature]
/Users/lipingzhang/anaconda/lib/python2.7/site-packages/sklearn/feature_extraction/text.pyc in <lambda>(doc)
264
265 return lambda doc: self._word_ngrams(
--> 266 tokenize(preprocess(self.decode(doc))), stop_words)
267
268 else:
<ipython-input-7-6e137b4be272> in tokenizer(text)
1 def tokenizer(text):
----> 2 words = nltk.word_tokenize(text)
3 return words
4
5 # Create TF-IDF of texts
/Users/lipingzhang/anaconda/lib/python2.7/site-packages/nltk/tokenize/__init__.pyc in word_tokenize(text, language, preserve_line)
128 :type preserver_line: bool
129 """
--> 130 sentences = [text] if preserve_line else sent_tokenize(text, language)
131 return [token for sent in sentences
132 for token in _treebank_word_tokenizer.tokenize(sent)]
/Users/lipingzhang/anaconda/lib/python2.7/site-packages/nltk/tokenize/__init__.pyc in sent_tokenize(text, language)
94 :param language: the model name in the Punkt corpus
95 """
---> 96 tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
97 return tokenizer.tokenize(text)
98
/Users/lipingzhang/anaconda/lib/python2.7/site-packages/nltk/data.pyc in load(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding)
812
813 # Load the resource.
--> 814 opened_resource = _open(resource_url)
815
816 if format == 'raw':
/Users/lipingzhang/anaconda/lib/python2.7/site-packages/nltk/data.pyc in _open(resource_url)
930
931 if protocol is None or protocol.lower() == 'nltk':
--> 932 return find(path_, path + ['']).open()
933 elif protocol.lower() == 'file':
934 # urllib might not use mode='rb', so handle this one ourselves:
/Users/lipingzhang/anaconda/lib/python2.7/site-packages/nltk/data.pyc in find(resource_name, paths)
651 sep = '*' * 70
652 resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 653 raise LookupError(resource_not_found)
654
655
LookupError:
**********************************************************************
Resource u'tokenizers/punkt/english.pickle' not found. Please
use the NLTK Downloader to obtain the resource: >>>
nltk.download()
Searched in:
- '/Users/lipingzhang/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
- u''
**********************************************************************