Let's start with a simple paragraph, copied from the course description:
In [1]:
text = """
Increasingly, customers send text to interact or leave comments,
which provides a wealth of data for text mining. That’s a great
starting point for developing custom search, content recommenders,
and even AI applications.
"""
repr(text)
Out[1]:
"'\\nIncreasingly, customers send text to interact or leave comments, \\nwhich provides a wealth of data for text mining. That’s a great \\nstarting point for developing custom search, content recommenders, \\nand even AI applications.\\n'"
Notice how there are explicit line breaks in the text. Let's write some code to flow the paragraph without any line breaks:
In [2]:
text = " ".join(map(lambda x: x.strip(), text.split("\n"))).strip()
repr(text)
Out[2]:
"'Increasingly, customers send text to interact or leave comments, which provides a wealth of data for text mining. That’s a great starting point for developing custom search, content recommenders, and even AI applications.'"
Now we can use TextBlob to split the paragraph into sentences:
In [3]:
from textblob import TextBlob
for sent in TextBlob(text).sentences:
print("> ", sent)
**********************************************************************
Resource 'tokenizers/punkt/PY3/english.pickle' not found.
Please use the NLTK Downloader to obtain the resource: >>>
nltk.download()
Searched in:
- '/Users/whitehat/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
- ''
**********************************************************************
---------------------------------------------------------------------------
LookupError Traceback (most recent call last)
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/decorators.py in decorated(*args, **kwargs)
34 try:
---> 35 return func(*args, **kwargs)
36 except LookupError as err:
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/tokenizers.py in tokenize(self, text)
56 '''Return a list of sentences.'''
---> 57 return nltk.tokenize.sent_tokenize(text)
58
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/tokenize/__init__.py in sent_tokenize(text, language)
89 """
---> 90 tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
91 return tokenizer.tokenize(text)
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in load(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding)
800 # Load the resource.
--> 801 opened_resource = _open(resource_url)
802
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in _open(resource_url)
918 if protocol is None or protocol.lower() == 'nltk':
--> 919 return find(path_, path + ['']).open()
920 elif protocol.lower() == 'file':
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in find(resource_name, paths)
640 resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 641 raise LookupError(resource_not_found)
642
LookupError:
**********************************************************************
Resource 'tokenizers/punkt/PY3/english.pickle' not found.
Please use the NLTK Downloader to obtain the resource: >>>
nltk.download()
Searched in:
- '/Users/whitehat/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
- ''
**********************************************************************
During handling of the above exception, another exception occurred:
MissingCorpusError Traceback (most recent call last)
<ipython-input-3-46fd6159f9ac> in <module>()
1 from textblob import TextBlob
2
----> 3 for sent in TextBlob(text).sentences:
4 print("> ", sent)
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/decorators.py in __get__(self, obj, cls)
22 if obj is None:
23 return self
---> 24 value = obj.__dict__[self.func.__name__] = self.func(obj)
25 return value
26
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/blob.py in sentences(self)
595 def sentences(self):
596 """Return list of :class:`Sentence <Sentence>` objects."""
--> 597 return self._create_sentence_objects()
598
599 @cached_property
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/blob.py in _create_sentence_objects(self)
639 '''
640 sentence_objects = []
--> 641 sentences = sent_tokenize(self.raw)
642 char_index = 0 # Keeps track of character index within the blob
643 for sent in sentences:
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/base.py in itokenize(self, text, *args, **kwargs)
62 :rtype: generator
63 """
---> 64 return (t for t in self.tokenize(text, *args, **kwargs))
65
66 ##### SENTIMENT ANALYZERS ####
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/decorators.py in decorated(*args, **kwargs)
36 except LookupError as err:
37 print(err)
---> 38 raise MissingCorpusError()
39 return decorated
MissingCorpusError:
Looks like you are missing some required data for this feature.
To download the necessary data, simply run
python -m textblob.download_corpora
or use the NLTK downloader to download the missing data: http://nltk.org/data.html
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
Next we take a sentence and annotate it with part-of-speech (PoS) tags:
In [4]:
import textblob_aptagger as tag
sent = "Increasingly, customers send text to interact or leave comments, which provides a wealth of data for text mining."
ts = tag.PerceptronTagger().tag(sent)
print(ts)
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
<ipython-input-4-96ce84526706> in <module>()
----> 1 import textblob_aptagger as tag
2
3 sent = "Increasingly, customers send text to interact or leave comments, which provides a wealth of data for text mining."
4
5 ts = tag.PerceptronTagger().tag(sent)
ImportError: No module named 'textblob_aptagger'
Given these annotations for part-of-speech tags, we can lemmatize nouns and verbs to get their root forms. This will also singularize the plural nouns:
In [5]:
from textblob import Word
ts = [('InterAct', 'VB'), ('comments', 'NNS'), ('provides', 'VBZ'), ('mining', 'NN')]
for lex, pos in ts:
w = Word(lex.lower())
lemma = w.lemmatize(pos[0].lower())
print(lex, pos, lemma)
**********************************************************************
Resource 'corpora/wordnet' not found. Please use the NLTK
Downloader to obtain the resource: >>> nltk.download()
Searched in:
- '/Users/whitehat/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
**********************************************************************
---------------------------------------------------------------------------
LookupError Traceback (most recent call last)
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __load(self)
62 except LookupError as e:
---> 63 try: root = nltk.data.find('corpora/%s' % zip_name)
64 except LookupError: raise e
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in find(resource_name, paths)
640 resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 641 raise LookupError(resource_not_found)
642
LookupError:
**********************************************************************
Resource 'corpora/wordnet.zip/wordnet/' not found. Please use
the NLTK Downloader to obtain the resource: >>> nltk.download()
Searched in:
- '/Users/whitehat/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
**********************************************************************
During handling of the above exception, another exception occurred:
LookupError Traceback (most recent call last)
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/decorators.py in decorated(*args, **kwargs)
34 try:
---> 35 return func(*args, **kwargs)
36 except LookupError as err:
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/blob.py in lemmatize(self, pos)
148 lemmatizer = nltk.stem.WordNetLemmatizer()
--> 149 return lemmatizer.lemmatize(self.string, pos)
150
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/stem/wordnet.py in lemmatize(self, word, pos)
39 def lemmatize(self, word, pos=NOUN):
---> 40 lemmas = wordnet._morphy(word, pos)
41 return min(lemmas, key=len) if lemmas else word
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __getattr__(self, attr)
98
---> 99 self.__load()
100 # This looks circular, but its not, since __load() changes our
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __load(self)
63 try: root = nltk.data.find('corpora/%s' % zip_name)
---> 64 except LookupError: raise e
65
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __load(self)
60 try:
---> 61 root = nltk.data.find('corpora/%s' % self.__name)
62 except LookupError as e:
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in find(resource_name, paths)
640 resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 641 raise LookupError(resource_not_found)
642
LookupError:
**********************************************************************
Resource 'corpora/wordnet' not found. Please use the NLTK
Downloader to obtain the resource: >>> nltk.download()
Searched in:
- '/Users/whitehat/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
**********************************************************************
During handling of the above exception, another exception occurred:
MissingCorpusError Traceback (most recent call last)
<ipython-input-5-36b09d2e4dc3> in <module>()
5 for lex, pos in ts:
6 w = Word(lex.lower())
----> 7 lemma = w.lemmatize(pos[0].lower())
8 print(lex, pos, lemma)
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/decorators.py in decorated(*args, **kwargs)
36 except LookupError as err:
37 print(err)
---> 38 raise MissingCorpusError()
39 return decorated
MissingCorpusError:
Looks like you are missing some required data for this feature.
To download the necessary data, simply run
python -m textblob.download_corpora
or use the NLTK downloader to download the missing data: http://nltk.org/data.html
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.
We can also lookup synonyms and definitions for each word, using synsets from WordNet:
In [6]:
from textblob.wordnet import VERB
w = Word("comments")
for synset, definition in zip(w.get_synsets(), w.define()):
print(synset, definition)
---------------------------------------------------------------------------
LookupError Traceback (most recent call last)
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __load(self)
62 except LookupError as e:
---> 63 try: root = nltk.data.find('corpora/%s' % zip_name)
64 except LookupError: raise e
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in find(resource_name, paths)
640 resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 641 raise LookupError(resource_not_found)
642
LookupError:
**********************************************************************
Resource 'corpora/wordnet.zip/wordnet/' not found. Please use
the NLTK Downloader to obtain the resource: >>> nltk.download()
Searched in:
- '/Users/whitehat/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
**********************************************************************
During handling of the above exception, another exception occurred:
LookupError Traceback (most recent call last)
<ipython-input-6-e816c4bec348> in <module>()
----> 1 from textblob.wordnet import VERB
2
3 w = Word("comments")
4
5 for synset, definition in zip(w.get_synsets(), w.define()):
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/wordnet.py in <module>()
11 wordnet = nltk.corpus.wordnet
12 #: Synset constructor
---> 13 Synset = nltk.corpus.wordnet.synset
14 #: Lemma constructor
15 Lemma = nltk.corpus.wordnet.lemma
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __getattr__(self, attr)
97 raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'")
98
---> 99 self.__load()
100 # This looks circular, but its not, since __load() changes our
101 # __class__ to something new:
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __load(self)
62 except LookupError as e:
63 try: root = nltk.data.find('corpora/%s' % zip_name)
---> 64 except LookupError: raise e
65
66 # Load the corpus.
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __load(self)
59 else:
60 try:
---> 61 root = nltk.data.find('corpora/%s' % self.__name)
62 except LookupError as e:
63 try: root = nltk.data.find('corpora/%s' % zip_name)
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in find(resource_name, paths)
639 sep = '*' * 70
640 resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 641 raise LookupError(resource_not_found)
642
643
LookupError:
**********************************************************************
Resource 'corpora/wordnet' not found. Please use the NLTK
Downloader to obtain the resource: >>> nltk.download()
Searched in:
- '/Users/whitehat/nltk_data'
- '/usr/share/nltk_data'
- '/usr/local/share/nltk_data'
- '/usr/lib/nltk_data'
- '/usr/local/lib/nltk_data'
**********************************************************************
In [ ]:
Content source: yevheniyc/Python
Similar notebooks: