Exercise 03: Splitting sentences and PoS annotation

Let's start with a simple paragraph, copied from the course description:


In [1]:
text = """
Increasingly, customers send text to interact or leave comments, 
which provides a wealth of data for text mining. That’s a great 
starting point for developing custom search, content recommenders, 
and even AI applications.
"""
repr(text)


Out[1]:
"'\\nIncreasingly, customers send text to interact or leave comments, \\nwhich provides a wealth of data for text mining. That’s a great \\nstarting point for developing custom search, content recommenders, \\nand even AI applications.\\n'"

Notice how there are explicit line breaks in the text. Let's write some code to flow the paragraph without any line breaks:


In [2]:
text = " ".join(map(lambda x: x.strip(), text.split("\n"))).strip()
repr(text)


Out[2]:
"'Increasingly, customers send text to interact or leave comments, which provides a wealth of data for text mining. That’s a great starting point for developing custom search, content recommenders, and even AI applications.'"

Now we can use TextBlob to split the paragraph into sentences:


In [3]:
from textblob import TextBlob

for sent in TextBlob(text).sentences:
  print("> ", sent)


**********************************************************************
  Resource 'tokenizers/punkt/PY3/english.pickle' not found.
  Please use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/Users/whitehat/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************
---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/decorators.py in decorated(*args, **kwargs)
     34         try:
---> 35             return func(*args, **kwargs)
     36         except LookupError as err:

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/tokenizers.py in tokenize(self, text)
     56         '''Return a list of sentences.'''
---> 57         return nltk.tokenize.sent_tokenize(text)
     58 

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/tokenize/__init__.py in sent_tokenize(text, language)
     89     """
---> 90     tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
     91     return tokenizer.tokenize(text)

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in load(resource_url, format, cache, verbose, logic_parser, fstruct_reader, encoding)
    800     # Load the resource.
--> 801     opened_resource = _open(resource_url)
    802 

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in _open(resource_url)
    918     if protocol is None or protocol.lower() == 'nltk':
--> 919         return find(path_, path + ['']).open()
    920     elif protocol.lower() == 'file':

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in find(resource_name, paths)
    640     resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 641     raise LookupError(resource_not_found)
    642 

LookupError: 
**********************************************************************
  Resource 'tokenizers/punkt/PY3/english.pickle' not found.
  Please use the NLTK Downloader to obtain the resource:  >>>
  nltk.download()
  Searched in:
    - '/Users/whitehat/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************

During handling of the above exception, another exception occurred:

MissingCorpusError                        Traceback (most recent call last)
<ipython-input-3-46fd6159f9ac> in <module>()
      1 from textblob import TextBlob
      2 
----> 3 for sent in TextBlob(text).sentences:
      4   print("> ", sent)

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/decorators.py in __get__(self, obj, cls)
     22         if obj is None:
     23             return self
---> 24         value = obj.__dict__[self.func.__name__] = self.func(obj)
     25         return value
     26 

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/blob.py in sentences(self)
    595     def sentences(self):
    596         """Return list of :class:`Sentence <Sentence>` objects."""
--> 597         return self._create_sentence_objects()
    598 
    599     @cached_property

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/blob.py in _create_sentence_objects(self)
    639         '''
    640         sentence_objects = []
--> 641         sentences = sent_tokenize(self.raw)
    642         char_index = 0  # Keeps track of character index within the blob
    643         for sent in sentences:

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/base.py in itokenize(self, text, *args, **kwargs)
     62         :rtype: generator
     63         """
---> 64         return (t for t in self.tokenize(text, *args, **kwargs))
     65 
     66 ##### SENTIMENT ANALYZERS ####

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/decorators.py in decorated(*args, **kwargs)
     36         except LookupError as err:
     37             print(err)
---> 38             raise MissingCorpusError()
     39     return decorated

MissingCorpusError: 
Looks like you are missing some required data for this feature.

To download the necessary data, simply run

    python -m textblob.download_corpora

or use the NLTK downloader to download the missing data: http://nltk.org/data.html
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.

Next we take a sentence and annotate it with part-of-speech (PoS) tags:


In [4]:
import textblob_aptagger as tag

sent = "Increasingly, customers send text to interact or leave comments, which provides a wealth of data for text mining."

ts = tag.PerceptronTagger().tag(sent)
print(ts)


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-4-96ce84526706> in <module>()
----> 1 import textblob_aptagger as tag
      2 
      3 sent = "Increasingly, customers send text to interact or leave comments, which provides a wealth of data for text mining."
      4 
      5 ts = tag.PerceptronTagger().tag(sent)

ImportError: No module named 'textblob_aptagger'

Given these annotations for part-of-speech tags, we can lemmatize nouns and verbs to get their root forms. This will also singularize the plural nouns:


In [5]:
from textblob import Word

ts = [('InterAct', 'VB'), ('comments', 'NNS'), ('provides', 'VBZ'), ('mining', 'NN')]

for lex, pos in ts:
  w = Word(lex.lower())
  lemma = w.lemmatize(pos[0].lower())
  print(lex, pos, lemma)


**********************************************************************
  Resource 'corpora/wordnet' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/Users/whitehat/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************
---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __load(self)
     62             except LookupError as e:
---> 63                 try: root = nltk.data.find('corpora/%s' % zip_name)
     64                 except LookupError: raise e

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in find(resource_name, paths)
    640     resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 641     raise LookupError(resource_not_found)
    642 

LookupError: 
**********************************************************************
  Resource 'corpora/wordnet.zip/wordnet/' not found.  Please use
  the NLTK Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/Users/whitehat/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

During handling of the above exception, another exception occurred:

LookupError                               Traceback (most recent call last)
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/decorators.py in decorated(*args, **kwargs)
     34         try:
---> 35             return func(*args, **kwargs)
     36         except LookupError as err:

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/blob.py in lemmatize(self, pos)
    148         lemmatizer = nltk.stem.WordNetLemmatizer()
--> 149         return lemmatizer.lemmatize(self.string, pos)
    150 

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/stem/wordnet.py in lemmatize(self, word, pos)
     39     def lemmatize(self, word, pos=NOUN):
---> 40         lemmas = wordnet._morphy(word, pos)
     41         return min(lemmas, key=len) if lemmas else word

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __getattr__(self, attr)
     98 
---> 99         self.__load()
    100         # This looks circular, but its not, since __load() changes our

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __load(self)
     63                 try: root = nltk.data.find('corpora/%s' % zip_name)
---> 64                 except LookupError: raise e
     65 

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __load(self)
     60             try:
---> 61                 root = nltk.data.find('corpora/%s' % self.__name)
     62             except LookupError as e:

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in find(resource_name, paths)
    640     resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 641     raise LookupError(resource_not_found)
    642 

LookupError: 
**********************************************************************
  Resource 'corpora/wordnet' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/Users/whitehat/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

During handling of the above exception, another exception occurred:

MissingCorpusError                        Traceback (most recent call last)
<ipython-input-5-36b09d2e4dc3> in <module>()
      5 for lex, pos in ts:
      6   w = Word(lex.lower())
----> 7   lemma = w.lemmatize(pos[0].lower())
      8   print(lex, pos, lemma)

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/decorators.py in decorated(*args, **kwargs)
     36         except LookupError as err:
     37             print(err)
---> 38             raise MissingCorpusError()
     39     return decorated

MissingCorpusError: 
Looks like you are missing some required data for this feature.

To download the necessary data, simply run

    python -m textblob.download_corpora

or use the NLTK downloader to download the missing data: http://nltk.org/data.html
If this doesn't fix the problem, file an issue at https://github.com/sloria/TextBlob/issues.

We can also lookup synonyms and definitions for each word, using synsets from WordNet:


In [6]:
from textblob.wordnet import VERB

w = Word("comments")

for synset, definition in zip(w.get_synsets(), w.define()):
  print(synset, definition)


---------------------------------------------------------------------------
LookupError                               Traceback (most recent call last)
/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __load(self)
     62             except LookupError as e:
---> 63                 try: root = nltk.data.find('corpora/%s' % zip_name)
     64                 except LookupError: raise e

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in find(resource_name, paths)
    640     resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 641     raise LookupError(resource_not_found)
    642 

LookupError: 
**********************************************************************
  Resource 'corpora/wordnet.zip/wordnet/' not found.  Please use
  the NLTK Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/Users/whitehat/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

During handling of the above exception, another exception occurred:

LookupError                               Traceback (most recent call last)
<ipython-input-6-e816c4bec348> in <module>()
----> 1 from textblob.wordnet import VERB
      2 
      3 w = Word("comments")
      4 
      5 for synset, definition in zip(w.get_synsets(), w.define()):

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/textblob/wordnet.py in <module>()
     11 wordnet = nltk.corpus.wordnet
     12 #: Synset constructor
---> 13 Synset = nltk.corpus.wordnet.synset
     14 #: Lemma constructor
     15 Lemma = nltk.corpus.wordnet.lemma

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __getattr__(self, attr)
     97             raise AttributeError("LazyCorpusLoader object has no attribute '__bases__'")
     98 
---> 99         self.__load()
    100         # This looks circular, but its not, since __load() changes our
    101         # __class__ to something new:

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __load(self)
     62             except LookupError as e:
     63                 try: root = nltk.data.find('corpora/%s' % zip_name)
---> 64                 except LookupError: raise e
     65 
     66         # Load the corpus.

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/corpus/util.py in __load(self)
     59         else:
     60             try:
---> 61                 root = nltk.data.find('corpora/%s' % self.__name)
     62             except LookupError as e:
     63                 try: root = nltk.data.find('corpora/%s' % zip_name)

/Users/whitehat/.virtualenvs/3_whitehat_dev/lib/python3.4/site-packages/nltk/data.py in find(resource_name, paths)
    639     sep = '*' * 70
    640     resource_not_found = '\n%s\n%s\n%s' % (sep, msg, sep)
--> 641     raise LookupError(resource_not_found)
    642 
    643 

LookupError: 
**********************************************************************
  Resource 'corpora/wordnet' not found.  Please use the NLTK
  Downloader to obtain the resource:  >>> nltk.download()
  Searched in:
    - '/Users/whitehat/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************

In [ ]: