In [1]:
%matplotlib inline
In [2]:
import os
import sys
# Modify the path
sys.path.append("..")
import yellowbrick as yb
In [3]:
texts = {
'nursery_rhyme' : '''Baa, baa, black sheep,
Have you any wool?
Yes, sir, yes, sir,
Three bags full;
One for the master,
And one for the dame,
And one for the little boy
Who lives down the lane.''',
'algebra' : '''Algebra (from Arabic "al-jabr" meaning
"reunion of broken parts") is one of the
broad parts of mathematics, together with
number theory, geometry and analysis. In
its most general form, algebra is the study
of mathematical symbols and the rules for
manipulating these symbols; it is a unifying
thread of almost all of mathematics.''',
'french_silk' : '''In a small saucepan, combine sugar and eggs
until well blended. Cook over low heat, stirring
constantly, until mixture reaches 160° and coats
the back of a metal spoon. Remove from the heat.
Stir in chocolate and vanilla until smooth. Cool
to lukewarm (90°), stirring occasionally. In a small
bowl, cream butter until light and fluffy. Add cooled
chocolate mixture; beat on high speed for 5 minutes
or until light and fluffy. In another large bowl,
beat cream until it begins to thicken. Add
confectioners' sugar; beat until stiff peaks form.
Fold into chocolate mixture. Pour into crust. Chill
for at least 6 hours before serving. Garnish with
whipped cream and chocolate curls if desired. '''
}
In [4]:
##########################################################################
# Imports
##########################################################################
from yellowbrick.text.base import TextVisualizer
##########################################################################
# PosTagVisualizer
##########################################################################
class PosTagVisualizer(TextVisualizer):
"""
A part-of-speech tag visualizer colorizes text to enable
the user to visualize the proportions of nouns, verbs, etc.
and to use this information to make decisions about text
normalization (e.g. stemming vs lemmatization) and
vectorization.
Parameters
----------
kwargs : dict
Pass any additional keyword arguments to the super class.
cmap : dict
ANSII colormap
These parameters can be influenced later on in the visualization
process, but can and should be set as early as possible.
"""
def __init__(self, ax=None, **kwargs):
"""
Initializes the base frequency distributions with many
of the options required in order to make this
visualization work.
"""
super(PosTagVisualizer, self).__init__(ax=ax, **kwargs)
# TODO: hard-coding in the ANSII colormap for now.
# Can we let the user reset the colors here?
self.COLORS = {
'white' : "\033[0;37m{}\033[0m",
'yellow' : "\033[0;33m{}\033[0m",
'green' : "\033[0;32m{}\033[0m",
'blue' : "\033[0;34m{}\033[0m",
'cyan' : "\033[0;36m{}\033[0m",
'red' : "\033[0;31m{}\033[0m",
'magenta' : "\033[0;35m{}\033[0m",
'black' : "\033[0;30m{}\033[0m",
'darkwhite' : "\033[1;37m{}\033[0m",
'darkyellow' : "\033[1;33m{}\033[0m",
'darkgreen' : "\033[1;32m{}\033[0m",
'darkblue' : "\033[1;34m{}\033[0m",
'darkcyan' : "\033[1;36m{}\033[0m",
'darkred' : "\033[1;31m{}\033[0m",
'darkmagenta': "\033[1;35m{}\033[0m",
'darkblack' : "\033[1;30m{}\033[0m",
None : "\033[0;0m{}\033[0m"
}
self.TAGS = {
'NN' : 'green',
'NNS' : 'green',
'NNP' : 'green',
'NNPS' : 'green',
'VB' : 'blue',
'VBD' : 'blue',
'VBG' : 'blue',
'VBN' : 'blue',
'VBP' : 'blue',
'VBZ' : 'blue',
'JJ' : 'red',
'JJR' : 'red',
'JJS' : 'red',
'RB' : 'cyan',
'RBR' : 'cyan',
'RBS' : 'cyan',
'IN' : 'darkwhite',
'POS' : 'darkyellow',
'PRP$' : 'magenta',
'PRP$' : 'magenta',
'DT' : 'black',
'CC' : 'black',
'CD' : 'black',
'WDT' : 'black',
'WP' : 'black',
'WP$' : 'black',
'WRB' : 'black',
'EX' : 'yellow',
'FW' : 'yellow',
'LS' : 'yellow',
'MD' : 'yellow',
'PDT' : 'yellow',
'RP' : 'yellow',
'SYM' : 'yellow',
'TO' : 'yellow',
'None' : 'off'
}
def colorize(self, token, color):
"""
Colorize text
Parameters
----------
token : str
A str representation of
"""
return self.COLORS[color].format(token)
def transform(self, tagged_tuples):
"""
The transform method transforms the raw text input for the
part-of-speech tagging visualization. It requires that
documents be in the form of (tag, token) tuples.
Parameters
----------
tagged_token_tuples : list of tuples
A list of (tag, token) tuples
Text documents must be tokenized and tagged before passing to fit()
"""
self.tagged = [
(self.TAGS.get(tag),tok) for tok, tag in tagged_tuples
]
In [5]:
from nltk.corpus import wordnet as wn
from nltk import pos_tag, word_tokenize
In [6]:
# Tokenize the text
for label,text in texts.items():
tokens = word_tokenize(text)
tagged = pos_tag(tokens)
visualizer = PosTagVisualizer()
visualizer.transform(tagged)
print(' '.join((visualizer.colorize(token, color) for color, token in visualizer.tagged)))
print('\n')
In [ ]: