spaCy Demo


In [ ]:
# Import spaCy's English analyzer and the location where data is stored
import pandas as pd
from spacy.en import English, LOCAL_DATA_DIR

In [ ]:
# What's the English object all about?
English?

In [ ]:
# Set up spaCy NLP analyzer (tokenizer, parser, NER-er, etc.)
nlp_analyzer = English(data_dir=LOCAL_DATA_DIR)

In [ ]:
# How does one interact with the analyzer object?
nlp_analyzer?

Input


In [ ]:
"""
The analyzer expects as input text -- the output contains all types of analysis
(parsing, tagging, and entity recognition can be turned off by setting them to
False)

Here's some input text that we'll use:
"""
text = ["This is a very simple sentence.",
        "This sentence, which is moderately more complex, is still quite simple.",
        "The two preceding sentences are easy to understand, hopefully easy to parse too.",
        "These sentences will be correctly parsed and tokenized if the gods look favorably on this demo.",
        "I hope that strange words like vapidity and celerity don't confuse the analyser (nor British spellings).",
        "One would even hopes that ungrammatical sentences not effects the parsing drammatically."]
text = ' '.join(text)

"""
Let's analyze it and also get a sense for how long it takes for a text of this size
to be analyzed
"""
%timeit nlp_analyzer(text)
analyzed_text = nlp_analyzer(text)

In [ ]:
analyzed_text.

Sentence Recognition

.sents


In [ ]:
"""
Let's take a look at what's in the output

The output is automatically divided up into the constituent sentences (.sents
attribute) and the sentences and text are composed of constituent tokens
"""
for sent in analyzed_text.sents:
    print('{}\n'.format(sent))

In [ ]:
"""
The .sents attribute is a generator and it stores the objects corresponding to each
recognized sentence
"""
sent = next(analyzed_text.sents)

In [ ]:
"""
Each sentence is of type spacy.tokens.span.Span, which is basically just a sequence
of token objects (more on that later)

Here you can see the type of the objects
"""
type(sent)

.string or .orth_


In [ ]:
"""
To get the string representation of anything (not just a sentence object), i.e., the
original token, the original sentence, the lemma, etc., use the .string or .orth_
attributes
"""
sent.orth_

In [ ]:
# The .string attribute contains whitespace
sent.string

Miscellaneous String Attributes

.is_alpha, .is_oov, .is_space, .like_email, .is_title, etc.


In [ ]:
"""
Various pieces of information can be collected about each object representing a
token.
"""
lines = []
for token in sent:
    lines.append(dict(Token=token.orth_, letter=token.is_alpha, ASCII=token.is_ascii,
                      digit=token.is_digit, lower=token.is_lower, OOV=token.is_oov,
                      punct=token.is_punct, space=token.is_space, stop=token.is_stop,
                      titlecase=token.is_title, like_email=token.like_email,
                      like_number=token.like_num, like_url=token.like_url,
                      shape=token.shape_, prefix=token.prefix_, suffix=token.suffix_,
                      lowercased=token.lower_))
pd.DataFrame(lines)

.doc attribute


In [ ]:
"""
If you want the whole document that the sentence occurred in, use the .doc attribute.
"""
sent.doc

Lemmatization

.lemma_


In [ ]:
"""
A lemmatized version of the object can be accessed via the .lemma_ attribute
"""
sent.lemma_

Parts of Speech and Tags


In [ ]:
tokens = []
for token in sent:
    tokens.append(dict(Token=token.orth_, tag=token.tag_, part_of_speech=token.pos_))
pd.DataFrame(tokens)

Parsing

.root


In [ ]:
print("sentence = {}".format(sent.orth_))
print("root of sentence = {}".format(sent.root))

.children, .dep_ attributes


In [ ]:
"""
Parse tree-related attributes can be accessed for each token, such as the
children/parents of the token, the dependency relationships, etc.
"""
token = sent[1]
print("sentence = {}".format(sent.orth_))
print("token: {}".format(token))
print("children: {}".format(list(token.children)))
print("head: {}".format(token.head))
print("dependency relationship: {}".format(token.dep_))

Word Representation Vectors

.repvec, .has_vector, .similarity()


In [ ]:
"""
Representing words as vectors allows for similarity calculations.
"""
last_sentence = list(analyzed_text.sents)[-1]
last_sentence.string

In [ ]:
token1 = last_sentence[5]
token1

In [ ]:
# Does this token have a vector?
token1.has_vector

In [ ]:
token1.repvec

In [ ]:
token2 = last_sentence[6]
token2

In [ ]:
# How similar are "ungrammatical" and "sentences"?
token1.similarity(token2)

In [ ]:
# How similar are two random other words?
token3 = last_sentence[8]
token3

In [ ]:
token4 = last_sentence[1]
token4

In [ ]:
token3.similarity(token4)

In [ ]:
# The similarity value is not as off as one might think, but it's still less

Log Probabilities and Brown Cluster IDs

.prob, .cluster


In [ ]:
for i, token in enumerate(sent):
    print("original:", token.orth_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")

NER

.ents, .ent_label, .ent_type_, .ent_iob_


In [ ]:
# Get a list of the entities directly with .ents
analyzed_text.ents

In [ ]:
# Let's print out all of the tokens in the example text only if they
# are entities
[print(token.orth_, token.ent_type_)
 for token in analyzed_text
 if token.ent_type_ != ""]

Can Handle Messy Data


In [ ]:
messy_data = "lol that is rly funny :) This is gr8 i rate it 8/8!!!"
analyzed_messy_data = nlp_analyzer(messy_data)
for token in analyzed_messy_data:
    print(token.orth_, token.pos_, token.lemma_)

Access to the Vocabulary


In [ ]:
# The vocabulary that the analyzer uses can be accessed and used (and
# also it, along with almost every other component of the system, can
# be customized)
vocab = nlp_analyzer.vocab
vocab.length

In [ ]:
# If there's a word that's in the vocabulary, then it can be loaded in and
# interacted with
vapid = vocab['vapid']

In [ ]:
vapid.similarity(vocab['senseless'])

In [ ]:
# Credit: https://nicschrading.com/project/Intro-to-NLP-with-spaCy/
# Let's see if it can figure out this analogy
# Man is to King as Woman is to ??
from numpy import dot
from numpy.linalg import norm

# Cosine similarity
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

king = nlp_analyzer.vocab['king']
man = nlp_analyzer.vocab['man']
woman = nlp_analyzer.vocab['woman']

result = king.repvec - man.repvec + woman.repvec

# Gather all known words, take only the lowercased versions
all_words = list({w for w in nlp_analyzer.vocab
                  if w.has_vector
                     and w.orth_.islower()
                     and w.lower_ != "king"
                     and w.lower_ != "man"
                     and w.lower_ != "woman"})

# Sort by similarity to the result
all_words.sort(key=lambda w: cosine(w.repvec, result))
all_words.reverse()
print("Top 3 closest results for king - man + woman:\n")
for word in all_words[:3]:   
    print("\t{}".format(word.orth_))

In [ ]:
# Most of the methods/attributes that we've been using can also be used in
# "standalone" mode and further attributes of the analyzer object can be
# specified

In [ ]:
nlp_analyzer.like_email("mulhodm@gmail.com")

In [ ]:
nlp_analyzer.tagger.tag_names

Sources, Links to Guides

1. spaCy home page - tutorials section

2. Nic Schrading's Intro to NLP with spaCy, a fantastic guide (which I stole from a little)

End