spaCy Demo



In [ ]:

    
# Import spaCy's English analyzer and the location where data is stored
import pandas as pd
from spacy.en import English, LOCAL_DATA_DIR



In [ ]:

    
# What's the English object all about?
English?



In [ ]:

    
# Set up spaCy NLP analyzer (tokenizer, parser, NER-er, etc.)
nlp_analyzer = English(data_dir=LOCAL_DATA_DIR)



In [ ]:

    
# How does one interact with the analyzer object?
nlp_analyzer?

Input



In [ ]:

    
"""
The analyzer expects as input text -- the output contains all types of analysis
(parsing, tagging, and entity recognition can be turned off by setting them to
False)

Here's some input text that we'll use:
"""
text = ["This is a very simple sentence.",
        "This sentence, which is moderately more complex, is still quite simple.",
        "The two preceding sentences are easy to understand, hopefully easy to parse too.",
        "These sentences will be correctly parsed and tokenized if the gods look favorably on this demo.",
        "I hope that strange words like vapidity and celerity don't confuse the analyser (nor British spellings).",
        "One would even hopes that ungrammatical sentences not effects the parsing drammatically."]
text = ' '.join(text)

"""
Let's analyze it and also get a sense for how long it takes for a text of this size
to be analyzed
"""
%timeit nlp_analyzer(text)
analyzed_text = nlp_analyzer(text)



In [ ]:

    
analyzed_text.

Sentence Recognition

.sents



In [ ]:

    
"""
Let's take a look at what's in the output

The output is automatically divided up into the constituent sentences (.sents
attribute) and the sentences and text are composed of constituent tokens
"""
for sent in analyzed_text.sents:
    print('{}\n'.format(sent))



In [ ]:

    
"""
The .sents attribute is a generator and it stores the objects corresponding to each
recognized sentence
"""
sent = next(analyzed_text.sents)



In [ ]:

    
"""
Each sentence is of type spacy.tokens.span.Span, which is basically just a sequence
of token objects (more on that later)

Here you can see the type of the objects
"""
type(sent)

.string or .orth_



In [ ]:

    
"""
To get the string representation of anything (not just a sentence object), i.e., the
original token, the original sentence, the lemma, etc., use the .string or .orth_
attributes
"""
sent.orth_



In [ ]:

    
# The .string attribute contains whitespace
sent.string

Miscellaneous String Attributes

.is_alpha, .is_oov, .is_space, .like_email, .is_title, etc.



In [ ]:

    
"""
Various pieces of information can be collected about each object representing a
token.
"""
lines = []
for token in sent:
    lines.append(dict(Token=token.orth_, letter=token.is_alpha, ASCII=token.is_ascii,
                      digit=token.is_digit, lower=token.is_lower, OOV=token.is_oov,
                      punct=token.is_punct, space=token.is_space, stop=token.is_stop,
                      titlecase=token.is_title, like_email=token.like_email,
                      like_number=token.like_num, like_url=token.like_url,
                      shape=token.shape_, prefix=token.prefix_, suffix=token.suffix_,
                      lowercased=token.lower_))
pd.DataFrame(lines)

.doc attribute



In [ ]:

    
"""
If you want the whole document that the sentence occurred in, use the .doc attribute.
"""
sent.doc

Lemmatization

.lemma_



In [ ]:

    
"""
A lemmatized version of the object can be accessed via the .lemma_ attribute
"""
sent.lemma_

Parts of Speech and Tags



In [ ]:

    
tokens = []
for token in sent:
    tokens.append(dict(Token=token.orth_, tag=token.tag_, part_of_speech=token.pos_))
pd.DataFrame(tokens)

Parsing

.root



In [ ]:

    
print("sentence = {}".format(sent.orth_))
print("root of sentence = {}".format(sent.root))

.children, .dep_ attributes



In [ ]:

    
"""
Parse tree-related attributes can be accessed for each token, such as the
children/parents of the token, the dependency relationships, etc.
"""
token = sent[1]
print("sentence = {}".format(sent.orth_))
print("token: {}".format(token))
print("children: {}".format(list(token.children)))
print("head: {}".format(token.head))
print("dependency relationship: {}".format(token.dep_))

Word Representation Vectors

.repvec, .has_vector, .similarity()



In [ ]:

    
"""
Representing words as vectors allows for similarity calculations.
"""
last_sentence = list(analyzed_text.sents)[-1]
last_sentence.string



In [ ]:

    
token1 = last_sentence[5]
token1



In [ ]:

    
# Does this token have a vector?
token1.has_vector



In [ ]:

    
token1.repvec



In [ ]:

    
token2 = last_sentence[6]
token2



In [ ]:

    
# How similar are "ungrammatical" and "sentences"?
token1.similarity(token2)



In [ ]:

    
# How similar are two random other words?
token3 = last_sentence[8]
token3



In [ ]:

    
token4 = last_sentence[1]
token4



In [ ]:

    
token3.similarity(token4)



In [ ]:

    
# The similarity value is not as off as one might think, but it's still less

Log Probabilities and Brown Cluster IDs

.prob, .cluster



In [ ]:

    
for i, token in enumerate(sent):
    print("original:", token.orth_)
    print("log probability:", token.prob)
    print("Brown cluster id:", token.cluster)
    print("----------------------------------------")

NER

.ents, .ent_label, .ent_type_, .ent_iob_



In [ ]:

    
# Get a list of the entities directly with .ents
analyzed_text.ents



In [ ]:

    
# Let's print out all of the tokens in the example text only if they
# are entities
[print(token.orth_, token.ent_type_)
 for token in analyzed_text
 if token.ent_type_ != ""]

Can Handle Messy Data



In [ ]:

    
messy_data = "lol that is rly funny :) This is gr8 i rate it 8/8!!!"
analyzed_messy_data = nlp_analyzer(messy_data)
for token in analyzed_messy_data:
    print(token.orth_, token.pos_, token.lemma_)

Access to the Vocabulary



In [ ]:

    
# The vocabulary that the analyzer uses can be accessed and used (and
# also it, along with almost every other component of the system, can
# be customized)
vocab = nlp_analyzer.vocab
vocab.length



In [ ]:

    
# If there's a word that's in the vocabulary, then it can be loaded in and
# interacted with
vapid = vocab['vapid']



In [ ]:

    
vapid.similarity(vocab['senseless'])



In [ ]:

    
# Credit: https://nicschrading.com/project/Intro-to-NLP-with-spaCy/
# Let's see if it can figure out this analogy
# Man is to King as Woman is to ??
from numpy import dot
from numpy.linalg import norm

# Cosine similarity
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))

king = nlp_analyzer.vocab['king']
man = nlp_analyzer.vocab['man']
woman = nlp_analyzer.vocab['woman']

result = king.repvec - man.repvec + woman.repvec

# Gather all known words, take only the lowercased versions
all_words = list({w for w in nlp_analyzer.vocab
                  if w.has_vector
                     and w.orth_.islower()
                     and w.lower_ != "king"
                     and w.lower_ != "man"
                     and w.lower_ != "woman"})

# Sort by similarity to the result
all_words.sort(key=lambda w: cosine(w.repvec, result))
all_words.reverse()
print("Top 3 closest results for king - man + woman:\n")
for word in all_words[:3]:   
    print("\t{}".format(word.orth_))



In [ ]:

    
# Most of the methods/attributes that we've been using can also be used in
# "standalone" mode and further attributes of the analyzer object can be
# specified



In [ ]:

    
nlp_analyzer.like_email("mulhodm@gmail.com")



In [ ]:

    
nlp_analyzer.tagger.tag_names

spaCy Demo

Input

Sentence Recognition

.sents

.string or .orth_

Miscellaneous String Attributes

.is_alpha, .is_oov, .is_space, .like_email, .is_title, etc.

.doc attribute

Lemmatization

.lemma_

Parts of Speech and Tags

Parsing

.root

.children, .dep_ attributes

Word Representation Vectors

.repvec, .has_vector, .similarity()

Log Probabilities and Brown Cluster IDs

.prob, .cluster

NER

.ents, .ent_label, .ent_type_, .ent_iob_

Can Handle Messy Data

Access to the Vocabulary

Sources, Links to Guides

1. spaCy home page - tutorials section

2. Nic Schrading's Intro to NLP with spaCy, a fantastic guide (which I stole from a little)

End