In [ ]:
# Import spaCy's English analyzer and the location where data is stored
import pandas as pd
from spacy.en import English, LOCAL_DATA_DIR
In [ ]:
# What's the English object all about?
English?
In [ ]:
# Set up spaCy NLP analyzer (tokenizer, parser, NER-er, etc.)
nlp_analyzer = English(data_dir=LOCAL_DATA_DIR)
In [ ]:
# How does one interact with the analyzer object?
nlp_analyzer?
In [ ]:
"""
The analyzer expects as input text -- the output contains all types of analysis
(parsing, tagging, and entity recognition can be turned off by setting them to
False)
Here's some input text that we'll use:
"""
text = ["This is a very simple sentence.",
"This sentence, which is moderately more complex, is still quite simple.",
"The two preceding sentences are easy to understand, hopefully easy to parse too.",
"These sentences will be correctly parsed and tokenized if the gods look favorably on this demo.",
"I hope that strange words like vapidity and celerity don't confuse the analyser (nor British spellings).",
"One would even hopes that ungrammatical sentences not effects the parsing drammatically."]
text = ' '.join(text)
"""
Let's analyze it and also get a sense for how long it takes for a text of this size
to be analyzed
"""
%timeit nlp_analyzer(text)
analyzed_text = nlp_analyzer(text)
In [ ]:
analyzed_text.
In [ ]:
"""
Let's take a look at what's in the output
The output is automatically divided up into the constituent sentences (.sents
attribute) and the sentences and text are composed of constituent tokens
"""
for sent in analyzed_text.sents:
print('{}\n'.format(sent))
In [ ]:
"""
The .sents attribute is a generator and it stores the objects corresponding to each
recognized sentence
"""
sent = next(analyzed_text.sents)
In [ ]:
"""
Each sentence is of type spacy.tokens.span.Span, which is basically just a sequence
of token objects (more on that later)
Here you can see the type of the objects
"""
type(sent)
In [ ]:
"""
To get the string representation of anything (not just a sentence object), i.e., the
original token, the original sentence, the lemma, etc., use the .string or .orth_
attributes
"""
sent.orth_
In [ ]:
# The .string attribute contains whitespace
sent.string
In [ ]:
"""
Various pieces of information can be collected about each object representing a
token.
"""
lines = []
for token in sent:
lines.append(dict(Token=token.orth_, letter=token.is_alpha, ASCII=token.is_ascii,
digit=token.is_digit, lower=token.is_lower, OOV=token.is_oov,
punct=token.is_punct, space=token.is_space, stop=token.is_stop,
titlecase=token.is_title, like_email=token.like_email,
like_number=token.like_num, like_url=token.like_url,
shape=token.shape_, prefix=token.prefix_, suffix=token.suffix_,
lowercased=token.lower_))
pd.DataFrame(lines)
In [ ]:
"""
If you want the whole document that the sentence occurred in, use the .doc attribute.
"""
sent.doc
In [ ]:
"""
A lemmatized version of the object can be accessed via the .lemma_ attribute
"""
sent.lemma_
In [ ]:
tokens = []
for token in sent:
tokens.append(dict(Token=token.orth_, tag=token.tag_, part_of_speech=token.pos_))
pd.DataFrame(tokens)
In [ ]:
print("sentence = {}".format(sent.orth_))
print("root of sentence = {}".format(sent.root))
In [ ]:
"""
Parse tree-related attributes can be accessed for each token, such as the
children/parents of the token, the dependency relationships, etc.
"""
token = sent[1]
print("sentence = {}".format(sent.orth_))
print("token: {}".format(token))
print("children: {}".format(list(token.children)))
print("head: {}".format(token.head))
print("dependency relationship: {}".format(token.dep_))
In [ ]:
"""
Representing words as vectors allows for similarity calculations.
"""
last_sentence = list(analyzed_text.sents)[-1]
last_sentence.string
In [ ]:
token1 = last_sentence[5]
token1
In [ ]:
# Does this token have a vector?
token1.has_vector
In [ ]:
token1.repvec
In [ ]:
token2 = last_sentence[6]
token2
In [ ]:
# How similar are "ungrammatical" and "sentences"?
token1.similarity(token2)
In [ ]:
# How similar are two random other words?
token3 = last_sentence[8]
token3
In [ ]:
token4 = last_sentence[1]
token4
In [ ]:
token3.similarity(token4)
In [ ]:
# The similarity value is not as off as one might think, but it's still less
In [ ]:
for i, token in enumerate(sent):
print("original:", token.orth_)
print("log probability:", token.prob)
print("Brown cluster id:", token.cluster)
print("----------------------------------------")
In [ ]:
# Get a list of the entities directly with .ents
analyzed_text.ents
In [ ]:
# Let's print out all of the tokens in the example text only if they
# are entities
[print(token.orth_, token.ent_type_)
for token in analyzed_text
if token.ent_type_ != ""]
In [ ]:
messy_data = "lol that is rly funny :) This is gr8 i rate it 8/8!!!"
analyzed_messy_data = nlp_analyzer(messy_data)
for token in analyzed_messy_data:
print(token.orth_, token.pos_, token.lemma_)
In [ ]:
# The vocabulary that the analyzer uses can be accessed and used (and
# also it, along with almost every other component of the system, can
# be customized)
vocab = nlp_analyzer.vocab
vocab.length
In [ ]:
# If there's a word that's in the vocabulary, then it can be loaded in and
# interacted with
vapid = vocab['vapid']
In [ ]:
vapid.similarity(vocab['senseless'])
In [ ]:
# Credit: https://nicschrading.com/project/Intro-to-NLP-with-spaCy/
# Let's see if it can figure out this analogy
# Man is to King as Woman is to ??
from numpy import dot
from numpy.linalg import norm
# Cosine similarity
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
king = nlp_analyzer.vocab['king']
man = nlp_analyzer.vocab['man']
woman = nlp_analyzer.vocab['woman']
result = king.repvec - man.repvec + woman.repvec
# Gather all known words, take only the lowercased versions
all_words = list({w for w in nlp_analyzer.vocab
if w.has_vector
and w.orth_.islower()
and w.lower_ != "king"
and w.lower_ != "man"
and w.lower_ != "woman"})
# Sort by similarity to the result
all_words.sort(key=lambda w: cosine(w.repvec, result))
all_words.reverse()
print("Top 3 closest results for king - man + woman:\n")
for word in all_words[:3]:
print("\t{}".format(word.orth_))
In [ ]:
# Most of the methods/attributes that we've been using can also be used in
# "standalone" mode and further attributes of the analyzer object can be
# specified
In [ ]:
nlp_analyzer.like_email("mulhodm@gmail.com")
In [ ]:
nlp_analyzer.tagger.tag_names