In [1]:
# Importing spacy
import spacy
In [2]:
# Loading the language model
nlp = spacy.load('en')
In [3]:
# Getting the document object after parsing/processing the text
# u - represents the string is in unicode format
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')
In [4]:
# Tokenization details, see more https://spacy.io/usage/spacy-101#annotations-token
# Observations
# here are U.K. is not separated out as two different words, even if there is a punctuation('.')
# $1 is separated out as two words even if there is no space between them
for token in doc:
print(token.text)
In [5]:
# Part-of-Speech tags and dependencies, see more https://spacy.io/usage/spacy-101#annotations-pos-deps
# After tokenization spacy does the POS tags and dependencies
# Linguistic annotations are available as Token attributes
# Text: The original word text.
# Lemma: The base form of the word.
# POS: The simple part-of-speech tag.
# Tag: The detailed part-of-speech tag.
# Dep: Syntactic dependency, i.e. the relation between tokens.
# Shape: The word shape – capitalisation, punctuation, digits.
# is alpha: Is the token an alpha character?
# is stop: Is the token part of a stop list, i.e. the most common words of the language?
for token in doc:
print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop)
In [6]:
# importing displacy from spacy
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)
In [7]:
# Understanding pos, tags and dep labels - using spacy.explain()
print('POS - PROPN - ', spacy.explain('PROPN'))
print('tag - VBZ - ', spacy.explain('VBZ'))
print('dep label - dobj - ', spacy.explain('dobj'))
In [8]:
# Named Entities
# Text: The original entity text.
# Start: Index of start of entity in the Doc.
# End: Index of end of entity in the Doc.
# Label: Entity label, i.e. type.
for ent in doc.ents:
print(ent.text, ent.start_char, ent.end_char, ent.label_)
In [9]:
# Word Vectors and Similarity
# spaCy is able to compare two objects, and make a prediction of "how similar they are".
tokens = nlp(u'dog cat banana')
# TODO: Similarity from dog to cat is bad, we can check the by loading large - en_core_web_lg - model.#
for token1 in tokens:
for token2 in tokens:
print(token1, ' smililarity to ', token2 , ' - ', token1.similarity(token2))
In [ ]:
# Checking word vectors availability
tokens = nlp(u'dog cat banana sasquatch')
# TODO: Load the large model and verify the same
# Text: The original token text.
# has vector: Does the token have a vector representation?
# Vector norm: The L2 norm of the token's vector (the square root of the sum of the values squared)
# is OOV: Is the word out-of-vocabulary?
for token in tokens:
print(token.text, token.has_vector, token.vector_norm, token.is_oov)
In [10]:
# Vocab, hashes and lexemes, more here https://spacy.io/usage/spacy-101#vocab
doc = nlp(u'I love coffee')
# Text: The original text of the lexeme.
# Orth: The hash value of the lexeme.
# Shape: The abstract word shape of the lexeme.
# Prefix: By default, the first letter of the word string.
# Suffix: By default, the last three letters of the word string.
# is alpha: Does the lexeme consist of alphabetic characters?
# is digit: Does the lexeme consist of digits?
for word in doc:
lexeme = doc.vocab[word.text]
print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)
In [ ]:
# Serialization here https://spacy.io/usage/spacy-101#serialization
# TODO - Add More