In [1]:
# Importing spacy
import spacy

In [2]:
# Loading the language model
nlp = spacy.load('en')

In [3]:
# Getting the document object after parsing/processing the text
# u - represents the string is in unicode format
doc = nlp(u'Apple is looking at buying U.K. startup for $1 billion')

In [4]:
# Tokenization details, see more https://spacy.io/usage/spacy-101#annotations-token
# Observations 
# here are U.K. is not separated out as two different words, even if there is a punctuation('.')
# $1 is separated out as two words even if there is no space between them
for token in doc:
    print(token.text)


Apple
is
looking
at
buying
U.K.
startup
for
$
1
billion

In [5]:
# Part-of-Speech tags and dependencies, see more https://spacy.io/usage/spacy-101#annotations-pos-deps
# After tokenization spacy does the POS tags and dependencies
# Linguistic annotations are available as Token attributes

# Text: The original word text.
# Lemma: The base form of the word.
# POS: The simple part-of-speech tag.
# Tag: The detailed part-of-speech tag.
# Dep: Syntactic dependency, i.e. the relation between tokens.
# Shape: The word shape – capitalisation, punctuation, digits.
# is alpha: Is the token an alpha character?
# is stop: Is the token part of a stop list, i.e. the most common words of the language?

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
          token.shape_, token.is_alpha, token.is_stop)


Apple apple PROPN NNP nsubj Xxxxx True False
is be VERB VBZ aux xx True True
looking look VERB VBG ROOT xxxx True False
at at ADP IN prep xx True True
buying buy VERB VBG pcomp xxxx True False
U.K. u.k. PROPN NNP compound X.X. False False
startup startup NOUN NN dobj xxxx True False
for for ADP IN prep xxx True True
$ $ SYM $ quantmod $ False False
1 1 NUM CD compound d False False
billion billion NUM CD pobj xxxx True False

In [6]:
# importing displacy from spacy
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)


Apple ORG is looking at buying U.K. GPE startup for $1 billion MONEY

In [7]:
# Understanding pos, tags and dep labels - using spacy.explain()
print('POS - PROPN - ', spacy.explain('PROPN'))
print('tag - VBZ - ', spacy.explain('VBZ'))
print('dep label - dobj - ', spacy.explain('dobj'))


POS - PROPN -  proper noun
tag - VBZ -  verb, 3rd person singular present
dep label - dobj -  direct object

In [8]:
# Named Entities

# Text: The original entity text.
# Start: Index of start of entity in the Doc.
# End: Index of end of entity in the Doc.
# Label: Entity label, i.e. type.

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)


Apple 0 5 ORG
U.K. 27 31 GPE
$1 billion 44 54 MONEY

In [9]:
# Word Vectors and Similarity
# spaCy is able to compare two objects, and make a prediction of "how similar they are".

tokens = nlp(u'dog cat banana')

# TODO: Similarity from dog to cat is bad, we can check the by loading large - en_core_web_lg - model.# 

for token1 in tokens:
    for token2 in tokens:
        print(token1, ' smililarity to ', token2 , ' - ', token1.similarity(token2))


dog  smililarity to  dog  -  1.0
dog  smililarity to  cat  -  0.53906965
dog  smililarity to  banana  -  0.28761008
cat  smililarity to  dog  -  0.53906965
cat  smililarity to  cat  -  1.0
cat  smililarity to  banana  -  0.48752153
banana  smililarity to  dog  -  0.28761008
banana  smililarity to  cat  -  0.48752153
banana  smililarity to  banana  -  1.0

In [ ]:
# Checking word vectors availability
tokens = nlp(u'dog cat banana sasquatch')

# TODO: Load the large model and verify the same

# Text: The original token text.
# has vector: Does the token have a vector representation?
# Vector norm: The L2 norm of the token's vector (the square root of the sum of the values squared)
# is OOV: Is the word out-of-vocabulary?

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

In [10]:
# Vocab, hashes and lexemes, more here https://spacy.io/usage/spacy-101#vocab
doc = nlp(u'I love coffee')

# Text: The original text of the lexeme.
# Orth: The hash value of the lexeme.
# Shape: The abstract word shape of the lexeme.
# Prefix: By default, the first letter of the word string.
# Suffix: By default, the last three letters of the word string.
# is alpha: Does the lexeme consist of alphabetic characters?
# is digit: Does the lexeme consist of digits?

for word in doc:
    lexeme = doc.vocab[word.text]
    print(lexeme.text, lexeme.orth, lexeme.shape_, lexeme.prefix_, lexeme.suffix_,
          lexeme.is_alpha, lexeme.is_digit, lexeme.is_title, lexeme.lang_)


I 4690420944186131903 X I I True False True en
love 3702023516439754181 xxxx l ove True False False en
coffee 3197928453018144401 xxxx c fee True False False en

In [ ]:
# Serialization here https://spacy.io/usage/spacy-101#serialization

# TODO - Add More