In [1]:
import spacy
In [2]:
nlp = spacy.load('en')
In [3]:
text = u"We are living in Singapore.\nIt's blazing outside today!\n"
In [4]:
doc = nlp(text)
In [5]:
for token in doc:
print((token.text, token.lemma, token.tag, token.pos))
In [6]:
for token in doc:
print((token.text, token.lemma_, token.tag_, token.pos_)) # lemma means *root form*
Tag | POS | Morphology |
---|---|---|
-LRB- | PUNCT | PunctType=brck PunctSide=ini |
-PRB- | PUNCT | PunctType=brck PunctSide=fin |
, | PUNCT | PunctType=comm |
: | PUNCT | |
. | PUNCT | PunctType=peri |
'' | PUNCT | PunctType=quot PunctSide=fin |
"" | PUNCT | PunctType=quot PunctSide=fin |
# | SYM | SymType=numbersign |
`` | PUNCT | PunctType=quot PunctSide=ini |
| SYM | SymType=currency |
ADD | X | |
AFX | ADJ | Hyph=yes |
BES | VERB | |
CC | CONJ | ConjType=coor |
CD | NUM | NumType=card |
DT | DET | |
EX | ADV | AdvType=ex |
FW | X | Foreign=yes |
GW | X | |
HVS | VERB | |
HYPH | PUNCT | PunctType=dash |
IN | ADP | |
JJ | ADJ | Degree=pos |
JJR | ADJ | Degree=comp |
JJS | ADJ | Degree=sup |
LS | PUNCT | NumType=ord |
MD | VERB | VerbType=mod |
NFP | PUNCT | |
NIL | ||
NN | NOUN | Number=sing |
NNP | PROPN | NounType=prop Number=sign |
NNPS | PROPN | NounType=prop Number=plur |
NNS | NOUN | Number=plur |
PDT | ADJ | AdjType=pdt PronType=prn |
POS | PART | Poss=yes |
PRP | PRON | PronType=prs |
PRP | ADJ | PronType=prs Poss=yes |
RB | ADV | Degree=pos |
RBR | ADV | Degree=comp |
RBS | ADV | Degree=sup |
RP | PART | |
SP | SPACE | |
SYM | SYM | |
TO | PART | PartType=inf VerbForm=inf |
UH | INTJ | |
VB | VERB | VerbForm=inf |
VBD | VERB | VerbForm=fin Tense=past |
VBG | VERB | VerbForm=part Tense=pres Aspect=prog |
VBN | VERB | VerbForm=part Tense=past Aspect=perf |
VBP | VERB | VerbForm=fin Tense=pres |
VBZ | VERB | VerbForm=fin Tense=pres Number=sing Person=3 |
WDT | ADJ | PronType=int|rel |
WP | NOUN | PronType=int|rel |
WP | ADJ | Poss=yes PronType=int|rel |
WRB | ADV | PronType=int|rel |
XX | X |
Number
|
Tag
|
Description
|
1. | CC | Coordinating conjunction |
2. | CD | Cardinal number |
3. | DT | Determiner |
4. | EX | Existential there |
5. | FW | Foreign word |
6. | IN | Preposition or subordinating conjunction |
7. | JJ | Adjective |
8. | JJR | Adjective, comparative |
9. | JJS | Adjective, superlative |
10. | LS | List item marker |
11. | MD | Modal |
12. | NN | Noun, singular or mass |
13. | NNS | Noun, plural |
14. | NNP | Proper noun, singular |
15. | NNPS | Proper noun, plural |
16. | PDT | Predeterminer |
17. | POS | Possessive ending |
18. | PRP | Personal pronoun |
19. | PRP | Possessive pronoun |
20. | RB | Adverb |
21. | RBR | Adverb, comparative |
22. | RBS | Adverb, superlative |
23. | RP | Particle |
24. | SYM | Symbol |
25. | TO | to |
26. | UH | Interjection |
27. | VB | Verb, base form |
28. | VBD | Verb, past tense |
29. | VBG | Verb, gerund or present participle |
30. | VBN | Verb, past participle |
31. | VBP | Verb, non-3rd person singular present |
32. | VBZ | Verb, 3rd person singular present |
33. | WDT | Wh-determiner |
34. | WP | Wh-pronoun |
35. | WP | Possessive wh-pronoun |
36. | WRB | Wh-adverb |
In [7]:
#https://spacy.io/docs/api/token
doc_ps = nlp("Mr.Sakamoto told us the Dragon Fruits was very yummy!")
#for t in doc:
t = doc_ps[2]
print("token:",t)
print("vocab (The vocab object of the parent Doc):", t.vocab)
print("doc (The parent document.):", t.doc)
print("i (The index of the token within the parent document.):", t.i)
print("ent_type_ (Named entity type.):", t.ent_type_)
print("ent_iob_ (IOB code of named entity tag):", t.ent_iob_)
print("ent_id_ (ID of the entity the token is an instance of):", t.ent_id_)
print("lemma_ (Base form of the word, with no inflectional suffixes.):", t.lemma_)
print("lower_ (Lower-case form of the word.):", t.lower_)
print("shape_ (A transform of the word's string, to show orthographic features.):", t.shape_)
print("prefix_ (Integer ID of a length-N substring from the start of the word):", t.prefix_)
print("suffix_ (Length-N substring from the end of the word):", t.suffix_)
print("like_url (Does the word resemble a URL?):", t.like_url)
print("like_num (Does the word represent a number? ):", t.like_num)
print("like_email (Does the word resemble an email address?):", t.like_email)
print("is_oov (Is the word out-of-vocabulary?):", t.is_oov)
print("is_stop (Is the word part of a stop list?):", t.is_stop)
print("pos_ (Coarse-grained part-of-speech.):", t.pos_)
print("tag_ (Fine-grained part-of-speech.):", t.tag_)
print("dep_ (Syntactic dependency relation.):", t.dep_)
print("lang_ (Language of the parent document's vocabulary.):", t.lang_)
print("prob: (Smoothed log probability estimate of token's type.)", t.prob)
print("idx (The character offset of the token within the parent document.):", t.idx)
print("sentiment (A scalar value indicating the positivity or negativity of the token):", t.sentiment)
print("lex_id (ID of the token's lexical type.):", t.lex_id)
print("text (Verbatim text content.):", t.text)
print("text_with_ws (Text content, with trailing space character if present.):", t.text_with_ws)
print("whitespace_ (Trailing space character if present.):", t.whitespace_)
In [8]:
doc_dep = nlp(u'I like chicken rice and Laksa.')
for np in doc_dep.noun_chunks:
print((np.text, np.root.text, np.root.dep_, np.root.head.text))
In [9]:
for t in doc_dep:
print((t.text, t.dep_, t.tag_))
Visualization using displaCy (https://demos.explosion.ai/displacy/)
In [10]:
for token in doc_dep:
# Orth: Original, Head: head of subtree
print((token.text, token.dep_, token.n_lefts, token.n_rights, token.head.orth_, [t.orth_ for t in token.lefts], [t.orth_ for t in token.rights]))
In [11]:
dependency_pattern = '{left}<---{word}[{w_type}]--->{right}\n--------'
In [12]:
for token in doc_dep:
print (dependency_pattern.format(word=token.orth_,
w_type=token.dep_,
left=[t.orth_ for t in token.lefts],
right=[t.orth_ for t in token.rights]))
spaCy uses the terms head and child to describe the words connected by a single arc in the dependency tree. The term dep is used for the arc label, which describes the type of syntactic relation that connects the child to the head.
https://spacy.io/docs/usage/dependency-parse
In [13]:
for t in doc_dep:
print((t.text, t.dep_,t.tag_,t.pos_),(t.head.text, t.head.dep_,t.head.tag_,t.head.pos_))
In [14]:
# Load symbols
from spacy.symbols import nsubj, VERB
verbs = set()
In [15]:
for token in doc:
print ((token, token.dep, token.head, token.head.pos))
if token.dep == nsubj and token.head.pos == VERB:
verbs.add(token.head)
In [16]:
verbs
Out[16]:
In [17]:
from numpy import dot
from numpy.linalg import norm
# cosine similarity
cosine = lambda v1, v2: dot(v1, v2) / (norm(v1) * norm(v2))
In [18]:
target_word = 'Singapore'
sing = nlp.vocab[target_word]
sing
Out[18]:
In [19]:
# gather all known words except for taget word
all_words = list({w for w in nlp.vocab if w.has_vector and w.orth_.islower() and w.lower_ != target_word.lower()})
len(all_words)
Out[19]:
In [20]:
# sort by similarity
#all_words.sort(key=lambda w: cosine(w.vector, sing.vector))
#all_words.reverse()
#print("Top 10 most similar words to",target_word)
#for word in all_words[:10]:
# print(word.orth_)
In [21]:
country1 = nlp.vocab['china']
race1 = nlp.vocab['chinese']
country2 = nlp.vocab['japan']
result = country1.vector - race1.vector + country2.vector
In [22]:
all_words = list({w for w in nlp.vocab if w.has_vector and w.orth_.islower() and w.lower_ != "china" and w.lower_ != "chinese" and w.lower_ != "japan"})
In [23]:
all_words.sort(key=lambda w: cosine(w.vector, result))
all_words[0].orth_
Out[23]:
In [24]:
# Top 3 results
for word in all_words[:3]:
print(word.orth_)
In [25]:
example_sent = "NTUC has raised S$25 million to help workers re-skill and upgrade their skills, secretary-general Chan Chun Sing said at the May Day Rally on Monday "
parsed = nlp(example_sent)
for token in parsed:
print((token.orth_, token.ent_type_ if token.ent_type_ != "" else "(not an entity)"))
Visualization using displaCy Named Entity Visualizer (https://demos.explosion.ai/displacy-ent/)
https://spacy.io/docs/usage/entity-recognition
Type | Description |
---|---|
PERSON | People, including fictional. |
NORP | Nationalities or religious or political groups. |
FACILITY | Buildings, airports, highways, bridges, etc. |
ORG | Companies, agencies, institutions, etc. |
GPE | Countries, cities, states. |
LOC | Non-GPE locations, mountain ranges, bodies of water. |
PRODUCT | Objects, vehicles, foods, etc. (Not services.) |
EVENT | Named hurricanes, battles, wars, sports events, etc. |
WORK_OF_ART | Titles of books, songs, etc. |
LANGUAGE | Any named language. |
In [26]:
import random
from spacy.gold import GoldParse
from spacy.language import EntityRecognizer
train_data = [
('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
('I like Bangkok and Buangkok.', [(7, 14, 'LOC'), (19, 27, 'LOC')])
]
nlp2 = spacy.load('en', entity=False, parser=False)
ner = EntityRecognizer(nlp2.vocab, entity_types=['PERSON', 'LOC'])
for itn in range(5):
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
doc2 = nlp2.make_doc(raw_text)
gold = GoldParse(doc2, entities=entity_offsets)
nlp.tagger(doc2)
ner.update(doc2, gold)
ner.model.end_training()
nlp.save_to_directory('./sample_ner/')
In [27]:
nlp3 = spacy.load('en', path='./sample_ner/')
example_sent = "Who is Tai Seng Tan?"
doc3 = nlp3(example_sent)
for ent in doc3.ents:
print(ent.label_, ent.text)
https://spacy.io/
https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
https://spacy.io/docs/usage/pos-tagging
[Installation]
pip install spacy
python -m spacy download en
In [ ]: