NLP01 parse sentence

Reference



In [154]:

    
import gzip
import pickle
from os import path
from collections import defaultdict
from numpy import sign


"""
Load buzz data as a dictionary.
You can give parameter for data so that you will get what you need only.
"""
def load_buzz(root='../data', data=['train', 'test', 'questions'], format='pklz'):
    buzz_data = {}
    for ii in data:
        file_path = path.join(root, ii + "." + format)
        with gzip.open(file_path, "rb") as fp:
          buzz_data[ii] = pickle.load(fp)
        
    return buzz_data



In [155]:

    
questions = load_buzz()['questions']



In [156]:

    
print(questions[19]['question'])
print(questions[19]['answer'])









    



He was born in Normandy, but spent most of his life in Rome, where Domenichino briefly employed him.  Rejecting the Baroque, he chose to model his work after Titian and Raphael.  Under Charles LeBrun in the 1660's, the French Academy would take his ideas on Classicism as a primary reference.  FTP, name this painter of Et in Arcadia Ego and The Burial of Phocion.
nicolas poussin



In [157]:

    
from nltk import word_tokenize, pos_tag

sentence = questions[19]['question']
tagged_sent = pos_tag(word_tokenize(sentence))
for word, tag in tagged_sent:
    print(word, tag)









    



He PRP
was VBD
born VBN
in IN
Normandy NNP
, ,
but CC
spent VBD
most RBS
of IN
his PRP$
life NN
in IN
Rome NNP
, ,
where WRB
Domenichino NNP
briefly RB
employed VBN
him PRP
. .
Rejecting NNP
the DT
Baroque NNP
, ,
he PRP
chose VBD
to TO
model VB
his PRP$
work NN
after IN
Titian NNP
and CC
Raphael NNP
. .
Under NNP
Charles NNP
LeBrun NNP
in IN
the DT
1660 CD
's POS
, ,
the DT
French JJ
Academy NNP
would MD
take VB
his PRP$
ideas NNS
on IN
Classicism NNP
as IN
a DT
primary JJ
reference NN
. .
FTP NNP
, ,
name VBD
this DT
painter NN
of IN
Et NNP
in IN
Arcadia NNP
Ego NNP
and CC
The DT
Burial NNP
of IN
Phocion NNP
. .



In [164]:

    
print([word for word,pos in tagged_sent if pos == 'NNP'])









    



['Normandy', 'Rome', 'Domenichino', 'Rejecting', 'Baroque', 'Titian', 'Raphael', 'Under', 'Charles', 'LeBrun', 'Academy', 'Classicism', 'FTP', 'Et', 'Arcadia', 'Ego', 'Burial', 'Phocion']



In [165]:

    
print([word for word,pos in tagged_sent if pos == 'NN'])









    



['life', 'work', 'reference', 'painter']



In [166]:

    
print([word for word,pos in tagged_sent if pos == 'CD'])









    



['1660']



In [167]:

    
print([word for word,pos in tagged_sent if pos == 'PRP'])
print([word for word,pos in tagged_sent if pos == 'PRP$'])









    



['He', 'him', 'he']
['his', 'his', 'his']



In [168]:

    
from nltk import ne_chunk
print(nltk.ne_chunk(tagged_sent))









    



(S
  He/PRP
  was/VBD
  born/VBN
  in/IN
  (GPE Normandy/NNP)
  ,/,
  but/CC
  spent/VBD
  most/RBS
  of/IN
  his/PRP$
  life/NN
  in/IN
  (GPE Rome/NNP)
  ,/,
  where/WRB
  (PERSON Domenichino/NNP)
  briefly/RB
  employed/VBN
  him/PRP
  ./.
  Rejecting/NNP
  the/DT
  (GPE Baroque/NNP)
  ,/,
  he/PRP
  chose/VBD
  to/TO
  model/VB
  his/PRP$
  work/NN
  after/IN
  (GPE Titian/NNP)
  and/CC
  (GPE Raphael/NNP)
  ./.
  (PERSON Under/NNP Charles/NNP)
  LeBrun/NNP
  in/IN
  the/DT
  1660/CD
  's/POS
  ,/,
  the/DT
  (GPE French/JJ)
  (ORGANIZATION Academy/NNP)
  would/MD
  take/VB
  his/PRP$
  ideas/NNS
  on/IN
  Classicism/NNP
  as/IN
  a/DT
  primary/JJ
  reference/NN
  ./.
  (ORGANIZATION FTP/NNP)
  ,/,
  name/VBD
  this/DT
  painter/NN
  of/IN
  Et/NNP
  in/IN
  (GPE Arcadia/NNP)
  Ego/NNP
  and/CC
  The/DT
  (ORGANIZATION Burial/NNP)
  of/IN
  (GPE Phocion/NNP)
  ./.)

NE Type

NE Type	Examples
ORGANIZATION	Georgia-Pacific Corp., WHO
PERSON	Eddy Bonte, President Obama
LOCATION	Murray River, Mount Everest
DATE	June, 2008-06-29
TIME	two fifty a m, 1:30 p.m.
MONEY	175 million Canadian Dollars, GBP 10.40
PERCENT	twenty pct, 18.75 %
FACILITY	Washington Monument, Stonehenge
GPE	South East Asia, Midlothian



In [199]:

    
import nltk

def extract_entities(text, all=True):
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if all:
                print(chunk)
            if type(chunk) is nltk.tree.Tree:
                print(chunk.label(), ' '.join(c[0] for c in chunk.leaves()))
            elif chunk[1] == 'CD':
                print('CD', chunk[0])

extract_entities(sentence, all=False)









    



GPE Normandy
GPE Rome
PERSON Domenichino
GPE Baroque
GPE Titian
GPE Raphael
PERSON Charles LeBrun
CD 1660
GPE French
ORGANIZATION Academy
GPE Arcadia
ORGANIZATION Burial
GPE Phocion



In [169]:

    
import dateutil.parser as parser



In [170]:

    
parser.parse("Sanghee 1978's", fuzzy=True).year









    Out[170]:





1978



In [171]:

    
from nltk.tag.stanford import NERTagger



In [172]:

    
trained_clfs = ['english.all.3class.distsim.crf.ser.gz',
                'english.muc.7class.distsim.crf.ser.gz',
                'english.conll.4class.distsim.crf.ser.gz']

for clf in trained_clfs:
    print("==")
    ner_path = '/home/sanghee/Libs/stanford-ner-2015-04-20/'
    crf_path = ner_path + 'classifiers/' + clf
    jar_path = ner_path + 'stanford-ner.jar'
    st = NERTagger(crf_path, jar_path, 'utf-8')
    for tt in st.tag(sentence.split()):
        for ii in tt:
            print(ii)









    



==
('He', 'O')
('was', 'O')
('born', 'O')
('in', 'O')
('Normandy,', 'O')
('but', 'O')
('spent', 'O')
('most', 'O')
('of', 'O')
('his', 'O')
('life', 'O')
('in', 'O')
('Rome,', 'LOCATION')
('where', 'O')
('Domenichino', 'PERSON')
('briefly', 'O')
('employed', 'O')
('him.', 'O')
('Rejecting', 'O')
('the', 'O')
('Baroque,', 'O')
('he', 'O')
('chose', 'O')
('to', 'O')
('model', 'O')
('his', 'O')
('work', 'O')
('after', 'O')
('Titian', 'PERSON')
('and', 'O')
('Raphael.', 'O')
('Under', 'O')
('Charles', 'PERSON')
('LeBrun', 'PERSON')
('in', 'O')
('the', 'O')
("1660's,", 'O')
('the', 'O')
('French', 'ORGANIZATION')
('Academy', 'ORGANIZATION')
('would', 'O')
('take', 'O')
('his', 'O')
('ideas', 'O')
('on', 'O')
('Classicism', 'O')
('as', 'O')
('a', 'O')
('primary', 'O')
('reference.', 'O')
('FTP,', 'O')
('name', 'O')
('this', 'O')
('painter', 'O')
('of', 'O')
('Et', 'O')
('in', 'O')
('Arcadia', 'LOCATION')
('Ego', 'O')
('and', 'O')
('The', 'O')
('Burial', 'O')
('of', 'O')
('Phocion.', 'O')
==
('He', 'O')
('was', 'O')
('born', 'O')
('in', 'O')
('Normandy,', 'O')
('but', 'O')
('spent', 'O')
('most', 'O')
('of', 'O')
('his', 'O')
('life', 'O')
('in', 'O')
('Rome,', 'O')
('where', 'O')
('Domenichino', 'O')
('briefly', 'O')
('employed', 'O')
('him.', 'O')
('Rejecting', 'O')
('the', 'O')
('Baroque,', 'O')
('he', 'O')
('chose', 'O')
('to', 'O')
('model', 'O')
('his', 'O')
('work', 'O')
('after', 'O')
('Titian', 'ORGANIZATION')
('and', 'O')
('Raphael.', 'O')
('Under', 'O')
('Charles', 'PERSON')
('LeBrun', 'PERSON')
('in', 'O')
('the', 'O')
("1660's,", 'O')
('the', 'O')
('French', 'O')
('Academy', 'O')
('would', 'O')
('take', 'O')
('his', 'O')
('ideas', 'O')
('on', 'O')
('Classicism', 'O')
('as', 'O')
('a', 'O')
('primary', 'O')
('reference.', 'O')
('FTP,', 'O')
('name', 'O')
('this', 'O')
('painter', 'O')
('of', 'O')
('Et', 'O')
('in', 'O')
('Arcadia', 'LOCATION')
('Ego', 'O')
('and', 'O')
('The', 'O')
('Burial', 'O')
('of', 'O')
('Phocion.', 'LOCATION')
==
('He', 'O')
('was', 'O')
('born', 'O')
('in', 'O')
('Normandy,', 'O')
('but', 'O')
('spent', 'O')
('most', 'O')
('of', 'O')
('his', 'O')
('life', 'O')
('in', 'O')
('Rome,', 'O')
('where', 'O')
('Domenichino', 'PERSON')
('briefly', 'O')
('employed', 'O')
('him.', 'O')
('Rejecting', 'O')
('the', 'O')
('Baroque,', 'O')
('he', 'O')
('chose', 'O')
('to', 'O')
('model', 'O')
('his', 'O')
('work', 'O')
('after', 'O')
('Titian', 'PERSON')
('and', 'O')
('Raphael.', 'O')
('Under', 'O')
('Charles', 'PERSON')
('LeBrun', 'PERSON')
('in', 'O')
('the', 'O')
("1660's,", 'O')
('the', 'O')
('French', 'MISC')
('Academy', 'MISC')
('would', 'O')
('take', 'O')
('his', 'O')
('ideas', 'O')
('on', 'O')
('Classicism', 'O')
('as', 'O')
('a', 'O')
('primary', 'O')
('reference.', 'O')
('FTP,', 'O')
('name', 'O')
('this', 'O')
('painter', 'O')
('of', 'O')
('Et', 'O')
('in', 'O')
('Arcadia', 'LOCATION')
('Ego', 'LOCATION')
('and', 'O')
('The', 'ORGANIZATION')
('Burial', 'ORGANIZATION')
('of', 'ORGANIZATION')
('Phocion.', 'ORGANIZATION')



In [ ]: