In [154]:
import gzip
import pickle
from os import path
from collections import defaultdict
from numpy import sign


"""
Load buzz data as a dictionary.
You can give parameter for data so that you will get what you need only.
"""
def load_buzz(root='../data', data=['train', 'test', 'questions'], format='pklz'):
    buzz_data = {}
    for ii in data:
        file_path = path.join(root, ii + "." + format)
        with gzip.open(file_path, "rb") as fp:
          buzz_data[ii] = pickle.load(fp)
        
    return buzz_data

In [155]:
questions = load_buzz()['questions']

In [156]:
print(questions[19]['question'])
print(questions[19]['answer'])


He was born in Normandy, but spent most of his life in Rome, where Domenichino briefly employed him.  Rejecting the Baroque, he chose to model his work after Titian and Raphael.  Under Charles LeBrun in the 1660's, the French Academy would take his ideas on Classicism as a primary reference.  FTP, name this painter of Et in Arcadia Ego and The Burial of Phocion.
nicolas poussin

In [157]:
from nltk import word_tokenize, pos_tag

sentence = questions[19]['question']
tagged_sent = pos_tag(word_tokenize(sentence))
for word, tag in tagged_sent:
    print(word, tag)


He PRP
was VBD
born VBN
in IN
Normandy NNP
, ,
but CC
spent VBD
most RBS
of IN
his PRP$
life NN
in IN
Rome NNP
, ,
where WRB
Domenichino NNP
briefly RB
employed VBN
him PRP
. .
Rejecting NNP
the DT
Baroque NNP
, ,
he PRP
chose VBD
to TO
model VB
his PRP$
work NN
after IN
Titian NNP
and CC
Raphael NNP
. .
Under NNP
Charles NNP
LeBrun NNP
in IN
the DT
1660 CD
's POS
, ,
the DT
French JJ
Academy NNP
would MD
take VB
his PRP$
ideas NNS
on IN
Classicism NNP
as IN
a DT
primary JJ
reference NN
. .
FTP NNP
, ,
name VBD
this DT
painter NN
of IN
Et NNP
in IN
Arcadia NNP
Ego NNP
and CC
The DT
Burial NNP
of IN
Phocion NNP
. .

In [164]:
print([word for word,pos in tagged_sent if pos == 'NNP'])


['Normandy', 'Rome', 'Domenichino', 'Rejecting', 'Baroque', 'Titian', 'Raphael', 'Under', 'Charles', 'LeBrun', 'Academy', 'Classicism', 'FTP', 'Et', 'Arcadia', 'Ego', 'Burial', 'Phocion']

In [165]:
print([word for word,pos in tagged_sent if pos == 'NN'])


['life', 'work', 'reference', 'painter']

In [166]:
print([word for word,pos in tagged_sent if pos == 'CD'])


['1660']

In [167]:
print([word for word,pos in tagged_sent if pos == 'PRP'])
print([word for word,pos in tagged_sent if pos == 'PRP$'])


['He', 'him', 'he']
['his', 'his', 'his']

In [168]:
from nltk import ne_chunk
print(nltk.ne_chunk(tagged_sent))


(S
  He/PRP
  was/VBD
  born/VBN
  in/IN
  (GPE Normandy/NNP)
  ,/,
  but/CC
  spent/VBD
  most/RBS
  of/IN
  his/PRP$
  life/NN
  in/IN
  (GPE Rome/NNP)
  ,/,
  where/WRB
  (PERSON Domenichino/NNP)
  briefly/RB
  employed/VBN
  him/PRP
  ./.
  Rejecting/NNP
  the/DT
  (GPE Baroque/NNP)
  ,/,
  he/PRP
  chose/VBD
  to/TO
  model/VB
  his/PRP$
  work/NN
  after/IN
  (GPE Titian/NNP)
  and/CC
  (GPE Raphael/NNP)
  ./.
  (PERSON Under/NNP Charles/NNP)
  LeBrun/NNP
  in/IN
  the/DT
  1660/CD
  's/POS
  ,/,
  the/DT
  (GPE French/JJ)
  (ORGANIZATION Academy/NNP)
  would/MD
  take/VB
  his/PRP$
  ideas/NNS
  on/IN
  Classicism/NNP
  as/IN
  a/DT
  primary/JJ
  reference/NN
  ./.
  (ORGANIZATION FTP/NNP)
  ,/,
  name/VBD
  this/DT
  painter/NN
  of/IN
  Et/NNP
  in/IN
  (GPE Arcadia/NNP)
  Ego/NNP
  and/CC
  The/DT
  (ORGANIZATION Burial/NNP)
  of/IN
  (GPE Phocion/NNP)
  ./.)

NE Type

NE Type Examples
ORGANIZATION Georgia-Pacific Corp., WHO
PERSON Eddy Bonte, President Obama
LOCATION Murray River, Mount Everest
DATE June, 2008-06-29
TIME two fifty a m, 1:30 p.m.
MONEY 175 million Canadian Dollars, GBP 10.40
PERCENT twenty pct, 18.75 %
FACILITY Washington Monument, Stonehenge
GPE South East Asia, Midlothian

In [199]:
import nltk

def extract_entities(text, all=True):
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if all:
                print(chunk)
            if type(chunk) is nltk.tree.Tree:
                print(chunk.label(), ' '.join(c[0] for c in chunk.leaves()))
            elif chunk[1] == 'CD':
                print('CD', chunk[0])

extract_entities(sentence, all=False)


GPE Normandy
GPE Rome
PERSON Domenichino
GPE Baroque
GPE Titian
GPE Raphael
PERSON Charles LeBrun
CD 1660
GPE French
ORGANIZATION Academy
GPE Arcadia
ORGANIZATION Burial
GPE Phocion

In [169]:
import dateutil.parser as parser

In [170]:
parser.parse("Sanghee 1978's", fuzzy=True).year


Out[170]:
1978

In [171]:
from nltk.tag.stanford import NERTagger

In [172]:
trained_clfs = ['english.all.3class.distsim.crf.ser.gz',
                'english.muc.7class.distsim.crf.ser.gz',
                'english.conll.4class.distsim.crf.ser.gz']

for clf in trained_clfs:
    print("==")
    ner_path = '/home/sanghee/Libs/stanford-ner-2015-04-20/'
    crf_path = ner_path + 'classifiers/' + clf
    jar_path = ner_path + 'stanford-ner.jar'
    st = NERTagger(crf_path, jar_path, 'utf-8')
    for tt in st.tag(sentence.split()):
        for ii in tt:
            print(ii)


==
('He', 'O')
('was', 'O')
('born', 'O')
('in', 'O')
('Normandy,', 'O')
('but', 'O')
('spent', 'O')
('most', 'O')
('of', 'O')
('his', 'O')
('life', 'O')
('in', 'O')
('Rome,', 'LOCATION')
('where', 'O')
('Domenichino', 'PERSON')
('briefly', 'O')
('employed', 'O')
('him.', 'O')
('Rejecting', 'O')
('the', 'O')
('Baroque,', 'O')
('he', 'O')
('chose', 'O')
('to', 'O')
('model', 'O')
('his', 'O')
('work', 'O')
('after', 'O')
('Titian', 'PERSON')
('and', 'O')
('Raphael.', 'O')
('Under', 'O')
('Charles', 'PERSON')
('LeBrun', 'PERSON')
('in', 'O')
('the', 'O')
("1660's,", 'O')
('the', 'O')
('French', 'ORGANIZATION')
('Academy', 'ORGANIZATION')
('would', 'O')
('take', 'O')
('his', 'O')
('ideas', 'O')
('on', 'O')
('Classicism', 'O')
('as', 'O')
('a', 'O')
('primary', 'O')
('reference.', 'O')
('FTP,', 'O')
('name', 'O')
('this', 'O')
('painter', 'O')
('of', 'O')
('Et', 'O')
('in', 'O')
('Arcadia', 'LOCATION')
('Ego', 'O')
('and', 'O')
('The', 'O')
('Burial', 'O')
('of', 'O')
('Phocion.', 'O')
==
('He', 'O')
('was', 'O')
('born', 'O')
('in', 'O')
('Normandy,', 'O')
('but', 'O')
('spent', 'O')
('most', 'O')
('of', 'O')
('his', 'O')
('life', 'O')
('in', 'O')
('Rome,', 'O')
('where', 'O')
('Domenichino', 'O')
('briefly', 'O')
('employed', 'O')
('him.', 'O')
('Rejecting', 'O')
('the', 'O')
('Baroque,', 'O')
('he', 'O')
('chose', 'O')
('to', 'O')
('model', 'O')
('his', 'O')
('work', 'O')
('after', 'O')
('Titian', 'ORGANIZATION')
('and', 'O')
('Raphael.', 'O')
('Under', 'O')
('Charles', 'PERSON')
('LeBrun', 'PERSON')
('in', 'O')
('the', 'O')
("1660's,", 'O')
('the', 'O')
('French', 'O')
('Academy', 'O')
('would', 'O')
('take', 'O')
('his', 'O')
('ideas', 'O')
('on', 'O')
('Classicism', 'O')
('as', 'O')
('a', 'O')
('primary', 'O')
('reference.', 'O')
('FTP,', 'O')
('name', 'O')
('this', 'O')
('painter', 'O')
('of', 'O')
('Et', 'O')
('in', 'O')
('Arcadia', 'LOCATION')
('Ego', 'O')
('and', 'O')
('The', 'O')
('Burial', 'O')
('of', 'O')
('Phocion.', 'LOCATION')
==
('He', 'O')
('was', 'O')
('born', 'O')
('in', 'O')
('Normandy,', 'O')
('but', 'O')
('spent', 'O')
('most', 'O')
('of', 'O')
('his', 'O')
('life', 'O')
('in', 'O')
('Rome,', 'O')
('where', 'O')
('Domenichino', 'PERSON')
('briefly', 'O')
('employed', 'O')
('him.', 'O')
('Rejecting', 'O')
('the', 'O')
('Baroque,', 'O')
('he', 'O')
('chose', 'O')
('to', 'O')
('model', 'O')
('his', 'O')
('work', 'O')
('after', 'O')
('Titian', 'PERSON')
('and', 'O')
('Raphael.', 'O')
('Under', 'O')
('Charles', 'PERSON')
('LeBrun', 'PERSON')
('in', 'O')
('the', 'O')
("1660's,", 'O')
('the', 'O')
('French', 'MISC')
('Academy', 'MISC')
('would', 'O')
('take', 'O')
('his', 'O')
('ideas', 'O')
('on', 'O')
('Classicism', 'O')
('as', 'O')
('a', 'O')
('primary', 'O')
('reference.', 'O')
('FTP,', 'O')
('name', 'O')
('this', 'O')
('painter', 'O')
('of', 'O')
('Et', 'O')
('in', 'O')
('Arcadia', 'LOCATION')
('Ego', 'LOCATION')
('and', 'O')
('The', 'ORGANIZATION')
('Burial', 'ORGANIZATION')
('of', 'ORGANIZATION')
('Phocion.', 'ORGANIZATION')

In [ ]: