Reference
In [154]:
    
import gzip
import pickle
from os import path
from collections import defaultdict
from numpy import sign
"""
Load buzz data as a dictionary.
You can give parameter for data so that you will get what you need only.
"""
def load_buzz(root='../data', data=['train', 'test', 'questions'], format='pklz'):
    buzz_data = {}
    for ii in data:
        file_path = path.join(root, ii + "." + format)
        with gzip.open(file_path, "rb") as fp:
          buzz_data[ii] = pickle.load(fp)
        
    return buzz_data
    
In [155]:
    
questions = load_buzz()['questions']
    
In [156]:
    
print(questions[19]['question'])
print(questions[19]['answer'])
    
    
In [157]:
    
from nltk import word_tokenize, pos_tag
sentence = questions[19]['question']
tagged_sent = pos_tag(word_tokenize(sentence))
for word, tag in tagged_sent:
    print(word, tag)
    
    
In [164]:
    
print([word for word,pos in tagged_sent if pos == 'NNP'])
    
    
In [165]:
    
print([word for word,pos in tagged_sent if pos == 'NN'])
    
    
In [166]:
    
print([word for word,pos in tagged_sent if pos == 'CD'])
    
    
In [167]:
    
print([word for word,pos in tagged_sent if pos == 'PRP'])
print([word for word,pos in tagged_sent if pos == 'PRP$'])
    
    
In [168]:
    
from nltk import ne_chunk
print(nltk.ne_chunk(tagged_sent))
    
    
NE Type
| NE Type | Examples | 
|---|---|
| ORGANIZATION | Georgia-Pacific Corp., WHO | 
| PERSON | Eddy Bonte, President Obama | 
| LOCATION | Murray River, Mount Everest | 
| DATE | June, 2008-06-29 | 
| TIME | two fifty a m, 1:30 p.m. | 
| MONEY | 175 million Canadian Dollars, GBP 10.40 | 
| PERCENT | twenty pct, 18.75 % | 
| FACILITY | Washington Monument, Stonehenge | 
| GPE | South East Asia, Midlothian | 
In [199]:
    
import nltk
def extract_entities(text, all=True):
    for sent in nltk.sent_tokenize(text):
        for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
            if all:
                print(chunk)
            if type(chunk) is nltk.tree.Tree:
                print(chunk.label(), ' '.join(c[0] for c in chunk.leaves()))
            elif chunk[1] == 'CD':
                print('CD', chunk[0])
extract_entities(sentence, all=False)
    
    
In [169]:
    
import dateutil.parser as parser
    
In [170]:
    
parser.parse("Sanghee 1978's", fuzzy=True).year
    
    Out[170]:
In [171]:
    
from nltk.tag.stanford import NERTagger
    
In [172]:
    
trained_clfs = ['english.all.3class.distsim.crf.ser.gz',
                'english.muc.7class.distsim.crf.ser.gz',
                'english.conll.4class.distsim.crf.ser.gz']
for clf in trained_clfs:
    print("==")
    ner_path = '/home/sanghee/Libs/stanford-ner-2015-04-20/'
    crf_path = ner_path + 'classifiers/' + clf
    jar_path = ner_path + 'stanford-ner.jar'
    st = NERTagger(crf_path, jar_path, 'utf-8')
    for tt in st.tag(sentence.split()):
        for ii in tt:
            print(ii)
    
    
In [ ]: