Reference
In [154]:
import gzip
import pickle
from os import path
from collections import defaultdict
from numpy import sign
"""
Load buzz data as a dictionary.
You can give parameter for data so that you will get what you need only.
"""
def load_buzz(root='../data', data=['train', 'test', 'questions'], format='pklz'):
buzz_data = {}
for ii in data:
file_path = path.join(root, ii + "." + format)
with gzip.open(file_path, "rb") as fp:
buzz_data[ii] = pickle.load(fp)
return buzz_data
In [155]:
questions = load_buzz()['questions']
In [156]:
print(questions[19]['question'])
print(questions[19]['answer'])
In [157]:
from nltk import word_tokenize, pos_tag
sentence = questions[19]['question']
tagged_sent = pos_tag(word_tokenize(sentence))
for word, tag in tagged_sent:
print(word, tag)
In [164]:
print([word for word,pos in tagged_sent if pos == 'NNP'])
In [165]:
print([word for word,pos in tagged_sent if pos == 'NN'])
In [166]:
print([word for word,pos in tagged_sent if pos == 'CD'])
In [167]:
print([word for word,pos in tagged_sent if pos == 'PRP'])
print([word for word,pos in tagged_sent if pos == 'PRP$'])
In [168]:
from nltk import ne_chunk
print(nltk.ne_chunk(tagged_sent))
NE Type
NE Type | Examples |
---|---|
ORGANIZATION | Georgia-Pacific Corp., WHO |
PERSON | Eddy Bonte, President Obama |
LOCATION | Murray River, Mount Everest |
DATE | June, 2008-06-29 |
TIME | two fifty a m, 1:30 p.m. |
MONEY | 175 million Canadian Dollars, GBP 10.40 |
PERCENT | twenty pct, 18.75 % |
FACILITY | Washington Monument, Stonehenge |
GPE | South East Asia, Midlothian |
In [199]:
import nltk
def extract_entities(text, all=True):
for sent in nltk.sent_tokenize(text):
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if all:
print(chunk)
if type(chunk) is nltk.tree.Tree:
print(chunk.label(), ' '.join(c[0] for c in chunk.leaves()))
elif chunk[1] == 'CD':
print('CD', chunk[0])
extract_entities(sentence, all=False)
In [169]:
import dateutil.parser as parser
In [170]:
parser.parse("Sanghee 1978's", fuzzy=True).year
Out[170]:
In [171]:
from nltk.tag.stanford import NERTagger
In [172]:
trained_clfs = ['english.all.3class.distsim.crf.ser.gz',
'english.muc.7class.distsim.crf.ser.gz',
'english.conll.4class.distsim.crf.ser.gz']
for clf in trained_clfs:
print("==")
ner_path = '/home/sanghee/Libs/stanford-ner-2015-04-20/'
crf_path = ner_path + 'classifiers/' + clf
jar_path = ner_path + 'stanford-ner.jar'
st = NERTagger(crf_path, jar_path, 'utf-8')
for tt in st.tag(sentence.split()):
for ii in tt:
print(ii)
In [ ]: