main task is to detect whether the text contain special entity, such as following 7 classes:
Online demo: http://nlp.stanford.edu:8080/ner/process
Reference
In [2]:
import nltk
# with open('sample.txt', 'r') as f:
# sample = f.read()
sample = "in my own language.\
As a video uploader, this means you can reach\
to people all over the world,\
irrespective of language.\
[Hiroto, Bedhead]\
You can upload multiple tracks like English and French,\
and viewers can choose the track they like.\
[Toliver, Japanese Learner]\
For example, if you enjoy using YouTube in French,"
sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)
def extract_entity_names(t):
entity_names = []
if hasattr(t, 'label') and t.label:
if t.label() == 'NE':
entity_names.append(' '.join([child[0] for child in t]))
else:
for child in t:
entity_names.extend(extract_entity_names(child))
return entity_names
entity_names = []
for tree in chunked_sentences:
# Print results per sentence
# print extract_entity_names(tree)
entity_names.extend(extract_entity_names(tree))
# Print all entity names
#print entity_names
# Print unique entity names
print set(entity_names)
In [28]:
from nltk.tag import StanfordNERTagger
st = StanfordNERTagger('/vagrant/stanford-ner-2015-12-09/classifiers/english.muc.7class.distsim.crf.ser.gz', '/vagrant/stanford-ner-2015-12-09/stanford-ner.jar')
st.tag('Rami Eid is studying at Stony Brook University in NY'.split())
Out[28]:
In [27]:
sample = "in my own language. \
As a video uploader, this means you can reach\
to people all over the world,\
irrespective of language. \
[Hiroto, Bedhead]\
You can upload multiple tracks like English and French,\
and viewers can choose the track they like. \
[Toliver, Japanese Learner]\
For example, if you enjoy using YouTube in French, 1990, July"
import string
from nltk.tokenize import word_tokenize
st.tag(word_tokenize(sample))
Out[27]: