In [5]:
import nltk

In [6]:
from nltk import word_tokenize
from nltk import pos_tag
from nltk.chunk import ne_chunk
text = "Jack and Jill went to Capitol Hill"
print ne_chunk(pos_tag(word_tokenize(text)))


(S
  (PERSON Jack/NNP)
  and/CC
  (PERSON Jill/NNP)
  went/VBD
  to/TO
  (PERSON Capitol/NNP Hill/NNP))

In [7]:
t = ne_chunk(pos_tag(word_tokenize(text)))
%matplotlib inline 
%pylab inline
nltk.draw.tree.demo()
t.draw()


Populating the interactive namespace from numpy and matplotlib
WARNING: pylab import has clobbered these variables: ['text']
`%matplotlib` prevents importing * from pylab and numpy

You Try It

  1. TBD
  2. TBD
  3. TBD

Measuing NER Accuracy

Compaing the output from NLTK and Rosette API


In [8]:
import httplib, urllib
import os, csv
import codecs
import json

In [9]:
#GET /rest/v1/ping HTTP/1.1
#Host: api.rosette.com
#user_key: 40fe14de7872ebf3b8c5e11c17fb7a5f
#Content-Type: application/json
#Accept: application/json
def ping():
    headers = { "user_key": "40fe14de7872ebf3b8c5e11c17fb7a5f",
            "Content-Type": "application/json",
            "Accept": "application/json"}
    conn = httplib.HTTPSConnection("api.rosette.com")
    conn.request("GET", "/rest/v1/ping", "", headers)
    response = conn.getresponse()
    data = response.read()
    conn.close()
    return data

#POST /rest/v1/entities HTTP/1.1
#Host: api.rosette.com
#user_key: 40fe14de7872ebf3b8c5e11c17fb7a5f
#Content-Type: application/json
#Accept: application/json
def get_entities(input_text):
    headers = { "user_key": "40fe14de7872ebf3b8c5e11c17fb7a5f",
                "Content-Type": "application/json",
                "Accept": "application/json"}
    conn = httplib.HTTPSConnection("api.rosette.com")
    body = '{"type": "text", "content": "'+input_text+'"}'
    conn.request("POST", "/rest/v1/entities", body, headers)
    response = conn.getresponse()
    data = response.read()
    conn.close()
    return data

#POST /rest/v1/morphology/complete HTTP/1.1
#Host: api.rosette.com
#user_key: 40fe14de7872ebf3b8c5e11c17fb7a5f
#Content-Type: application/json
#Accept: application/json
def get_morphology(input_text):
    headers = { "user_key": "40fe14de7872ebf3b8c5e11c17fb7a5f",
                "Content-Type": "application/json",
                "Accept": "application/json"}
    conn = httplib.HTTPSConnection("api.rosette.com")
    body = '{"language": "eng", "content": "'+input_text+'"}'
    conn.request("POST", "/rest/v1/morphology/parts-of-speech", body, headers) #complete
    response = conn.getresponse()
    data = response.read()
    conn.close()
    return data

#print ping()
#print get_entities("Mary had a little lamb.");
#print get_morphology("Mary had a little lamb.");

In [10]:
la = "LA beat New York last night. John was at the game. There are teams in the NHL."
ny = "I grew up on the town of Milton. I grew up on the town of Milton. Milton is a great place to live. Only in Milton will you find this."
cnn = "Their handling of politically perilous issues this week couldn't have been more different: Jeb Bush, persistently fielding questions in public, hemmed and hawed for days over Iraq, while Hillary Clinton stayed radio silent while her party waged an internal fight over trade."
jj = "Jack and Jill went to the Red River"

In [11]:
#ENTITIES FROM NLTK
def extract_entity_names(t):
    entity_names = []    
    if hasattr(t, 'node') and t.node:
        if t.node in ['NE','ORGANIZATION','PERSON','LOCATION','DATE','TIME','MONEY','PERCENT','FACILITY','GPE']:
            entity_names.append((' '.join([child[0] for child in t]),t.node))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))                
    return entity_names

raw_response = ne_chunk(pos_tag(word_tokenize(jj)))
tuple_response = extract_entity_names(raw_response)

#set(tuple_response)

for e,t in tuple_response:
    print e, "(" + t + ")"

In [12]:
#ENTITIES FROM ROSETTE API
raw_response = get_entities(jj)
json_response = json.loads(raw_response)

for e in json_response['entities']:
    print e['mention'], "(" + e['type'] + ")"


Jack (PERSON)
Jill (PERSON)
Red River (LOCATION)

In [ ]: