In [0]:
%pylab inline
In [0]:
%load_ext autoreload
%autoreload 2
In [0]:
from io import StringIO
import json
import re
import unicodedata
In [0]:
from collections import namedtuple
from collections import defaultdict
In [0]:
from IPython.display import Image
In [0]:
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.image as img
In [0]:
import pandas as pd
import numpy as np
In [0]:
from bs4 import UnicodeDammit
In [0]:
import html2text
In [0]:
pd.options.display.max_rows = 999
In [0]:
import pydot
In [0]:
from stanford_corenlp_pywrapper import sockwrap
In [0]:
CORENLP_DIR = '/home/blannon/src/stanford-corenlp-full-2015-04-20'
CORENLP_VER = '3.5.2'
CORENLP_JARS = [
'{d}/stanford-corenlp-{v}.jar'.format(d=CORENLP_DIR, v=CORENLP_VER),
'{d}/stanford-corenlp-{v}-models.jar'.format(d=CORENLP_DIR, v=CORENLP_VER),
]
In [0]:
CORENLP_CONFIG = {
'annotators': 'tokenize, ssplit, pos, lemma, ner, parse',
'sutime.markTimeRanges': 'true',
'sutime.includeRange': 'true',
'ssplit.newlineIsSentenceBreak': 'two',
'parse.flags': '-makeCopulaHead'
}
OUTPUT_TYPES = ['pos, lemmas, parse, deps_basic, deps_cc, ner, normner']
In [0]:
CORENLP_NN_CONFIG = {
'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse',
'sutime.markTimeRanges': True,
'sutime.includeRange': True,
'ssplit.newlineIsSentenceBreak': 'two',
'parse.flags': '-makeCopulaHead'
}
NN_OUTPUT_TYPES = ['pos, lemmas, deps_cc, ner, normner']
In [0]:
parser = sockwrap.SockWrap(
mode=None,
corenlp_jars=CORENLP_JARS,
corenlp_dir=CORENLP_DIR,
configdict=CORENLP_CONFIG,
output_types=OUTPUT_TYPES
)
In [0]:
nn_parser = sockwrap.SockWrap(
mode=None,
corenlp_jars=CORENLP_JARS,
corenlp_dir=CORENLP_DIR,
configdict=CORENLP_NN_CONFIG,
output_types=NN_OUTPUT_TYPES
)
In [0]:
#parser.kill_proc_if_running()
In [0]:
#nn_parser.kill_proc_if_running()
In [0]:
test_text = "From 1981 to 1983, Vicki served as an assistant city representative at the National Center for Municipal Development."
In [0]:
parsed = parser.parse_doc(test_text)
sent = parsed['sentences'][0]
In [0]:
parsed
Out[0]:
In [0]:
pdg = pydot.Dot()
In [0]:
pdg.write_dot?
In [0]:
TokenNode = namedtuple('TokenNode', ('index', 'token', 'pos', 'lemma', 'ner', 'char_offsets'))
In [0]:
def build_dep_graph(sent, dep_type='cc'):
token_nodes = [TokenNode(i,t,p,l,n,tuple(o)) for i,t,p,l,n,o in zip(xrange(len(sent['tokens'])), sent['tokens'], sent['pos'],sent['lemmas'], sent['ner'], sent['char_offsets'])]
token_lookup = {i:t for i,t in enumerate(token_nodes)}
#token_lookup[-1] = 'ROOT'
dg = nx.DiGraph()
for tn in token_nodes:
dg.add_node(tn, label=tn.lemma, ner=tn.ner, pos=tn.pos)
sorted_deps = sorted(sent['deps_'+dep_type], key=lambda x: x[0])
for rel, lix, rix in sorted_deps:
try:
lnode = token_lookup[lix]
rnode = token_lookup[rix]
dg.add_edge(lnode, rnode, label=rel.replace(':','_'))
except KeyError:
continue
for e in dg.selfloop_edges():
dg.remove_edge(*e)
return dg
In [0]:
def display_parse(dep_graph, filename):
pdg = pydot.Dot()
for u,v in dep_graph.edges():
ulabel = '{lemma}-{index}'.format(**u.__dict__)
vlabel = '{lemma}-{index}'.format(**v.__dict__)
pdg.add_edge(pydot.Edge(ulabel,vlabel,**dep_graph.edge[u][v]))
pdg.write_png('images/{fn}.png'.format(fn=filename), prog='dot')
pdg.write_dot('images/{fn}.dot'.format(fn=filename), prog='dot')
In [0]:
def subtree_to_string(head, dg):
others = [d for d in nx.algorithms.descendants(dg, head) if dg[head].get(d,{'label':''})['label'] != 'case']
linearized = sorted([head,] + others, key=lambda x: x.index)
return ' '.join([t.token for t in linearized])
In [0]:
def simple_pas(predicate, dg):
arguments = dg[predicate]
_pas = defaultdict(list)
for arg, rel in arguments.items():
_pas[rel['label']].append(subtree_to_string(arg, dg))
_pas[u'predicate'] = predicate.token
return dict(_pas)
In [0]:
def collect_all_predicates(dg):
predicates = [n for n in nx.topological_sort_recursive(dg) if n.pos.startswith('V')]
return [simple_pas(p, dg) for p in predicates]
In [0]:
parsed = parser.parse_doc(test_text)
In [0]:
mydg = build_dep_graph(parsed['sentences'][0])
In [0]:
In [0]:
ner_nodes = [n for n in mydg.nodes() if n.ner != 'O']
In [0]:
sorted(ner_nodes, key=lambda x: x.index)
Out[0]:
In [0]:
mydg.edges(nbunch=ner_nodes,data=True)
Out[0]:
In [0]:
display_parse(build_dep_graph(parsed['sentences'][0]), 'test')
Image('images/test.png')
Out[0]:
In [0]:
simple = "From 1981 to 1983, Vicki served as an assistant city representative at the National Center for Municipal Development."
copula = "She was the assistant to the executive director of the Democratic Study Group in the US House of Representatives from 1979 to 1981."
twoverb = "Vicki has also served as a government relations consultant, representing the interests of Portland, Oregon in Washington DC."
smallclause = "The Department of Agriculture had appointed Vicki president"
relclause_subj = "The clients whom Vicki has represented include Coca-Cola, Texaco, and Giant Foods."
subclause = "Vicki lobbied for health insurance companies that supported Obamacare"
passive = "Giant is represented by Victoria Cram as of June 3, 2006."
In [0]:
print ' '.join([simple, copula])
In [0]:
simple_parse = parser.parse_doc(simple)
simple_dg_cc = build_dep_graph(simple_parse['sentences'][0])
display_parse(simple_dg_cc, 'pcfg-simple')
Image('images/pcfg-simple.png')
Out[0]:
In [0]:
collect_all_predicates(simple_dg_cc)
Out[0]:
In [0]:
tn = simple_dg_cc.nodes()[0]
In [0]:
In [0]:
pd.DataFrame.from_records(simple_dg_cc.nodes(), columns=simple_dg_cc.nodes()[0].__dict__.keys()).sort('index')
Out[0]:
In [0]:
tn.__dict__.keys()
tn.__dict__
Out[0]:
Note: "1979 - 1981" got parsed wrong!
In [0]:
copula_parse = parser.parse_doc(copula)
copula_dg_cc = build_dep_graph(copula_parse['sentences'][0])
display_parse(copula_dg_cc, 'pcfg-copula')
Image('images/pcfg-copula.png')
Out[0]:
In [0]:
collect_all_predicates(copula_dg_cc)
Out[0]:
In [0]:
twoverb_parse = parser.parse_doc(twoverb)
twoverb_dg_cc = build_dep_graph(twoverb_parse['sentences'][0])
display_parse(twoverb_dg_cc, 'pcfg-twoverb')
Image('images/pcfg-twoverb.png')
Out[0]:
In [0]:
collect_all_predicates(twoverb_dg_cc)
Out[0]:
In [0]:
smallclause_parse = parser.parse_doc(smallclause)
smallclause_dg_cc = build_dep_graph(smallclause_parse['sentences'][0])
display_parse(smallclause_dg_cc, 'pcfg-smallclause')
Image('images/pcfg-smallclause.png')
Out[0]:
In [0]:
collect_all_predicates(smallclause_dg_cc)
Out[0]:
In [0]:
relclause_parse = parser.parse_doc(relclause_subj)
relclause_dg_cc = build_dep_graph(relclause_parse['sentences'][0])
display_parse(relclause_dg_cc, 'pcfg-relclause')
Image('images/pcfg-relclause.png')
Out[0]:
In [0]:
nx.algorithms.traversal.bfs_successors
Out[0]:
In [0]:
collect_all_predicates(relclause_dg_cc)
Out[0]:
In [0]:
subclause_parse = parser.parse_doc(subclause)
subclause_dg_cc = build_dep_graph(subclause_parse['sentences'][0])
display_parse(subclause_dg_cc, 'pcfg-subclause')
Image('images/pcfg-subclause.png')
Out[0]:
In [0]:
collect_all_predicates(subclause_dg_cc)
Out[0]:
In [0]:
passive_parse = parser.parse_doc(passive)
passive_dg_cc = build_dep_graph(passive_parse['sentences'][0])
display_parse(passive_dg_cc, 'pcfg-passive')
Image('images/pcfg-passive.png')
Out[0]:
In [0]:
collect_all_predicates(passive_dg_cc)
Out[0]:
In [0]:
nx.topological_sort(simple_dg_cc)
Out[0]:
In [0]:
nx.topological_sort_recursive(simple_dg_cc)
Out[0]:
In [0]:
nx.topological_sort(twoverb_dg_cc)
Out[0]:
In [0]:
nx.topological_sort_recursive(twoverb_dg_cc)
Out[0]:
In [0]:
h2t = html2text.HTML2Text()
In [0]:
h2t.body_width = 0
h2t.unicode_snob = 1
h2t.emphasis_mark = ''
In [0]:
def asciify(bio):
asciitext = re.sub(r'[Aa]\?\?', ' ', unicodedata.normalize('NFD', bio).encode('ascii','replace'))
return asciitext
def filter_lists(asciitext):
for l in re.split(r'(\r\n|\n)', asciitext):
lstrip = l.strip()
if len(lstrip) > 0:
if lstrip[0] not in ['*','-']:
yield l
def clean_bio(bio):
text = h2t.handle(bio)
return '\n'.join([l for l in filter_lists(text)])
In [0]:
manatt_data = json.load(open('data/manatt-out-html-full.json'))
In [0]:
manatt_data[0]
Out[0]:
In [0]:
s = manatt_data[0]['bio'][2]
In [0]:
h2t.handle(s)
Out[0]:
In [0]:
h2t.handle(manatt_data[0]['bio'][0]).strip()
Out[0]:
In [0]:
def parse_to_predicates(
for sent in _parsed['sentences']:
_dg = build_dep_graph(sent)
try:
for pas in collect_all_predicates(_dg):
yield pas
In [0]:
def bio_to_parses(bio):
parses = []
for text in bio:
for sent in parser.parse_doc(clean_bio(text))['sentences']:
parses.append(sent)
return parses
def process_data(d):
d['parses'] = bio_to_parses(d['bio'])
d['name'] = h2t.handle(d['name'][0])
return d
In [0]:
%time processed = [process_data(d) for d in manatt_data[0:1]]
In [0]:
processed[0]
In [0]:
manatt_data[0]['bio'][-1]
In [0]:
p = nn_parser.parse_doc(clean_bio(manatt_data[0]['bio'][-1]))
In [0]:
display_parse(build_dep_graph(p['sentences'][-2]),'test2')
Image('images/test2.png')
In [0]:
all_preds = [collect_all_predicates(build_dep_graph(p)) for p in processed[0]['parses']]
In [0]:
all_preds
In [0]:
display_parse(build_dep_graph(processed[0]['parses'][-2]), 'test')
Image('images/test.png')
In [0]:
dg = build_dep_graph(p['sentences'][-2])
In [0]:
pd.DataFrame.from_records(dg.nodes(), columns=dg.nodes()[0].__dict__.keys()).sort('index')
In [0]:
collect_all_predicates(dg)
In [0]: