In [0]:

    
%pylab inline









    



Populating the interactive namespace from numpy and matplotlib



In [0]:

    
%load_ext autoreload
%autoreload 2

Imports



In [0]:

    
from io import StringIO
import json
import re
import unicodedata



In [0]:

    
from collections import namedtuple
from collections import defaultdict



In [0]:

    
from IPython.display import Image



In [0]:

    
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.image as img









    



Couldn't import dot_parser, loading of dot files will not be possible.



In [0]:

    
import pandas as pd
import numpy as np



In [0]:

    
from bs4 import UnicodeDammit



In [0]:

    
import html2text



In [0]:

    
pd.options.display.max_rows = 999



In [0]:

    
import pydot



In [0]:

    
from stanford_corenlp_pywrapper import sockwrap

CoreNLP

Start Server



In [0]:

    
CORENLP_DIR = '/home/blannon/src/stanford-corenlp-full-2015-04-20'
CORENLP_VER = '3.5.2'

CORENLP_JARS = [
    '{d}/stanford-corenlp-{v}.jar'.format(d=CORENLP_DIR, v=CORENLP_VER),
    '{d}/stanford-corenlp-{v}-models.jar'.format(d=CORENLP_DIR, v=CORENLP_VER),
]



In [0]:

    
CORENLP_CONFIG = {
    'annotators': 'tokenize, ssplit, pos, lemma, ner, parse',
    'sutime.markTimeRanges': 'true',
    'sutime.includeRange': 'true',
    'ssplit.newlineIsSentenceBreak': 'two',
    'parse.flags': '-makeCopulaHead'
}

OUTPUT_TYPES = ['pos, lemmas, parse, deps_basic, deps_cc, ner, normner']



In [0]:

    
CORENLP_NN_CONFIG = {
    'annotators': 'tokenize, ssplit, pos, lemma, ner, depparse',
    'sutime.markTimeRanges': True,
    'sutime.includeRange': True,
    'ssplit.newlineIsSentenceBreak': 'two',
    'parse.flags': '-makeCopulaHead'
}

NN_OUTPUT_TYPES = ['pos, lemmas, deps_cc, ner, normner']



In [0]:

    
parser = sockwrap.SockWrap(
    mode=None,
    corenlp_jars=CORENLP_JARS,
    corenlp_dir=CORENLP_DIR,
    configdict=CORENLP_CONFIG,
    output_types=OUTPUT_TYPES
)









    



INFO:StanfordSocketWrap:Starting pipe subprocess, and waiting for signal it's ready, with command:  exec java -Xmx4g -cp '/home/blannon/src/stanford_corenlp_pywrapper/stanford_corenlp_pywrapper/lib/piperunner.jar:/home/blannon/src/stanford_corenlp_pywrapper/stanford_corenlp_pywrapper/lib/guava-13.0.1.jar:/home/blannon/src/stanford_corenlp_pywrapper/stanford_corenlp_pywrapper/lib/jackson-all-1.9.11.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/ejml-0.23.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/javax.json.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/joda-time.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/jollyday.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/protobuf.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2-javadoc.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2-models.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/stanford-srparser-2014-10-23-models.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/xom-1.2.10-src.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/xom.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2-models.jar'     corenlp.PipeCommandRunner --server 12340  --configdict '{"ssplit.newlineIsSentenceBreak": "two", "sutime.includeRange": "true", "sutime.markTimeRanges": "true", "annotators": "tokenize, ssplit, pos, lemma, ner, parse", "parse.flags": "-makeCopulaHead"}' --output-types 'pos, lemmas, parse, deps_basic, deps_cc, ner, normner'
INFO:StanfordSocketWrap:socket error when making connection ([Errno 111] Connection refused)
INFO:StanfordSocketWrap:pausing before retry
INFO:StanfordSocketWrap:socket error when making connection ([Errno 111] Connection refused)
INFO:StanfordSocketWrap:pausing before retry
INFO:StanfordSocketWrap:socket error when making connection ([Errno 111] Connection refused)
INFO:StanfordSocketWrap:pausing before retry
INFO:StanfordSocketWrap:socket error when making connection ([Errno 111] Connection refused)
INFO:StanfordSocketWrap:pausing before retry
INFO:StanfordSocketWrap:socket error when making connection ([Errno 111] Connection refused)
INFO:StanfordSocketWrap:pausing before retry
INFO:StanfordSocketWrap:socket error when making connection ([Errno 111] Connection refused)
INFO:StanfordSocketWrap:pausing before retry
INFO:StanfordSocketWrap:socket error when making connection ([Errno 111] Connection refused)
INFO:StanfordSocketWrap:pausing before retry
INFO:StanfordSocketWrap:socket error when making connection ([Errno 111] Connection refused)
INFO:StanfordSocketWrap:pausing before retry
INFO:StanfordSocketWrap:Successful ping. The server has started.
INFO:StanfordSocketWrap:Subprocess is ready.






    



goo!



In [0]:

    
nn_parser = sockwrap.SockWrap(
    mode=None,
    corenlp_jars=CORENLP_JARS,
    corenlp_dir=CORENLP_DIR,
    configdict=CORENLP_NN_CONFIG,
    output_types=NN_OUTPUT_TYPES
)









    



INFO:StanfordSocketWrap:Starting pipe subprocess, and waiting for signal it's ready, with command:  exec java -Xmx4g -cp '/home/blannon/src/stanford_corenlp_pywrapper/stanford_corenlp_pywrapper/lib/piperunner.jar:/home/blannon/src/stanford_corenlp_pywrapper/stanford_corenlp_pywrapper/lib/guava-13.0.1.jar:/home/blannon/src/stanford_corenlp_pywrapper/stanford_corenlp_pywrapper/lib/jackson-all-1.9.11.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/ejml-0.23.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/javax.json.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/joda-time.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/jollyday.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/protobuf.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2-javadoc.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2-models.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/stanford-srparser-2014-10-23-models.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/xom-1.2.10-src.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/xom.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2.jar:/home/blannon/src/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2-models.jar'     corenlp.PipeCommandRunner --server 12340  --configdict '{"ssplit.newlineIsSentenceBreak": "two", "sutime.includeRange": true, "sutime.markTimeRanges": true, "annotators": "tokenize, ssplit, pos, lemma, ner, depparse", "parse.flags": "-makeCopulaHead"}' --output-types 'pos, lemmas, deps_cc, ner, normner'
INFO:StanfordSocketWrap:Successful ping. The server has started.
INFO:StanfordSocketWrap:Subprocess is ready.






    



goo!



In [0]:

    
#parser.kill_proc_if_running()



In [0]:

    
#nn_parser.kill_proc_if_running()

Parse



In [0]:

    
test_text = "From 1981 to 1983, Vicki served as an assistant city representative at the National Center for Municipal Development."



In [0]:

    
parsed = parser.parse_doc(test_text)
sent = parsed['sentences'][0]



In [0]:

    
parsed









    Out[0]:





{u'sentences': [{u'char_offsets': [[0, 4],
    [5, 9],
    [10, 12],
    [13, 17],
    [17, 18],
    [19, 24],
    [25, 31],
    [32, 34],
    [35, 37],
    [38, 47],
    [48, 52],
    [53, 67],
    [68, 70],
    [71, 74],
    [75, 83],
    [84, 90],
    [91, 94],
    [95, 104],
    [105, 116],
    [116, 117]],
   u'deps_basic': [[u'root', -1, 6],
    [u'case', 3, 0],
    [u'nummod', 3, 1],
    [u'dep', 3, 2],
    [u'nmod', 6, 15],
    [u'nmod', 6, 3],
    [u'nsubj', 6, 5],
    [u'nmod', 6, 11],
    [u'case', 11, 7],
    [u'det', 11, 8],
    [u'amod', 11, 9],
    [u'compound', 11, 10],
    [u'nmod', 15, 18],
    [u'case', 15, 12],
    [u'det', 15, 13],
    [u'compound', 15, 14],
    [u'case', 18, 16],
    [u'compound', 18, 17]],
   u'deps_cc': [[u'root', -1, 6],
    [u'case', 3, 0],
    [u'nummod', 3, 1],
    [u'dep', 3, 2],
    [u'nmod:at', 6, 15],
    [u'nmod:from', 6, 3],
    [u'nsubj', 6, 5],
    [u'nmod:as', 6, 11],
    [u'case', 11, 7],
    [u'det', 11, 8],
    [u'amod', 11, 9],
    [u'compound', 11, 10],
    [u'nmod:for', 15, 18],
    [u'case', 15, 12],
    [u'det', 15, 13],
    [u'compound', 15, 14],
    [u'case', 18, 16],
    [u'compound', 18, 17]],
   u'lemmas': [u'from',
    u'1981',
    u'to',
    u'1983',
    u',',
    u'Vicki',
    u'serve',
    u'as',
    u'a',
    u'assistant',
    u'city',
    u'representative',
    u'at',
    u'the',
    u'National',
    u'Center',
    u'for',
    u'Municipal',
    u'Development',
    u'.'],
   u'ner': [u'O',
    u'DATE',
    u'O',
    u'DATE',
    u'O',
    u'PERSON',
    u'O',
    u'O',
    u'O',
    u'O',
    u'O',
    u'O',
    u'O',
    u'O',
    u'ORGANIZATION',
    u'ORGANIZATION',
    u'ORGANIZATION',
    u'ORGANIZATION',
    u'ORGANIZATION',
    u'O'],
   u'normner': [u'',
    u'1981',
    u'',
    u'1983',
    u'',
    u'',
    u'',
    u'',
    u'',
    u'',
    u'',
    u'',
    u'',
    u'',
    u'',
    u'',
    u'',
    u'',
    u'',
    u''],
   u'parse': u'(ROOT (S (PP (IN From) (NP (CD 1981) (TO to) (CD 1983))) (, ,) (NP (NNP Vicki)) (VP (VBD served) (PP (IN as) (NP (DT an) (JJ assistant) (NN city) (NN representative))) (PP (IN at) (NP (NP (DT the) (NNP National) (NNP Center)) (PP (IN for) (NP (NNP Municipal) (NNP Development)))))) (. .)))',
   u'pos': [u'IN',
    u'CD',
    u'TO',
    u'CD',
    u',',
    u'NNP',
    u'VBD',
    u'IN',
    u'DT',
    u'JJ',
    u'NN',
    u'NN',
    u'IN',
    u'DT',
    u'NNP',
    u'NNP',
    u'IN',
    u'NNP',
    u'NNP',
    u'.'],
   u'tokens': [u'From',
    u'1981',
    u'to',
    u'1983',
    u',',
    u'Vicki',
    u'served',
    u'as',
    u'an',
    u'assistant',
    u'city',
    u'representative',
    u'at',
    u'the',
    u'National',
    u'Center',
    u'for',
    u'Municipal',
    u'Development',
    u'.']}]}

Utils



In [0]:

    
pdg = pydot.Dot()



In [0]:

    
pdg.write_dot?



In [0]:

    
TokenNode = namedtuple('TokenNode', ('index', 'token', 'pos', 'lemma', 'ner', 'char_offsets'))



In [0]:

    
def build_dep_graph(sent, dep_type='cc'):
    token_nodes = [TokenNode(i,t,p,l,n,tuple(o)) for i,t,p,l,n,o in zip(xrange(len(sent['tokens'])), sent['tokens'], sent['pos'],sent['lemmas'], sent['ner'], sent['char_offsets'])]    
    token_lookup = {i:t for i,t in enumerate(token_nodes)}
    #token_lookup[-1] = 'ROOT'
    dg = nx.DiGraph()
    for tn in token_nodes:
        dg.add_node(tn, label=tn.lemma, ner=tn.ner, pos=tn.pos)
    sorted_deps = sorted(sent['deps_'+dep_type], key=lambda x: x[0])
    for rel, lix, rix in sorted_deps:
        try:
            lnode = token_lookup[lix]
            rnode = token_lookup[rix]
            dg.add_edge(lnode, rnode, label=rel.replace(':','_'))
        except KeyError:
            continue
    for e in dg.selfloop_edges():
        dg.remove_edge(*e)
    return dg



In [0]:

    
def display_parse(dep_graph, filename):
    pdg = pydot.Dot()
    for u,v in dep_graph.edges():
        ulabel = '{lemma}-{index}'.format(**u.__dict__)
        vlabel = '{lemma}-{index}'.format(**v.__dict__)
        pdg.add_edge(pydot.Edge(ulabel,vlabel,**dep_graph.edge[u][v]))
    pdg.write_png('images/{fn}.png'.format(fn=filename), prog='dot')
    pdg.write_dot('images/{fn}.dot'.format(fn=filename), prog='dot')



In [0]:

    
def subtree_to_string(head, dg):
    others = [d for d in nx.algorithms.descendants(dg, head) if dg[head].get(d,{'label':''})['label'] != 'case'] 
    linearized = sorted([head,] + others, key=lambda x: x.index)
    return ' '.join([t.token for t in linearized])



In [0]:

    
def simple_pas(predicate, dg):
    arguments = dg[predicate]
    _pas = defaultdict(list)
    for arg, rel in arguments.items():
        _pas[rel['label']].append(subtree_to_string(arg, dg))
    _pas[u'predicate'] = predicate.token
    return dict(_pas)



In [0]:

    
def collect_all_predicates(dg):
    predicates = [n for n in nx.topological_sort_recursive(dg) if n.pos.startswith('V')]
    return [simple_pas(p, dg) for p in predicates]

Build DAG



In [0]:

    
parsed = parser.parse_doc(test_text)



In [0]:

    
mydg = build_dep_graph(parsed['sentences'][0])



In [0]:



In [0]:

    
ner_nodes = [n for n in mydg.nodes() if n.ner != 'O']



In [0]:

    
sorted(ner_nodes, key=lambda x: x.index)









    Out[0]:





[TokenNode(index=1, token=u'1981', pos=u'CD', lemma=u'1981', ner=u'DATE', char_offsets=(5, 9)),
 TokenNode(index=3, token=u'1983', pos=u'CD', lemma=u'1983', ner=u'DATE', char_offsets=(13, 17)),
 TokenNode(index=5, token=u'Vicki', pos=u'NNP', lemma=u'Vicki', ner=u'PERSON', char_offsets=(19, 24)),
 TokenNode(index=14, token=u'National', pos=u'NNP', lemma=u'National', ner=u'ORGANIZATION', char_offsets=(75, 83)),
 TokenNode(index=15, token=u'Center', pos=u'NNP', lemma=u'Center', ner=u'ORGANIZATION', char_offsets=(84, 90)),
 TokenNode(index=16, token=u'for', pos=u'IN', lemma=u'for', ner=u'ORGANIZATION', char_offsets=(91, 94)),
 TokenNode(index=17, token=u'Municipal', pos=u'NNP', lemma=u'Municipal', ner=u'ORGANIZATION', char_offsets=(95, 104)),
 TokenNode(index=18, token=u'Development', pos=u'NNP', lemma=u'Development', ner=u'ORGANIZATION', char_offsets=(105, 116))]



In [0]:

    
mydg.edges(nbunch=ner_nodes,data=True)









    Out[0]:





[(TokenNode(index=3, token=u'1983', pos=u'CD', lemma=u'1983', ner=u'DATE', char_offsets=(13, 17)),
  TokenNode(index=0, token=u'From', pos=u'IN', lemma=u'from', ner=u'O', char_offsets=(0, 4)),
  {'label': u'case'}),
 (TokenNode(index=3, token=u'1983', pos=u'CD', lemma=u'1983', ner=u'DATE', char_offsets=(13, 17)),
  TokenNode(index=1, token=u'1981', pos=u'CD', lemma=u'1981', ner=u'DATE', char_offsets=(5, 9)),
  {'label': u'nummod'}),
 (TokenNode(index=3, token=u'1983', pos=u'CD', lemma=u'1983', ner=u'DATE', char_offsets=(13, 17)),
  TokenNode(index=2, token=u'to', pos=u'TO', lemma=u'to', ner=u'O', char_offsets=(10, 12)),
  {'label': u'dep'}),
 (TokenNode(index=18, token=u'Development', pos=u'NNP', lemma=u'Development', ner=u'ORGANIZATION', char_offsets=(105, 116)),
  TokenNode(index=16, token=u'for', pos=u'IN', lemma=u'for', ner=u'ORGANIZATION', char_offsets=(91, 94)),
  {'label': u'case'}),
 (TokenNode(index=18, token=u'Development', pos=u'NNP', lemma=u'Development', ner=u'ORGANIZATION', char_offsets=(105, 116)),
  TokenNode(index=17, token=u'Municipal', pos=u'NNP', lemma=u'Municipal', ner=u'ORGANIZATION', char_offsets=(95, 104)),
  {'label': u'compound'}),
 (TokenNode(index=15, token=u'Center', pos=u'NNP', lemma=u'Center', ner=u'ORGANIZATION', char_offsets=(84, 90)),
  TokenNode(index=12, token=u'at', pos=u'IN', lemma=u'at', ner=u'O', char_offsets=(68, 70)),
  {'label': u'case'}),
 (TokenNode(index=15, token=u'Center', pos=u'NNP', lemma=u'Center', ner=u'ORGANIZATION', char_offsets=(84, 90)),
  TokenNode(index=13, token=u'the', pos=u'DT', lemma=u'the', ner=u'O', char_offsets=(71, 74)),
  {'label': u'det'}),
 (TokenNode(index=15, token=u'Center', pos=u'NNP', lemma=u'Center', ner=u'ORGANIZATION', char_offsets=(84, 90)),
  TokenNode(index=14, token=u'National', pos=u'NNP', lemma=u'National', ner=u'ORGANIZATION', char_offsets=(75, 83)),
  {'label': u'compound'}),
 (TokenNode(index=15, token=u'Center', pos=u'NNP', lemma=u'Center', ner=u'ORGANIZATION', char_offsets=(84, 90)),
  TokenNode(index=18, token=u'Development', pos=u'NNP', lemma=u'Development', ner=u'ORGANIZATION', char_offsets=(105, 116)),
  {'label': u'nmod_for'})]

Display a parse



In [0]:

    
display_parse(build_dep_graph(parsed['sentences'][0]), 'test')
Image('images/test.png')









    Out[0]:

Examples



In [0]:

    
simple = "From 1981 to 1983, Vicki served as an assistant city representative at the National Center for Municipal Development."
copula = "She was the assistant to the executive director of the Democratic Study Group in the US House of Representatives from 1979 to 1981."
twoverb = "Vicki has also served as a government relations consultant, representing the interests of Portland, Oregon in Washington DC."
smallclause = "The Department of Agriculture had appointed Vicki president"
relclause_subj = "The clients whom Vicki has represented include Coca-Cola, Texaco, and Giant Foods."
subclause = "Vicki lobbied for health insurance companies that supported Obamacare"
passive = "Giant is represented by Victoria Cram as of June 3, 2006."



In [0]:

    
print ' '.join([simple, copula])









    



From 1981 to 1983, Vicki served as an assistant city representative at the National Center for Municipal Development. She was the assistant to the executive director of the Democratic Study Group in the US House of Representatives from 1979 to 1981.

PCFG

simple



In [0]:

    
simple_parse = parser.parse_doc(simple)
simple_dg_cc = build_dep_graph(simple_parse['sentences'][0])
display_parse(simple_dg_cc, 'pcfg-simple')
Image('images/pcfg-simple.png')









    Out[0]:



In [0]:

    
collect_all_predicates(simple_dg_cc)









    Out[0]:





[{u'nmod_as': [u'an assistant city representative'],
  u'nmod_at': [u'the National Center for Municipal Development'],
  u'nmod_from': [u'1981 to 1983'],
  u'nsubj': [u'Vicki'],
  u'predicate': u'served'}]



In [0]:

    
tn = simple_dg_cc.nodes()[0]



In [0]:



In [0]:

    
pd.DataFrame.from_records(simple_dg_cc.nodes(), columns=simple_dg_cc.nodes()[0].__dict__.keys()).sort('index')









    Out[0]:






  
    
      
      index
      token
      pos
      lemma
      ner
      char_offsets
    
  
  
    
      4
      0
      From
      IN
      from
      O
      (0, 4)
    
    
      1
      1
      1981
      CD
      1981
      DATE
      (5, 9)
    
    
      9
      2
      to
      TO
      to
      O
      (10, 12)
    
    
      2
      3
      1983
      CD
      1983
      DATE
      (13, 17)
    
    
      5
      4
      ,
      ,
      ,
      O
      (17, 18)
    
    
      3
      5
      Vicki
      NNP
      Vicki
      PERSON
      (19, 24)
    
    
      7
      6
      served
      VBD
      serve
      O
      (25, 31)
    
    
      15
      7
      as
      IN
      as
      O
      (32, 34)
    
    
      16
      8
      an
      DT
      a
      O
      (35, 37)
    
    
      8
      9
      assistant
      JJ
      assistant
      O
      (38, 47)
    
    
      10
      10
      city
      NN
      city
      O
      (48, 52)
    
    
      19
      11
      representative
      NN
      representative
      O
      (53, 67)
    
    
      6
      12
      at
      IN
      at
      O
      (68, 70)
    
    
      17
      13
      the
      DT
      the
      O
      (71, 74)
    
    
      13
      14
      National
      NNP
      National
      ORGANIZATION
      (75, 83)
    
    
      14
      15
      Center
      NNP
      Center
      ORGANIZATION
      (84, 90)
    
    
      11
      16
      for
      IN
      for
      ORGANIZATION
      (91, 94)
    
    
      18
      17
      Municipal
      NNP
      Municipal
      ORGANIZATION
      (95, 104)
    
    
      12
      18
      Development
      NNP
      Development
      ORGANIZATION
      (105, 116)
    
    
      0
      19
      .
      .
      .
      O
      (116, 117)



In [0]:

    
tn.__dict__.keys()
tn.__dict__









    Out[0]:





OrderedDict([('index', 19), ('token', u'.'), ('pos', u'.'), ('lemma', u'.'), ('ner', u'O'), ('char_offsets', (116, 117))])

copula

Note: "1979 - 1981" got parsed wrong!



In [0]:

    
copula_parse = parser.parse_doc(copula)
copula_dg_cc = build_dep_graph(copula_parse['sentences'][0])
display_parse(copula_dg_cc, 'pcfg-copula')
Image('images/pcfg-copula.png')









    Out[0]:



In [0]:

    
collect_all_predicates(copula_dg_cc)









    Out[0]:





[{u'nmod_from': [u'1979 to 1981'],
  u'nsubj': [u'She'],
  u'predicate': u'was',
  u'xcomp': [u'the assistant to the executive director of the Democratic Study Group in the US House of Representatives']}]

twoverb



In [0]:

    
twoverb_parse = parser.parse_doc(twoverb)
twoverb_dg_cc = build_dep_graph(twoverb_parse['sentences'][0])
display_parse(twoverb_dg_cc, 'pcfg-twoverb')
Image('images/pcfg-twoverb.png')









    Out[0]:



In [0]:

    
collect_all_predicates(twoverb_dg_cc)









    Out[0]:





[{u'advmod': [u'also'],
  u'aux': [u'has'],
  u'nmod_as': [u'a government relations consultant'],
  u'nsubj': [u'Vicki'],
  u'predicate': u'served',
  u'xcomp': [u'representing the interests of Portland Oregon in Washington DC']},
 {u'dobj': [u'the interests of Portland Oregon'],
  u'nmod_in': [u'Washington DC'],
  u'predicate': u'representing'},
 {u'predicate': u'has'}]

smallclause



In [0]:

    
smallclause_parse = parser.parse_doc(smallclause)
smallclause_dg_cc = build_dep_graph(smallclause_parse['sentences'][0])
display_parse(smallclause_dg_cc, 'pcfg-smallclause')
Image('images/pcfg-smallclause.png')









    Out[0]:



In [0]:

    
collect_all_predicates(smallclause_dg_cc)









    Out[0]:





[{u'aux': [u'had'],
  u'dobj': [u'Vicki president'],
  u'nsubj': [u'The Department of Agriculture'],
  u'predicate': u'appointed'},
 {u'predicate': u'had'}]

relclause



In [0]:

    
relclause_parse = parser.parse_doc(relclause_subj)
relclause_dg_cc = build_dep_graph(relclause_parse['sentences'][0])
display_parse(relclause_dg_cc, 'pcfg-relclause')
Image('images/pcfg-relclause.png')









    Out[0]:



In [0]:

    
nx.algorithms.traversal.bfs_successors









    Out[0]:





<function networkx.algorithms.traversal.breadth_first_search.bfs_successors>



In [0]:

    
collect_all_predicates(relclause_dg_cc)









    Out[0]:





[{u'dobj': [u'Texaco', u'Coca-Cola Texaco and Giant Foods', u'Giant Foods'],
  u'nsubj': [u'The clients whom Vicki has represented'],
  u'predicate': u'include'},
 {u'aux': [u'has'],
  u'dobj': [u'whom'],
  u'nsubj': [u'Vicki'],
  u'predicate': u'represented'},
 {u'predicate': u'has'}]

subclause



In [0]:

    
subclause_parse = parser.parse_doc(subclause)
subclause_dg_cc = build_dep_graph(subclause_parse['sentences'][0])
display_parse(subclause_dg_cc, 'pcfg-subclause')
Image('images/pcfg-subclause.png')









    Out[0]:



In [0]:

    
collect_all_predicates(subclause_dg_cc)









    Out[0]:





[{u'nmod_for': [u'health insurance companies that supported Obamacare'],
  u'nsubj': [u'Vicki'],
  u'predicate': u'lobbied'},
 {u'dobj': [u'Obamacare'], u'nsubj': [u'that'], u'predicate': u'supported'}]

passive



In [0]:

    
passive_parse = parser.parse_doc(passive)
passive_dg_cc = build_dep_graph(passive_parse['sentences'][0])
display_parse(passive_dg_cc, 'pcfg-passive')
Image('images/pcfg-passive.png')









    Out[0]:



In [0]:

    
collect_all_predicates(passive_dg_cc)









    Out[0]:





[{u'auxpass': [u'is'],
  u'nmod_agent': [u'Victoria Cram'],
  u'nmod_as_of': [u'of June 3 2006'],
  u'nsubjpass': [u'Giant'],
  u'predicate': u'represented'},
 {u'predicate': u'is'}]



In [0]:

    
nx.topological_sort(simple_dg_cc)









    Out[0]:





[TokenNode(index=6, token=u'served', pos=u'VBD', lemma=u'serve', ner=u'O', char_offsets=(25, 31)),
 TokenNode(index=11, token=u'representative', pos=u'NN', lemma=u'representative', ner=u'O', char_offsets=(53, 67)),
 TokenNode(index=10, token=u'city', pos=u'NN', lemma=u'city', ner=u'O', char_offsets=(48, 52)),
 TokenNode(index=8, token=u'an', pos=u'DT', lemma=u'a', ner=u'O', char_offsets=(35, 37)),
 TokenNode(index=7, token=u'as', pos=u'IN', lemma=u'as', ner=u'O', char_offsets=(32, 34)),
 TokenNode(index=9, token=u'assistant', pos=u'JJ', lemma=u'assistant', ner=u'O', char_offsets=(38, 47)),
 TokenNode(index=15, token=u'Center', pos=u'NNP', lemma=u'Center', ner=u'ORGANIZATION', char_offsets=(84, 90)),
 TokenNode(index=13, token=u'the', pos=u'DT', lemma=u'the', ner=u'O', char_offsets=(71, 74)),
 TokenNode(index=14, token=u'National', pos=u'NNP', lemma=u'National', ner=u'ORGANIZATION', char_offsets=(75, 83)),
 TokenNode(index=18, token=u'Development', pos=u'NNP', lemma=u'Development', ner=u'ORGANIZATION', char_offsets=(105, 116)),
 TokenNode(index=16, token=u'for', pos=u'IN', lemma=u'for', ner=u'ORGANIZATION', char_offsets=(91, 94)),
 TokenNode(index=17, token=u'Municipal', pos=u'NNP', lemma=u'Municipal', ner=u'ORGANIZATION', char_offsets=(95, 104)),
 TokenNode(index=12, token=u'at', pos=u'IN', lemma=u'at', ner=u'O', char_offsets=(68, 70)),
 TokenNode(index=4, token=u',', pos=u',', lemma=u',', ner=u'O', char_offsets=(17, 18)),
 TokenNode(index=5, token=u'Vicki', pos=u'NNP', lemma=u'Vicki', ner=u'PERSON', char_offsets=(19, 24)),
 TokenNode(index=3, token=u'1983', pos=u'CD', lemma=u'1983', ner=u'DATE', char_offsets=(13, 17)),
 TokenNode(index=0, token=u'From', pos=u'IN', lemma=u'from', ner=u'O', char_offsets=(0, 4)),
 TokenNode(index=2, token=u'to', pos=u'TO', lemma=u'to', ner=u'O', char_offsets=(10, 12)),
 TokenNode(index=1, token=u'1981', pos=u'CD', lemma=u'1981', ner=u'DATE', char_offsets=(5, 9)),
 TokenNode(index=19, token=u'.', pos=u'.', lemma=u'.', ner=u'O', char_offsets=(116, 117))]



In [0]:

    
nx.topological_sort_recursive(simple_dg_cc)









    Out[0]:





[TokenNode(index=6, token=u'served', pos=u'VBD', lemma=u'serve', ner=u'O', char_offsets=(25, 31)),
 TokenNode(index=15, token=u'Center', pos=u'NNP', lemma=u'Center', ner=u'ORGANIZATION', char_offsets=(84, 90)),
 TokenNode(index=18, token=u'Development', pos=u'NNP', lemma=u'Development', ner=u'ORGANIZATION', char_offsets=(105, 116)),
 TokenNode(index=17, token=u'Municipal', pos=u'NNP', lemma=u'Municipal', ner=u'ORGANIZATION', char_offsets=(95, 104)),
 TokenNode(index=16, token=u'for', pos=u'IN', lemma=u'for', ner=u'ORGANIZATION', char_offsets=(91, 94)),
 TokenNode(index=14, token=u'National', pos=u'NNP', lemma=u'National', ner=u'ORGANIZATION', char_offsets=(75, 83)),
 TokenNode(index=13, token=u'the', pos=u'DT', lemma=u'the', ner=u'O', char_offsets=(71, 74)),
 TokenNode(index=11, token=u'representative', pos=u'NN', lemma=u'representative', ner=u'O', char_offsets=(53, 67)),
 TokenNode(index=9, token=u'assistant', pos=u'JJ', lemma=u'assistant', ner=u'O', char_offsets=(38, 47)),
 TokenNode(index=7, token=u'as', pos=u'IN', lemma=u'as', ner=u'O', char_offsets=(32, 34)),
 TokenNode(index=8, token=u'an', pos=u'DT', lemma=u'a', ner=u'O', char_offsets=(35, 37)),
 TokenNode(index=10, token=u'city', pos=u'NN', lemma=u'city', ner=u'O', char_offsets=(48, 52)),
 TokenNode(index=12, token=u'at', pos=u'IN', lemma=u'at', ner=u'O', char_offsets=(68, 70)),
 TokenNode(index=4, token=u',', pos=u',', lemma=u',', ner=u'O', char_offsets=(17, 18)),
 TokenNode(index=5, token=u'Vicki', pos=u'NNP', lemma=u'Vicki', ner=u'PERSON', char_offsets=(19, 24)),
 TokenNode(index=3, token=u'1983', pos=u'CD', lemma=u'1983', ner=u'DATE', char_offsets=(13, 17)),
 TokenNode(index=2, token=u'to', pos=u'TO', lemma=u'to', ner=u'O', char_offsets=(10, 12)),
 TokenNode(index=0, token=u'From', pos=u'IN', lemma=u'from', ner=u'O', char_offsets=(0, 4)),
 TokenNode(index=1, token=u'1981', pos=u'CD', lemma=u'1981', ner=u'DATE', char_offsets=(5, 9)),
 TokenNode(index=19, token=u'.', pos=u'.', lemma=u'.', ner=u'O', char_offsets=(116, 117))]



In [0]:

    
nx.topological_sort(twoverb_dg_cc)









    Out[0]:





[TokenNode(index=3, token=u'served', pos=u'VBN', lemma=u'serve', ner=u'O', char_offsets=(15, 21)),
 TokenNode(index=10, token=u'representing', pos=u'VBG', lemma=u'represent', ner=u'O', char_offsets=(60, 72)),
 TokenNode(index=8, token=u'consultant', pos=u'NN', lemma=u'consultant', ner=u'O', char_offsets=(48, 58)),
 TokenNode(index=9, token=u',', pos=u',', lemma=u',', ner=u'O', char_offsets=(58, 59)),
 TokenNode(index=19, token=u'DC', pos=u'NNP', lemma=u'DC', ner=u'LOCATION', char_offsets=(121, 123)),
 TokenNode(index=18, token=u'Washington', pos=u'NNP', lemma=u'Washington', ner=u'LOCATION', char_offsets=(110, 120)),
 TokenNode(index=20, token=u'.', pos=u'.', lemma=u'.', ner=u'O', char_offsets=(123, 124)),
 TokenNode(index=15, token=u',', pos=u',', lemma=u',', ner=u'O', char_offsets=(98, 99)),
 TokenNode(index=7, token=u'relations', pos=u'NNS', lemma=u'relation', ner=u'O', char_offsets=(38, 47)),
 TokenNode(index=0, token=u'Vicki', pos=u'NNP', lemma=u'Vicki', ner=u'PERSON', char_offsets=(0, 5)),
 TokenNode(index=6, token=u'government', pos=u'NN', lemma=u'government', ner=u'O', char_offsets=(27, 37)),
 TokenNode(index=5, token=u'a', pos=u'DT', lemma=u'a', ner=u'O', char_offsets=(25, 26)),
 TokenNode(index=17, token=u'in', pos=u'IN', lemma=u'in', ner=u'O', char_offsets=(107, 109)),
 TokenNode(index=1, token=u'has', pos=u'VBZ', lemma=u'have', ner=u'O', char_offsets=(6, 9)),
 TokenNode(index=12, token=u'interests', pos=u'NNS', lemma=u'interest', ner=u'O', char_offsets=(77, 86)),
 TokenNode(index=11, token=u'the', pos=u'DT', lemma=u'the', ner=u'O', char_offsets=(73, 76)),
 TokenNode(index=2, token=u'also', pos=u'RB', lemma=u'also', ner=u'O', char_offsets=(10, 14)),
 TokenNode(index=4, token=u'as', pos=u'IN', lemma=u'as', ner=u'O', char_offsets=(22, 24)),
 TokenNode(index=16, token=u'Oregon', pos=u'NNP', lemma=u'Oregon', ner=u'LOCATION', char_offsets=(100, 106)),
 TokenNode(index=13, token=u'of', pos=u'IN', lemma=u'of', ner=u'O', char_offsets=(87, 89)),
 TokenNode(index=14, token=u'Portland', pos=u'NNP', lemma=u'Portland', ner=u'LOCATION', char_offsets=(90, 98))]



In [0]:

    
nx.topological_sort_recursive(twoverb_dg_cc)









    Out[0]:





[TokenNode(index=3, token=u'served', pos=u'VBN', lemma=u'serve', ner=u'O', char_offsets=(15, 21)),
 TokenNode(index=10, token=u'representing', pos=u'VBG', lemma=u'represent', ner=u'O', char_offsets=(60, 72)),
 TokenNode(index=8, token=u'consultant', pos=u'NN', lemma=u'consultant', ner=u'O', char_offsets=(48, 58)),
 TokenNode(index=9, token=u',', pos=u',', lemma=u',', ner=u'O', char_offsets=(58, 59)),
 TokenNode(index=19, token=u'DC', pos=u'NNP', lemma=u'DC', ner=u'LOCATION', char_offsets=(121, 123)),
 TokenNode(index=18, token=u'Washington', pos=u'NNP', lemma=u'Washington', ner=u'LOCATION', char_offsets=(110, 120)),
 TokenNode(index=20, token=u'.', pos=u'.', lemma=u'.', ner=u'O', char_offsets=(123, 124)),
 TokenNode(index=15, token=u',', pos=u',', lemma=u',', ner=u'O', char_offsets=(98, 99)),
 TokenNode(index=7, token=u'relations', pos=u'NNS', lemma=u'relation', ner=u'O', char_offsets=(38, 47)),
 TokenNode(index=0, token=u'Vicki', pos=u'NNP', lemma=u'Vicki', ner=u'PERSON', char_offsets=(0, 5)),
 TokenNode(index=6, token=u'government', pos=u'NN', lemma=u'government', ner=u'O', char_offsets=(27, 37)),
 TokenNode(index=5, token=u'a', pos=u'DT', lemma=u'a', ner=u'O', char_offsets=(25, 26)),
 TokenNode(index=17, token=u'in', pos=u'IN', lemma=u'in', ner=u'O', char_offsets=(107, 109)),
 TokenNode(index=1, token=u'has', pos=u'VBZ', lemma=u'have', ner=u'O', char_offsets=(6, 9)),
 TokenNode(index=12, token=u'interests', pos=u'NNS', lemma=u'interest', ner=u'O', char_offsets=(77, 86)),
 TokenNode(index=11, token=u'the', pos=u'DT', lemma=u'the', ner=u'O', char_offsets=(73, 76)),
 TokenNode(index=2, token=u'also', pos=u'RB', lemma=u'also', ner=u'O', char_offsets=(10, 14)),
 TokenNode(index=4, token=u'as', pos=u'IN', lemma=u'as', ner=u'O', char_offsets=(22, 24)),
 TokenNode(index=16, token=u'Oregon', pos=u'NNP', lemma=u'Oregon', ner=u'LOCATION', char_offsets=(100, 106)),
 TokenNode(index=13, token=u'of', pos=u'IN', lemma=u'of', ner=u'O', char_offsets=(87, 89)),
 TokenNode(index=14, token=u'Portland', pos=u'NNP', lemma=u'Portland', ner=u'LOCATION', char_offsets=(90, 98))]

Test data



In [0]:

    
h2t = html2text.HTML2Text()



In [0]:

    
h2t.body_width = 0
h2t.unicode_snob = 1
h2t.emphasis_mark = ''



In [0]:

    
def asciify(bio):
    asciitext = re.sub(r'[Aa]\?\?', ' ', unicodedata.normalize('NFD', bio).encode('ascii','replace'))
    return asciitext

def filter_lists(asciitext):
    for l in re.split(r'(\r\n|\n)', asciitext):
        lstrip = l.strip()
        if len(lstrip) > 0:
            if lstrip[0] not in ['*','-']:
                yield l

def clean_bio(bio):
    text = h2t.handle(bio)
    return '\n'.join([l for l in filter_lists(text)])



In [0]:

    
manatt_data = json.load(open('data/manatt-out-html-full.json'))



In [0]:

    
manatt_data[0]









    Out[0]:





{u'bio': [u'<div id="profile-education"><strong>Education</strong><br/><education>&#13;\n  <p>University of San Diego School of Law, J.D., May 1994<br/>Appellate Moot Court Board, National Team Member</p>&#13;\n  <p class="resumetext">&#160;</p>&#13;\n  <p class="resumetext">University of California at San Diego, B.A., cum laude<em>,</em>&#160;in political science with distinction, December 1990</p>&#13;\n</education></div>',
  u'<div id="ctl00_MainContent_DropZone1_columnDisplay_ctl00_controlcolumn_ctl00_WidgetHost_WidgetHost_widget_admissions">&#13;\n                    <strong>Bar Admissions</strong><br/>&#13;\n                    <span id="ctl00_MainContent_DropZone1_columnDisplay_ctl00_controlcolumn_ctl00_WidgetHost_WidgetHost_widget_lblbarAdmissions"><baradmissions>California</baradmissions></span>&#13;\n                </div>&#13;',
  u'<professionalexperience><p class="resumetext">Jack&#160;Yeh is a trial lawyer whose practice encompasses a wide range of complex commercial litigation matters, including competitive business litigation, unfair competition, government litigation, land use, intellectual property, entertainment, and other general business contract and tort claims.&#160;Mr. Yeh represents both publicly traded companies and emerging businesses in the advertising, entertainment and media, energy, real estate and technology industries.</p>&#13;\n<p class="resumetext">Mr. Yeh has been publicly recognized by the National Asian Pacific American Bar Association, which named him one of 25 of the nation&#8217;s &#8220;Best Lawyers Under 40; by <em>Los Angeles</em> Magazine, which, on multiple occasions, has accorded him its prestigious &#8220;Southern California Superlawyer&#8221; designation; and by LawDragon.com, a national portal listing of lawyers, which named him to its <em>500 New Stars, New Worlds</em> list.</p>&#13;\n<p class="resumetext">Representing his clients in federal and state courts throughout the country, Mr. Yeh has amassed substantial experience litigating commercial disputes at all stages, including arbitrations, mediations, court and jury trials, and appeals, including before the California Court of Appeal, the Ninth Circuit Court of Appeals and the California Supreme Court.</p>&#13;\n<p class="resumetext">In addition to his considerable trial experience, Mr. Yeh has developed a particular expertise in the litigation of complex consumer and competitor class action brought under California&#8217;s Unfair Competition Law (California Business and Professions Code Section 17200); governmental-related matters involving local and regional public agencies, such as the City of Los Angeles, the Los Angeles Unified School District and the Los Angeles County Metropolitan Transportation Authority; and matters involving State of California agencies, including the Department of General Services, the Department of Conservation, the Department of Transportation, and the Transportation Commission.&#160;Mr. Yeh has developed, coordinated and implemented successful litigation strategies for project applicants and public agencies at the trial court and appellate levels on a variety of subject matter, including the California Environmental Quality Act, public contracting, competitive bidding, negotiated procurement, conflicts of interest, political contributions and the California Public Records Act.</p>&#13;\n</professionalexperience>'],
 u'name': [u'<span id="ctl00_MainContent_DropZone1_columnDisplay_ctl00_controlcolumn_ctl00_WidgetHost_WidgetHost_widget_lblFullName">Jack S. Yeh</span>'],
 u'url': u'http://manatt.com/JackYeh.aspx'}



In [0]:

    
s = manatt_data[0]['bio'][2]



In [0]:

    
h2t.handle(s)









    Out[0]:





u'Jack\xa0Yeh is a trial lawyer whose practice encompasses a wide range of complex commercial litigation matters, including competitive business litigation, unfair competition, government litigation, land use, intellectual property, entertainment, and other general business contract and tort claims.\xa0Mr. Yeh represents both publicly traded companies and emerging businesses in the advertising, entertainment and media, energy, real estate and technology industries.\n\nMr. Yeh has been publicly recognized by the National Asian Pacific American Bar Association, which named him one of 25 of the nation\u2019s \u201cBest Lawyers Under 40; by Los Angeles Magazine, which, on multiple occasions, has accorded him its prestigious \u201cSouthern California Superlawyer\u201d designation; and by LawDragon.com, a national portal listing of lawyers, which named him to its 500 New Stars, New Worlds list.\n\nRepresenting his clients in federal and state courts throughout the country, Mr. Yeh has amassed substantial experience litigating commercial disputes at all stages, including arbitrations, mediations, court and jury trials, and appeals, including before the California Court of Appeal, the Ninth Circuit Court of Appeals and the California Supreme Court.\n\nIn addition to his considerable trial experience, Mr. Yeh has developed a particular expertise in the litigation of complex consumer and competitor class action brought under California\u2019s Unfair Competition Law (California Business and Professions Code Section 17200); governmental-related matters involving local and regional public agencies, such as the City of Los Angeles, the Los Angeles Unified School District and the Los Angeles County Metropolitan Transportation Authority; and matters involving State of California agencies, including the Department of General Services, the Department of Conservation, the Department of Transportation, and the Transportation Commission.\xa0Mr. Yeh has developed, coordinated and implemented successful litigation strategies for project applicants and public agencies at the trial court and appellate levels on a variety of subject matter, including the California Environmental Quality Act, public contracting, competitive bidding, negotiated procurement, conflicts of interest, political contributions and the California Public Records Act.\n'



In [0]:

    
h2t.handle(manatt_data[0]['bio'][0]).strip()









    Out[0]:





u'**Education**  \n\n\nUniversity of San Diego School of Law, J.D., May 1994  \nAppellate Moot Court Board, National Team Member\n\n\xa0\n\nUniversity of California at San Diego, B.A., cum laude,\xa0in political science with distinction, December 1990'



In [0]:

    
def parse_to_predicates(
     for sent in _parsed['sentences']:
        _dg = build_dep_graph(sent)
        try:
        for pas in collect_all_predicates(_dg):
            yield pas









    



  File "<ipython-input-69-83f07a1bda79>", line 2
    for sent in _parsed['sentences']:
      ^
SyntaxError: invalid syntax



In [0]:

    
def bio_to_parses(bio):
    parses = []
    for text in bio:
        for sent in parser.parse_doc(clean_bio(text))['sentences']:
            parses.append(sent)
    return parses

def process_data(d):
    d['parses'] = bio_to_parses(d['bio'])
    d['name'] = h2t.handle(d['name'][0])
    return d



In [0]:

    
%time processed = [process_data(d) for d in manatt_data[0:1]]



In [0]:

    
processed[0]



In [0]:

    
manatt_data[0]['bio'][-1]



In [0]:

    
p = nn_parser.parse_doc(clean_bio(manatt_data[0]['bio'][-1]))



In [0]:

    
display_parse(build_dep_graph(p['sentences'][-2]),'test2')
Image('images/test2.png')



In [0]:

    
all_preds = [collect_all_predicates(build_dep_graph(p)) for p in processed[0]['parses']]



In [0]:

    
all_preds



In [0]:

    
display_parse(build_dep_graph(processed[0]['parses'][-2]), 'test')
Image('images/test.png')



In [0]:

    
dg = build_dep_graph(p['sentences'][-2])



In [0]:

    
pd.DataFrame.from_records(dg.nodes(), columns=dg.nodes()[0].__dict__.keys()).sort('index')



In [0]:

    
collect_all_predicates(dg)



In [0]:

	index	token	pos	lemma	ner	char_offsets
4	0	From	IN	from	O	(0, 4)
1	1	1981	CD	1981	DATE	(5, 9)
9	2	to	TO	to	O	(10, 12)
2	3	1983	CD	1983	DATE	(13, 17)
5	4	,	,	,	O	(17, 18)
3	5	Vicki	NNP	Vicki	PERSON	(19, 24)
7	6	served	VBD	serve	O	(25, 31)
15	7	as	IN	as	O	(32, 34)
16	8	an	DT	a	O	(35, 37)
8	9	assistant	JJ	assistant	O	(38, 47)
10	10	city	NN	city	O	(48, 52)
19	11	representative	NN	representative	O	(53, 67)
6	12	at	IN	at	O	(68, 70)
17	13	the	DT	the	O	(71, 74)
13	14	National	NNP	National	ORGANIZATION	(75, 83)
14	15	Center	NNP	Center	ORGANIZATION	(84, 90)
11	16	for	IN	for	ORGANIZATION	(91, 94)
18	17	Municipal	NNP	Municipal	ORGANIZATION	(95, 104)
12	18	Development	NNP	Development	ORGANIZATION	(105, 116)
0	19	.	.	.	O	(116, 117)