In [176]:
import sys
import os
import re
from glob import glob
import pprint

In [3]:
from lxml import etree
from collections import Counter

In [4]:
PROJ_DIR = os.pardir
sys.path.append(os.path.abspath(PROJ_DIR))

In [5]:
import settings

In [6]:
sys.path


Out[6]:
['',
 '/home/blannon/.virtualenvs/lobbying/local/lib/python2.7/site-packages/setuptools-0.6c11-py2.7.egg',
 '/home/blannon/.virtualenvs/lobbying/lib/python2.7/site-packages/setuptools-0.6c11-py2.7.egg',
 '/home/blannon/.virtualenvs/lobbying/lib/python2.7',
 '/home/blannon/.virtualenvs/lobbying/lib/python2.7/plat-x86_64-linux-gnu',
 '/home/blannon/.virtualenvs/lobbying/lib/python2.7/lib-tk',
 '/home/blannon/.virtualenvs/lobbying/lib/python2.7/lib-old',
 '/home/blannon/.virtualenvs/lobbying/lib/python2.7/lib-dynload',
 '/usr/lib/python2.7',
 '/usr/lib/python2.7/plat-x86_64-linux-gnu',
 '/usr/lib/python2.7/lib-tk',
 '/home/blannon/.virtualenvs/lobbying/local/lib/python2.7/site-packages',
 '/home/blannon/.virtualenvs/lobbying/lib/python2.7/site-packages',
 '/home/blannon/.virtualenvs/lobbying/local/lib/python2.7/site-packages/IPython/extensions',
 '/home/blannon/dev/influence-usa/lobbying-federal-domestic']

In [7]:
HOUSE_ORIG = os.path.join(settings.ORIG_DIR, 'house_xml')

In [8]:
!tree -d ../data/original/house_xml


../data/original/house_xml
├── LD1
│   ├── 2004
│   │   └── ALL
│   ├── 2005
│   │   └── ALL
│   ├── 2006
│   │   └── ALL
│   ├── 2007
│   │   └── ALL
│   ├── 2008
│   │   └── ALL
│   ├── 2009
│   │   └── ALL
│   ├── 2010
│   │   └── ALL
│   ├── 2011
│   │   └── ALL
│   ├── 2012
│   │   └── ALL
│   ├── 2013
│   │   └── ALL
│   └── 2014
│       └── ALL
└── LD2
    ├── 2004
    │   ├── Q2
    │   └── Q4
    ├── 2005
    │   ├── Q2
    │   └── Q4
    ├── 2006
    │   ├── Q2
    │   └── Q4
    ├── 2007
    │   ├── Q2
    │   └── Q4
    ├── 2008
    │   ├── Q1
    │   ├── Q2
    │   ├── Q3
    │   └── Q4
    ├── 2009
    │   ├── Q1
    │   ├── Q2
    │   ├── Q3
    │   └── Q4
    ├── 2010
    │   ├── Q1
    │   ├── Q2
    │   ├── Q3
    │   └── Q4
    ├── 2011
    │   ├── Q1
    │   ├── Q2
    │   ├── Q3
    │   └── Q4
    ├── 2012
    │   ├── Q1
    │   ├── Q2
    │   ├── Q3
    │   └── Q4
    ├── 2013
    │   ├── Q1
    │   ├── Q2
    │   ├── Q3
    │   └── Q4
    └── 2014
        ├── Q1
        └── Q2

69 directories

In [9]:
LD1_DIR = os.path.join(HOUSE_ORIG,'LD1')
LD2_DIR = os.path.join(HOUSE_ORIG,'LD2')

Utility functions


In [183]:
def get_root_tag(fname):
    _tree = etree.parse(open(fname))
    return _tree.getroot().tag

def get_top_level_fields(fname):
    _tree = etree.parse(open(fname))
    children = _tree.getroot().getchildren()
    return [c.tag for c in children]

def fields_nonempty(node):
    if node is not None:
        texts = filter(lambda x: len(re.sub(r'\s+', '', x)) > 0, node.xpath('.//text()'))
        return len(texts) > 0
    else:
        False

In [167]:
def all_files(file_list, inspect_function, **kwargs):
    for _fname in file_list:
        try:
            yield inspect_function(_fname, **kwargs)
        except etree.XMLSyntaxError as e:
            sys.stderr.write("issue with {fn}".format(fn=_fname))
            sys.stderr.write(str(e))
            continue
            
def at_least_one_file(file_list, inspect_function, **kwargs):
    for _fname in file_list:
        try:
            result = inspect_function(_fname, **kwargs)
            if result:
                sys.stderr.write("found in {fn}".format(fn=_fname))
                return result
        except etree.XMLSyntaxError as e:
            sys.stderr.write("issue with {fn}".format(fn=_fname))
            sys.stderr.write(str(e))
            continue

def find_egs(_fname, **kwargs):
    xpath_query = kwargs['xpath_query']
    with open(_fname) as fin:
        tree = etree.parse(fin)
        root = tree.getroot()
        egs = root.xpath(xpath_query)
        if egs:
            return egs
            #for eg in egs:
                #print etree.tostring(eg, pretty_print=True)
     
def find_all_parents(_fname):
    parents = set()
    with open(_fname) as fin:
        tree = etree.parse(fin)
        root = tree.getroot()
        egs = root.xpath('.//*')
        if egs:
            for eg in egs:
                parent = eg.getparent()
                parents.add(parent.tag)
    return parents

def find_nonempty_egs(_fname, **kwargs):
    xpath_query = kwargs['xpath_query']
    with open(_fname) as fin:
        tree = etree.parse(fin)
        root = tree.getroot()
        egs = filter(fields_nonempty, root.xpath(xpath_query))
        if egs:
            return egs
        
def find_egs_with_multiple_children(_fname, **kwargs):
    xpath_query = kwargs['xpath_query']
    with open(_fname) as fin:
        tree = etree.parse(fin)
        root = tree.getroot()
        egs = filter(lambda x: len(x.getchildren()) > 1, root.xpath(xpath_query))
        if egs:
            return egs

Form LD-1


In [12]:
LD1_files = glob(os.path.join(LD1_DIR, '*', '*', '*.xml'))

In [13]:
LD1_files[0:5]


Out[13]:
['/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD1/2013/ALL/300612247.xml',
 '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD1/2013/ALL/300546614.xml',
 '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD1/2013/ALL/300620793.xml',
 '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD1/2013/ALL/300631469.xml',
 '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD1/2013/ALL/300614078.xml']

In [14]:
len(LD1_files)


Out[14]:
55680

In [15]:
LD1tree = etree.parse(open(LD1_files[0]))

In [16]:
LD1tree.getroot()


Out[16]:
<Element LOBBYINGDISCLOSURE1 at 0x7fa23810bf38>

In [17]:
LD1r = LD1tree.getroot()

In [18]:
LD1r.tag


Out[18]:
'LOBBYINGDISCLOSURE1'

In [67]:
LD1c = LD1r.getchildren()[0]

In [68]:
LD1c.getparent()


Out[68]:
<Element LOBBYINGDISCLOSURE1 at 0x7fa23810bf38>

In [69]:
LD1r.getparent()

In [130]:
#formtypes = Counter()
#for f in LD1_files:
#    tree = etree.parse(open(f))
#    formtypes.update([tree.getroot().tag,])
#formtypes


Out[130]:
Counter({'LOBBYINGDISCLOSURE1': 55652, '{http://www.PureEdge.com/XFDL/Custom}LOBBYINGDISCLOSURE1': 27, 'LOBBYINGDISCLOSURE2': 1})

In [19]:
print etree.tostring(LD1tree, pretty_print=True)


<?xml-stylesheet type='text/xsl' href='../../ld1.xsl'?>
<LOBBYINGDISCLOSURE1>
   <imported>Y</imported>
   <pages>2</pages>
   <regType>3</regType>
   <organizationName>Patton Boggs LLP</organizationName>
   <prefix/>
   <firstName/>
   <lastName/>
   <address1>2550 M STREET, NW</address1>
   <address2/>
   <city>WASHINGTON</city>
   <state>DC</state>
   <zip>20037</zip>
   <zipext/>
   <country>USA</country>
   <principal_city/>
   <principal_state/>
   <principal_zip/>
   <principal_zipext/>
   <principal_country/>
   <registrantGeneralDescription>Law firm</registrantGeneralDescription>
   <selfSelect>N</selfSelect>
   <clientName>Siemens Corportation</clientName>
   <clientAddress>300 New Jersey Avenue SE</clientAddress>
   <clientCity>Washington </clientCity>
   <clientState>DC</clientState>
   <clientZip>20001</clientZip>
   <clientZipExt/>
   <clientCountry>USA</clientCountry>
   <prinClientCity/>
   <prinClientState/>
   <prinClientZip/>
   <prinClientZipExt/>
   <prinClientCountry/>
   <clientGeneralDescription> Global corporation focused on energy, health, smart cities and transportation. </clientGeneralDescription>
   <senateID>30906-2551</senateID>
   <houseID>319170238</houseID>
   <lobbyists>
      <lobbyist>
         <lobbyistFirstName>Norma</lobbyistFirstName>
         <lobbyistLastName>Krayem</lobbyistLastName>
         <lobbyistSuffix/>
         <coveredPosition>DCSDOT99-01DAFedRailAdmin99-SofSt97-99DSPDOS96-97</coveredPosition>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
      <lobbyist>
         <lobbyistFirstName/>
         <lobbyistLastName/>
         <lobbyistSuffix/>
         <coveredPosition/>
         <lobbyistNew>Y</lobbyistNew>
      </lobbyist>
   </lobbyists>
   <alis>
      <ali_Code>TRA</ali_Code>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
      <ali_Code/>
   </alis>
   <specific_issues>High Speed Rail issues 
</specific_issues>
   <affiliatedUrl/>
   <affiliatedOrgs>
      <affiliatedOrg>
         <affiliatedOrgName/>
         <affiliatedOrgAddress/>
         <affiliatedOrgCity/>
         <affiliatedOrgState/>
         <affiliatedOrgZip/>
         <affiliatedOrgCountry/>
         <affiliatedPrinOrgCity/>
         <affiliatedPrinOrgState/>
         <affiliatedPrinOrgCountry/>
      </affiliatedOrg>
      <affiliatedOrg>
         <affiliatedOrgName/>
         <affiliatedOrgAddress/>
         <affiliatedOrgCity/>
         <affiliatedOrgState/>
         <affiliatedOrgZip/>
         <affiliatedOrgCountry/>
         <affiliatedPrinOrgCity/>
         <affiliatedPrinOrgState/>
         <affiliatedPrinOrgCountry/>
      </affiliatedOrg>
      <affiliatedOrg>
         <affiliatedOrgName/>
         <affiliatedOrgAddress/>
         <affiliatedOrgCity/>
         <affiliatedOrgState/>
         <affiliatedOrgZip/>
         <affiliatedOrgCountry/>
         <affiliatedPrinOrgCity/>
         <affiliatedPrinOrgState/>
         <affiliatedPrinOrgCountry/>
      </affiliatedOrg>
      <affiliatedOrg>
         <affiliatedOrgName/>
         <affiliatedOrgAddress/>
         <affiliatedOrgCity/>
         <affiliatedOrgState/>
         <affiliatedOrgZip/>
         <affiliatedOrgCountry/>
         <affiliatedPrinOrgCity/>
         <affiliatedPrinOrgState/>
         <affiliatedPrinOrgCountry/>
      </affiliatedOrg>
      <affiliatedOrg>
         <affiliatedOrgName/>
         <affiliatedOrgAddress/>
         <affiliatedOrgCity/>
         <affiliatedOrgState/>
         <affiliatedOrgZip/>
         <affiliatedOrgCountry/>
         <affiliatedPrinOrgCity/>
         <affiliatedPrinOrgState/>
         <affiliatedPrinOrgCountry/>
      </affiliatedOrg>
      <affiliatedOrg>
         <affiliatedOrgName/>
         <affiliatedOrgAddress/>
         <affiliatedOrgCity/>
         <affiliatedOrgState/>
         <affiliatedOrgZip/>
         <affiliatedOrgCountry/>
         <affiliatedPrinOrgCity/>
         <affiliatedPrinOrgState/>
         <affiliatedPrinOrgCountry/>
      </affiliatedOrg>
      <affiliatedOrg>
         <affiliatedOrgName/>
         <affiliatedOrgAddress/>
         <affiliatedOrgCity/>
         <affiliatedOrgState/>
         <affiliatedOrgZip/>
         <affiliatedOrgCountry/>
         <affiliatedPrinOrgCity/>
         <affiliatedPrinOrgState/>
         <affiliatedPrinOrgCountry/>
      </affiliatedOrg>
      <affiliatedOrg>
         <affiliatedOrgName/>
         <affiliatedOrgAddress/>
         <affiliatedOrgCity/>
         <affiliatedOrgState/>
         <affiliatedOrgZip/>
         <affiliatedOrgCountry/>
         <affiliatedPrinOrgCity/>
         <affiliatedPrinOrgState/>
         <affiliatedPrinOrgCountry/>
      </affiliatedOrg>
      <affiliatedOrg>
         <affiliatedOrgName/>
         <affiliatedOrgAddress/>
         <affiliatedOrgCity/>
         <affiliatedOrgState/>
         <affiliatedOrgZip/>
         <affiliatedOrgCountry/>
         <affiliatedPrinOrgCity/>
         <affiliatedPrinOrgState/>
         <affiliatedPrinOrgCountry/>
      </affiliatedOrg>
      <affiliatedOrg>
         <affiliatedOrgName/>
         <affiliatedOrgAddress/>
         <affiliatedOrgCity/>
         <affiliatedOrgState/>
         <affiliatedOrgZip/>
         <affiliatedOrgCountry/>
         <affiliatedPrinOrgCity/>
         <affiliatedPrinOrgState/>
         <affiliatedPrinOrgCountry/>
      </affiliatedOrg>
      <affiliatedOrg>
         <affiliatedOrgName/>
         <affiliatedOrgAddress/>
         <affiliatedOrgCity/>
         <affiliatedOrgState/>
         <affiliatedOrgZip/>
         <affiliatedOrgCountry/>
         <affiliatedPrinOrgCity/>
         <affiliatedPrinOrgState/>
         <affiliatedPrinOrgCountry/>
      </affiliatedOrg>
      <affiliatedOrg>
         <affiliatedOrgName/>
         <affiliatedOrgAddress/>
         <affiliatedOrgCity/>
         <affiliatedOrgState/>
         <affiliatedOrgZip/>
         <affiliatedOrgCountry/>
         <affiliatedPrinOrgCity/>
         <affiliatedPrinOrgState/>
         <affiliatedPrinOrgCountry/>
      </affiliatedOrg>
   </affiliatedOrgs>
   <foreignEntities>
      <foreignEntity>
         <name/>
         <address/>
         <city/>
         <state/>
         <country/>
         <prinCity/>
         <prinState/>
         <prinCountry/>
         <contribution/>
         <ownership_Percentage/>
      </foreignEntity>
      <foreignEntity>
         <name/>
         <address/>
         <city/>
         <state/>
         <country/>
         <prinCity/>
         <prinState/>
         <prinCountry/>
         <contribution/>
         <ownership_Percentage/>
      </foreignEntity>
      <foreignEntity>
         <name/>
         <address/>
         <city/>
         <state/>
         <country/>
         <prinCity/>
         <prinState/>
         <prinCountry/>
         <contribution/>
         <ownership_Percentage/>
      </foreignEntity>
      <foreignEntity>
         <name/>
         <address/>
         <city/>
         <state/>
         <country/>
         <prinCity/>
         <prinState/>
         <prinCountry/>
         <contribution/>
         <ownership_Percentage/>
      </foreignEntity>
      <foreignEntity>
         <name/>
         <address/>
         <city/>
         <state/>
         <country/>
         <prinCity/>
         <prinState/>
         <prinCountry/>
         <contribution/>
         <ownership_Percentage/>
      </foreignEntity>
      <foreignEntity>
         <name/>
         <address/>
         <city/>
         <state/>
         <country/>
         <prinCity/>
         <prinState/>
         <prinCountry/>
         <contribution/>
         <ownership_Percentage/>
      </foreignEntity>
      <foreignEntity>
         <name/>
         <address/>
         <city/>
         <state/>
         <country/>
         <prinCity/>
         <prinState/>
         <prinCountry/>
         <contribution/>
         <ownership_Percentage/>
      </foreignEntity>
      <foreignEntity>
         <name/>
         <address/>
         <city/>
         <state/>
         <country/>
         <prinCity/>
         <prinState/>
         <prinCountry/>
         <contribution/>
         <ownership_Percentage/>
      </foreignEntity>
      <foreignEntity>
         <name/>
         <address/>
         <city/>
         <state/>
         <country/>
         <prinCity/>
         <prinState/>
         <prinCountry/>
         <contribution/>
         <ownership_Percentage/>
      </foreignEntity>
      <foreignEntity>
         <name/>
         <address/>
         <city/>
         <state/>
         <country/>
         <prinCity/>
         <prinState/>
         <prinCountry/>
         <contribution/>
         <ownership_Percentage/>
      </foreignEntity>
      <foreignEntity>
         <name/>
         <address/>
         <city/>
         <state/>
         <country/>
         <prinCity/>
         <prinState/>
         <prinCountry/>
         <contribution/>
         <ownership_Percentage/>
      </foreignEntity>
   </foreignEntities>
   <reportYear>2013</reportYear>
   <reportType>RA</reportType>
   <effectiveDate>12/06/2013</effectiveDate>
   <printedName>James B. Christian, Partner</printedName>
   <signedDate>01/09/2014</signedDate>
</LOBBYINGDISCLOSURE1>


In [25]:
#LD1_top_level_counts = Counter()
#for children_list in all_files(LD1_files, get_top_level_fields):
#    LD1_top_level_counts.update(children_list)
#LD1_top_level_counts
LD1_top_level_counts = Counter({'selfSelect': 55653, 'prefix': 55653, 'principal_zip': 55653, 'houseID': 55653, 'address1': 55653, 'address2': 55653, 'organizationName': 55653, 'clientName': 55653, 'printedName': 55653, 'senateID': 55653, 'principal_zipext': 55653, 'state': 55653, 'reportYear': 55653, 'lastName': 55653, 'zipext': 55653, 'city': 55653, 'zip': 55653, 'reportType': 55653, 'firstName': 55653, 'country': 55653, 'alis': 55653, 'principal_state': 55653, 'signedDate': 55653, 'principal_city': 55653, 'pages': 55653, 'principal_country': 55653, 'registrantGeneralDescription': 55652, 'clientAddress': 55652, 'foreignEntities': 55652, 'clientCity': 55652, 'prinClientCountry': 55652, 'effectiveDate': 55652, 'prinClientState': 55652, 'regType': 55652, 'clientState': 55652, 'specific_issues': 55652, 'lobbyists': 55652, 'clientCountry': 55652, 'prinClientZipExt': 55652, 'affiliatedOrgs': 55652, 'clientGeneralDescription': 55652, 'prinClientCity': 55652, 'clientZipExt': 55652, 'prinClientZip': 55652, 'clientZip': 55652, 'affiliatedUrl': 36819, 'imported': 32332, 'signerEmail': 22722, 'clientGovtEntity': 4487, 'contactIntlPhone': 3087, 'contactPhone': 422, 'contactPrefix': 418, 'contactEmail': 418, 'contactName': 418, '{http://www.PureEdge.com/XFDL/Custom}clientCity': 27, '{http://www.PureEdge.com/XFDL/Custom}specific_issues': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_zipext': 27, '{http://www.PureEdge.com/XFDL/Custom}organizationName': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_city': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_zip': 27, '{http://www.PureEdge.com/XFDL/Custom}clientName': 27, '{http://www.PureEdge.com/XFDL/Custom}city': 27, '{http://www.PureEdge.com/XFDL/Custom}clientZip': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientCity': 27, '{http://www.PureEdge.com/XFDL/Custom}foreignEntities': 27, '{http://www.PureEdge.com/XFDL/Custom}regType': 27, '{http://www.PureEdge.com/XFDL/Custom}senateID': 27, '{http://www.PureEdge.com/XFDL/Custom}prefix': 27, '{http://www.PureEdge.com/XFDL/Custom}effectiveDate': 27, '{http://www.PureEdge.com/XFDL/Custom}state': 27, '{http://www.PureEdge.com/XFDL/Custom}registrantGeneralDescription': 27, '{http://www.PureEdge.com/XFDL/Custom}alis': 27, '{http://www.PureEdge.com/XFDL/Custom}lastName': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientCountry': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientZip': 27, '{http://www.PureEdge.com/XFDL/Custom}lobbyists': 27, '{http://www.PureEdge.com/XFDL/Custom}reportType': 27, '{http://www.PureEdge.com/XFDL/Custom}clientZipExt': 27, '{http://www.PureEdge.com/XFDL/Custom}reportYear': 27, '{http://www.PureEdge.com/XFDL/Custom}selfSelect': 27, '{http://www.PureEdge.com/XFDL/Custom}country': 27, '{http://www.PureEdge.com/XFDL/Custom}clientState': 27, '{http://www.PureEdge.com/XFDL/Custom}zip': 27, '{http://www.PureEdge.com/XFDL/Custom}firstName': 27, '{http://www.PureEdge.com/XFDL/Custom}signedDate': 27, '{http://www.PureEdge.com/XFDL/Custom}zipext': 27, '{http://www.PureEdge.com/XFDL/Custom}address1': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_state': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_country': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientState': 27, '{http://www.PureEdge.com/XFDL/Custom}pages': 27, '{http://www.PureEdge.com/XFDL/Custom}affiliatedOrgs': 27, '{http://www.PureEdge.com/XFDL/Custom}clientGeneralDescription': 27, '{http://www.PureEdge.com/XFDL/Custom}clientAddress': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientZipExt': 27, '{http://www.PureEdge.com/XFDL/Custom}clientCountry': 27, '{http://www.PureEdge.com/XFDL/Custom}houseID': 27, '{http://www.PureEdge.com/XFDL/Custom}address2': 27, '{http://www.PureEdge.com/XFDL/Custom}signerEmail': 27, '{http://www.PureEdge.com/XFDL/Custom}printedName': 27, '{http://www.PureEdge.com/XFDL/Custom}contactEmail': 20, '{http://www.PureEdge.com/XFDL/Custom}contactPrefix': 20, '{http://www.PureEdge.com/XFDL/Custom}contactName': 20, '{http://www.PureEdge.com/XFDL/Custom}contactPhone': 20, '{http://www.PureEdge.com/XFDL/Custom}contactIntlPhone': 20, 'noLobbying': 1, 'updates': 1, 'registrantDifferentAddress': 1, 'terminationDate': 1, 'submitURL': 1, 'expensesMethod': 1, 'expenses': 1, 'income': 1})

In [28]:
LD1_sorted_top_level_counts = sorted(LD1_top_level_counts.items(), key=lambda x: -x[1])
LD1_sorted_top_level_counts


Out[28]:
[('zipext', 55653),
 ('selfSelect', 55653),
 ('prefix', 55653),
 ('city', 55653),
 ('zip', 55653),
 ('pages', 55653),
 ('reportType', 55653),
 ('principal_zip', 55653),
 ('houseID', 55653),
 ('address1', 55653),
 ('address2', 55653),
 ('principal_state', 55653),
 ('signedDate', 55653),
 ('organizationName', 55653),
 ('firstName', 55653),
 ('alis', 55653),
 ('clientName', 55653),
 ('printedName', 55653),
 ('senateID', 55653),
 ('country', 55653),
 ('principal_city', 55653),
 ('principal_zipext', 55653),
 ('state', 55653),
 ('reportYear', 55653),
 ('lastName', 55653),
 ('principal_country', 55653),
 ('registrantGeneralDescription', 55652),
 ('prinClientZip', 55652),
 ('specific_issues', 55652),
 ('lobbyists', 55652),
 ('prinClientZipExt', 55652),
 ('clientCountry', 55652),
 ('clientAddress', 55652),
 ('clientGeneralDescription', 55652),
 ('foreignEntities', 55652),
 ('clientCity', 55652),
 ('affiliatedOrgs', 55652),
 ('prinClientCountry', 55652),
 ('prinClientCity', 55652),
 ('effectiveDate', 55652),
 ('clientZipExt', 55652),
 ('prinClientState', 55652),
 ('clientZip', 55652),
 ('regType', 55652),
 ('clientState', 55652),
 ('affiliatedUrl', 36819),
 ('imported', 32332),
 ('signerEmail', 22722),
 ('clientGovtEntity', 4487),
 ('contactIntlPhone', 3087),
 ('contactPhone', 422),
 ('contactPrefix', 418),
 ('contactEmail', 418),
 ('contactName', 418),
 ('{http://www.PureEdge.com/XFDL/Custom}firstName', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}selfSelect', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}country', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientCountry', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}zip', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientCity', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}specific_issues', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}organizationName', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_zipext', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_city', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_zip', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}zipext', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}printedName', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}pages', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientName', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_country', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}prinClientState', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}city', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientZip', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}affiliatedOrgs', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}prinClientCity', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}foreignEntities', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}regType', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}senateID', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}prefix', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}effectiveDate', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_state', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientAddress', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}state', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}registrantGeneralDescription', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}alis', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}prinClientZipExt', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}lastName', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}prinClientCountry', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientState', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}prinClientZip', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}lobbyists', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientGeneralDescription', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}reportType', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}signedDate', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientZipExt', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}houseID', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}address2', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}reportYear', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}address1', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}signerEmail', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}contactName', 20),
 ('{http://www.PureEdge.com/XFDL/Custom}contactPhone', 20),
 ('{http://www.PureEdge.com/XFDL/Custom}contactIntlPhone', 20),
 ('{http://www.PureEdge.com/XFDL/Custom}contactEmail', 20),
 ('{http://www.PureEdge.com/XFDL/Custom}contactPrefix', 20),
 ('noLobbying', 1),
 ('updates', 1),
 ('registrantDifferentAddress', 1),
 ('expenses', 1),
 ('income', 1),
 ('terminationDate', 1),
 ('submitURL', 1),
 ('expensesMethod', 1)]

In [29]:
sorted(LD1_top_level_counts.items(), key=lambda x: x[0])


Out[29]:
[('address1', 55653),
 ('address2', 55653),
 ('affiliatedOrgs', 55652),
 ('affiliatedUrl', 36819),
 ('alis', 55653),
 ('city', 55653),
 ('clientAddress', 55652),
 ('clientCity', 55652),
 ('clientCountry', 55652),
 ('clientGeneralDescription', 55652),
 ('clientGovtEntity', 4487),
 ('clientName', 55653),
 ('clientState', 55652),
 ('clientZip', 55652),
 ('clientZipExt', 55652),
 ('contactEmail', 418),
 ('contactIntlPhone', 3087),
 ('contactName', 418),
 ('contactPhone', 422),
 ('contactPrefix', 418),
 ('country', 55653),
 ('effectiveDate', 55652),
 ('expenses', 1),
 ('expensesMethod', 1),
 ('firstName', 55653),
 ('foreignEntities', 55652),
 ('houseID', 55653),
 ('imported', 32332),
 ('income', 1),
 ('lastName', 55653),
 ('lobbyists', 55652),
 ('noLobbying', 1),
 ('organizationName', 55653),
 ('pages', 55653),
 ('prefix', 55653),
 ('prinClientCity', 55652),
 ('prinClientCountry', 55652),
 ('prinClientState', 55652),
 ('prinClientZip', 55652),
 ('prinClientZipExt', 55652),
 ('principal_city', 55653),
 ('principal_country', 55653),
 ('principal_state', 55653),
 ('principal_zip', 55653),
 ('principal_zipext', 55653),
 ('printedName', 55653),
 ('regType', 55652),
 ('registrantDifferentAddress', 1),
 ('registrantGeneralDescription', 55652),
 ('reportType', 55653),
 ('reportYear', 55653),
 ('selfSelect', 55653),
 ('senateID', 55653),
 ('signedDate', 55653),
 ('signerEmail', 22722),
 ('specific_issues', 55652),
 ('state', 55653),
 ('submitURL', 1),
 ('terminationDate', 1),
 ('updates', 1),
 ('zip', 55653),
 ('zipext', 55653),
 ('{http://www.PureEdge.com/XFDL/Custom}address1', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}address2', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}affiliatedOrgs', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}alis', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}city', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientAddress', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientCity', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientCountry', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientGeneralDescription', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientName', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientState', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientZip', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}clientZipExt', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}contactEmail', 20),
 ('{http://www.PureEdge.com/XFDL/Custom}contactIntlPhone', 20),
 ('{http://www.PureEdge.com/XFDL/Custom}contactName', 20),
 ('{http://www.PureEdge.com/XFDL/Custom}contactPhone', 20),
 ('{http://www.PureEdge.com/XFDL/Custom}contactPrefix', 20),
 ('{http://www.PureEdge.com/XFDL/Custom}country', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}effectiveDate', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}firstName', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}foreignEntities', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}houseID', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}lastName', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}lobbyists', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}organizationName', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}pages', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}prefix', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}prinClientCity', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}prinClientCountry', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}prinClientState', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}prinClientZip', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}prinClientZipExt', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_city', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_country', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_state', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_zip', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_zipext', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}printedName', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}regType', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}registrantGeneralDescription', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}reportType', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}reportYear', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}selfSelect', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}senateID', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}signedDate', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}signerEmail', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}specific_issues', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}state', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}zip', 27),
 ('{http://www.PureEdge.com/XFDL/Custom}zipext', 27)]

In [54]:
LD1_top_level_counts.keys()


Out[54]:
['zipext',
 '{http://www.PureEdge.com/XFDL/Custom}firstName',
 'contactPhone',
 '{http://www.PureEdge.com/XFDL/Custom}selfSelect',
 'registrantGeneralDescription',
 '{http://www.PureEdge.com/XFDL/Custom}contactName',
 'selfSelect',
 '{http://www.PureEdge.com/XFDL/Custom}country',
 'prefix',
 '{http://www.PureEdge.com/XFDL/Custom}contactPhone',
 'prinClientZip',
 '{http://www.PureEdge.com/XFDL/Custom}clientCountry',
 'specific_issues',
 '{http://www.PureEdge.com/XFDL/Custom}zip',
 'city',
 'zip',
 '{http://www.PureEdge.com/XFDL/Custom}clientCity',
 'noLobbying',
 '{http://www.PureEdge.com/XFDL/Custom}contactIntlPhone',
 '{http://www.PureEdge.com/XFDL/Custom}specific_issues',
 'pages',
 '{http://www.PureEdge.com/XFDL/Custom}organizationName',
 'reportType',
 'lobbyists',
 'principal_zip',
 '{http://www.PureEdge.com/XFDL/Custom}principal_zipext',
 '{http://www.PureEdge.com/XFDL/Custom}principal_city',
 'affiliatedUrl',
 'houseID',
 'contactPrefix',
 '{http://www.PureEdge.com/XFDL/Custom}principal_zip',
 'address1',
 'address2',
 '{http://www.PureEdge.com/XFDL/Custom}zipext',
 'prinClientZipExt',
 'clientCountry',
 'clientAddress',
 'principal_state',
 'updates',
 'signedDate',
 '{http://www.PureEdge.com/XFDL/Custom}printedName',
 '{http://www.PureEdge.com/XFDL/Custom}pages',
 'clientGeneralDescription',
 '{http://www.PureEdge.com/XFDL/Custom}clientName',
 'foreignEntities',
 '{http://www.PureEdge.com/XFDL/Custom}principal_country',
 '{http://www.PureEdge.com/XFDL/Custom}prinClientState',
 'organizationName',
 'firstName',
 '{http://www.PureEdge.com/XFDL/Custom}contactEmail',
 'signerEmail',
 '{http://www.PureEdge.com/XFDL/Custom}city',
 '{http://www.PureEdge.com/XFDL/Custom}clientZip',
 '{http://www.PureEdge.com/XFDL/Custom}affiliatedOrgs',
 'registrantDifferentAddress',
 '{http://www.PureEdge.com/XFDL/Custom}prinClientCity',
 '{http://www.PureEdge.com/XFDL/Custom}foreignEntities',
 '{http://www.PureEdge.com/XFDL/Custom}regType',
 'clientCity',
 'alis',
 'clientName',
 '{http://www.PureEdge.com/XFDL/Custom}senateID',
 '{http://www.PureEdge.com/XFDL/Custom}prefix',
 'affiliatedOrgs',
 'prinClientCountry',
 '{http://www.PureEdge.com/XFDL/Custom}effectiveDate',
 'printedName',
 '{http://www.PureEdge.com/XFDL/Custom}principal_state',
 'expenses',
 'contactEmail',
 '{http://www.PureEdge.com/XFDL/Custom}clientAddress',
 'contactName',
 'prinClientCity',
 'clientGovtEntity',
 '{http://www.PureEdge.com/XFDL/Custom}state',
 'imported',
 'senateID',
 'effectiveDate',
 'country',
 '{http://www.PureEdge.com/XFDL/Custom}registrantGeneralDescription',
 'clientZipExt',
 'principal_city',
 '{http://www.PureEdge.com/XFDL/Custom}alis',
 'principal_zipext',
 'state',
 '{http://www.PureEdge.com/XFDL/Custom}prinClientZipExt',
 '{http://www.PureEdge.com/XFDL/Custom}lastName',
 'income',
 '{http://www.PureEdge.com/XFDL/Custom}prinClientCountry',
 'terminationDate',
 'submitURL',
 'prinClientState',
 'clientZip',
 '{http://www.PureEdge.com/XFDL/Custom}clientState',
 '{http://www.PureEdge.com/XFDL/Custom}prinClientZip',
 '{http://www.PureEdge.com/XFDL/Custom}lobbyists',
 '{http://www.PureEdge.com/XFDL/Custom}clientGeneralDescription',
 'reportYear',
 'regType',
 '{http://www.PureEdge.com/XFDL/Custom}reportType',
 '{http://www.PureEdge.com/XFDL/Custom}signedDate',
 'expensesMethod',
 'clientState',
 '{http://www.PureEdge.com/XFDL/Custom}clientZipExt',
 '{http://www.PureEdge.com/XFDL/Custom}houseID',
 '{http://www.PureEdge.com/XFDL/Custom}address2',
 '{http://www.PureEdge.com/XFDL/Custom}reportYear',
 'lastName',
 'principal_country',
 '{http://www.PureEdge.com/XFDL/Custom}address1',
 '{http://www.PureEdge.com/XFDL/Custom}signerEmail',
 'contactIntlPhone',
 '{http://www.PureEdge.com/XFDL/Custom}contactPrefix']

In [55]:
LD1_parents = {}

tags = filter(lambda x: 'PureEdge' not in x, LD1_top_level_counts.keys())

for tag in tags:
    egs = find_egs(LD1_files, '{tag}/*'.format(tag=tag))
    if egs:
        LD1_parents[tag] = egs

In [82]:
LD1_parents = set()
for parentset in all_files(LD1_files, find_all_parents):
    LD1_parents.update(parentset)

In [85]:
LD1_parents


Out[85]:
{'LOBBYINGDISCLOSURE1',
 'LOBBYINGDISCLOSURE2',
 'affiliatedOrg',
 'affiliatedOrgs',
 'ali_info',
 'alis',
 'error',
 'federal_agencies',
 'foreignEntities',
 'foreignEntity',
 'inactiveOrgs',
 'inactive_ALIs',
 'inactive_ForeignEntities',
 'inactive_lobbyist',
 'inactive_lobbyists',
 'lobbyist',
 'lobbyists',
 'specific_issues',
 'updates',
 '{http://www.PureEdge.com/XFDL/Custom}LOBBYINGDISCLOSURE1',
 '{http://www.PureEdge.com/XFDL/Custom}affiliatedOrg',
 '{http://www.PureEdge.com/XFDL/Custom}affiliatedOrgs',
 '{http://www.PureEdge.com/XFDL/Custom}alis',
 '{http://www.PureEdge.com/XFDL/Custom}foreignEntities',
 '{http://www.PureEdge.com/XFDL/Custom}foreignEntity',
 '{http://www.PureEdge.com/XFDL/Custom}lobbyist',
 '{http://www.PureEdge.com/XFDL/Custom}lobbyists'}

In [209]:
for x in at_least_one_file(LD1_files, find_nonempty_egs, xpath_query='.//affiliatedUrl'):
    print etree.tostring(x)


<affiliatedUrl>http://www.contracttower.org/</affiliatedUrl>
   
found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD1/2013/ALL/300586528.xml

LD-2


In [30]:
LD2_files = glob(os.path.join(LD2_DIR, '*', '*', '*.xml'))

In [31]:
LD2_files[0:5]


Out[31]:
['/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300625213.xml',
 '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300618105.xml',
 '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300625771.xml',
 '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300628923.xml',
 '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300626594.xml']

In [32]:
len(LD2_files)


Out[32]:
669003

In [33]:
LD2tree = etree.parse(open(LD2_files[0]))

In [34]:
print etree.tostring(LD2tree, pretty_print=True)


<?xml-stylesheet type='text/xsl' href='../../ld2.xsl'?>
<LOBBYINGDISCLOSURE2>
   <imported>N</imported>
   <pages>5</pages>
   <submitURL/>
   <organizationName>Murray, Montgomery &amp; O'Donnell</organizationName>
   <prefix/>
   <firstName/>
   <lastName/>
   <registrantDifferentAddress>N</registrantDifferentAddress>
   <address1>101 Constitution Ave, NW</address1>
   <address2/>
   <city>Washington</city>
   <state>DC</state>
   <zip>20001</zip>
   <zipext/>
   <country>USA</country>
   <principal_city/>
   <principal_state/>
   <principal_zip/>
   <principal_zipext/>
   <principal_country/>
   <selfSelect>N</selfSelect>
   <clientName>CITY OF OXNARD</clientName>
   <clientGovtEntity>Y</clientGovtEntity>
   <senateID>26227-63</senateID>
   <houseID>317760034</houseID>
   <reportYear>2013</reportYear>
   <reportType>Q4</reportType>
   <terminationDate/>
   <noLobbying/>
   <income>8000.00</income>
   <expenses/>
   <expensesMethod/>
   <printedName>John O'Donnell, Partner </printedName>
   <signedDate>01/21/2014</signedDate>
   <alis>
      <ali_info>
         <issueAreaCode>BUD</issueAreaCode>
         <specific_issues>
            <description>FY 14 Appropriations</description>
         </specific_issues>
         <federal_agencies>U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE</federal_agencies>
         <lobbyists>
            <lobbyist>
               <lobbyistFirstName>John</lobbyistFirstName>
               <lobbyistLastName>O'Donnell </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName>Kyriakos </lobbyistFirstName>
               <lobbyistLastName>Pagonis </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
         </lobbyists>
         <foreign_entity_issues/>
      </ali_info>
      <ali_info>
         <issueAreaCode>ENV</issueAreaCode>
         <specific_issues>
            <description>Halaco Site Remediation </description>
         </specific_issues>
         <federal_agencies>U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE, Environmental Protection Agency (EPA)</federal_agencies>
         <lobbyists>
            <lobbyist>
               <lobbyistFirstName>John</lobbyistFirstName>
               <lobbyistLastName>O'Donnell</lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName>Kyriakos </lobbyistFirstName>
               <lobbyistLastName>Pagonis </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
         </lobbyists>
         <foreign_entity_issues/>
      </ali_info>
      <ali_info>
         <issueAreaCode>TAX</issueAreaCode>
         <specific_issues>
            <description>SBIR reauthorization</description>
         </specific_issues>
         <federal_agencies>U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE</federal_agencies>
         <lobbyists>
            <lobbyist>
               <lobbyistFirstName>John</lobbyistFirstName>
               <lobbyistLastName>O'Donnell</lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName>Kyriakos </lobbyistFirstName>
               <lobbyistLastName>Pagonis </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
         </lobbyists>
         <foreign_entity_issues/>
      </ali_info>
      <ali_info>
         <issueAreaCode>LAW</issueAreaCode>
         <specific_issues>
            <description>Gang Violence and Public Safety </description>
         </specific_issues>
         <federal_agencies>U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE, Bureau of Justice Assistance, Office of Justice Program</federal_agencies>
         <lobbyists>
            <lobbyist>
               <lobbyistFirstName>John</lobbyistFirstName>
               <lobbyistLastName>O'Donnell</lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName>Kyriakos </lobbyistFirstName>
               <lobbyistLastName>Pagonis </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
         </lobbyists>
         <foreign_entity_issues/>
      </ali_info>
   </alis>
   <updates>
      <clientAddress/>
      <clientCity/>
      <clientState/>
      <clientZip/>
      <clientZipext/>
      <clientCountry/>
      <prinClientCity/>
      <prinClientState/>
      <prinClientZip/>
      <prinClientZipext/>
      <prinClientCountry/>
      <generalDescription/>
      <inactive_lobbyists>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
      </inactive_lobbyists>
      <inactive_ALIs>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
      </inactive_ALIs>
      <affiliatedUrl/>
      <affiliatedOrgs>
         <affiliatedOrg>
            <affiliatedOrgName/>
            <affiliatedOrgAddress/>
            <affiliatedOrgCity/>
            <affiliatedOrgState/>
            <affiliatedOrgZip/>
            <affiliatedOrgCountry/>
            <affiliatedPrinOrgCity/>
            <affiliatedPrinOrgState/>
            <affiliatedPrinOrgCountry/>
         </affiliatedOrg>
         <affiliatedOrg>
            <affiliatedOrgName/>
            <affiliatedOrgAddress/>
            <affiliatedOrgCity/>
            <affiliatedOrgState/>
            <affiliatedOrgZip/>
            <affiliatedOrgCountry/>
            <affiliatedPrinOrgCity/>
            <affiliatedPrinOrgState/>
            <affiliatedPrinOrgCountry/>
         </affiliatedOrg>
      </affiliatedOrgs>
      <inactiveOrgs>
         <inactiveOrgName/>
         <inactiveOrgName/>
         <inactiveOrgName/>
      </inactiveOrgs>
      <foreignEntities>
         <foreignEntity>
            <name/>
            <address/>
            <city/>
            <state/>
            <country/>
            <prinCity/>
            <prinState/>
            <prinCountry/>
            <contribution/>
            <ownership_Percentage/>
         </foreignEntity>
      </foreignEntities>
      <inactive_ForeignEntities>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
      </inactive_ForeignEntities>
   </updates>
</LOBBYINGDISCLOSURE2>


In [35]:
LD2r = LD2tree.getroot()

In [36]:
LD2r.tag


Out[36]:
'LOBBYINGDISCLOSURE2'

In [142]:
#formtypes = Counter()
#for f in LD2_files:
#    try:
#        tree = etree.parse(open(f))
#        formtypes.update([tree.getroot().tag,])
#    except Exception as e:
#        print str(e)
#        print f
#formtypes


xmlParseCharRef: invalid xmlChar value 16, line 277, column 44
/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300616796.xml
Out[142]:
Counter({'LOBBYINGDISCLOSURE2': 668905, '{http://www.PureEdge.com/XFDL/Custom}LOBBYINGDISCLOSURE2': 97})

In [37]:
formtypes = Counter({'LOBBYINGDISCLOSURE2': 668905, '{http://www.PureEdge.com/XFDL/Custom}LOBBYINGDISCLOSURE2': 97})
formtypes


Out[37]:
Counter({'LOBBYINGDISCLOSURE2': 668905, '{http://www.PureEdge.com/XFDL/Custom}LOBBYINGDISCLOSURE2': 97})

In [145]:
#tag_counts = Counter(all_files(LD2_files, get_root_tag))

In [38]:
tag_counts = Counter({'LOBBYINGDISCLOSURE2': 668905, '{http://www.PureEdge.com/XFDL/Custom}LOBBYINGDISCLOSURE2': 97})
tag_counts


Out[38]:
Counter({'LOBBYINGDISCLOSURE2': 668905, '{http://www.PureEdge.com/XFDL/Custom}LOBBYINGDISCLOSURE2': 97})

In [166]:
#ld2_top_level_counts = Counter()
#for children_list in all_files(LD2_files, get_top_level_fields):
#    ld2_top_level_counts.update(children_list)
#ld2_top_level_counts


issue with /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300616796.xmlxmlParseCharRef: invalid xmlChar value 16, line 277, column 44
Out[166]:
Counter({'zipext': 668905, 'printedName': 668905, 'principal_state': 668905, 'selfSelect': 668905, 'signedDate': 668905, 'expenses': 668905, 'prefix': 668905, 'city': 668905, 'senateID': 668905, 'zip': 668905, 'noLobbying': 668905, 'principal_zipext': 668905, 'state': 668905, 'reportType': 668905, 'principal_city': 668905, 'principal_zip': 668905, 'terminationDate': 668905, 'submitURL': 668905, 'houseID': 668905, 'address1': 668905, 'address2': 668905, 'income': 668905, 'updates': 668905, 'reportYear': 668905, 'pages': 668905, 'expensesMethod': 668905, 'organizationName': 668905, 'firstName': 668905, 'lastName': 668905, 'registrantDifferentAddress': 668905, 'country': 668905, 'alis': 668905, 'clientName': 668905, 'principal_country': 668904, 'clientGovtEntity': 536861, 'imported': 500575, 'signerEmail': 320584, 'contactIntlPhone': 37472, 'contactPhone': 209, 'contactPrefix': 179, 'contactEmail': 178, 'contactName': 178, '{http://www.PureEdge.com/XFDL/Custom}expensesMethod': 97, '{http://www.PureEdge.com/XFDL/Custom}firstName': 97, '{http://www.PureEdge.com/XFDL/Custom}prefix': 97, '{http://www.PureEdge.com/XFDL/Custom}houseID': 97, '{http://www.PureEdge.com/XFDL/Custom}senateID': 97, '{http://www.PureEdge.com/XFDL/Custom}registrantDifferentAddress': 97, '{http://www.PureEdge.com/XFDL/Custom}noLobbying': 97, '{http://www.PureEdge.com/XFDL/Custom}zip': 97, '{http://www.PureEdge.com/XFDL/Custom}submitURL': 97, '{http://www.PureEdge.com/XFDL/Custom}signedDate': 97, '{http://www.PureEdge.com/XFDL/Custom}contactIntlPhone': 97, '{http://www.PureEdge.com/XFDL/Custom}alis': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_zipext': 97, '{http://www.PureEdge.com/XFDL/Custom}income': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_city': 97, '{http://www.PureEdge.com/XFDL/Custom}clientName': 97, '{http://www.PureEdge.com/XFDL/Custom}selfSelect': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_zip': 97, '{http://www.PureEdge.com/XFDL/Custom}expenses': 97, '{http://www.PureEdge.com/XFDL/Custom}zipext': 97, '{http://www.PureEdge.com/XFDL/Custom}address2': 97, '{http://www.PureEdge.com/XFDL/Custom}address1': 97, '{http://www.PureEdge.com/XFDL/Custom}state': 97, '{http://www.PureEdge.com/XFDL/Custom}pages': 97, '{http://www.PureEdge.com/XFDL/Custom}reportType': 97, '{http://www.PureEdge.com/XFDL/Custom}organizationName': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_country': 97, '{http://www.PureEdge.com/XFDL/Custom}country': 97, '{http://www.PureEdge.com/XFDL/Custom}lastName': 97, '{http://www.PureEdge.com/XFDL/Custom}reportYear': 97, '{http://www.PureEdge.com/XFDL/Custom}city': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_state': 97, '{http://www.PureEdge.com/XFDL/Custom}updates': 97, '{http://www.PureEdge.com/XFDL/Custom}signerEmail': 97, '{http://www.PureEdge.com/XFDL/Custom}printedName': 97, '{http://www.PureEdge.com/XFDL/Custom}terminationDate': 97, '{http://www.PureEdge.com/XFDL/Custom}contactPhone': 95, '{http://www.PureEdge.com/XFDL/Custom}contactEmail': 95, '{http://www.PureEdge.com/XFDL/Custom}contactName': 95, '{http://www.PureEdge.com/XFDL/Custom}contactPrefix': 95, '{http://www.PureEdge.com/XFDL/Custom}imported': 2, '{http://www.PureEdge.com/XFDL/Custom}clientGovtEntity': 2})

In [39]:
ld2_top_level_counts = Counter({'zipext': 668905, 'printedName': 668905, 'principal_state': 668905, 'selfSelect': 668905, 'signedDate': 668905, 'expenses': 668905, 'prefix': 668905, 'city': 668905, 'senateID': 668905, 'zip': 668905, 'noLobbying': 668905, 'principal_zipext': 668905, 'state': 668905, 'reportType': 668905, 'principal_city': 668905, 'principal_zip': 668905, 'terminationDate': 668905, 'submitURL': 668905, 'houseID': 668905, 'address1': 668905, 'address2': 668905, 'income': 668905, 'updates': 668905, 'reportYear': 668905, 'pages': 668905, 'expensesMethod': 668905, 'organizationName': 668905, 'firstName': 668905, 'lastName': 668905, 'registrantDifferentAddress': 668905, 'country': 668905, 'alis': 668905, 'clientName': 668905, 'principal_country': 668904, 'clientGovtEntity': 536861, 'imported': 500575, 'signerEmail': 320584, 'contactIntlPhone': 37472, 'contactPhone': 209, 'contactPrefix': 179, 'contactEmail': 178, 'contactName': 178, '{http://www.PureEdge.com/XFDL/Custom}expensesMethod': 97, '{http://www.PureEdge.com/XFDL/Custom}firstName': 97, '{http://www.PureEdge.com/XFDL/Custom}prefix': 97, '{http://www.PureEdge.com/XFDL/Custom}houseID': 97, '{http://www.PureEdge.com/XFDL/Custom}senateID': 97, '{http://www.PureEdge.com/XFDL/Custom}registrantDifferentAddress': 97, '{http://www.PureEdge.com/XFDL/Custom}noLobbying': 97, '{http://www.PureEdge.com/XFDL/Custom}zip': 97, '{http://www.PureEdge.com/XFDL/Custom}submitURL': 97, '{http://www.PureEdge.com/XFDL/Custom}signedDate': 97, '{http://www.PureEdge.com/XFDL/Custom}contactIntlPhone': 97, '{http://www.PureEdge.com/XFDL/Custom}alis': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_zipext': 97, '{http://www.PureEdge.com/XFDL/Custom}income': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_city': 97, '{http://www.PureEdge.com/XFDL/Custom}clientName': 97, '{http://www.PureEdge.com/XFDL/Custom}selfSelect': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_zip': 97, '{http://www.PureEdge.com/XFDL/Custom}expenses': 97, '{http://www.PureEdge.com/XFDL/Custom}zipext': 97, '{http://www.PureEdge.com/XFDL/Custom}address2': 97, '{http://www.PureEdge.com/XFDL/Custom}address1': 97, '{http://www.PureEdge.com/XFDL/Custom}state': 97, '{http://www.PureEdge.com/XFDL/Custom}pages': 97, '{http://www.PureEdge.com/XFDL/Custom}reportType': 97, '{http://www.PureEdge.com/XFDL/Custom}organizationName': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_country': 97, '{http://www.PureEdge.com/XFDL/Custom}country': 97, '{http://www.PureEdge.com/XFDL/Custom}lastName': 97, '{http://www.PureEdge.com/XFDL/Custom}reportYear': 97, '{http://www.PureEdge.com/XFDL/Custom}city': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_state': 97, '{http://www.PureEdge.com/XFDL/Custom}updates': 97, '{http://www.PureEdge.com/XFDL/Custom}signerEmail': 97, '{http://www.PureEdge.com/XFDL/Custom}printedName': 97, '{http://www.PureEdge.com/XFDL/Custom}terminationDate': 97, '{http://www.PureEdge.com/XFDL/Custom}contactPhone': 95, '{http://www.PureEdge.com/XFDL/Custom}contactEmail': 95, '{http://www.PureEdge.com/XFDL/Custom}contactName': 95, '{http://www.PureEdge.com/XFDL/Custom}contactPrefix': 95, '{http://www.PureEdge.com/XFDL/Custom}imported': 2, '{http://www.PureEdge.com/XFDL/Custom}clientGovtEntity': 2})

In [40]:
sorted(ld2_top_level_counts.items(), key=lambda x: x[0])


Out[40]:
[('address1', 668905),
 ('address2', 668905),
 ('alis', 668905),
 ('city', 668905),
 ('clientGovtEntity', 536861),
 ('clientName', 668905),
 ('contactEmail', 178),
 ('contactIntlPhone', 37472),
 ('contactName', 178),
 ('contactPhone', 209),
 ('contactPrefix', 179),
 ('country', 668905),
 ('expenses', 668905),
 ('expensesMethod', 668905),
 ('firstName', 668905),
 ('houseID', 668905),
 ('imported', 500575),
 ('income', 668905),
 ('lastName', 668905),
 ('noLobbying', 668905),
 ('organizationName', 668905),
 ('pages', 668905),
 ('prefix', 668905),
 ('principal_city', 668905),
 ('principal_country', 668904),
 ('principal_state', 668905),
 ('principal_zip', 668905),
 ('principal_zipext', 668905),
 ('printedName', 668905),
 ('registrantDifferentAddress', 668905),
 ('reportType', 668905),
 ('reportYear', 668905),
 ('selfSelect', 668905),
 ('senateID', 668905),
 ('signedDate', 668905),
 ('signerEmail', 320584),
 ('state', 668905),
 ('submitURL', 668905),
 ('terminationDate', 668905),
 ('updates', 668905),
 ('zip', 668905),
 ('zipext', 668905),
 ('{http://www.PureEdge.com/XFDL/Custom}address1', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}address2', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}alis', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}city', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}clientGovtEntity', 2),
 ('{http://www.PureEdge.com/XFDL/Custom}clientName', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}contactEmail', 95),
 ('{http://www.PureEdge.com/XFDL/Custom}contactIntlPhone', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}contactName', 95),
 ('{http://www.PureEdge.com/XFDL/Custom}contactPhone', 95),
 ('{http://www.PureEdge.com/XFDL/Custom}contactPrefix', 95),
 ('{http://www.PureEdge.com/XFDL/Custom}country', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}expenses', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}expensesMethod', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}firstName', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}houseID', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}imported', 2),
 ('{http://www.PureEdge.com/XFDL/Custom}income', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}lastName', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}noLobbying', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}organizationName', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}pages', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}prefix', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_city', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_country', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_state', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_zip', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_zipext', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}printedName', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}registrantDifferentAddress', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}reportType', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}reportYear', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}selfSelect', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}senateID', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}signedDate', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}signerEmail', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}state', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}submitURL', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}terminationDate', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}updates', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}zip', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}zipext', 97)]

In [41]:
sorted(ld2_top_level_counts.items(), key=lambda x: -x[1])


Out[41]:
[('zipext', 668905),
 ('noLobbying', 668905),
 ('reportYear', 668905),
 ('organizationName', 668905),
 ('prefix', 668905),
 ('city', 668905),
 ('zip', 668905),
 ('pages', 668905),
 ('reportType', 668905),
 ('expensesMethod', 668905),
 ('principal_zip', 668905),
 ('houseID', 668905),
 ('principal_city', 668905),
 ('address1', 668905),
 ('address2', 668905),
 ('updates', 668905),
 ('firstName', 668905),
 ('registrantDifferentAddress', 668905),
 ('country', 668905),
 ('alis', 668905),
 ('clientName', 668905),
 ('principal_state', 668905),
 ('printedName', 668905),
 ('signedDate', 668905),
 ('expenses', 668905),
 ('senateID', 668905),
 ('principal_zipext', 668905),
 ('state', 668905),
 ('income', 668905),
 ('terminationDate', 668905),
 ('submitURL', 668905),
 ('selfSelect', 668905),
 ('lastName', 668905),
 ('principal_country', 668904),
 ('clientGovtEntity', 536861),
 ('imported', 500575),
 ('signerEmail', 320584),
 ('contactIntlPhone', 37472),
 ('contactPhone', 209),
 ('contactPrefix', 179),
 ('contactEmail', 178),
 ('contactName', 178),
 ('{http://www.PureEdge.com/XFDL/Custom}expensesMethod', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}selfSelect', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}alis', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}country', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}noLobbying', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}zip', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}firstName', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}contactIntlPhone', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_zipext', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}organizationName', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_city', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_zip', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}expenses', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}zipext', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}address2', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}address1', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}income', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}clientName', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_country', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}pages', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}principal_state', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}city', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}terminationDate', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}senateID', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}prefix', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}registrantDifferentAddress', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}submitURL', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}lastName', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}updates', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}state', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}reportType', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}signedDate', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}houseID', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}reportYear', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}signerEmail', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}printedName', 97),
 ('{http://www.PureEdge.com/XFDL/Custom}contactPhone', 95),
 ('{http://www.PureEdge.com/XFDL/Custom}contactEmail', 95),
 ('{http://www.PureEdge.com/XFDL/Custom}contactName', 95),
 ('{http://www.PureEdge.com/XFDL/Custom}contactPrefix', 95),
 ('{http://www.PureEdge.com/XFDL/Custom}clientGovtEntity', 2),
 ('{http://www.PureEdge.com/XFDL/Custom}imported', 2)]

In [86]:
LD2_parents = set()
for parentset in all_files(LD2_files, find_all_parents):
    LD2_parents.update(parentset)


issue with /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300616796.xmlxmlParseCharRef: invalid xmlChar value 16, line 277, column 44

In [87]:
LD2_parents


Out[87]:
{'LOBBYINGDISCLOSURE2',
 'ae',
 'affiliatedOrg',
 'affiliatedOrgs',
 'ali_info',
 'alis',
 'error',
 'federal_agencies',
 'foreignEntities',
 'foreignEntity',
 'inactiveOrgs',
 'inactive_ALIs',
 'inactive_ForeignEntities',
 'inactive_lobbyist',
 'inactive_lobbyists',
 'lobbyist',
 'lobbyists',
 'specific_issues',
 'updates',
 '{http://www.PureEdge.com/XFDL/Custom}LOBBYINGDISCLOSURE2',
 '{http://www.PureEdge.com/XFDL/Custom}affiliatedOrg',
 '{http://www.PureEdge.com/XFDL/Custom}affiliatedOrgs',
 '{http://www.PureEdge.com/XFDL/Custom}ali_info',
 '{http://www.PureEdge.com/XFDL/Custom}alis',
 '{http://www.PureEdge.com/XFDL/Custom}federal_agencies',
 '{http://www.PureEdge.com/XFDL/Custom}foreignEntities',
 '{http://www.PureEdge.com/XFDL/Custom}foreignEntity',
 '{http://www.PureEdge.com/XFDL/Custom}inactiveOrgs',
 '{http://www.PureEdge.com/XFDL/Custom}inactive_ALIs',
 '{http://www.PureEdge.com/XFDL/Custom}inactive_ForeignEntities',
 '{http://www.PureEdge.com/XFDL/Custom}inactive_lobbyist',
 '{http://www.PureEdge.com/XFDL/Custom}inactive_lobbyists',
 '{http://www.PureEdge.com/XFDL/Custom}lobbyist',
 '{http://www.PureEdge.com/XFDL/Custom}lobbyists',
 '{http://www.PureEdge.com/XFDL/Custom}specific_issues',
 '{http://www.PureEdge.com/XFDL/Custom}updates'}

In [135]:
has_child_array = {}

In [134]:
for fa in at_least_one_file(LD2_files, 
                            find_egs, 
                            xpath_query='.//ae'):
    print etree.tostring(fa)


issue with /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300616796.xmlxmlParseCharRef: invalid xmlChar value 16, line 277, column 44found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2006/Q4/200031565.xml
<ae>
      <specific_issues>
        <ae/>
        <ae/>
        <ae>All provisions that take prudent steps to improve the process for vetting foreign investments in the U.S. in a way that does not become so unwieldy and uninviting that valuable investment from abroad is discouraged, 
hurting jobs and economic growth.</ae>
      </specific_issues>
    </ae>
  
<ae/>
        
<ae/>
        
<ae>All provisions that take prudent steps to improve the process for vetting foreign investments in the U.S. in a way that does not become so unwieldy and uninviting that valuable investment from abroad is discouraged, 
hurting jobs and economic growth.</ae>
      

In [98]:
for fa in at_least_one_file(LD2_files, 
                            find_egs, 
                            xpath_query='.//federal_agencies'):
    print etree.tostring(fa)


<federal_agencies>U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE</federal_agencies>
         
<federal_agencies>U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE, Environmental Protection Agency (EPA)</federal_agencies>
         
<federal_agencies>U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE</federal_agencies>
         
<federal_agencies>U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE, Bureau of Justice Assistance, Office of Justice Program</federal_agencies>
         
found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300625213.xml

In [136]:
has_child_array['federal_agencies'] = True

In [185]:
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//affiliatedOrgs'):
    print etree.tostring(il)


<affiliatedOrgs>
         <affiliatedOrg>
            <affiliatedOrgName>WellPoint, Inc.</affiliatedOrgName>
            <affiliatedOrgAddress>120 Monument Circle</affiliatedOrgAddress>
            <affiliatedOrgCity>Indianapolis</affiliatedOrgCity>
            <affiliatedOrgState>IN</affiliatedOrgState>
            <affiliatedOrgZip>46204</affiliatedOrgZip>
            <affiliatedOrgCountry>USA</affiliatedOrgCountry>
            <affiliatedPrinOrgCity/>
            <affiliatedPrinOrgState/>
            <affiliatedPrinOrgCountry/>
         </affiliatedOrg>
         <affiliatedOrg>
            <affiliatedOrgName/>
            <affiliatedOrgAddress/>
            <affiliatedOrgCity/>
            <affiliatedOrgState/>
            <affiliatedOrgZip/>
            <affiliatedOrgCountry/>
            <affiliatedPrinOrgCity/>
            <affiliatedPrinOrgState/>
            <affiliatedPrinOrgCountry/>
         </affiliatedOrg>
         <affiliatedOrg>
            <affiliatedOrgName/>
            <affiliatedOrgAddress/>
            <affiliatedOrgCity/>
            <affiliatedOrgState/>
            <affiliatedOrgZip/>
            <affiliatedOrgCountry/>
            <affiliatedPrinOrgCity/>
            <affiliatedPrinOrgState/>
            <affiliatedPrinOrgCountry/>
         </affiliatedOrg>
         <affiliatedOrg>
            <affiliatedOrgName/>
            <affiliatedOrgAddress/>
            <affiliatedOrgCity/>
            <affiliatedOrgState/>
            <affiliatedOrgZip/>
            <affiliatedOrgCountry/>
            <affiliatedPrinOrgCity/>
            <affiliatedPrinOrgState/>
            <affiliatedPrinOrgCountry/>
         </affiliatedOrg>
      </affiliatedOrgs>
      
found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300623989.xml

In [139]:
has_child_array['affiliatedOrgs'] = True

In [140]:
for il in at_least_one_file(LD2_files, 
                            find_egs, 
                            xpath_query='.//alis'):
    print etree.tostring(il)


<alis>
      <ali_info>
         <issueAreaCode>BUD</issueAreaCode>
         <specific_issues>
            <description>FY 14 Appropriations</description>
         </specific_issues>
         <federal_agencies>U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE</federal_agencies>
         <lobbyists>
            <lobbyist>
               <lobbyistFirstName>John</lobbyistFirstName>
               <lobbyistLastName>O'Donnell </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName>Kyriakos </lobbyistFirstName>
               <lobbyistLastName>Pagonis </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
         </lobbyists>
         <foreign_entity_issues/>
      </ali_info>
      <ali_info>
         <issueAreaCode>ENV</issueAreaCode>
         <specific_issues>
            <description>Halaco Site Remediation </description>
         </specific_issues>
         <federal_agencies>U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE, Environmental Protection Agency (EPA)</federal_agencies>
         <lobbyists>
            <lobbyist>
               <lobbyistFirstName>John</lobbyistFirstName>
               <lobbyistLastName>O'Donnell</lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName>Kyriakos </lobbyistFirstName>
               <lobbyistLastName>Pagonis </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
         </lobbyists>
         <foreign_entity_issues/>
      </ali_info>
      <ali_info>
         <issueAreaCode>TAX</issueAreaCode>
         <specific_issues>
            <description>SBIR reauthorization</description>
         </specific_issues>
         <federal_agencies>U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE</federal_agencies>
         <lobbyists>
            <lobbyist>
               <lobbyistFirstName>John</lobbyistFirstName>
               <lobbyistLastName>O'Donnell</lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName>Kyriakos </lobbyistFirstName>
               <lobbyistLastName>Pagonis </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
         </lobbyists>
         <foreign_entity_issues/>
      </ali_info>
      <ali_info>
         <issueAreaCode>LAW</issueAreaCode>
         <specific_issues>
            <description>Gang Violence and Public Safety </description>
         </specific_issues>
         <federal_agencies>U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE, Bureau of Justice Assistance, Office of Justice Program</federal_agencies>
         <lobbyists>
            <lobbyist>
               <lobbyistFirstName>John</lobbyistFirstName>
               <lobbyistLastName>O'Donnell</lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName>Kyriakos </lobbyistFirstName>
               <lobbyistLastName>Pagonis </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
         </lobbyists>
         <foreign_entity_issues/>
      </ali_info>
   </alis>
   
found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300625213.xml

In [141]:
has_child_array['alis'] = True

In [102]:
for il in at_least_one_file(LD2_files, 
                            find_egs, 
                            xpath_query='.//error'):
    print etree.tostring(il)


<error>
            <prinCountry/>
         </error>
         
found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300616866.xml

In [186]:
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//foreignEntities'):
    print etree.tostring(il)


<foreignEntities>
         <foreignEntity>
            <name>Mitsubishi Tanabe Pharma Corporation</name>
            <address>6-18  Kitahama,2-chome,Chuo-ku</address>
            <city>Osaka</city>
            <state/>
            <country>JPN</country>
            <prinCity>Osaka</prinCity>
            <prinState/>
            <prinCountry>JPN</prinCountry>
            <contribution>0.00</contribution>
            <ownership_Percentage>60</ownership_Percentage>
         </foreignEntity>
         <foreignEntity>
            <name>Philip Morris Investments B.V</name>
            <address>Marconilaan 20</address>
            <city>Bergen op Zoom</city>
            <state/>
            <country>NED</country>
            <prinCity>Bergen op Zoom</prinCity>
            <prinState/>
            <prinCountry>NED</prinCountry>
            <contribution>0.00</contribution>
            <ownership_Percentage>40</ownership_Percentage>
         </foreignEntity>
         <foreignEntity>
            <name/>
            <address/>
            <city/>
            <state/>
            <country/>
            <prinCity/>
            <prinState/>
            <prinCountry/>
            <contribution/>
            <ownership_Percentage/>
         </foreignEntity>
      </foreignEntities>
      
found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300629698.xml

In [143]:
has_child_array['foreignEntities'] = True

In [187]:
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//inactive_ALIs'):
    print etree.tostring(il)


<inactive_ALIs>
         <ali_Code>MED</ali_Code>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
      </inactive_ALIs>
      
found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300629698.xml

In [144]:
has_child_array['inactive_ALIs'] = True

In [188]:
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//inactiveOrgs'):
    print etree.tostring(il)


<inactiveOrgs>
         <inactiveOrgName>The Ashcroft Group</inactiveOrgName>
         <inactiveOrgName/>
         <inactiveOrgName/>
         <inactiveOrgName/>
         <inactiveOrgName/>
         <inactiveOrgName/>
      </inactiveOrgs>
      
found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300594225.xml

In [189]:
has_child_array['inactiveOrgs'] = True

In [190]:
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//inactive_ForeignEntities'):
    print etree.tostring(il)


<inactive_ForeignEntities>
         <inactive_ForeignEntity>Accenture SCA</inactive_ForeignEntity>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
      </inactive_ForeignEntities>
   
found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300619266.xml

In [191]:
has_child_array['inactive_ForeignEntities'] = True

In [192]:
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//inactive_lobbyists'):
    print etree.tostring(il)


<inactive_lobbyists>
         <inactive_lobbyist>
            <firstName>Heather</firstName>
            <lastName>Dumont</lastName>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
      </inactive_lobbyists>
      
found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300625771.xml

In [193]:
has_child_array['inactive_lobbyists'] = True

In [194]:
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//lobbyists'):
    print etree.tostring(il)


<lobbyists>
            <lobbyist>
               <lobbyistFirstName>John</lobbyistFirstName>
               <lobbyistLastName>O'Donnell </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName>Kyriakos </lobbyistFirstName>
               <lobbyistLastName>Pagonis </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
         </lobbyists>
         
<lobbyists>
            <lobbyist>
               <lobbyistFirstName>John</lobbyistFirstName>
               <lobbyistLastName>O'Donnell</lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName>Kyriakos </lobbyistFirstName>
               <lobbyistLastName>Pagonis </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
         </lobbyists>
         
<lobbyists>
            <lobbyist>
               <lobbyistFirstName>John</lobbyistFirstName>
               <lobbyistLastName>O'Donnell</lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName>Kyriakos </lobbyistFirstName>
               <lobbyistLastName>Pagonis </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
         </lobbyists>
         
<lobbyists>
            <lobbyist>
               <lobbyistFirstName>John</lobbyistFirstName>
               <lobbyistLastName>O'Donnell</lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName>Kyriakos </lobbyistFirstName>
               <lobbyistLastName>Pagonis </lobbyistLastName>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
            <lobbyist>
               <lobbyistFirstName/>
               <lobbyistLastName/>
               <lobbyistSuffix/>
               <coveredPosition/>
               <lobbyistNew>N</lobbyistNew>
            </lobbyist>
         </lobbyists>
         
found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300625213.xml

In [195]:
has_child_array['lobbyists'] = True

In [153]:
for il in at_least_one_file(LD2_files, 
                            find_egs, 
                            xpath_query='.//specific_issues'):
    print etree.tostring(il)


<specific_issues>
            <description>FY 14 Appropriations</description>
         </specific_issues>
         
<specific_issues>
            <description>Halaco Site Remediation </description>
         </specific_issues>
         
<specific_issues>
            <description>SBIR reauthorization</description>
         </specific_issues>
         
<specific_issues>
            <description>Gang Violence and Public Safety </description>
         </specific_issues>
         
found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300625213.xml

In [159]:
for il in at_least_one_file(LD2_files, 
                            find_egs_with_multiple_children, 
                            xpath_query='.//specific_issues'):
    print etree.tostring(il)


<specific_issues>
            <description>Issues relating to the reduction of waste and inefficiency in the federal government;
H.R. 310, S. 124, No Budget, No Pay Act;
H.R. 1869, Biennial Budgeting and Enhanced Oversight Act of 2013;
S. 554, Biennial Budgeting and Appropriations Act; </description>
            <description>H.R. 2506, S. 1231, Duplication Elimination Act;
H.R. 2590, 21st Century Health Care for Heroes Act;
H.R. 2694, S. 1304, Buy Smarter and Save Act; 
H.R. 2686 , S. 1321, To amend title 31, United States Code, to provide that the Presidents annual budget submission to Congress list the current fiscal year spending level for each proposed program and a separate amount for any proposed spending increases, and for other purposes;
S.1296, Servicemember's Electronic Health Records Act of 2013;
H.R. 2643, Stay in Place, Cut the Waste Act of 2013;
H.R. 2689, Energy Savings Through Public-Private Partnerships Act of 2013; 
S. 1308, A bill to amend the National Energy Conservation Policy Act to encourage the increased use of performance contracting in Federal facilities; 
H.R. 2675, Government Transformation Act; 
S. 1297, Government Transformation Act of 2013 
</description>
         </specific_issues>
         
found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300625771.xml

In [160]:
has_child_array['specific_issues'] = True

In [196]:
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//updates'):
    print etree.tostring(il)


<updates>
      <clientAddress/>
      <clientCity/>
      <clientState/>
      <clientZip/>
      <clientZipext/>
      <clientCountry/>
      <prinClientCity/>
      <prinClientState/>
      <prinClientZip/>
      <prinClientZipext/>
      <prinClientCountry/>
      <generalDescription/>
      <inactive_lobbyists>
         <inactive_lobbyist>
            <firstName>Heather</firstName>
            <lastName>Dumont</lastName>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
         <inactive_lobbyist>
            <firstName/>
            <lastName/>
            <suffix/>
         </inactive_lobbyist>
      </inactive_lobbyists>
      <inactive_ALIs>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
         <ali_Code/>
      </inactive_ALIs>
      <affiliatedUrl/>
      <affiliatedOrgs>
         <affiliatedOrg>
            <affiliatedOrgName/>
            <affiliatedOrgAddress/>
            <affiliatedOrgCity/>
            <affiliatedOrgState/>
            <affiliatedOrgZip/>
            <affiliatedOrgCountry/>
            <affiliatedPrinOrgCity/>
            <affiliatedPrinOrgState/>
            <affiliatedPrinOrgCountry/>
         </affiliatedOrg>
         <affiliatedOrg>
            <affiliatedOrgName/>
            <affiliatedOrgAddress/>
            <affiliatedOrgCity/>
            <affiliatedOrgState/>
            <affiliatedOrgZip/>
            <affiliatedOrgCountry/>
            <affiliatedPrinOrgCity/>
            <affiliatedPrinOrgState/>
            <affiliatedPrinOrgCountry/>
         </affiliatedOrg>
         <affiliatedOrg>
            <affiliatedOrgName/>
            <affiliatedOrgAddress/>
            <affiliatedOrgCity/>
            <affiliatedOrgState/>
            <affiliatedOrgZip/>
            <affiliatedOrgCountry/>
            <affiliatedPrinOrgCity/>
            <affiliatedPrinOrgState/>
            <affiliatedPrinOrgCountry/>
         </affiliatedOrg>
         <affiliatedOrg>
            <affiliatedOrgName/>
            <affiliatedOrgAddress/>
            <affiliatedOrgCity/>
            <affiliatedOrgState/>
            <affiliatedOrgZip/>
            <affiliatedOrgCountry/>
            <affiliatedPrinOrgCity/>
            <affiliatedPrinOrgState/>
            <affiliatedPrinOrgCountry/>
         </affiliatedOrg>
      </affiliatedOrgs>
      <inactiveOrgs>
         <inactiveOrgName/>
         <inactiveOrgName/>
         <inactiveOrgName/>
         <inactiveOrgName/>
         <inactiveOrgName/>
         <inactiveOrgName/>
      </inactiveOrgs>
      <foreignEntities>
         <foreignEntity>
            <name/>
            <address/>
            <city/>
            <state/>
            <country/>
            <prinCity/>
            <prinState/>
            <prinCountry/>
            <contribution/>
            <ownership_Percentage/>
         </foreignEntity>
         <foreignEntity>
            <name/>
            <address/>
            <city/>
            <state/>
            <country/>
            <prinCity/>
            <prinState/>
            <prinCountry/>
            <contribution/>
            <ownership_Percentage/>
         </foreignEntity>
      </foreignEntities>
      <inactive_ForeignEntities>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
         <inactive_ForeignEntity/>
      </inactive_ForeignEntities>
   </updates>

found in /home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300625771.xml

In [197]:
has_child_array['updates'] = False

In [269]:
ERROR_FIELDS = ['ae', 'error']

In [267]:
ARRAY_FIELDS = [f for f,v in has_child_array.items() if v]

In [268]:
ARRAY_FIELDS


Out[268]:
['inactiveOrgs',
 'inactive_lobbyists',
 'federal_agencies',
 'inactive_ALIs',
 'lobbyists',
 'affiliatedOrgs',
 'alis',
 'specific_issues',
 'foreignEntities',
 'inactive_ForeignEntities']

General Approach


In [272]:
def _add_element_array(children, json_array):
    for c in children:
        new_obj = _add_element(c, {})
        json_array.append(new_obj[c.tag])
        

def _add_element(element, json_dict):
    children = element.getchildren()
    if children:
        if element.tag in ARRAY_FIELDS:
            json_dict[element.tag] = []
            _add_element_array(children, json_dict[element.tag])
        else:
            json_dict[element.tag] = {}
            for child in children:
                _add_element(child, json_dict[element.tag])    
    else:
        text = element.text or ''
        json_dict[element.tag] = text.strip()
    return json_dict

In [273]:
_add_element(LD1r, {})


Out[273]:
{'LOBBYINGDISCLOSURE1': {'address1': '2550 M STREET, NW',
  'address2': '',
  'affiliatedOrgs': [{'affiliatedOrgAddress': '',
    'affiliatedOrgCity': '',
    'affiliatedOrgCountry': '',
    'affiliatedOrgName': '',
    'affiliatedOrgState': '',
    'affiliatedOrgZip': '',
    'affiliatedPrinOrgCity': '',
    'affiliatedPrinOrgCountry': '',
    'affiliatedPrinOrgState': ''},
   {'affiliatedOrgAddress': '',
    'affiliatedOrgCity': '',
    'affiliatedOrgCountry': '',
    'affiliatedOrgName': '',
    'affiliatedOrgState': '',
    'affiliatedOrgZip': '',
    'affiliatedPrinOrgCity': '',
    'affiliatedPrinOrgCountry': '',
    'affiliatedPrinOrgState': ''},
   {'affiliatedOrgAddress': '',
    'affiliatedOrgCity': '',
    'affiliatedOrgCountry': '',
    'affiliatedOrgName': '',
    'affiliatedOrgState': '',
    'affiliatedOrgZip': '',
    'affiliatedPrinOrgCity': '',
    'affiliatedPrinOrgCountry': '',
    'affiliatedPrinOrgState': ''},
   {'affiliatedOrgAddress': '',
    'affiliatedOrgCity': '',
    'affiliatedOrgCountry': '',
    'affiliatedOrgName': '',
    'affiliatedOrgState': '',
    'affiliatedOrgZip': '',
    'affiliatedPrinOrgCity': '',
    'affiliatedPrinOrgCountry': '',
    'affiliatedPrinOrgState': ''},
   {'affiliatedOrgAddress': '',
    'affiliatedOrgCity': '',
    'affiliatedOrgCountry': '',
    'affiliatedOrgName': '',
    'affiliatedOrgState': '',
    'affiliatedOrgZip': '',
    'affiliatedPrinOrgCity': '',
    'affiliatedPrinOrgCountry': '',
    'affiliatedPrinOrgState': ''},
   {'affiliatedOrgAddress': '',
    'affiliatedOrgCity': '',
    'affiliatedOrgCountry': '',
    'affiliatedOrgName': '',
    'affiliatedOrgState': '',
    'affiliatedOrgZip': '',
    'affiliatedPrinOrgCity': '',
    'affiliatedPrinOrgCountry': '',
    'affiliatedPrinOrgState': ''},
   {'affiliatedOrgAddress': '',
    'affiliatedOrgCity': '',
    'affiliatedOrgCountry': '',
    'affiliatedOrgName': '',
    'affiliatedOrgState': '',
    'affiliatedOrgZip': '',
    'affiliatedPrinOrgCity': '',
    'affiliatedPrinOrgCountry': '',
    'affiliatedPrinOrgState': ''},
   {'affiliatedOrgAddress': '',
    'affiliatedOrgCity': '',
    'affiliatedOrgCountry': '',
    'affiliatedOrgName': '',
    'affiliatedOrgState': '',
    'affiliatedOrgZip': '',
    'affiliatedPrinOrgCity': '',
    'affiliatedPrinOrgCountry': '',
    'affiliatedPrinOrgState': ''},
   {'affiliatedOrgAddress': '',
    'affiliatedOrgCity': '',
    'affiliatedOrgCountry': '',
    'affiliatedOrgName': '',
    'affiliatedOrgState': '',
    'affiliatedOrgZip': '',
    'affiliatedPrinOrgCity': '',
    'affiliatedPrinOrgCountry': '',
    'affiliatedPrinOrgState': ''},
   {'affiliatedOrgAddress': '',
    'affiliatedOrgCity': '',
    'affiliatedOrgCountry': '',
    'affiliatedOrgName': '',
    'affiliatedOrgState': '',
    'affiliatedOrgZip': '',
    'affiliatedPrinOrgCity': '',
    'affiliatedPrinOrgCountry': '',
    'affiliatedPrinOrgState': ''},
   {'affiliatedOrgAddress': '',
    'affiliatedOrgCity': '',
    'affiliatedOrgCountry': '',
    'affiliatedOrgName': '',
    'affiliatedOrgState': '',
    'affiliatedOrgZip': '',
    'affiliatedPrinOrgCity': '',
    'affiliatedPrinOrgCountry': '',
    'affiliatedPrinOrgState': ''},
   {'affiliatedOrgAddress': '',
    'affiliatedOrgCity': '',
    'affiliatedOrgCountry': '',
    'affiliatedOrgName': '',
    'affiliatedOrgState': '',
    'affiliatedOrgZip': '',
    'affiliatedPrinOrgCity': '',
    'affiliatedPrinOrgCountry': '',
    'affiliatedPrinOrgState': ''}],
  'affiliatedUrl': '',
  'alis': ['TRA',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   '',
   ''],
  'city': 'WASHINGTON',
  'clientAddress': '300 New Jersey Avenue SE',
  'clientCity': 'Washington',
  'clientCountry': 'USA',
  'clientGeneralDescription': 'Global corporation focused on energy, health, smart cities and transportation.',
  'clientName': 'Siemens Corportation',
  'clientState': 'DC',
  'clientZip': '20001',
  'clientZipExt': '',
  'country': 'USA',
  'effectiveDate': '12/06/2013',
  'firstName': '',
  'foreignEntities': [{'address': '',
    'city': '',
    'contribution': '',
    'country': '',
    'name': '',
    'ownership_Percentage': '',
    'prinCity': '',
    'prinCountry': '',
    'prinState': '',
    'state': ''},
   {'address': '',
    'city': '',
    'contribution': '',
    'country': '',
    'name': '',
    'ownership_Percentage': '',
    'prinCity': '',
    'prinCountry': '',
    'prinState': '',
    'state': ''},
   {'address': '',
    'city': '',
    'contribution': '',
    'country': '',
    'name': '',
    'ownership_Percentage': '',
    'prinCity': '',
    'prinCountry': '',
    'prinState': '',
    'state': ''},
   {'address': '',
    'city': '',
    'contribution': '',
    'country': '',
    'name': '',
    'ownership_Percentage': '',
    'prinCity': '',
    'prinCountry': '',
    'prinState': '',
    'state': ''},
   {'address': '',
    'city': '',
    'contribution': '',
    'country': '',
    'name': '',
    'ownership_Percentage': '',
    'prinCity': '',
    'prinCountry': '',
    'prinState': '',
    'state': ''},
   {'address': '',
    'city': '',
    'contribution': '',
    'country': '',
    'name': '',
    'ownership_Percentage': '',
    'prinCity': '',
    'prinCountry': '',
    'prinState': '',
    'state': ''},
   {'address': '',
    'city': '',
    'contribution': '',
    'country': '',
    'name': '',
    'ownership_Percentage': '',
    'prinCity': '',
    'prinCountry': '',
    'prinState': '',
    'state': ''},
   {'address': '',
    'city': '',
    'contribution': '',
    'country': '',
    'name': '',
    'ownership_Percentage': '',
    'prinCity': '',
    'prinCountry': '',
    'prinState': '',
    'state': ''},
   {'address': '',
    'city': '',
    'contribution': '',
    'country': '',
    'name': '',
    'ownership_Percentage': '',
    'prinCity': '',
    'prinCountry': '',
    'prinState': '',
    'state': ''},
   {'address': '',
    'city': '',
    'contribution': '',
    'country': '',
    'name': '',
    'ownership_Percentage': '',
    'prinCity': '',
    'prinCountry': '',
    'prinState': '',
    'state': ''},
   {'address': '',
    'city': '',
    'contribution': '',
    'country': '',
    'name': '',
    'ownership_Percentage': '',
    'prinCity': '',
    'prinCountry': '',
    'prinState': '',
    'state': ''}],
  'houseID': '319170238',
  'imported': 'Y',
  'lastName': '',
  'lobbyists': [{'coveredPosition': 'DCSDOT99-01DAFedRailAdmin99-SofSt97-99DSPDOS96-97',
    'lobbyistFirstName': 'Norma',
    'lobbyistLastName': 'Krayem',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''},
   {'coveredPosition': '',
    'lobbyistFirstName': '',
    'lobbyistLastName': '',
    'lobbyistNew': 'Y',
    'lobbyistSuffix': ''}],
  'organizationName': 'Patton Boggs LLP',
  'pages': '2',
  'prefix': '',
  'prinClientCity': '',
  'prinClientCountry': '',
  'prinClientState': '',
  'prinClientZip': '',
  'prinClientZipExt': '',
  'principal_city': '',
  'principal_country': '',
  'principal_state': '',
  'principal_zip': '',
  'principal_zipext': '',
  'printedName': 'James B. Christian, Partner',
  'regType': '3',
  'registrantGeneralDescription': 'Law firm',
  'reportType': 'RA',
  'reportYear': '2013',
  'selfSelect': 'N',
  'senateID': '30906-2551',
  'signedDate': '01/09/2014',
  'specific_issues': 'High Speed Rail issues',
  'state': 'DC',
  'zip': '20037',
  'zipext': ''}}

In [274]:
_add_element(LD2r, {})


Out[274]:
{'LOBBYINGDISCLOSURE2': {'address1': '101 Constitution Ave, NW',
  'address2': '',
  'alis': [{'federal_agencies': 'U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE',
    'foreign_entity_issues': '',
    'issueAreaCode': 'BUD',
    'lobbyists': [{'coveredPosition': '',
      'lobbyistFirstName': 'John',
      'lobbyistLastName': "O'Donnell",
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': 'Kyriakos',
      'lobbyistLastName': 'Pagonis',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''}],
    'specific_issues': ['FY 14 Appropriations']},
   {'federal_agencies': 'U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE, Environmental Protection Agency (EPA)',
    'foreign_entity_issues': '',
    'issueAreaCode': 'ENV',
    'lobbyists': [{'coveredPosition': '',
      'lobbyistFirstName': 'John',
      'lobbyistLastName': "O'Donnell",
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': 'Kyriakos',
      'lobbyistLastName': 'Pagonis',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''}],
    'specific_issues': ['Halaco Site Remediation']},
   {'federal_agencies': 'U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE',
    'foreign_entity_issues': '',
    'issueAreaCode': 'TAX',
    'lobbyists': [{'coveredPosition': '',
      'lobbyistFirstName': 'John',
      'lobbyistLastName': "O'Donnell",
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': 'Kyriakos',
      'lobbyistLastName': 'Pagonis',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''}],
    'specific_issues': ['SBIR reauthorization']},
   {'federal_agencies': 'U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE, Bureau of Justice Assistance, Office of Justice Program',
    'foreign_entity_issues': '',
    'issueAreaCode': 'LAW',
    'lobbyists': [{'coveredPosition': '',
      'lobbyistFirstName': 'John',
      'lobbyistLastName': "O'Donnell",
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': 'Kyriakos',
      'lobbyistLastName': 'Pagonis',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''},
     {'coveredPosition': '',
      'lobbyistFirstName': '',
      'lobbyistLastName': '',
      'lobbyistNew': 'N',
      'lobbyistSuffix': ''}],
    'specific_issues': ['Gang Violence and Public Safety']}],
  'city': 'Washington',
  'clientGovtEntity': 'Y',
  'clientName': 'CITY OF OXNARD',
  'country': 'USA',
  'expenses': '',
  'expensesMethod': '',
  'firstName': '',
  'houseID': '317760034',
  'imported': 'N',
  'income': '8000.00',
  'lastName': '',
  'noLobbying': '',
  'organizationName': "Murray, Montgomery & O'Donnell",
  'pages': '5',
  'prefix': '',
  'principal_city': '',
  'principal_country': '',
  'principal_state': '',
  'principal_zip': '',
  'principal_zipext': '',
  'printedName': "John O'Donnell, Partner",
  'registrantDifferentAddress': 'N',
  'reportType': 'Q4',
  'reportYear': '2013',
  'selfSelect': 'N',
  'senateID': '26227-63',
  'signedDate': '01/21/2014',
  'state': 'DC',
  'submitURL': '',
  'terminationDate': '',
  'updates': {'affiliatedOrgs': [{'affiliatedOrgAddress': '',
     'affiliatedOrgCity': '',
     'affiliatedOrgCountry': '',
     'affiliatedOrgName': '',
     'affiliatedOrgState': '',
     'affiliatedOrgZip': '',
     'affiliatedPrinOrgCity': '',
     'affiliatedPrinOrgCountry': '',
     'affiliatedPrinOrgState': ''},
    {'affiliatedOrgAddress': '',
     'affiliatedOrgCity': '',
     'affiliatedOrgCountry': '',
     'affiliatedOrgName': '',
     'affiliatedOrgState': '',
     'affiliatedOrgZip': '',
     'affiliatedPrinOrgCity': '',
     'affiliatedPrinOrgCountry': '',
     'affiliatedPrinOrgState': ''}],
   'affiliatedUrl': '',
   'clientAddress': '',
   'clientCity': '',
   'clientCountry': '',
   'clientState': '',
   'clientZip': '',
   'clientZipext': '',
   'foreignEntities': [{'address': '',
     'city': '',
     'contribution': '',
     'country': '',
     'name': '',
     'ownership_Percentage': '',
     'prinCity': '',
     'prinCountry': '',
     'prinState': '',
     'state': ''}],
   'generalDescription': '',
   'inactiveOrgs': ['', '', ''],
   'inactive_ALIs': ['', '', '', '', '', '', '', '', ''],
   'inactive_ForeignEntities': ['', '', '', '', '', ''],
   'inactive_lobbyists': [{'firstName': '', 'lastName': '', 'suffix': ''},
    {'firstName': '', 'lastName': '', 'suffix': ''},
    {'firstName': '', 'lastName': '', 'suffix': ''},
    {'firstName': '', 'lastName': '', 'suffix': ''}],
   'prinClientCity': '',
   'prinClientCountry': '',
   'prinClientState': '',
   'prinClientZip': '',
   'prinClientZipext': ''},
  'zip': '20001',
  'zipext': ''}}

In [280]:
xml_filepath = '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD1/2013/ALL/300620793.xml'
os.path.splitext(os.path.basename(xml_filepath))


Out[280]:
('300620793', '.xml')

In [277]:
from tasks.utils import translate_dir

In [279]:
translate_dir(xml_filepath,
              from_dir=settings.ORIG_DIR,
              to_dir=settings.TRANS_DIR)


Out[279]:
('/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD2/2013/Q4/300626594.xml',
 '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/transformed/house_xml/LD2/2013/Q4')

In [281]:
translate_dir(xml_filepath,
              from_dir=settings.ORIG_DIR,
              to_dir=settings.TRANS_DIR)


Out[281]:
('/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD1/2013/ALL/300620793.xml',
 '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/transformed/house_xml/LD1/2013/ALL')

In [ ]: