In [176]:
import sys
import os
import re
from glob import glob
import pprint
In [3]:
from lxml import etree
from collections import Counter
In [4]:
PROJ_DIR = os.pardir
sys.path.append(os.path.abspath(PROJ_DIR))
In [5]:
import settings
In [6]:
sys.path
Out[6]:
In [7]:
HOUSE_ORIG = os.path.join(settings.ORIG_DIR, 'house_xml')
In [8]:
!tree -d ../data/original/house_xml
In [9]:
LD1_DIR = os.path.join(HOUSE_ORIG,'LD1')
LD2_DIR = os.path.join(HOUSE_ORIG,'LD2')
In [183]:
def get_root_tag(fname):
_tree = etree.parse(open(fname))
return _tree.getroot().tag
def get_top_level_fields(fname):
_tree = etree.parse(open(fname))
children = _tree.getroot().getchildren()
return [c.tag for c in children]
def fields_nonempty(node):
if node is not None:
texts = filter(lambda x: len(re.sub(r'\s+', '', x)) > 0, node.xpath('.//text()'))
return len(texts) > 0
else:
False
In [167]:
def all_files(file_list, inspect_function, **kwargs):
for _fname in file_list:
try:
yield inspect_function(_fname, **kwargs)
except etree.XMLSyntaxError as e:
sys.stderr.write("issue with {fn}".format(fn=_fname))
sys.stderr.write(str(e))
continue
def at_least_one_file(file_list, inspect_function, **kwargs):
for _fname in file_list:
try:
result = inspect_function(_fname, **kwargs)
if result:
sys.stderr.write("found in {fn}".format(fn=_fname))
return result
except etree.XMLSyntaxError as e:
sys.stderr.write("issue with {fn}".format(fn=_fname))
sys.stderr.write(str(e))
continue
def find_egs(_fname, **kwargs):
xpath_query = kwargs['xpath_query']
with open(_fname) as fin:
tree = etree.parse(fin)
root = tree.getroot()
egs = root.xpath(xpath_query)
if egs:
return egs
#for eg in egs:
#print etree.tostring(eg, pretty_print=True)
def find_all_parents(_fname):
parents = set()
with open(_fname) as fin:
tree = etree.parse(fin)
root = tree.getroot()
egs = root.xpath('.//*')
if egs:
for eg in egs:
parent = eg.getparent()
parents.add(parent.tag)
return parents
def find_nonempty_egs(_fname, **kwargs):
xpath_query = kwargs['xpath_query']
with open(_fname) as fin:
tree = etree.parse(fin)
root = tree.getroot()
egs = filter(fields_nonempty, root.xpath(xpath_query))
if egs:
return egs
def find_egs_with_multiple_children(_fname, **kwargs):
xpath_query = kwargs['xpath_query']
with open(_fname) as fin:
tree = etree.parse(fin)
root = tree.getroot()
egs = filter(lambda x: len(x.getchildren()) > 1, root.xpath(xpath_query))
if egs:
return egs
In [12]:
LD1_files = glob(os.path.join(LD1_DIR, '*', '*', '*.xml'))
In [13]:
LD1_files[0:5]
Out[13]:
In [14]:
len(LD1_files)
Out[14]:
In [15]:
LD1tree = etree.parse(open(LD1_files[0]))
In [16]:
LD1tree.getroot()
Out[16]:
In [17]:
LD1r = LD1tree.getroot()
In [18]:
LD1r.tag
Out[18]:
In [67]:
LD1c = LD1r.getchildren()[0]
In [68]:
LD1c.getparent()
Out[68]:
In [69]:
LD1r.getparent()
In [130]:
#formtypes = Counter()
#for f in LD1_files:
# tree = etree.parse(open(f))
# formtypes.update([tree.getroot().tag,])
#formtypes
Out[130]:
In [19]:
print etree.tostring(LD1tree, pretty_print=True)
In [25]:
#LD1_top_level_counts = Counter()
#for children_list in all_files(LD1_files, get_top_level_fields):
# LD1_top_level_counts.update(children_list)
#LD1_top_level_counts
LD1_top_level_counts = Counter({'selfSelect': 55653, 'prefix': 55653, 'principal_zip': 55653, 'houseID': 55653, 'address1': 55653, 'address2': 55653, 'organizationName': 55653, 'clientName': 55653, 'printedName': 55653, 'senateID': 55653, 'principal_zipext': 55653, 'state': 55653, 'reportYear': 55653, 'lastName': 55653, 'zipext': 55653, 'city': 55653, 'zip': 55653, 'reportType': 55653, 'firstName': 55653, 'country': 55653, 'alis': 55653, 'principal_state': 55653, 'signedDate': 55653, 'principal_city': 55653, 'pages': 55653, 'principal_country': 55653, 'registrantGeneralDescription': 55652, 'clientAddress': 55652, 'foreignEntities': 55652, 'clientCity': 55652, 'prinClientCountry': 55652, 'effectiveDate': 55652, 'prinClientState': 55652, 'regType': 55652, 'clientState': 55652, 'specific_issues': 55652, 'lobbyists': 55652, 'clientCountry': 55652, 'prinClientZipExt': 55652, 'affiliatedOrgs': 55652, 'clientGeneralDescription': 55652, 'prinClientCity': 55652, 'clientZipExt': 55652, 'prinClientZip': 55652, 'clientZip': 55652, 'affiliatedUrl': 36819, 'imported': 32332, 'signerEmail': 22722, 'clientGovtEntity': 4487, 'contactIntlPhone': 3087, 'contactPhone': 422, 'contactPrefix': 418, 'contactEmail': 418, 'contactName': 418, '{http://www.PureEdge.com/XFDL/Custom}clientCity': 27, '{http://www.PureEdge.com/XFDL/Custom}specific_issues': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_zipext': 27, '{http://www.PureEdge.com/XFDL/Custom}organizationName': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_city': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_zip': 27, '{http://www.PureEdge.com/XFDL/Custom}clientName': 27, '{http://www.PureEdge.com/XFDL/Custom}city': 27, '{http://www.PureEdge.com/XFDL/Custom}clientZip': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientCity': 27, '{http://www.PureEdge.com/XFDL/Custom}foreignEntities': 27, '{http://www.PureEdge.com/XFDL/Custom}regType': 27, '{http://www.PureEdge.com/XFDL/Custom}senateID': 27, '{http://www.PureEdge.com/XFDL/Custom}prefix': 27, '{http://www.PureEdge.com/XFDL/Custom}effectiveDate': 27, '{http://www.PureEdge.com/XFDL/Custom}state': 27, '{http://www.PureEdge.com/XFDL/Custom}registrantGeneralDescription': 27, '{http://www.PureEdge.com/XFDL/Custom}alis': 27, '{http://www.PureEdge.com/XFDL/Custom}lastName': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientCountry': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientZip': 27, '{http://www.PureEdge.com/XFDL/Custom}lobbyists': 27, '{http://www.PureEdge.com/XFDL/Custom}reportType': 27, '{http://www.PureEdge.com/XFDL/Custom}clientZipExt': 27, '{http://www.PureEdge.com/XFDL/Custom}reportYear': 27, '{http://www.PureEdge.com/XFDL/Custom}selfSelect': 27, '{http://www.PureEdge.com/XFDL/Custom}country': 27, '{http://www.PureEdge.com/XFDL/Custom}clientState': 27, '{http://www.PureEdge.com/XFDL/Custom}zip': 27, '{http://www.PureEdge.com/XFDL/Custom}firstName': 27, '{http://www.PureEdge.com/XFDL/Custom}signedDate': 27, '{http://www.PureEdge.com/XFDL/Custom}zipext': 27, '{http://www.PureEdge.com/XFDL/Custom}address1': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_state': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_country': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientState': 27, '{http://www.PureEdge.com/XFDL/Custom}pages': 27, '{http://www.PureEdge.com/XFDL/Custom}affiliatedOrgs': 27, '{http://www.PureEdge.com/XFDL/Custom}clientGeneralDescription': 27, '{http://www.PureEdge.com/XFDL/Custom}clientAddress': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientZipExt': 27, '{http://www.PureEdge.com/XFDL/Custom}clientCountry': 27, '{http://www.PureEdge.com/XFDL/Custom}houseID': 27, '{http://www.PureEdge.com/XFDL/Custom}address2': 27, '{http://www.PureEdge.com/XFDL/Custom}signerEmail': 27, '{http://www.PureEdge.com/XFDL/Custom}printedName': 27, '{http://www.PureEdge.com/XFDL/Custom}contactEmail': 20, '{http://www.PureEdge.com/XFDL/Custom}contactPrefix': 20, '{http://www.PureEdge.com/XFDL/Custom}contactName': 20, '{http://www.PureEdge.com/XFDL/Custom}contactPhone': 20, '{http://www.PureEdge.com/XFDL/Custom}contactIntlPhone': 20, 'noLobbying': 1, 'updates': 1, 'registrantDifferentAddress': 1, 'terminationDate': 1, 'submitURL': 1, 'expensesMethod': 1, 'expenses': 1, 'income': 1})
In [28]:
LD1_sorted_top_level_counts = sorted(LD1_top_level_counts.items(), key=lambda x: -x[1])
LD1_sorted_top_level_counts
Out[28]:
In [29]:
sorted(LD1_top_level_counts.items(), key=lambda x: x[0])
Out[29]:
In [54]:
LD1_top_level_counts.keys()
Out[54]:
In [55]:
LD1_parents = {}
tags = filter(lambda x: 'PureEdge' not in x, LD1_top_level_counts.keys())
for tag in tags:
egs = find_egs(LD1_files, '{tag}/*'.format(tag=tag))
if egs:
LD1_parents[tag] = egs
In [82]:
LD1_parents = set()
for parentset in all_files(LD1_files, find_all_parents):
LD1_parents.update(parentset)
In [85]:
LD1_parents
Out[85]:
In [209]:
for x in at_least_one_file(LD1_files, find_nonempty_egs, xpath_query='.//affiliatedUrl'):
print etree.tostring(x)
In [30]:
LD2_files = glob(os.path.join(LD2_DIR, '*', '*', '*.xml'))
In [31]:
LD2_files[0:5]
Out[31]:
In [32]:
len(LD2_files)
Out[32]:
In [33]:
LD2tree = etree.parse(open(LD2_files[0]))
In [34]:
print etree.tostring(LD2tree, pretty_print=True)
In [35]:
LD2r = LD2tree.getroot()
In [36]:
LD2r.tag
Out[36]:
In [142]:
#formtypes = Counter()
#for f in LD2_files:
# try:
# tree = etree.parse(open(f))
# formtypes.update([tree.getroot().tag,])
# except Exception as e:
# print str(e)
# print f
#formtypes
Out[142]:
In [37]:
formtypes = Counter({'LOBBYINGDISCLOSURE2': 668905, '{http://www.PureEdge.com/XFDL/Custom}LOBBYINGDISCLOSURE2': 97})
formtypes
Out[37]:
In [145]:
#tag_counts = Counter(all_files(LD2_files, get_root_tag))
In [38]:
tag_counts = Counter({'LOBBYINGDISCLOSURE2': 668905, '{http://www.PureEdge.com/XFDL/Custom}LOBBYINGDISCLOSURE2': 97})
tag_counts
Out[38]:
In [166]:
#ld2_top_level_counts = Counter()
#for children_list in all_files(LD2_files, get_top_level_fields):
# ld2_top_level_counts.update(children_list)
#ld2_top_level_counts
Out[166]:
In [39]:
ld2_top_level_counts = Counter({'zipext': 668905, 'printedName': 668905, 'principal_state': 668905, 'selfSelect': 668905, 'signedDate': 668905, 'expenses': 668905, 'prefix': 668905, 'city': 668905, 'senateID': 668905, 'zip': 668905, 'noLobbying': 668905, 'principal_zipext': 668905, 'state': 668905, 'reportType': 668905, 'principal_city': 668905, 'principal_zip': 668905, 'terminationDate': 668905, 'submitURL': 668905, 'houseID': 668905, 'address1': 668905, 'address2': 668905, 'income': 668905, 'updates': 668905, 'reportYear': 668905, 'pages': 668905, 'expensesMethod': 668905, 'organizationName': 668905, 'firstName': 668905, 'lastName': 668905, 'registrantDifferentAddress': 668905, 'country': 668905, 'alis': 668905, 'clientName': 668905, 'principal_country': 668904, 'clientGovtEntity': 536861, 'imported': 500575, 'signerEmail': 320584, 'contactIntlPhone': 37472, 'contactPhone': 209, 'contactPrefix': 179, 'contactEmail': 178, 'contactName': 178, '{http://www.PureEdge.com/XFDL/Custom}expensesMethod': 97, '{http://www.PureEdge.com/XFDL/Custom}firstName': 97, '{http://www.PureEdge.com/XFDL/Custom}prefix': 97, '{http://www.PureEdge.com/XFDL/Custom}houseID': 97, '{http://www.PureEdge.com/XFDL/Custom}senateID': 97, '{http://www.PureEdge.com/XFDL/Custom}registrantDifferentAddress': 97, '{http://www.PureEdge.com/XFDL/Custom}noLobbying': 97, '{http://www.PureEdge.com/XFDL/Custom}zip': 97, '{http://www.PureEdge.com/XFDL/Custom}submitURL': 97, '{http://www.PureEdge.com/XFDL/Custom}signedDate': 97, '{http://www.PureEdge.com/XFDL/Custom}contactIntlPhone': 97, '{http://www.PureEdge.com/XFDL/Custom}alis': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_zipext': 97, '{http://www.PureEdge.com/XFDL/Custom}income': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_city': 97, '{http://www.PureEdge.com/XFDL/Custom}clientName': 97, '{http://www.PureEdge.com/XFDL/Custom}selfSelect': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_zip': 97, '{http://www.PureEdge.com/XFDL/Custom}expenses': 97, '{http://www.PureEdge.com/XFDL/Custom}zipext': 97, '{http://www.PureEdge.com/XFDL/Custom}address2': 97, '{http://www.PureEdge.com/XFDL/Custom}address1': 97, '{http://www.PureEdge.com/XFDL/Custom}state': 97, '{http://www.PureEdge.com/XFDL/Custom}pages': 97, '{http://www.PureEdge.com/XFDL/Custom}reportType': 97, '{http://www.PureEdge.com/XFDL/Custom}organizationName': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_country': 97, '{http://www.PureEdge.com/XFDL/Custom}country': 97, '{http://www.PureEdge.com/XFDL/Custom}lastName': 97, '{http://www.PureEdge.com/XFDL/Custom}reportYear': 97, '{http://www.PureEdge.com/XFDL/Custom}city': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_state': 97, '{http://www.PureEdge.com/XFDL/Custom}updates': 97, '{http://www.PureEdge.com/XFDL/Custom}signerEmail': 97, '{http://www.PureEdge.com/XFDL/Custom}printedName': 97, '{http://www.PureEdge.com/XFDL/Custom}terminationDate': 97, '{http://www.PureEdge.com/XFDL/Custom}contactPhone': 95, '{http://www.PureEdge.com/XFDL/Custom}contactEmail': 95, '{http://www.PureEdge.com/XFDL/Custom}contactName': 95, '{http://www.PureEdge.com/XFDL/Custom}contactPrefix': 95, '{http://www.PureEdge.com/XFDL/Custom}imported': 2, '{http://www.PureEdge.com/XFDL/Custom}clientGovtEntity': 2})
In [40]:
sorted(ld2_top_level_counts.items(), key=lambda x: x[0])
Out[40]:
In [41]:
sorted(ld2_top_level_counts.items(), key=lambda x: -x[1])
Out[41]:
In [86]:
LD2_parents = set()
for parentset in all_files(LD2_files, find_all_parents):
LD2_parents.update(parentset)
In [87]:
LD2_parents
Out[87]:
In [135]:
has_child_array = {}
In [134]:
for fa in at_least_one_file(LD2_files,
find_egs,
xpath_query='.//ae'):
print etree.tostring(fa)
In [98]:
for fa in at_least_one_file(LD2_files,
find_egs,
xpath_query='.//federal_agencies'):
print etree.tostring(fa)
In [136]:
has_child_array['federal_agencies'] = True
In [185]:
for il in at_least_one_file(LD2_files,
find_nonempty_egs,
xpath_query='.//affiliatedOrgs'):
print etree.tostring(il)
In [139]:
has_child_array['affiliatedOrgs'] = True
In [140]:
for il in at_least_one_file(LD2_files,
find_egs,
xpath_query='.//alis'):
print etree.tostring(il)
In [141]:
has_child_array['alis'] = True
In [102]:
for il in at_least_one_file(LD2_files,
find_egs,
xpath_query='.//error'):
print etree.tostring(il)
In [186]:
for il in at_least_one_file(LD2_files,
find_nonempty_egs,
xpath_query='.//foreignEntities'):
print etree.tostring(il)
In [143]:
has_child_array['foreignEntities'] = True
In [187]:
for il in at_least_one_file(LD2_files,
find_nonempty_egs,
xpath_query='.//inactive_ALIs'):
print etree.tostring(il)
In [144]:
has_child_array['inactive_ALIs'] = True
In [188]:
for il in at_least_one_file(LD2_files,
find_nonempty_egs,
xpath_query='.//inactiveOrgs'):
print etree.tostring(il)
In [189]:
has_child_array['inactiveOrgs'] = True
In [190]:
for il in at_least_one_file(LD2_files,
find_nonempty_egs,
xpath_query='.//inactive_ForeignEntities'):
print etree.tostring(il)
In [191]:
has_child_array['inactive_ForeignEntities'] = True
In [192]:
for il in at_least_one_file(LD2_files,
find_nonempty_egs,
xpath_query='.//inactive_lobbyists'):
print etree.tostring(il)
In [193]:
has_child_array['inactive_lobbyists'] = True
In [194]:
for il in at_least_one_file(LD2_files,
find_nonempty_egs,
xpath_query='.//lobbyists'):
print etree.tostring(il)
In [195]:
has_child_array['lobbyists'] = True
In [153]:
for il in at_least_one_file(LD2_files,
find_egs,
xpath_query='.//specific_issues'):
print etree.tostring(il)
In [159]:
for il in at_least_one_file(LD2_files,
find_egs_with_multiple_children,
xpath_query='.//specific_issues'):
print etree.tostring(il)
In [160]:
has_child_array['specific_issues'] = True
In [196]:
for il in at_least_one_file(LD2_files,
find_nonempty_egs,
xpath_query='.//updates'):
print etree.tostring(il)
In [197]:
has_child_array['updates'] = False
In [269]:
ERROR_FIELDS = ['ae', 'error']
In [267]:
ARRAY_FIELDS = [f for f,v in has_child_array.items() if v]
In [268]:
ARRAY_FIELDS
Out[268]:
In [272]:
def _add_element_array(children, json_array):
for c in children:
new_obj = _add_element(c, {})
json_array.append(new_obj[c.tag])
def _add_element(element, json_dict):
children = element.getchildren()
if children:
if element.tag in ARRAY_FIELDS:
json_dict[element.tag] = []
_add_element_array(children, json_dict[element.tag])
else:
json_dict[element.tag] = {}
for child in children:
_add_element(child, json_dict[element.tag])
else:
text = element.text or ''
json_dict[element.tag] = text.strip()
return json_dict
In [273]:
_add_element(LD1r, {})
Out[273]:
In [274]:
_add_element(LD2r, {})
Out[274]:
In [280]:
xml_filepath = '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD1/2013/ALL/300620793.xml'
os.path.splitext(os.path.basename(xml_filepath))
Out[280]:
In [277]:
from tasks.utils import translate_dir
In [279]:
translate_dir(xml_filepath,
from_dir=settings.ORIG_DIR,
to_dir=settings.TRANS_DIR)
Out[279]:
In [281]:
translate_dir(xml_filepath,
from_dir=settings.ORIG_DIR,
to_dir=settings.TRANS_DIR)
Out[281]:
In [ ]: