In [176]:
    
import sys
import os
import re
from glob import glob
import pprint
    
In [3]:
    
from lxml import etree
from collections import Counter
    
In [4]:
    
PROJ_DIR = os.pardir
sys.path.append(os.path.abspath(PROJ_DIR))
    
In [5]:
    
import settings
    
In [6]:
    
sys.path
    
    Out[6]:
In [7]:
    
HOUSE_ORIG = os.path.join(settings.ORIG_DIR, 'house_xml')
    
In [8]:
    
!tree -d ../data/original/house_xml
    
    
In [9]:
    
LD1_DIR = os.path.join(HOUSE_ORIG,'LD1')
LD2_DIR = os.path.join(HOUSE_ORIG,'LD2')
    
In [183]:
    
def get_root_tag(fname):
    _tree = etree.parse(open(fname))
    return _tree.getroot().tag
def get_top_level_fields(fname):
    _tree = etree.parse(open(fname))
    children = _tree.getroot().getchildren()
    return [c.tag for c in children]
def fields_nonempty(node):
    if node is not None:
        texts = filter(lambda x: len(re.sub(r'\s+', '', x)) > 0, node.xpath('.//text()'))
        return len(texts) > 0
    else:
        False
    
In [167]:
    
def all_files(file_list, inspect_function, **kwargs):
    for _fname in file_list:
        try:
            yield inspect_function(_fname, **kwargs)
        except etree.XMLSyntaxError as e:
            sys.stderr.write("issue with {fn}".format(fn=_fname))
            sys.stderr.write(str(e))
            continue
            
def at_least_one_file(file_list, inspect_function, **kwargs):
    for _fname in file_list:
        try:
            result = inspect_function(_fname, **kwargs)
            if result:
                sys.stderr.write("found in {fn}".format(fn=_fname))
                return result
        except etree.XMLSyntaxError as e:
            sys.stderr.write("issue with {fn}".format(fn=_fname))
            sys.stderr.write(str(e))
            continue
def find_egs(_fname, **kwargs):
    xpath_query = kwargs['xpath_query']
    with open(_fname) as fin:
        tree = etree.parse(fin)
        root = tree.getroot()
        egs = root.xpath(xpath_query)
        if egs:
            return egs
            #for eg in egs:
                #print etree.tostring(eg, pretty_print=True)
     
def find_all_parents(_fname):
    parents = set()
    with open(_fname) as fin:
        tree = etree.parse(fin)
        root = tree.getroot()
        egs = root.xpath('.//*')
        if egs:
            for eg in egs:
                parent = eg.getparent()
                parents.add(parent.tag)
    return parents
def find_nonempty_egs(_fname, **kwargs):
    xpath_query = kwargs['xpath_query']
    with open(_fname) as fin:
        tree = etree.parse(fin)
        root = tree.getroot()
        egs = filter(fields_nonempty, root.xpath(xpath_query))
        if egs:
            return egs
        
def find_egs_with_multiple_children(_fname, **kwargs):
    xpath_query = kwargs['xpath_query']
    with open(_fname) as fin:
        tree = etree.parse(fin)
        root = tree.getroot()
        egs = filter(lambda x: len(x.getchildren()) > 1, root.xpath(xpath_query))
        if egs:
            return egs
    
In [12]:
    
LD1_files = glob(os.path.join(LD1_DIR, '*', '*', '*.xml'))
    
In [13]:
    
LD1_files[0:5]
    
    Out[13]:
In [14]:
    
len(LD1_files)
    
    Out[14]:
In [15]:
    
LD1tree = etree.parse(open(LD1_files[0]))
    
In [16]:
    
LD1tree.getroot()
    
    Out[16]:
In [17]:
    
LD1r = LD1tree.getroot()
    
In [18]:
    
LD1r.tag
    
    Out[18]:
In [67]:
    
LD1c = LD1r.getchildren()[0]
    
In [68]:
    
LD1c.getparent()
    
    Out[68]:
In [69]:
    
LD1r.getparent()
    
In [130]:
    
#formtypes = Counter()
#for f in LD1_files:
#    tree = etree.parse(open(f))
#    formtypes.update([tree.getroot().tag,])
#formtypes
    
    Out[130]:
In [19]:
    
print etree.tostring(LD1tree, pretty_print=True)
    
    
In [25]:
    
#LD1_top_level_counts = Counter()
#for children_list in all_files(LD1_files, get_top_level_fields):
#    LD1_top_level_counts.update(children_list)
#LD1_top_level_counts
LD1_top_level_counts = Counter({'selfSelect': 55653, 'prefix': 55653, 'principal_zip': 55653, 'houseID': 55653, 'address1': 55653, 'address2': 55653, 'organizationName': 55653, 'clientName': 55653, 'printedName': 55653, 'senateID': 55653, 'principal_zipext': 55653, 'state': 55653, 'reportYear': 55653, 'lastName': 55653, 'zipext': 55653, 'city': 55653, 'zip': 55653, 'reportType': 55653, 'firstName': 55653, 'country': 55653, 'alis': 55653, 'principal_state': 55653, 'signedDate': 55653, 'principal_city': 55653, 'pages': 55653, 'principal_country': 55653, 'registrantGeneralDescription': 55652, 'clientAddress': 55652, 'foreignEntities': 55652, 'clientCity': 55652, 'prinClientCountry': 55652, 'effectiveDate': 55652, 'prinClientState': 55652, 'regType': 55652, 'clientState': 55652, 'specific_issues': 55652, 'lobbyists': 55652, 'clientCountry': 55652, 'prinClientZipExt': 55652, 'affiliatedOrgs': 55652, 'clientGeneralDescription': 55652, 'prinClientCity': 55652, 'clientZipExt': 55652, 'prinClientZip': 55652, 'clientZip': 55652, 'affiliatedUrl': 36819, 'imported': 32332, 'signerEmail': 22722, 'clientGovtEntity': 4487, 'contactIntlPhone': 3087, 'contactPhone': 422, 'contactPrefix': 418, 'contactEmail': 418, 'contactName': 418, '{http://www.PureEdge.com/XFDL/Custom}clientCity': 27, '{http://www.PureEdge.com/XFDL/Custom}specific_issues': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_zipext': 27, '{http://www.PureEdge.com/XFDL/Custom}organizationName': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_city': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_zip': 27, '{http://www.PureEdge.com/XFDL/Custom}clientName': 27, '{http://www.PureEdge.com/XFDL/Custom}city': 27, '{http://www.PureEdge.com/XFDL/Custom}clientZip': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientCity': 27, '{http://www.PureEdge.com/XFDL/Custom}foreignEntities': 27, '{http://www.PureEdge.com/XFDL/Custom}regType': 27, '{http://www.PureEdge.com/XFDL/Custom}senateID': 27, '{http://www.PureEdge.com/XFDL/Custom}prefix': 27, '{http://www.PureEdge.com/XFDL/Custom}effectiveDate': 27, '{http://www.PureEdge.com/XFDL/Custom}state': 27, '{http://www.PureEdge.com/XFDL/Custom}registrantGeneralDescription': 27, '{http://www.PureEdge.com/XFDL/Custom}alis': 27, '{http://www.PureEdge.com/XFDL/Custom}lastName': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientCountry': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientZip': 27, '{http://www.PureEdge.com/XFDL/Custom}lobbyists': 27, '{http://www.PureEdge.com/XFDL/Custom}reportType': 27, '{http://www.PureEdge.com/XFDL/Custom}clientZipExt': 27, '{http://www.PureEdge.com/XFDL/Custom}reportYear': 27, '{http://www.PureEdge.com/XFDL/Custom}selfSelect': 27, '{http://www.PureEdge.com/XFDL/Custom}country': 27, '{http://www.PureEdge.com/XFDL/Custom}clientState': 27, '{http://www.PureEdge.com/XFDL/Custom}zip': 27, '{http://www.PureEdge.com/XFDL/Custom}firstName': 27, '{http://www.PureEdge.com/XFDL/Custom}signedDate': 27, '{http://www.PureEdge.com/XFDL/Custom}zipext': 27, '{http://www.PureEdge.com/XFDL/Custom}address1': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_state': 27, '{http://www.PureEdge.com/XFDL/Custom}principal_country': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientState': 27, '{http://www.PureEdge.com/XFDL/Custom}pages': 27, '{http://www.PureEdge.com/XFDL/Custom}affiliatedOrgs': 27, '{http://www.PureEdge.com/XFDL/Custom}clientGeneralDescription': 27, '{http://www.PureEdge.com/XFDL/Custom}clientAddress': 27, '{http://www.PureEdge.com/XFDL/Custom}prinClientZipExt': 27, '{http://www.PureEdge.com/XFDL/Custom}clientCountry': 27, '{http://www.PureEdge.com/XFDL/Custom}houseID': 27, '{http://www.PureEdge.com/XFDL/Custom}address2': 27, '{http://www.PureEdge.com/XFDL/Custom}signerEmail': 27, '{http://www.PureEdge.com/XFDL/Custom}printedName': 27, '{http://www.PureEdge.com/XFDL/Custom}contactEmail': 20, '{http://www.PureEdge.com/XFDL/Custom}contactPrefix': 20, '{http://www.PureEdge.com/XFDL/Custom}contactName': 20, '{http://www.PureEdge.com/XFDL/Custom}contactPhone': 20, '{http://www.PureEdge.com/XFDL/Custom}contactIntlPhone': 20, 'noLobbying': 1, 'updates': 1, 'registrantDifferentAddress': 1, 'terminationDate': 1, 'submitURL': 1, 'expensesMethod': 1, 'expenses': 1, 'income': 1})
    
In [28]:
    
LD1_sorted_top_level_counts = sorted(LD1_top_level_counts.items(), key=lambda x: -x[1])
LD1_sorted_top_level_counts
    
    Out[28]:
In [29]:
    
sorted(LD1_top_level_counts.items(), key=lambda x: x[0])
    
    Out[29]:
In [54]:
    
LD1_top_level_counts.keys()
    
    Out[54]:
In [55]:
    
LD1_parents = {}
tags = filter(lambda x: 'PureEdge' not in x, LD1_top_level_counts.keys())
for tag in tags:
    egs = find_egs(LD1_files, '{tag}/*'.format(tag=tag))
    if egs:
        LD1_parents[tag] = egs
    
In [82]:
    
LD1_parents = set()
for parentset in all_files(LD1_files, find_all_parents):
    LD1_parents.update(parentset)
    
In [85]:
    
LD1_parents
    
    Out[85]:
In [209]:
    
for x in at_least_one_file(LD1_files, find_nonempty_egs, xpath_query='.//affiliatedUrl'):
    print etree.tostring(x)
    
    
    
In [30]:
    
LD2_files = glob(os.path.join(LD2_DIR, '*', '*', '*.xml'))
    
In [31]:
    
LD2_files[0:5]
    
    Out[31]:
In [32]:
    
len(LD2_files)
    
    Out[32]:
In [33]:
    
LD2tree = etree.parse(open(LD2_files[0]))
    
In [34]:
    
print etree.tostring(LD2tree, pretty_print=True)
    
    
In [35]:
    
LD2r = LD2tree.getroot()
    
In [36]:
    
LD2r.tag
    
    Out[36]:
In [142]:
    
#formtypes = Counter()
#for f in LD2_files:
#    try:
#        tree = etree.parse(open(f))
#        formtypes.update([tree.getroot().tag,])
#    except Exception as e:
#        print str(e)
#        print f
#formtypes
    
    
    Out[142]:
In [37]:
    
formtypes = Counter({'LOBBYINGDISCLOSURE2': 668905, '{http://www.PureEdge.com/XFDL/Custom}LOBBYINGDISCLOSURE2': 97})
formtypes
    
    Out[37]:
In [145]:
    
#tag_counts = Counter(all_files(LD2_files, get_root_tag))
    
In [38]:
    
tag_counts = Counter({'LOBBYINGDISCLOSURE2': 668905, '{http://www.PureEdge.com/XFDL/Custom}LOBBYINGDISCLOSURE2': 97})
tag_counts
    
    Out[38]:
In [166]:
    
#ld2_top_level_counts = Counter()
#for children_list in all_files(LD2_files, get_top_level_fields):
#    ld2_top_level_counts.update(children_list)
#ld2_top_level_counts
    
    
    Out[166]:
In [39]:
    
ld2_top_level_counts = Counter({'zipext': 668905, 'printedName': 668905, 'principal_state': 668905, 'selfSelect': 668905, 'signedDate': 668905, 'expenses': 668905, 'prefix': 668905, 'city': 668905, 'senateID': 668905, 'zip': 668905, 'noLobbying': 668905, 'principal_zipext': 668905, 'state': 668905, 'reportType': 668905, 'principal_city': 668905, 'principal_zip': 668905, 'terminationDate': 668905, 'submitURL': 668905, 'houseID': 668905, 'address1': 668905, 'address2': 668905, 'income': 668905, 'updates': 668905, 'reportYear': 668905, 'pages': 668905, 'expensesMethod': 668905, 'organizationName': 668905, 'firstName': 668905, 'lastName': 668905, 'registrantDifferentAddress': 668905, 'country': 668905, 'alis': 668905, 'clientName': 668905, 'principal_country': 668904, 'clientGovtEntity': 536861, 'imported': 500575, 'signerEmail': 320584, 'contactIntlPhone': 37472, 'contactPhone': 209, 'contactPrefix': 179, 'contactEmail': 178, 'contactName': 178, '{http://www.PureEdge.com/XFDL/Custom}expensesMethod': 97, '{http://www.PureEdge.com/XFDL/Custom}firstName': 97, '{http://www.PureEdge.com/XFDL/Custom}prefix': 97, '{http://www.PureEdge.com/XFDL/Custom}houseID': 97, '{http://www.PureEdge.com/XFDL/Custom}senateID': 97, '{http://www.PureEdge.com/XFDL/Custom}registrantDifferentAddress': 97, '{http://www.PureEdge.com/XFDL/Custom}noLobbying': 97, '{http://www.PureEdge.com/XFDL/Custom}zip': 97, '{http://www.PureEdge.com/XFDL/Custom}submitURL': 97, '{http://www.PureEdge.com/XFDL/Custom}signedDate': 97, '{http://www.PureEdge.com/XFDL/Custom}contactIntlPhone': 97, '{http://www.PureEdge.com/XFDL/Custom}alis': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_zipext': 97, '{http://www.PureEdge.com/XFDL/Custom}income': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_city': 97, '{http://www.PureEdge.com/XFDL/Custom}clientName': 97, '{http://www.PureEdge.com/XFDL/Custom}selfSelect': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_zip': 97, '{http://www.PureEdge.com/XFDL/Custom}expenses': 97, '{http://www.PureEdge.com/XFDL/Custom}zipext': 97, '{http://www.PureEdge.com/XFDL/Custom}address2': 97, '{http://www.PureEdge.com/XFDL/Custom}address1': 97, '{http://www.PureEdge.com/XFDL/Custom}state': 97, '{http://www.PureEdge.com/XFDL/Custom}pages': 97, '{http://www.PureEdge.com/XFDL/Custom}reportType': 97, '{http://www.PureEdge.com/XFDL/Custom}organizationName': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_country': 97, '{http://www.PureEdge.com/XFDL/Custom}country': 97, '{http://www.PureEdge.com/XFDL/Custom}lastName': 97, '{http://www.PureEdge.com/XFDL/Custom}reportYear': 97, '{http://www.PureEdge.com/XFDL/Custom}city': 97, '{http://www.PureEdge.com/XFDL/Custom}principal_state': 97, '{http://www.PureEdge.com/XFDL/Custom}updates': 97, '{http://www.PureEdge.com/XFDL/Custom}signerEmail': 97, '{http://www.PureEdge.com/XFDL/Custom}printedName': 97, '{http://www.PureEdge.com/XFDL/Custom}terminationDate': 97, '{http://www.PureEdge.com/XFDL/Custom}contactPhone': 95, '{http://www.PureEdge.com/XFDL/Custom}contactEmail': 95, '{http://www.PureEdge.com/XFDL/Custom}contactName': 95, '{http://www.PureEdge.com/XFDL/Custom}contactPrefix': 95, '{http://www.PureEdge.com/XFDL/Custom}imported': 2, '{http://www.PureEdge.com/XFDL/Custom}clientGovtEntity': 2})
    
In [40]:
    
sorted(ld2_top_level_counts.items(), key=lambda x: x[0])
    
    Out[40]:
In [41]:
    
sorted(ld2_top_level_counts.items(), key=lambda x: -x[1])
    
    Out[41]:
In [86]:
    
LD2_parents = set()
for parentset in all_files(LD2_files, find_all_parents):
    LD2_parents.update(parentset)
    
    
In [87]:
    
LD2_parents
    
    Out[87]:
In [135]:
    
has_child_array = {}
    
In [134]:
    
for fa in at_least_one_file(LD2_files, 
                            find_egs, 
                            xpath_query='.//ae'):
    print etree.tostring(fa)
    
    
    
In [98]:
    
for fa in at_least_one_file(LD2_files, 
                            find_egs, 
                            xpath_query='.//federal_agencies'):
    print etree.tostring(fa)
    
    
    
In [136]:
    
has_child_array['federal_agencies'] = True
    
In [185]:
    
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//affiliatedOrgs'):
    print etree.tostring(il)
    
    
    
In [139]:
    
has_child_array['affiliatedOrgs'] = True
    
In [140]:
    
for il in at_least_one_file(LD2_files, 
                            find_egs, 
                            xpath_query='.//alis'):
    print etree.tostring(il)
    
    
    
In [141]:
    
has_child_array['alis'] = True
    
In [102]:
    
for il in at_least_one_file(LD2_files, 
                            find_egs, 
                            xpath_query='.//error'):
    print etree.tostring(il)
    
    
    
In [186]:
    
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//foreignEntities'):
    print etree.tostring(il)
    
    
    
In [143]:
    
has_child_array['foreignEntities'] = True
    
In [187]:
    
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//inactive_ALIs'):
    print etree.tostring(il)
    
    
    
In [144]:
    
has_child_array['inactive_ALIs'] = True
    
In [188]:
    
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//inactiveOrgs'):
    print etree.tostring(il)
    
    
    
In [189]:
    
has_child_array['inactiveOrgs'] = True
    
In [190]:
    
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//inactive_ForeignEntities'):
    print etree.tostring(il)
    
    
    
In [191]:
    
has_child_array['inactive_ForeignEntities'] = True
    
In [192]:
    
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//inactive_lobbyists'):
    print etree.tostring(il)
    
    
    
In [193]:
    
has_child_array['inactive_lobbyists'] = True
    
In [194]:
    
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//lobbyists'):
    print etree.tostring(il)
    
    
    
In [195]:
    
has_child_array['lobbyists'] = True
    
In [153]:
    
for il in at_least_one_file(LD2_files, 
                            find_egs, 
                            xpath_query='.//specific_issues'):
    print etree.tostring(il)
    
    
    
In [159]:
    
for il in at_least_one_file(LD2_files, 
                            find_egs_with_multiple_children, 
                            xpath_query='.//specific_issues'):
    print etree.tostring(il)
    
    
    
In [160]:
    
has_child_array['specific_issues'] = True
    
In [196]:
    
for il in at_least_one_file(LD2_files, 
                            find_nonempty_egs, 
                            xpath_query='.//updates'):
    print etree.tostring(il)
    
    
    
In [197]:
    
has_child_array['updates'] = False
    
In [269]:
    
ERROR_FIELDS = ['ae', 'error']
    
In [267]:
    
ARRAY_FIELDS = [f for f,v in has_child_array.items() if v]
    
In [268]:
    
ARRAY_FIELDS
    
    Out[268]:
In [272]:
    
def _add_element_array(children, json_array):
    for c in children:
        new_obj = _add_element(c, {})
        json_array.append(new_obj[c.tag])
        
def _add_element(element, json_dict):
    children = element.getchildren()
    if children:
        if element.tag in ARRAY_FIELDS:
            json_dict[element.tag] = []
            _add_element_array(children, json_dict[element.tag])
        else:
            json_dict[element.tag] = {}
            for child in children:
                _add_element(child, json_dict[element.tag])    
    else:
        text = element.text or ''
        json_dict[element.tag] = text.strip()
    return json_dict
    
In [273]:
    
_add_element(LD1r, {})
    
    Out[273]:
In [274]:
    
_add_element(LD2r, {})
    
    Out[274]:
In [280]:
    
xml_filepath = '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/original/house_xml/LD1/2013/ALL/300620793.xml'
os.path.splitext(os.path.basename(xml_filepath))
    
    Out[280]:
In [277]:
    
from tasks.utils import translate_dir
    
In [279]:
    
translate_dir(xml_filepath,
              from_dir=settings.ORIG_DIR,
              to_dir=settings.TRANS_DIR)
    
    Out[279]:
In [281]:
    
translate_dir(xml_filepath,
              from_dir=settings.ORIG_DIR,
              to_dir=settings.TRANS_DIR)
    
    Out[281]:
In [ ]: