In [2]:

    
import json
import os
import re
import sys
import urlparse

sys.path.append(os.path.join(os.getcwd(),os.path.pardir))

from datetime import datetime, date, timedelta
from io import StringIO

import requests
from pyquery import PyQuery as pq

from lxml import etree

import settings

LD-1



In [11]:

    
html_parser = etree.HTMLParser()



In [12]:

    
_filing_detail_url = 'http://soprweb.senate.gov/index.cfm?'



In [13]:

    
_params = {'event': 'getFilingDetails',
           'filingID': 'b931620a-16aa-4834-b798-e08b0a3bddf8',
           'filingTypeID': 1}

resp = requests.get(_filing_detail_url, params=_params)



In [18]:

    
parsed =  etree.parse(StringIO(resp.text), html_parser)



In [28]:

    
#e = 
e = parsed.xpath('/html/body/table[3]/tbody/tr/'
             'td[contains(.,"Prefix")]/following-sibling::td[1]/div')[0]
print etree.tostring(e)









    



<div style="border-bottom:solid 1px">&#160;Mr.</div>



In [214]:

    
_params = {'event': 'getFilingDetails',
           'filingID': 'b4c3bd67-7c7c-45e6-8b6c-5fd6b55eec3f',
           'filingTypeID': 1}

resp = requests.get(_filing_detail_url, params=_params)



In [216]:

    
parsed =  etree.parse(StringIO(resp.text), html_parser)
parsed.xpath('/html/body/table[position() > 2 and position() < 10]/tbody/tr/*')









    Out[216]:





[<Element td at 0x7fd28f2bb830>,
 <Element td at 0x7fd28f2bbb00>,
 <Element td at 0x7fd28f2bba70>,
 <Element td at 0x7fd28f2bbab8>,
 <Element td at 0x7fd28f2bb950>,
 <Element td at 0x7fd28f22b5a8>,
 <Element td at 0x7fd28d8fb290>,
 <Element td at 0x7fd28d8fb320>,
 <Element td at 0x7fd28d8fb1b8>,
 <Element td at 0x7fd28d8fb170>,
 <Element td at 0x7fd28d8fbf80>,
 <Element td at 0x7fd28d8fb2d8>,
 <Element td at 0x7fd28d8a2050>,
 <Element td at 0x7fd28d8a20e0>,
 <Element td at 0x7fd28d8a21b8>,
 <Element td at 0x7fd28d8a2248>,
 <Element td at 0x7fd28d8a2290>,
 <Element td at 0x7fd28d8a2320>,
 <Element td at 0x7fd28d8a23b0>,
 <Element td at 0x7fd28d8a23f8>,
 <Element td at 0x7fd28d8a2488>,
 <Element td at 0x7fd28d8a24d0>,
 <Element td at 0x7fd28d8a2518>,
 <Element td at 0x7fd28d8a2560>,
 <Element td at 0x7fd28d8a25a8>,
 <Element td at 0x7fd28d8a25f0>,
 <Element td at 0x7fd28d8a2638>,
 <Element td at 0x7fd28d8a26c8>,
 <Element td at 0x7fd28d8a2710>,
 <Element td at 0x7fd28d8a2758>,
 <Element td at 0x7fd28d8a27a0>,
 <Element td at 0x7fd28d8a27e8>,
 <Element td at 0x7fd28d8a2830>,
 <Element td at 0x7fd28d8a2878>,
 <Element td at 0x7fd28d8a28c0>]



In [217]:

    
parsed.xpath('/html/body/table[8]/tbody/tr/td[4]/div/text()')









    Out[217]:





[u'\xa02024146169']



In [218]:

    
s = parsed.xpath('/html/body/table[8]/tbody/tr/td[4]/div/text()')[0]
s.replace(u'\xa0',u'')









    Out[218]:





u'2024146169'



In [219]:

    
parsed.xpath('/html/body/table[16]/tbody/tr[position() > 3]')









    Out[219]:





[]

multiple lobbyists



In [220]:

    
_params = {'event': 'getFilingDetails',
           'filingID': 'C3A7E902-87A2-49FB-8D27-1D031F48DC12',
           'filingTypeID': 1}

resp = requests.get(_filing_detail_url, params=_params)
multiple_lobbyists =  etree.parse(StringIO(resp.text), html_parser)



In [221]:

    
lobb_rows = multiple_lobbyists.xpath('/html/body/table[12]/tbody/tr[position() > 2]')



In [222]:

    
lobb_row = lobb_rows[0]
etree.tostring(lobb_row)









    Out[222]:





'<tr>\n<td style="border-bottom:2px solid">&#160;R.L. (Les)</td><td style="border-bottom:2px solid">&#160;Brownlee</td><td style="border-bottom:2px solid;border-right:2px solid">&#160;</td><td style="border-bottom:2px solid">&#160;Acting Secretary/Under Secretary of the Army;</td>\n</tr>\n'



In [223]:

    
lobb_row.xpath('td[1]')









    Out[223]:





[<Element td at 0x7fd29c0fde18>]

multiple lobbying issues



In [224]:

    
_params = {'event': 'getFilingDetails',
           'filingID': '3A144627-84A0-4190-81A8-B40718EA37EC',
           'filingTypeID': 1}

resp = requests.get(_filing_detail_url, params=_params)
multiple_issues =  etree.parse(StringIO(resp.text), html_parser)



In [225]:

    
multiple_issues.xpath('/html/body/table[13]/tbody/tr/td/div')









    Out[225]:





[<Element div at 0x7fd28d8a2e18>,
 <Element div at 0x7fd28d8a2fc8>,
 <Element div at 0x7fd28d8a2f38>,
 <Element div at 0x7fd28d8a2c68>,
 <Element div at 0x7fd28d8a2a28>,
 <Element div at 0x7fd28d8a2950>,
 <Element div at 0x7fd28d8a2908>,
 <Element div at 0x7fd28d8a2f80>,
 <Element div at 0x7fd29c0fdd88>,
 <Element div at 0x7fd28d9eb4d0>,
 <Element div at 0x7fd28d9eb440>,
 <Element div at 0x7fd28d9eb488>,
 <Element div at 0x7fd28d9eb518>,
 <Element div at 0x7fd28d9eb320>,
 <Element div at 0x7fd28d9eb560>,
 <Element div at 0x7fd28d9eb5a8>,
 <Element div at 0x7fd28d9eb5f0>,
 <Element div at 0x7fd28d9eb638>,
 <Element div at 0x7fd28d9eb680>,
 <Element div at 0x7fd28d9eb6c8>,
 <Element div at 0x7fd28d9eb710>,
 <Element div at 0x7fd28d9eb758>,
 <Element div at 0x7fd28d9eb7a0>,
 <Element div at 0x7fd28d9eb7e8>,
 <Element div at 0x7fd28d9eb830>,
 <Element div at 0x7fd28d9eb878>,
 <Element div at 0x7fd28d9eb8c0>]



In [226]:

    
multiple_issues.xpath('/html/body/table[13]/tbody/tr/td/div/text()')









    Out[226]:





[u'\xa0HOM',
 u'\xa0AVI',
 u'\xa0BUD',
 u'\xa0DEF',
 u'\xa0ECN',
 u'\xa0FOR',
 u'\xa0GOV',
 u'\xa0LBR',
 u'\xa0LAW',
 u'\xa0TOU',
 u'\xa0TRA',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0']

multiple affiliated orgs



In [229]:

    
_params = {'event': 'getFilingDetails',
           'filingID': 'C3A7E902-87A2-49FB-8D27-1D031F48DC12',
           'filingTypeID': 1}

resp = requests.get(_filing_detail_url, params=_params)
multiple_aff_orgs =  etree.parse(StringIO(resp.text), html_parser)



In [230]:

    
multiple_aff_orgs.xpath('/html/body/table[16]/tbody/tr[position() > 3]')









    Out[230]:





[<Element tr at 0x7fd28d9eb248>,
 <Element tr at 0x7fd28d9eb200>,
 <Element tr at 0x7fd28d9eb2d8>,
 <Element tr at 0x7fd28d9eb290>,
 <Element tr at 0x7fd28d9eb3f8>,
 <Element tr at 0x7fd28d9eb368>]



In [231]:

    
def _zip_odd_even(arr):
    return [(arr[i].xpath('td/div/text()'), arr[i+1].xpath('td[position() > 1]/table/tbody/tr/td/div/text()')) for i in xrange(0,len(arr),2) ]

_zip_odd_even(multiple_aff_orgs.xpath('/html/body/table[16]/tbody/tr[position() > 3]'))









    Out[231]:





[([u'\xa0AT&T Services Inc.', u'\xa01133 21st Street NW, #900'],
  [u'\xa0Washington',
   u'\xa0DC',
   u'\xa020036',
   u'\xa0USA',
   u'\xa0TX',
   u'\xa0USA']),
 ([u'\xa0T-Mobile US, Inc.', u'\xa0601 Pennsylvania Avenue, NW #800 North'],
  [u'\xa0Washington',
   u'\xa0DC',
   u'\xa020004',
   u'\xa0USA',
   u'\xa0WA',
   u'\xa0USA']),
 ([u'\xa0Verizon Communications Inc.', u'\xa01300 I Street, NW'],
  [u'\xa0Washington',
   u'\xa0DC',
   u'\xa020005',
   u'\xa0USA',
   u'\xa0NY',
   u'\xa0USA'])]



In [232]:

    
row_eg = multiple_aff_orgs.xpath('/html/body/table[16]/tbody/tr[position() > 3]')[0]



In [233]:

    
row_eg.xpath('.//td/div/text()')









    Out[233]:





[u'\xa0AT&T Services Inc.', u'\xa01133 21st Street NW, #900', u'\xa0Dallas']



In [234]:

    
for r in multiple_aff_orgs.xpath('/html/body/table[16]/tbody/tr[position() > 3]'):
    for d in r.xpath('td[3]/table/tbody/tr/td[4]/div/text()'):
        print d
    print '='*20









    



====================
 USA
====================
====================
 USA
====================
====================
 USA
====================

Multiple foreign entities



In [235]:

    
_params = {'event': 'getFilingDetails',
           'filingID': '3A144627-84A0-4190-81A8-B40718EA37EC',
           'filingTypeID': 1}

resp = requests.get(_filing_detail_url, params=_params)
multiple_foreign =  etree.parse(StringIO(resp.text), html_parser)



In [236]:

    
for r in multiple_foreign.xpath('/html/body/table[19]/tbody/tr'):
    for d in r.xpath('td[5]/div/text()'):
        print d
    print '='*20









    



====================
 100
====================
====================
 0
====================
====================
 0
====================

Checkbox



In [237]:

    
e = parsed.xpath('/html/body/div[1]/input[2]')[0]



In [238]:

    
e.attrib.has_key('checked')









    Out[238]:





True

parse date



In [239]:

    
s = parsed.xpath('/html/body/table[2]/tbody/tr[1]/td[3]/div/text()')[0]
s









    Out[239]:





'5/28/2013'



In [240]:

    
datetime.strptime(s, '%m/%d/%Y')









    Out[240]:





datetime.datetime(2013, 5, 28, 0, 0)



In [241]:

    
filter(lambda x: x % 2, [1,2,3,4,5])









    Out[241]:





[1, 3, 5]



In [242]:

    
os.path.basename('/home/blannon/1223_watzman_20131101.csv')









    Out[242]:





'1223_watzman_20131101.csv'



In [243]:

    
os.extsep









    Out[243]:





'.'



In [244]:

    
u = 'http://soprweb.senate.gov/index.cfm?event=getFilingDetails&filingTypeID=57&filingID=33eb46ef-55a7-4233-8685-f7427c057f41'



In [245]:

    
urlparse.parse_qsl(urlparse.urlparse(u).query)









    Out[245]:





[('event', 'getFilingDetails'),
 ('filingTypeID', '57'),
 ('filingID', '33eb46ef-55a7-4233-8685-f7427c057f41')]



In [246]:

    
thing = etree.parse(open('../data/cache/sopr_html/2014/REG/a25b34ae-5ccb-409f-848d-f4fa008a06b4.html'), html_parser)



In [247]:

    
print etree.tostring(thing)









    



<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title>LD-1 Disclosure Form</title>
<style type="text/css">
                    * {font-size:9pt}
                    hr {height:3px;background:black}
                    table {border-color:black;table-layout:fixed}
                </style>
</head>
<body>
<table style="border-style:double;border-width:3px" width="60%">
<col width="60%"/>
<col width="40%"/>
<tbody>
<tr>
<td>Clerk of the House of Representatives<br/>
                                Legislative Resource Center<br/>
                                B-106 Cannon Building<br/>
                                Washington, DC 20515<br/>
<a style="text-decoration:underline" href="http://lobbyingdisclosure.house.gov">http://lobbyingdisclosure.house.gov</a></td><td>Secretary of the Senate<br/>
                                Office of Public Records<br/>
                                232 Hart Building<br/>
                                Washington, DC 20510<br/>
<a style="text-decoration:underline" href="http://www.senate.gov/lobby">http://www.senate.gov/lobby</a></td>
</tr>
</tbody>
</table>
<p style="font-size:24pt;font-weight:bold">LOBBYING REGISTRATION</p>
<p style="font-size:12pt">Lobbying Disclosure Act of 1995 (Section 4)</p>
<div>
<span style="font-style:italic;text-decoration:underline;font-size:8pt">Check One:</span><input disabled="disabled" type="checkbox"/> New Registrant <input disabled="disabled" type="checkbox" checked="checked"/> New Client for Existing Registrant<input disabled="disabled" type="checkbox"/> Amendment</div>
<br/>
<table width="100%">
<col span="2" width="20%"/>
<col span="2" width="10%"/>
<col span="2" width="20%"/>
<tbody>
<tr valign="baseline" style="font-size:10pt">
<td colspan="3"/><td style="text-align:center" colspan="2">1. Effective Date of Registration</td><td>
<div style="border-bottom:solid 1px">05/01/2014</div>
</td>
</tr>
<tr valign="baseline" style="font-size:10pt">
<td style="font-weight:bold">2. House Identification</td><td>
<div style="border-bottom:solid 1px">&#160;42262</div>
</td><td colspan="2"/><td style="font-weight:bold">Senate Identification</td><td>
<div style="border-bottom:solid 1px">&#160;401047945</div>
</td>
</tr>
</tbody>
</table>
<hr/>
<p>
<span style="font-weight:bold;font-size:14pt">REGISTRANT </span><input disabled="disabled" type="checkbox"/> Organization/Lobbying Firm <input disabled="disabled" type="checkbox" checked="checked"/> Self Employed Individual</p>
<table width="100%">
<col width="11%"/>
<col width="9%"/>
<col width="5%"/>
<col width="10%"/>
<col width="20%"/>
<col width="10%"/>
<col width="35%"/>
<tbody>
<tr>
<td>3. Registrant</td><td>Prefix</td><td>
<div style="border-bottom:solid 1px">&#160;Mr.</div>
</td><td>First</td><td>
<div style="border-bottom:solid 1px">&#160;Andrew</div>
</td><td>Last</td><td>
<div style="border-bottom:solid 1px">&#160;Wahlquist</div>
</td>
</tr>
</tbody>
</table>
<table width="100%">
<col width="8%"/>
<col width="42%"/>
<col width="8%"/>
<col width="42%"/>
<tbody>
<tr>
<td>Address</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">6622 Madison McLean Drive</div>
</td><td>Address2</td><td>
<div style="border-bottom:solid 1px">&#160;</div>
</td>
</tr>
</tbody>
</table>
<table width="100%">
<col width="5%"/>
<col width="40%"/>
<col width="5%"/>
<col width="5%"/>
<col width="5%"/>
<col width="20%"/>
<col width="6%"/>
<col width="14%"/>
<tbody>
<tr>
<td>City</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">McLean</div>
</td><td>State</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">VA</div>
</td><td>Zip</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">22101</div>
</td><td>Country</td><td style="padding-left:1em">
<div style="border-bottom:solid 1px">USA</div>
</td>
</tr>
</tbody>
</table>4. Principal place of business (if different than line 3)<br/>
<table width="100%">
<col width="5%"/>
<col width="40%"/>
<col width="5%"/>
<col width="5%"/>
<col width="5%"/>
<col width="20%"/>
<col width="6%"/>
<col width="14%"/>
<tbody>
<tr>
<td>City</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;</div>
</td><td>State</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;</div>
</td><td>Zip</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;</div>
</td><td>Country</td><td style="padding-left:1em">
<div style="border-bottom:solid 1px">&#160;</div>
</td>
</tr>
</tbody>
</table>
<table width="100%">
<col span="2" width="50%"/>
<tbody>
<tr>
<td>5. Contact name and telephone number</td><td><input disabled="disabled" type="checkbox"/><span style="font-weight:bold;font-size:75%"> International Number</span></td>
</tr>
</tbody>
</table>
<table width="100%">
<col width="7%"/>
<col width="34%"/>
<col width="7%"/>
<col width="20%"/>
<col width="7%"/>
<col width="25%"/>
<tbody>
<tr>
<td>Contact</td><td style="padding-right:.5em;padding-left:.5em">
<div style="border-bottom:solid 1px">&#160;Mr. Andrew Wahlquist</div>
</td><td>Telephone</td><td style="padding-right:.5em;padding-left:2em">
<div style="border-bottom:solid 1px">&#160;2025778848</div>
</td><td>E-mail</td><td>
<div style="border-bottom:solid 1px">&#160;awahlquist@cox.net</div>
</td>
</tr>
</tbody>
</table>6. General description of registrant&#8217;s business or activities<div style="border-bottom:solid 1px">&#160;Government relations and business development</div>
<hr/>
<p style="margin-bottom:-.25em;margin-top:-.5em;font-size:14pt;font-weight:bold">CLIENT
                    <span style="font-weight:normal;font-style:italic;font-size:8pt">A Lobbying Firm is required to file a separate
                        registration for each client. Organizations employing in-house lobbyists should
                        check the box labeled &#8220;Self&#8221; and proceed to line 10.</span> <input disabled="disabled" type="checkbox"/><span style="font-style:italic;font-weight:bold;font-size:8pt">Self</span>
</p>
<table width="100%">
<col width="12%"/>
<col width="88%"/>
<tbody>
<tr>
<td>7. Client name</td><td>
<div style="border-bottom:solid 1px">&#160;Technology and Suipply Management, LLC</div>
</td>
</tr>
<tr>
<td>Address</td><td>
<div style="border-bottom:solid 1px">&#160;3877 Fairfax Ridge Road - Suite 110N</div>
</td>
</tr>
</tbody>
</table>
<table width="100%">
<col width="7%"/>
<col width="55%"/>
<col width="5%"/>
<col width="5%"/>
<col width="5%"/>
<col width="10%"/>
<col width="6%"/>
<col width="7%"/>
<tbody>
<tr>
<td>City</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;Fairfax</div>
</td><td>State</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;VA</div>
</td><td>Zip</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;22030</div>
</td><td>Country</td><td>
<div style="border-bottom:solid 1px">&#160;USA</div>
</td>
</tr>
</tbody>
</table>8. Principal place of business (if different than line 7)<table width="100%">
<col width="7%"/>
<col width="55%"/>
<col width="5%"/>
<col width="5%"/>
<col width="5%"/>
<col width="10%"/>
<col width="6%"/>
<col width="7%"/>
<tbody>
<tr>
<td>City</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;</div>
</td><td>State</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;</div>
</td><td>Zip</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;</div>
</td><td>Country</td><td>
<div style="border-bottom:solid 1px">&#160;</div>
</td>
</tr>
</tbody>
</table>9. General description of client&#8217;s business or activities<div style="border-bottom:solid 1px">&#160;Government contracting and commercial logistics</div>
<hr/>
<p style="margin-bottom:-.5em;margin-top:-.5em;font-size:14pt;font-weight:bold">LOBBYISTS</p>
<p style="font-size:10pt">10. Name of each individual who has acted or is expected to act as a lobbyist
                    for the client identified on line 7. If any person listed in this section has
                    served as a &#8220;covered executive branch official&#8221; or
                    &#8220;covered legislative branch official&#8221; within twenty years
                    of first acting as a lobbyist for the client,
                    <span style="font-style:italic">state the executive and/or legislative
                        position(s) in which the person served.</span>
</p>
<table style="border-collapse:collapse" width="100%">
<col span="2" width="22.5%"/>
<col width="5%"/>
<col width="50%"/>
<tbody>
<tr>
<td/><td style="text-align:center">Name</td><td style="border-right:solid 2px">&#160;</td><td style="text-align:center">
                                Covered Official Position (if applicable)
                            </td>
</tr>
<tr style="font-size:8pt">
<td style="border-bottom:solid 2px">
<div style="border:solid 1px">First</div>
</td><td style="border-bottom:solid 2px">
<div style="border:solid 1px">Last</div>
</td><td style="border-right:2px solid;border-bottom:solid 2px">
<div style="border:solid 1px">Suffix</div>
</td><td style="border-bottom:solid 2px"/>
</tr>
<tr>
<td style="border-bottom:2px solid">&#160;Andrew </td><td style="border-bottom:2px solid">&#160;Wahlquist</td><td style="border-bottom:2px solid;border-right:2px solid">&#160;Mr.</td><td style="border-bottom:2px solid">&#160;</td>
</tr>
</tbody>
</table>
<hr/>
<p style="margin-bottom:-.5em;margin-top:-.5em;                                 font-size:14pt;font-weight:bold">LOBBYING ISSUES</p>
<p>11. General lobbying issue areas (Select all applicable codes).</p>
<table width="90%">
<col span="9" width="11.1%"/>
<tbody>
<tr>
<td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;DEF</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;HOM</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td>
</tr>
</tbody>
</table>
<hr/>
<p>12. Specific lobbying issues (current and anticipated)</p>
<p>Promote aerostat technology for border security</p>
<hr/>
<p style="margin-bottom:-.5em;margin-top:-.5em;font-size:14pt;font-weight:bold">AFFILIATED ORGANIZATIONS</p>
<p>13. Is there an entity other than the client that contributes more than
                    $5,000 to the lobbying activities of the registrant in a quarterly period
                    and either actively participates in and/or in whole or in major part supervises, plans, or
                    controls the registrant&#8217;s lobbying activities?</p>
<table width="100%">
<col width="40%"/>
<col width="60%"/>
<tbody>
<tr>
<td><input disabled="disabled" type="checkbox" checked="checked"/>  No --&gt; Go to line 14.</td><td><input disabled="disabled" type="checkbox"/>  
                                Yes --&gt; Complete the rest of this section for each entity matching the
                                criteria above, then proceed to line 14.
                            </td>
</tr>
</tbody>
</table>
<hr style="height:1px"/>
<table width="100%">
<col width="10%"/>
<col width="90%"/>
<tbody>
<td>Internet Address:</td><td>
<div style="border-bottom:solid 1px">&#160;</div>
</td>
</tbody>
</table>
<table width="100%">
<col span="3" width="33.3%"/>
<tbody>
<tr style="text-align:center">
<td>Name</td><td>Address</td><td>Principal Place of Business</td>
</tr>
<tr>
<td/><td>Street</td><td/>
</tr>
<tr>
<td/><td>
<table width="100%">
<col width="25%"/>
<col width="40%"/>
<col width="15%"/>
<col width="20%"/>
<tbody>
<tr>
<td>City</td><td style="text-align:right">State/Province</td><td>Zip Code</td><td>Country</td>
</tr>
</tbody>
</table>
</td><td/>
</tr>
</tbody>
</table>
<hr/>
<p style="margin-bottom:-.5em;margin-top:-.5em;font-size:14pt;font-weight:bold">FOREIGN ENTITIES</p>
<p>14. Is there any foreign entity</p>
<p style="margin-left:5em">a) holds at least 20% equitable ownership in the client or
                    any organization identified on line 13; or</p>
<p style="margin-left:5em">b) directly or indirectly, in whole or in major part, plans,
                    supervises, controls, directs, finances or subsidizes activities of the client or
                    any organization identified on line 13; or</p>
<p style="margin-left:5em">c) is an affiliate of the client or any organization identified
                    on line 13 and has a direct interest in the outcome of the lobbying activity?</p>
<table width="100%">
<col width="5%"/>
<col width="30%"/>
<col width="5%"/>
<col width="60%"/>
<tbody>
<tr>
<td><input disabled="disabled" type="checkbox" checked="checked"/></td><td>No --&gt; Sign and date the registration.</td><td><input disabled="disabled" type="checkbox"/></td><td>Yes --&gt; Complete the rest of this section for each entity matching the
                                criteria above, then sign the registration.</td>
</tr>
</tbody>
</table>
<hr/>
<table width="100%">
<col width="20%"/>
<col width="25%"/>
<col width="20%"/>
<col width="20%"/>
<col width="10%"/>
<col width="5%"/>
<tbody>
<tr>
<td/><td style="text-align:center">Address</td><td/><td/><td style="text-align:center" colspan="2">Ownership</td>
</tr>
<tr>
<td style="text-align:center">Name</td><td>Street</td><td style="text-align:center">Principal place of business</td><td style="text-align:center">Amount of contribution</td>
</tr>
<tr>
<td/><td>
<table width="100%">
<col width="25%"/>
<col width="50%"/>
<col width="25%"/>
<tbody>
<tr>
<td>City</td><td style="text-align:right">State/Province</td><td>Country</td>
</tr>
</tbody>
</table>
</td><td style="text-align:center">(city and state or country)</td><td style="text-align:center">for lobbying activities</td>
</tr>
</tbody>
</table>
<hr style="height:1px"/>
<table width="100%">
<col width="20%"/>
<col width="25%"/>
<col width="28%"/>
<col width="20%"/>
<col width="5%"/>
<col width="2%"/>
<tbody/>
</table>
<table width="100%">
<col width="10%"/>
<col width="70%"/>
<col span="2" width="10%"/>
<tbody>
<tr>
<td style="font-weight:bold">Signature</td><td>
<div style="border:solid 1px">Digitally Signed By: Andrew Wahlquist - Consultant</div>
</td><td style="text-align:center;font-weight:bold">Date</td><td>
<div style="border-bottom:solid 2px">06/30/2014</div>
</td>
</tr>
</tbody>
</table>
</body>
</html>



In [248]:

    
sys.path.append('..')



In [249]:

    
from tasks import extract
from tasks.schema import ld1_schema



In [250]:

    
elements = filter(lambda x: 'children' not in x, ld1_schema)
containers = filter(lambda x: 'children' in x, ld1_schema)



In [251]:

    
elements









    Out[251]:





[{'field': 'new_registrant',
  'lda_question': None,
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/div[1]/input[1]',
  'section': 'registration_type'},
 {'field': 'new_client_for_existing_registrant',
  'lda_question': None,
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/div[1]/input[2]',
  'section': 'registration_type'},
 {'field': 'amendment',
  'lda_question': None,
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/div[1]/input[3]',
  'section': 'registration_type'},
 {'field': 'effective_date',
  'lda_question': '1',
  'parser': <function tasks.schema.parse_datetime>,
  'path': '/html/body/table[2]/tbody/tr[1]/td[3]/div',
  'section': 'datetimes'},
 {'field': 'registrant_house_id',
  'lda_question': '2',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[2]/tbody/tr[2]/td[2]/div',
  'section': 'identifiers'},
 {'field': 'registrant_senate_id',
  'lda_question': '2',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[2]/tbody/tr[2]/td[5]/div',
  'section': 'identifiers'},
 {'field': 'organization_or_lobbying_firm',
  'lda_question': None,
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/p[3]/input[1]',
  'section': 'registrant'},
 {'field': 'self_employed_individual',
  'lda_question': None,
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/p[3]/input[2]',
  'section': 'registrant'},
 {'field': 'registrant_name',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[3]/tbody/tr/td[3]/div',
  'section': 'registrant'},
 {'field': 'registrant_address_one',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[4]/tbody/tr/td[2]/div',
  'section': 'registrant'},
 {'field': 'registrant_address_two',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[4]/tbody/tr/td[4]/div',
  'section': 'registrant'},
 {'field': 'registrant_city',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[5]/tbody/tr/td[2]/div',
  'section': 'registrant'},
 {'field': 'registrant_state',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[5]/tbody/tr/td[4]/div',
  'section': 'registrant'},
 {'field': 'registrant_zip',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[5]/tbody/tr/td[6]/div',
  'section': 'registrant'},
 {'field': 'registrant_country',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[5]/tbody/tr/td[8]/div',
  'section': 'registrant'},
 {'field': 'registrant_ppb_city',
  'lda_question': '4',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[6]/tbody/tr/td[2]/div',
  'section': 'registrant'},
 {'field': 'registrant_ppb_state',
  'lda_question': '4',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[6]/tbody/tr/td[4]/div',
  'section': 'registrant'},
 {'field': 'registrant_ppb_zip',
  'lda_question': '4',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[6]/tbody/tr/td[6]/div',
  'section': 'registrant'},
 {'field': 'registrant_ppb_country',
  'lda_question': '4',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[6]/tbody/tr/td[8]/div',
  'section': 'registrant'},
 {'field': 'registrant_international_phone',
  'lda_question': '5',
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/table[7]/tbody/tr/td[2]/input',
  'section': 'registrant'},
 {'field': 'registrant_contact',
  'lda_question': '5',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[8]/tbody/tr/td[2]/div',
  'section': 'registrant'},
 {'field': 'registrant_phone',
  'lda_question': '5',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[8]/tbody/tr/td[4]/div',
  'section': 'registrant'},
 {'field': 'registrant_email',
  'lda_question': '5',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[8]/tbody/tr/td[6]/div',
  'section': 'registrant'},
 {'field': 'registrant_email',
  'lda_question': '5',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[8]/tbody/tr/td[6]/div',
  'section': 'registrant'},
 {'field': 'registrant_general_description',
  'lda_question': '6',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/div[2]',
  'section': 'registrant'},
 {'field': 'client_self',
  'lda_question': None,
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/p[4]/input',
  'section': 'client'},
 {'field': 'client_name',
  'lda_question': '7',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[9]/tbody/tr[1]/td[2]/div',
  'section': 'client'},
 {'field': 'client_address',
  'lda_question': '7',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[9]/tbody/tr[2]/td[2]/div',
  'section': 'client'},
 {'field': 'client_city',
  'lda_question': '7',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[10]/tbody/tr/td[2]/div',
  'section': 'client'},
 {'field': 'client_state',
  'lda_question': '7',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[10]/tbody/tr/td[4]/div',
  'section': 'client'},
 {'field': 'client_zip',
  'lda_question': '7',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[10]/tbody/tr/td[6]/div',
  'section': 'client'},
 {'field': 'client_country',
  'lda_question': '7',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[10]/tbody/tr/td[8]/div',
  'section': 'client'},
 {'field': 'client_ppb_city',
  'lda_question': '8',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[11]/tbody/tr/td[2]/div',
  'section': 'client'},
 {'field': 'client_ppb_state',
  'lda_question': '8',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[11]/tbody/tr/td[4]/div',
  'section': 'client'},
 {'field': 'client_ppb_zip',
  'lda_question': '8',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[11]/tbody/tr/td[6]/div',
  'section': 'client'},
 {'field': 'client_ppb_country',
  'lda_question': '8',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[11]/tbody/tr/td[8]/div',
  'section': 'client'},
 {'field': 'client_general_description',
  'lda_question': '9',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/div[3]',
  'section': 'client'},
 {'field': 'lobbying_issues_detail',
  'lda_question': '12',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/p[10]',
  'section': 'lobbying_issues_detail'},
 {'field': 'affiliated_organizations_no',
  'lda_question': '13',
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/table[14]/tbody/tr/td[1]/input',
  'section': 'affiliated_organizations'},
 {'field': 'affiliated_organizations_yes',
  'lda_question': '13',
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/table[14]/tbody/tr/td[2]/input',
  'section': 'affiliated_organizations'},
 {'field': 'affiliated_organizations_url',
  'lda_question': '13',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[15]/tbody/tr/td[2]/div',
  'section': 'affiliated_organizations'},
 {'field': 'foreign_entities_no',
  'lda_question': '14',
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/table[17]/tbody/tr/td[1]/input',
  'section': 'foreign_entities'},
 {'field': 'foreign_entities_yes',
  'lda_question': '14',
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/table[17]/tbody/tr/td[3]/input',
  'section': 'foreign_entities'},
 {'field': 'signature',
  'lda_question': None,
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[20]/tbody/tr/td[2]/div',
  'section': 'signature'},
 {'field': 'signature_date',
  'lda_question': None,
  'parser': <function tasks.schema.parse_datetime>,
  'path': '/html/body/table[20]/tbody/tr/td[4]/div',
  'section': 'datetimes'}]



In [252]:

    
reload(extract)
extract.extract_html('../data/cache/sopr_html/2014/REG/a25b34ae-5ccb-409f-848d-f4fa008a06b4.html',
                     elements,
                     containers)









    



05/01/2014
06/30/2014
{
  "lobbying_issues": {
    "lobbying_issues": [
      {
        "issue_code": "DEF"
      }, 
      {
        "issue_code": "HOM"
      }
    ]
  }, 
  "identifiers": {
    "registrant_house_id": "42262", 
    "registrant_senate_id": "401047945"
  }, 
  "affiliated_organizations": {
    "affiliated_organizations_yes": false, 
    "affiliated_organizations_url": null, 
    "affiliated_organizations": [], 
    "affiliated_organizations_no": true
  }, 
  "datetimes": {
    "effective_date": "2014-05-01T00:00:00", 
    "signature_date": "2014-06-30T00:00:00"
  }, 
  "client": {
    "client_country": "USA", 
    "client_ppb_city": "", 
    "client_self": false, 
    "client_name": "Technology and Suipply Management, LLC", 
    "client_city": "Fairfax", 
    "client_zip": "22030", 
    "client_ppb_zip": "", 
    "client_ppb_country": "", 
    "client_ppb_state": "", 
    "client_general_description": "Government contracting and commercial logistics", 
    "client_state": "VA", 
    "client_address": "3877 Fairfax Ridge Road - Suite 110N"
  }, 
  "foreign_entities": {
    "foreign_entities_yes": false, 
    "foreign_entities": [], 
    "foreign_entities_no": true
  }, 
  "lobbyists": {
    "lobbyists": [
      {
        "lobbyist_covered_official_position": "", 
        "lobbyist_first_name": "Andrew", 
        "lobbyist_last_name": "Wahlquist", 
        "lobbyist_suffix": "Mr."
      }
    ]
  }, 
  "signature": {
    "signature": "Digitally Signed By: Andrew Wahlquist - Consultant"
  }, 
  "lobbying_issues_detail": {
    "lobbying_issues_detail": "Promote aerostat technology for border security"
  }, 
  "registrant": {
    "registrant_name": "Mr.", 
    "registrant_zip": "22101", 
    "registrant_address_one": "6622 Madison McLean Drive", 
    "self_employed_individual": true, 
    "registrant_ppb_city": "", 
    "registrant_ppb_zip": "", 
    "registrant_city": "McLean", 
    "registrant_email": "awahlquist@cox.net", 
    "registrant_state": "VA", 
    "registrant_ppb_state": "", 
    "organization_or_lobbying_firm": false, 
    "registrant_country": "USA", 
    "registrant_address_two": "", 
    "registrant_contact": "Mr. Andrew Wahlquist", 
    "registrant_international_phone": false, 
    "registrant_phone": "2025778848", 
    "registrant_general_description": "Government relations and business development", 
    "registrant_ppb_country": ""
  }, 
  "document_id": "a25b34ae-5ccb-409f-848d-f4fa008a06b4", 
  "registration_type": {
    "amendment": false, 
    "new_registrant": false, 
    "new_client_for_existing_registrant": true
  }
}



In [253]:

    
"goo"









    Out[253]:





'goo'



In [254]:

    
dt = datetime(2014, 4, 1, 0, 0)



In [255]:

    
dt.isoformat()









    Out[255]:





'2014-04-01T00:00:00'

LD-2



In [256]:

    
_params = {'event': 'getFilingDetails',
           'filingID': '80b956e1-3448-404a-bdfd-558ffe2631ce',
           'filingTypeID': 69}

resp = requests.get(_filing_detail_url, params=_params)
multiple_issues =  etree.parse(StringIO(resp.text), html_parser)



In [257]:

    
print etree.tostring(multiple_issues.xpath('/html/body/table[4]/tbody/tr[5]/td[1]/table')[0])









    



<table width="65%">
<col width="47.77%"/>
<col span="2" width="8%"/>
<col width="36.23%"/>
<tbody>
<td style="text-decoration:underline;font-size=75%">$5,000 or more</td><td><input disabled="disabled" type="checkbox" checked="checked"/></td><td style="text-align:right">$ </td><td>
<div style="border-bottom:solid 1px">&#160;50,000.00</div>
</td>
</tbody>
</table>



In [258]:

    
for e in multiple_issues.xpath('//p[@style="page-break-before:always"]'):
    print etree.tostring(e)
    print "="*80









    



<p style="page-break-before:always">
<span style="font-weight:bold">LOBBYING ACTIVITY.</span>
                        Select as many codes as necessary to reflect the general issue areas in which the registrant
                        engaged in lobbying on behalf of the client during the reporting period. Using
                        a separate page for each code, provide information as requested. Add additional page(s)
                        as needed.
                    </p>

================================================================================
<p style="page-break-before:always">
<span style="font-weight:bold">LOBBYING ACTIVITY.</span>
                        Select as many codes as necessary to reflect the general issue areas in which the registrant
                        engaged in lobbying on behalf of the client during the reporting period. Using
                        a separate page for each code, provide information as requested. Add additional page(s)
                        as needed.
                    </p>

================================================================================
<p style="page-break-before:always">
<span style="font-weight:bold">LOBBYING ACTIVITY.</span>
                        Select as many codes as necessary to reflect the general issue areas in which the registrant
                        engaged in lobbying on behalf of the client during the reporting period. Using
                        a separate page for each code, provide information as requested. Add additional page(s)
                        as needed.
                    </p>

================================================================================



In [259]:

    
page_break = multiple_issues.xpath('//p[@style="page-break-before:always"]')[0]



In [260]:

    
#general_issue_code = multiple_issues.xpath('//p[@style="page-break-before:always"]/following-sibling::p[1]')[0]
general_issue_code = multiple_issues.xpath('//p[contains(.,"15. General issue area")]')[0]
print etree.tostring(general_issue_code)









    



<p>15. General issue area code TAX </p>



In [261]:

    
[etree.tostring(e) for e in multiple_issues.xpath('//p[contains(.,"15. General issue area")]')]









    Out[261]:





['<p>15. General issue area code TAX </p>\n',
 '<p>15. General issue area code MMM </p>\n',
 '<p>15. General issue area code ECN </p>\n']



In [262]:

    
general_issue_code = page_break.getnext()
filler = general_issue_code.getnext()
specific_lobbying_issues = filler.getnext()
congress_agency_check = specific_lobbying_issues.getnext()
congress_agency_detail = congress_agency_check.getnext()
filler = congress_agency_detail.getnext()
lobbyists = filler.getnext()
foreign_entity_check = lobbyists.getnext()
foreign_entity_list = foreign_entity_check.getnext()



In [263]:

    
fs = [general_issue_code,
      specific_lobbying_issues,
      congress_agency_check,
      congress_agency_detail,
      lobbyists,
      foreign_entity_check,
      foreign_entity_list]

for f in fs:
    print etree.tostring(f)
    print '='*20









    



<p>15. General issue area code TAX </p>

====================
<p class="lightbox">S. 232/H.R. 523, The Protect Medical Innovation Act of 2013.<br/>
</p>

====================
<p>17. House(s) of Congress and Federal agencies
                        <span style="padding-left:2em"><input disabled="disabled" type="checkbox"/></span>
                        Check if None
                    </p>

====================
<p class="lightbox">U.S. HOUSE OF REPRESENTATIVES,</p>

====================
<table width="100%">
<col span="2" width="23.2%"/>
<col width="5.8%"/>
<col width="42%"/>
<col width="5.8%"/>
<tbody>
<tr>
<td>First Name</td><td>Last Name</td><td style="border-right:2px solid">Suffix</td><td style="text-align:center;border-right:2px solid">
                                    Covered Official Position (if applicable)
                                </td><td style="border-right:2px solid">New</td>
</tr>
<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Jodie</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Curtis</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox"/></td>
</tr>
</tbody>
</table>

====================
<p>19. Interest of each foreign entity in the specific issues listed on line 16 above
                        <span style="padding-left:2em"><input disabled="disabled" type="checkbox" checked="checked"/></span>
                        Check if None
                    </p>

====================
<p class="lightbox">&#8203;</p>

====================



In [264]:

    
[etree.tostring(e) for e in general_issue_code.xpath('../following-sibling::p')]









    Out[264]:





[]



In [265]:

    
print etree.tostring(general_issue_code.xpath(
                     'following-sibling::p[7]')[0])









    



<p class="lightbox">&#8203;</p>



In [266]:

    
second_gic = multiple_issues.xpath('//p[@style="page-break-before:always"]/following-sibling::p[1]')[1]

for r in second_gic.xpath('following-sibling::table[1]/tbody/tr[position() > 1]'):
    print etree.tostring(r)









    



<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Jodie</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Curtis</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Jim</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Twaddell</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">Leg. Aide/Dpty Comm. Dir./Senator Arlen Specter</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox" checked="checked"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Jeremy </div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Scott</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">Staff Asst./Leg. Corr., Senator Mike DeWine</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox" checked="checked"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Ilisa</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Halpern Paul</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">Staff Asst./Leg. Corr., Senator Dianne Feinstein</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox" checked="checked"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Rebecca</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">McGrath</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">Leg. Asst./Scheduler, Senator Chris Dodd</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox" checked="checked"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Julie</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Hyams</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">Leg. Asst./Staff Asst. Cong. Stokes</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox" checked="checked"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Erin</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Morton</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox" checked="checked"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Anna</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Howard</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox"/></td>
</tr>



In [267]:

    
print etree.tostring(multiple_issues.xpath('//p[contains(.,"20.")]')[0])









    



<p>20. Client new address</p>



In [268]:

    
found = multiple_issues.xpath('//p[contains(.,"23. Name of each previously")]'
                              '/following-sibling::table[1]'
                              '/tbody/tr[position()>1]/td/table'
                              '/tbody')
for e in found:
    print etree.tostring(e)
    print '='*30









    



<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;Andrew </div>
</td><td>
<div style="border-bottom:solid 2px">&#160;Bowman</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================



In [269]:

    
resp.url









    Out[269]:





u'http://soprweb.senate.gov/index.cfm?filingTypeID=69&filingID=80b956e1-3448-404a-bdfd-558ffe2631ce&event=getFilingDetails'



In [270]:

    
len(multiple_issues.xpath('//p[contains(.,"24. General lobbying issue")]/following-sibling::table[1]/tbody/tr/td/div'))









    Out[270]:





18

multiple added affiliated orgs



In [271]:

    
_params = {'event': 'getFilingDetails',
           'filingID': '42524728-28e1-424f-9608-2b4f05f7cd2b',
           'filingTypeID': 82}

resp = requests.get(_filing_detail_url, params=_params)
multiple_added_aff =  etree.parse(StringIO(resp.text), html_parser)



In [272]:

    
rows = multiple_added_aff.xpath('//p[contains(.,"25. Add the following")]'
                         '/following-sibling::table[1]/tbody/tr')
rows









    Out[272]:





[<Element tr at 0x7fd28f2695f0>, <Element tr at 0x7fd28f269638>]



In [273]:

    
for r in rows:
    for e in (r.xpath('td[3]/table/tbody/tr[2]/td[2]')):
        print etree.tostring(e)
    #print etree.tostring(r)
    print "="*20









    



<td>Country USA</td>

====================
<td>Country 
        </td>

====================



In [274]:

    
r.xpath('td[3]/table/tbody/tr[2]/td[2]')[0].text.split()









    Out[274]:





['Country']



In [275]:

    
r.getchildren()









    Out[275]:





[<Element td at 0x7fd28d99be18>,
 <Element td at 0x7fd28d9a47e8>,
 <Element td at 0x7fd28d9a4758>]

multiple no longer affiliated



In [276]:

    
_params = {'event': 'getFilingDetails',
           'filingID': '2897035b-c56e-4d05-9a51-cab6a4b505f8',
           'filingTypeID': 53}

resp = requests.get(_filing_detail_url, params=_params)
multiple_removed_aff =  etree.parse(StringIO(resp.text), html_parser)



In [277]:

    
table = multiple_removed_aff.xpath('//p[contains(.,"26. Name of each previously")]'
                         '/following-sibling::table[1]')[0]



In [278]:

    
for e in table.xpath('tbody/tr/td'):
    print etree.tostring(e)









    



<td><span style="font-weight:bold;padding:1px" class="lightbox">1</span> Sylvan Learning</td>
<td><span style="font-weight:bold;padding:1px" class="lightbox">2</span> Educate Online, Inc.</td>
<td><span style="font-weight:bold;padding:1px" class="lightbox">3</span> Educate, Inc.</td>



In [279]:

    
table.xpath('tbody/tr/td/span')









    Out[279]:





[<Element span at 0x7fd28d9a4680>,
 <Element span at 0x7fd28d9a4710>,
 <Element span at 0x7fd28d9a4320>]



In [280]:

    
e = table.xpath('tbody/tr/td/span')[0]



In [281]:

    
e.tail









    Out[281]:





' Sylvan Learning'



In [282]:

    
[e.tail.strip() for e in multiple_removed_aff.xpath('//p[contains(.,"26. Name of each previously")]'
                         '/following-sibling::table[1]/tbody/tr/td/span')]









    Out[282]:





['Sylvan Learning', 'Educate Online, Inc.', 'Educate, Inc.']

multiple added foreign



In [283]:

    
_params = {'event': 'getFilingDetails',
           'filingID': '6e8effc6-e1e3-413e-86c9-24eda20858f2',
           'filingTypeID': 60}

resp = requests.get(_filing_detail_url, params=_params)
multiple_added_foreign =  etree.parse(StringIO(resp.text), html_parser)



In [284]:

    
rows = multiple_added_foreign.xpath('//p[contains(.,"27. Add the following foreign")]'
                             '/following-sibling::table[1]/tbody/tr')



In [285]:

    
for r in rows:
    print etree.tostring(r)
    print "="*20









    



<tr>
<td style="border-right:solid 1px">E.ON AG</td><td style="border-right:solid 1px">
<table width="100%">
<col width="50%"/>
<col width="30%"/>
<col width="20%"/>
<tbody>
<tr>
<td colspan="3">E.ON - Platz 1 40479</td>
</tr>
<tr>
<td>Dusseldorf</td><td>
        </td><td>DEN</td>
</tr>
</tbody>
</table>
</td><td style="border-right:solid 1px">
<table width="100%">
<col span="2" width="50%"/>
<tbody>
<tr>
<td colspan="2">City Dusseldorf</td>
</tr>
<tr>
<td>State 
        </td><td>Country DEN</td>
</tr>
</tbody>
</table>
</td><td style="border-right:solid 1px">0</td><td style="text-align:right"> %</td>
</tr>

====================
<tr>
<td style="border-right:solid 1px">E.ON AG</td><td style="border-right:solid 1px">
<table width="100%">
<col width="50%"/>
<col width="30%"/>
<col width="20%"/>
<tbody>
<tr>
<td colspan="3">E.ON - Platz 1 40479</td>
</tr>
<tr>
<td>Dusseldorf</td><td>
        </td><td>DEN</td>
</tr>
</tbody>
</table>
</td><td style="border-right:solid 1px">
<table width="100%">
<col span="2" width="50%"/>
<tbody>
<tr>
<td colspan="2">City Dusseldorf</td>
</tr>
<tr>
<td>State 
        </td><td>Country DEN</td>
</tr>
</tbody>
</table>
</td><td style="border-right:solid 1px">0</td><td style="text-align:right"> %</td>
</tr>

====================



In [286]:

    
for r in rows:
    for e in (r.xpath('td[5]')):
        print etree.tostring(e)
        print "="*20
    print "="*20









    



<td style="text-align:right"> %</td>

====================
====================
<td style="text-align:right"> %</td>

====================
====================



In [287]:

    
print etree.tostring(r)









    



<tr>
<td style="border-right:solid 1px">E.ON AG</td><td style="border-right:solid 1px">
<table width="100%">
<col width="50%"/>
<col width="30%"/>
<col width="20%"/>
<tbody>
<tr>
<td colspan="3">E.ON - Platz 1 40479</td>
</tr>
<tr>
<td>Dusseldorf</td><td>
        </td><td>DEN</td>
</tr>
</tbody>
</table>
</td><td style="border-right:solid 1px">
<table width="100%">
<col span="2" width="50%"/>
<tbody>
<tr>
<td colspan="2">City Dusseldorf</td>
</tr>
<tr>
<td>State 
        </td><td>Country DEN</td>
</tr>
</tbody>
</table>
</td><td style="border-right:solid 1px">0</td><td style="text-align:right"> %</td>
</tr>



In [288]:

    
import locale

multiple inactive foreign



In [290]:

    
_params = {'event': 'getFilingDetails',
           'filingID': '55dd2926-23b4-489d-8132-b040cc6ddac5',
           'filingTypeID': 78}

resp = requests.get(_filing_detail_url, params=_params)
multiple_inactive_foreign =  etree.parse(StringIO(resp.text), html_parser)



In [291]:

    
[e.tail.strip() for e in multiple_inactive_foreign.xpath(
    '//p[contains(.,"28. Name of each previously reported foreign entity")]'
    '/following-sibling::table[1]/tbody/tr/td/span')]









    Out[291]:





['CNH Global N.V.',
 '',
 '',
 'Fiat Industrial S.p.A.',
 '',
 '',
 'Fiat Netherlands Holding N.V.',
 '',
 '',
 '',
 '',
 '']



In [292]:

    
from collections import defaultdict, Counter
record = defaultdict(dict)



In [293]:

    
record['a'] = 2



In [294]:

    
record









    Out[294]:





defaultdict(<type 'dict'>, {'a': 2})



In [295]:

    
json.dumps(record)









    Out[295]:





'{"a": 2}'



In [296]:

    
from glob import glob



In [297]:

    
dirs = glob(os.path.join(settings.CACHE_DIR, 'sopr_html/200[89]/Q2/*.html'))\
       + glob(os.path.join(settings.CACHE_DIR, 'sopr_html/201[0-9]/Q2/*.html'))
len(dirs)









    Out[297]:





41



In [298]:

    
Counter([i.split('/')[9] for i in dirs])









    Out[298]:





Counter({'2014': 41})

Debug



In [306]:

    
from pymongo import mongo_client



In [308]:

    
mc = mongo_client.MongoClient()
db = mc.lobbying_federal_domestic



In [311]:

    
db.house_ld2.find_one({"LOBBYINGDISCLOSURE2.alis.0.lobbyists": 
                        {'$elemMatch': 
                         {'coveredPosition': {'$ne': ''}}}})









    Out[311]:





{u'LOBBYINGDISCLOSURE2': {u'address1': u'4908 Cloister Drive',
  u'address2': u'',
  u'alis': [{u'federal_agencies': u'U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE, Natl Credit Union Administration (NCUA)',
    u'foreign_entity_issues': u'',
    u'issueAreaCode': u'FIN',
    u'lobbyists': [{u'coveredPosition': u'Director of Public and Congressional Affairs,',
      u'lobbyistFirstName': u'John',
      u'lobbyistLastName': u'McKechnie',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'National Credit Union Administration',
      u'lobbyistFirstName': u'(continued)',
      u'lobbyistLastName': u'(continued)',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''}],
    u'specific_issues': [u'H.R. 688 (Credit Union Small Business Job Creation Act)\nH.R. 719 (Capital Access for Small Businesses and Jobs Act)\nCooperative development']},
   {u'federal_agencies': u'U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE',
    u'foreign_entity_issues': u'',
    u'issueAreaCode': u'TAX',
    u'lobbyists': [{u'coveredPosition': u'Director of Public and Congressional Affairs,',
      u'lobbyistFirstName': u'John',
      u'lobbyistLastName': u'McKechnie',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'National Credit Union Administration',
      u'lobbyistFirstName': u'(continued)',
      u'lobbyistLastName': u'(continued)',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''}],
    u'specific_issues': [u'Cooperative taxation, cooperative development']},
   {u'federal_agencies': u'U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE, Agriculture - Dept of  (USDA)',
    u'foreign_entity_issues': u'',
    u'issueAreaCode': u'AGR',
    u'lobbyists': [{u'coveredPosition': u'Director of Public and Congressional Affairs,',
      u'lobbyistFirstName': u'John',
      u'lobbyistLastName': u'McKechnie',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'National Credit Union Administration',
      u'lobbyistFirstName': u'(continued)',
      u'lobbyistLastName': u'(continued)',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''}],
    u'specific_issues': [u'Agricultural market policies, cooperative development']},
   {u'federal_agencies': u'U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE',
    u'foreign_entity_issues': u'',
    u'issueAreaCode': u'ENG',
    u'lobbyists': [{u'coveredPosition': u'Director of Public and Congressional Affairs,',
      u'lobbyistFirstName': u'John',
      u'lobbyistLastName': u'McKechnie',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'National Credit Union Administration',
      u'lobbyistFirstName': u'(continued)',
      u'lobbyistLastName': u'(continued)',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''}],
    u'specific_issues': [u'Electric power generation, cooperative development']}],
  u'city': u'North Bethesda',
  u'clientGovtEntity': u'N',
  u'clientName': u'National Cooperative Business Association',
  u'country': u'USA',
  u'expenses': u'',
  u'expensesMethod': u'',
  u'firstName': u'',
  u'houseID': u'416800008',
  u'imported': u'N',
  u'income': u'',
  u'lastName': u'',
  u'noLobbying': u'',
  u'organizationName': u'John McKechnie LLC',
  u'pages': u'5',
  u'prefix': u'',
  u'principal_city': u'',
  u'principal_country': u'',
  u'principal_state': u'',
  u'principal_zip': u'',
  u'principal_zipext': u'',
  u'printedName': u'Mr. John McKechnie, Owner',
  u'registrantDifferentAddress': u'N',
  u'reportType': u'Q4',
  u'reportYear': u'2013',
  u'selfSelect': u'',
  u'senateID': u'400751249-90',
  u'signedDate': u'01/21/2014',
  u'state': u'MD',
  u'submitURL': u'',
  u'terminationDate': u'',
  u'updates': {u'affiliatedOrgs': [{u'affiliatedOrgAddress': u'',
     u'affiliatedOrgCity': u'',
     u'affiliatedOrgCountry': u'',
     u'affiliatedOrgName': u'',
     u'affiliatedOrgState': u'',
     u'affiliatedOrgZip': u'',
     u'affiliatedPrinOrgCity': u'',
     u'affiliatedPrinOrgCountry': u'',
     u'affiliatedPrinOrgState': u''},
    {u'affiliatedOrgAddress': u'',
     u'affiliatedOrgCity': u'',
     u'affiliatedOrgCountry': u'',
     u'affiliatedOrgName': u'',
     u'affiliatedOrgState': u'',
     u'affiliatedOrgZip': u'',
     u'affiliatedPrinOrgCity': u'',
     u'affiliatedPrinOrgCountry': u'',
     u'affiliatedPrinOrgState': u''}],
   u'affiliatedUrl': u'',
   u'clientAddress': u'',
   u'clientCity': u'',
   u'clientCountry': u'',
   u'clientState': u'',
   u'clientZip': u'',
   u'clientZipext': u'',
   u'foreignEntities': [{u'address': u'',
     u'city': u'',
     u'contribution': u'',
     u'country': u'',
     u'name': u'',
     u'ownership_Percentage': u'',
     u'prinCity': u'',
     u'prinCountry': u'',
     u'prinState': u'',
     u'state': u''}],
   u'generalDescription': u'',
   u'inactiveOrgs': [u'', u'', u''],
   u'inactive_ALIs': [u'', u'', u'', u'', u'', u'', u'', u'', u''],
   u'inactive_ForeignEntities': [u'', u'', u'', u'', u'', u''],
   u'inactive_lobbyists': [{u'firstName': u'',
     u'lastName': u'',
     u'suffix': u''},
    {u'firstName': u'', u'lastName': u'', u'suffix': u''},
    {u'firstName': u'', u'lastName': u'', u'suffix': u''},
    {u'firstName': u'', u'lastName': u'', u'suffix': u''}],
   u'prinClientCity': u'',
   u'prinClientCountry': u'',
   u'prinClientState': u'',
   u'prinClientZip': u'',
   u'prinClientZipext': u''},
  u'zip': u'20852',
  u'zipext': u''},
 u'_id': ObjectId('537a43ff6e95522a2a96f904'),
 u'doc_id': u'300624537'}



In [299]:

    
floc_template = '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/cache/sopr_html/{year}/{subyear}/{id}.html'
floc = floc_template.format(year=2014,
                            subyear='Q1',
                            id='33eb46ef-55a7-4233-8685-f7427c057f41')



In [300]:

    
dbg = etree.parse(open(floc), parser=html_parser)



In [301]:

    
etree.tostring(e.xpath('//p[contains(., "15. General issue area")]'
                       '/following-sibling::p[1]')[0])
                       #'/tbody/tr[position() > 1]'
                       #'/td')[0])









    Out[301]:





'<p>16. Specific lobbying issues</p>\n'



In [302]:

    
#from tasks import extract
#from tasks import schema
reload(extract)
reload(schema)

ld2_containers = filter(lambda x: 'children' in x, schema.ld2_schema)
ld2_elements = filter(lambda x: 'children' not in x, schema.ld2_schema)



In [303]:

    
ld2_containers[:3]









    Out[303]:





[{'children': [{'field': 'general_issue_area',
    'lda_question': '15',
    'parser': <function tasks.schema.split_keep_rightmost>,
    'path': '.',
    'section': 'lobbying_activities'},
   {'field': 'specific_issues',
    'lda_question': '16',
    'parser': <function tasks.schema.clean_text>,
    'path': 'following-sibling::p[2]',
    'section': 'lobbying_activities'},
   {'field': 'houses_and_agencies_none',
    'lda_question': '17',
    'parser': <function tasks.schema.checkbox_boolean>,
    'path': 'following-sibling::p[3]/span/input',
    'section': 'lobbying_activities'},
   {'field': 'houses_and_agencies',
    'lda_question': '17',
    'parser': <function tasks.schema.clean_text>,
    'path': 'following-sibling::p[4]',
    'section': 'lobbying_activities'},
   {'children': [{'field': 'lobbyist_first_name',
      'lda_question': '18',
      'parser': <function tasks.schema.clean_text>,
      'path': 'td[1]/div',
      'section': 'lobbying_activities'},
     {'field': 'lobbyist_last_name',
      'lda_question': '18',
      'parser': <function tasks.schema.clean_text>,
      'path': 'td[2]/div',
      'section': 'lobbying_activities'},
     {'field': 'lobbyist_suffix',
      'lda_question': '18',
      'parser': <function tasks.schema.clean_text>,
      'path': 'td[3]/div',
      'section': 'lobbying_activities'},
     {'field': 'lobbyist_covered_position',
      'lda_question': '18',
      'parser': <function tasks.schema.clean_text>,
      'path': 'td[4]/div',
      'section': 'lobbying_activities'},
     {'field': 'lobbyist_is_new',
      'lda_question': '18',
      'parser': <function tasks.schema.checkbox_boolean>,
      'path': 'td[5]/input',
      'section': 'lobbying_activities'}],
    'field': 'lobbyists',
    'lda_question': '18',
    'parser': <function tasks.schema.parse_array>,
    'path': 'following-sibling::table[1]/tbody/tr[position() > 1]',
    'section': 'lobbying_activities'},
   {'field': 'foreign_entity_interest_none',
    'lda_question': '19',
    'parser': <function tasks.schema.checkbox_boolean>,
    'path': 'following-sibling::p[6]/span/input',
    'section': 'lobbying_activities'},
   {'field': 'foreign_entity_interest',
    'lda_question': '19',
    'parser': <function tasks.schema.clean_text>,
    'path': 'following-sibling::p[7]',
    'section': 'lobbying_activities'}],
  'field': 'lobbying_activities',
  'lda_question': None,
  'parser': <function tasks.schema.parse_array>,
  'path': '//p[contains(.,"15. General issue area")]',
  'section': 'lobbying_activities'},
 {'children': [{'field': 'removed_lobbyist_first_name',
    'lda_question': 23,
    'parser': <function tasks.schema.clean_text>,
    'path': 'td[1]/div',
    'section': 'registration_update'},
   {'field': 'removed_lobbyist_last_name',
    'lda_question': 23,
    'parser': <function tasks.schema.clean_text>,
    'path': 'td[2]/div',
    'section': 'registration_update'},
   {'field': 'removed_lobbyist_last_name',
    'lda_question': 23,
    'parser': <function tasks.schema.clean_text>,
    'path': 'td[3]/div',
    'section': 'registration_update'}],
  'field': 'removed_lobbyists',
  'lda_question': 23,
  'parser': <function tasks.schema.parse_array>,
  'path': '//p[contains(.,"23. Name of each previously")]/following-sibling::table[1]/tbody/tr[position()>1]/td/table/tbody',
  'section': 'registration_update'},
 {'children': [{'field': 'issue_code',
    'lda_question': '24',
    'parser': <function tasks.schema.clean_text>,
    'path': '.',
    'section': 'lobbying_issues'}],
  'field': 'removed_lobbying_issues',
  'lda_question': 24,
  'parser': <function tasks.schema.parse_array>,
  'path': '//p[contains(.,"24. General lobbying issue")]/following-sibling::table[1]/tbody/tr/td/div',
  'section': 'registration_update'}]



In [304]:

    
extract.extract_html(floc, ld2_elements, ld2_containers)









    



03/31/2014
06/30/2014
{
  "lobbying_activities": {
    "lobbying_activities": [
      {
        "general_issue_area": "HCR", 
        "houses_and_agencies_none": false, 
        "specific_issues": "Implementation of PL 111-148, The Patient Protection and Affordable Care Act and the economic impact of budget cuts to biopharmaceuticals.", 
        "houses_and_agencies": "U.S. SENATE, U.S. HOUSE OF REPRESENTATIVES", 
        "foreign_entity_interest": "", 
        "lobbyists": [
          {
            "lobbyist_covered_position": "", 
            "lobbyist_is_new": false, 
            "lobbyist_first_name": "Sara", 
            "lobbyist_last_name": "Froelich", 
            "lobbyist_suffix": ""
          }
        ], 
        "foreign_entity_interest_none": true
      }
    ]
  }, 
  "identifiers": {
    "client_registrant_house_id": null, 
    "client_registrant_senate_id": null
  }, 
  "expenses": {
    "expense_amount": null, 
    "expense_method_a": false, 
    "expense_method_c": false, 
    "expense_method_b": false, 
    "expense_less_than_five_thousand": false, 
    "expense_five_thousand_or_more": false
  }, 
  "registration_update": {
    "client_new_ppb_city": "", 
    "client_new_ppb_zip": "", 
    "added_foreign_entities": [], 
    "added_affiliated_organizations": [], 
    "client_new_zip": "", 
    "client_new_city": "", 
    "client_new_ppb_country": "", 
    "client_new_general_description": "", 
    "removed_lobbyists": [], 
    "removed_lobbying_issues": [], 
    "client_new_address": "", 
    "client_new_country": "", 
    "client_new_state": "", 
    "removed_affiliated_organizations": [], 
    "client_new_ppb_state": ""
  }, 
  "client": {
    "client_name": null, 
    "client_self": null, 
    "client_state_or_local_government": null
  }, 
  "income": {
    "income_less_than_five_thousand": true, 
    "income_amount": null, 
    "income_five_thousand_or_more": false
  }, 
  "report": {
    "report_quarter_one": true, 
    "report_quarter_four": false, 
    "report_no_activity": false, 
    "report_year": "2014", 
    "report_quarter_two": false, 
    "report_termination_date": "2014-03-31T00:00:00", 
    "report_quarter_three": false, 
    "report_is_amendment": true, 
    "report_is_termination": true
  }, 
  "signature": {
    "signature_date": "2014-06-30T00:00:00", 
    "signature": "Digitally Signed By: Sara Froelich"
  }, 
  "registrant": {
    "registrant_name": "TKG Federal Affairs", 
    "registrant_zip": null, 
    "registrant_contact_email": null, 
    "registrant_address_one": null, 
    "self_employed_individual": false, 
    "registrant_ppb_city": null, 
    "registrant_ppb_zip": null, 
    "registrant_city": null, 
    "registrant_contact_name": null, 
    "registrant_state": null, 
    "registrant_ppb_state": null, 
    "registrant_contact_phone": null, 
    "registrant_country": null, 
    "registrant_address_two": null, 
    "organization_lobbying_firm": true, 
    "registrant_contact_name_prefix": null
  }, 
  "document_id": "33eb46ef-55a7-4233-8685-f7427c057f41"
}



In [305]:

    
2385.88 - 1153









    Out[305]:





1232.88



In [305]: