In [2]:
import json
import os
import re
import sys
import urlparse

sys.path.append(os.path.join(os.getcwd(),os.path.pardir))

from datetime import datetime, date, timedelta
from io import StringIO

import requests
from pyquery import PyQuery as pq

from lxml import etree

import settings

LD-1


In [11]:
html_parser = etree.HTMLParser()

In [12]:
_filing_detail_url = 'http://soprweb.senate.gov/index.cfm?'

In [13]:
_params = {'event': 'getFilingDetails',
           'filingID': 'b931620a-16aa-4834-b798-e08b0a3bddf8',
           'filingTypeID': 1}

resp = requests.get(_filing_detail_url, params=_params)

In [18]:
parsed =  etree.parse(StringIO(resp.text), html_parser)

In [28]:
#e = 
e = parsed.xpath('/html/body/table[3]/tbody/tr/'
             'td[contains(.,"Prefix")]/following-sibling::td[1]/div')[0]
print etree.tostring(e)


<div style="border-bottom:solid 1px">&#160;Mr.</div>


In [214]:
_params = {'event': 'getFilingDetails',
           'filingID': 'b4c3bd67-7c7c-45e6-8b6c-5fd6b55eec3f',
           'filingTypeID': 1}

resp = requests.get(_filing_detail_url, params=_params)

In [216]:
parsed =  etree.parse(StringIO(resp.text), html_parser)
parsed.xpath('/html/body/table[position() > 2 and position() < 10]/tbody/tr/*')


Out[216]:
[<Element td at 0x7fd28f2bb830>,
 <Element td at 0x7fd28f2bbb00>,
 <Element td at 0x7fd28f2bba70>,
 <Element td at 0x7fd28f2bbab8>,
 <Element td at 0x7fd28f2bb950>,
 <Element td at 0x7fd28f22b5a8>,
 <Element td at 0x7fd28d8fb290>,
 <Element td at 0x7fd28d8fb320>,
 <Element td at 0x7fd28d8fb1b8>,
 <Element td at 0x7fd28d8fb170>,
 <Element td at 0x7fd28d8fbf80>,
 <Element td at 0x7fd28d8fb2d8>,
 <Element td at 0x7fd28d8a2050>,
 <Element td at 0x7fd28d8a20e0>,
 <Element td at 0x7fd28d8a21b8>,
 <Element td at 0x7fd28d8a2248>,
 <Element td at 0x7fd28d8a2290>,
 <Element td at 0x7fd28d8a2320>,
 <Element td at 0x7fd28d8a23b0>,
 <Element td at 0x7fd28d8a23f8>,
 <Element td at 0x7fd28d8a2488>,
 <Element td at 0x7fd28d8a24d0>,
 <Element td at 0x7fd28d8a2518>,
 <Element td at 0x7fd28d8a2560>,
 <Element td at 0x7fd28d8a25a8>,
 <Element td at 0x7fd28d8a25f0>,
 <Element td at 0x7fd28d8a2638>,
 <Element td at 0x7fd28d8a26c8>,
 <Element td at 0x7fd28d8a2710>,
 <Element td at 0x7fd28d8a2758>,
 <Element td at 0x7fd28d8a27a0>,
 <Element td at 0x7fd28d8a27e8>,
 <Element td at 0x7fd28d8a2830>,
 <Element td at 0x7fd28d8a2878>,
 <Element td at 0x7fd28d8a28c0>]

In [217]:
parsed.xpath('/html/body/table[8]/tbody/tr/td[4]/div/text()')


Out[217]:
[u'\xa02024146169']

In [218]:
s = parsed.xpath('/html/body/table[8]/tbody/tr/td[4]/div/text()')[0]
s.replace(u'\xa0',u'')


Out[218]:
u'2024146169'

In [219]:
parsed.xpath('/html/body/table[16]/tbody/tr[position() > 3]')


Out[219]:
[]

multiple lobbyists


In [220]:
_params = {'event': 'getFilingDetails',
           'filingID': 'C3A7E902-87A2-49FB-8D27-1D031F48DC12',
           'filingTypeID': 1}

resp = requests.get(_filing_detail_url, params=_params)
multiple_lobbyists =  etree.parse(StringIO(resp.text), html_parser)

In [221]:
lobb_rows = multiple_lobbyists.xpath('/html/body/table[12]/tbody/tr[position() > 2]')

In [222]:
lobb_row = lobb_rows[0]
etree.tostring(lobb_row)


Out[222]:
'<tr>\n<td style="border-bottom:2px solid">&#160;R.L. (Les)</td><td style="border-bottom:2px solid">&#160;Brownlee</td><td style="border-bottom:2px solid;border-right:2px solid">&#160;</td><td style="border-bottom:2px solid">&#160;Acting Secretary/Under Secretary of the Army;</td>\n</tr>\n'

In [223]:
lobb_row.xpath('td[1]')


Out[223]:
[<Element td at 0x7fd29c0fde18>]

multiple lobbying issues


In [224]:
_params = {'event': 'getFilingDetails',
           'filingID': '3A144627-84A0-4190-81A8-B40718EA37EC',
           'filingTypeID': 1}

resp = requests.get(_filing_detail_url, params=_params)
multiple_issues =  etree.parse(StringIO(resp.text), html_parser)

In [225]:
multiple_issues.xpath('/html/body/table[13]/tbody/tr/td/div')


Out[225]:
[<Element div at 0x7fd28d8a2e18>,
 <Element div at 0x7fd28d8a2fc8>,
 <Element div at 0x7fd28d8a2f38>,
 <Element div at 0x7fd28d8a2c68>,
 <Element div at 0x7fd28d8a2a28>,
 <Element div at 0x7fd28d8a2950>,
 <Element div at 0x7fd28d8a2908>,
 <Element div at 0x7fd28d8a2f80>,
 <Element div at 0x7fd29c0fdd88>,
 <Element div at 0x7fd28d9eb4d0>,
 <Element div at 0x7fd28d9eb440>,
 <Element div at 0x7fd28d9eb488>,
 <Element div at 0x7fd28d9eb518>,
 <Element div at 0x7fd28d9eb320>,
 <Element div at 0x7fd28d9eb560>,
 <Element div at 0x7fd28d9eb5a8>,
 <Element div at 0x7fd28d9eb5f0>,
 <Element div at 0x7fd28d9eb638>,
 <Element div at 0x7fd28d9eb680>,
 <Element div at 0x7fd28d9eb6c8>,
 <Element div at 0x7fd28d9eb710>,
 <Element div at 0x7fd28d9eb758>,
 <Element div at 0x7fd28d9eb7a0>,
 <Element div at 0x7fd28d9eb7e8>,
 <Element div at 0x7fd28d9eb830>,
 <Element div at 0x7fd28d9eb878>,
 <Element div at 0x7fd28d9eb8c0>]

In [226]:
multiple_issues.xpath('/html/body/table[13]/tbody/tr/td/div/text()')


Out[226]:
[u'\xa0HOM',
 u'\xa0AVI',
 u'\xa0BUD',
 u'\xa0DEF',
 u'\xa0ECN',
 u'\xa0FOR',
 u'\xa0GOV',
 u'\xa0LBR',
 u'\xa0LAW',
 u'\xa0TOU',
 u'\xa0TRA',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0',
 u'\xa0']

multiple affiliated orgs


In [229]:
_params = {'event': 'getFilingDetails',
           'filingID': 'C3A7E902-87A2-49FB-8D27-1D031F48DC12',
           'filingTypeID': 1}

resp = requests.get(_filing_detail_url, params=_params)
multiple_aff_orgs =  etree.parse(StringIO(resp.text), html_parser)

In [230]:
multiple_aff_orgs.xpath('/html/body/table[16]/tbody/tr[position() > 3]')


Out[230]:
[<Element tr at 0x7fd28d9eb248>,
 <Element tr at 0x7fd28d9eb200>,
 <Element tr at 0x7fd28d9eb2d8>,
 <Element tr at 0x7fd28d9eb290>,
 <Element tr at 0x7fd28d9eb3f8>,
 <Element tr at 0x7fd28d9eb368>]

In [231]:
def _zip_odd_even(arr):
    return [(arr[i].xpath('td/div/text()'), arr[i+1].xpath('td[position() > 1]/table/tbody/tr/td/div/text()')) for i in xrange(0,len(arr),2) ]

_zip_odd_even(multiple_aff_orgs.xpath('/html/body/table[16]/tbody/tr[position() > 3]'))


Out[231]:
[([u'\xa0AT&T Services Inc.', u'\xa01133 21st Street NW, #900'],
  [u'\xa0Washington',
   u'\xa0DC',
   u'\xa020036',
   u'\xa0USA',
   u'\xa0TX',
   u'\xa0USA']),
 ([u'\xa0T-Mobile US, Inc.', u'\xa0601 Pennsylvania Avenue, NW #800 North'],
  [u'\xa0Washington',
   u'\xa0DC',
   u'\xa020004',
   u'\xa0USA',
   u'\xa0WA',
   u'\xa0USA']),
 ([u'\xa0Verizon Communications Inc.', u'\xa01300 I Street, NW'],
  [u'\xa0Washington',
   u'\xa0DC',
   u'\xa020005',
   u'\xa0USA',
   u'\xa0NY',
   u'\xa0USA'])]

In [232]:
row_eg = multiple_aff_orgs.xpath('/html/body/table[16]/tbody/tr[position() > 3]')[0]

In [233]:
row_eg.xpath('.//td/div/text()')


Out[233]:
[u'\xa0AT&T Services Inc.', u'\xa01133 21st Street NW, #900', u'\xa0Dallas']

In [234]:
for r in multiple_aff_orgs.xpath('/html/body/table[16]/tbody/tr[position() > 3]'):
    for d in r.xpath('td[3]/table/tbody/tr/td[4]/div/text()'):
        print d
    print '='*20


====================
 USA
====================
====================
 USA
====================
====================
 USA
====================

Multiple foreign entities


In [235]:
_params = {'event': 'getFilingDetails',
           'filingID': '3A144627-84A0-4190-81A8-B40718EA37EC',
           'filingTypeID': 1}

resp = requests.get(_filing_detail_url, params=_params)
multiple_foreign =  etree.parse(StringIO(resp.text), html_parser)

In [236]:
for r in multiple_foreign.xpath('/html/body/table[19]/tbody/tr'):
    for d in r.xpath('td[5]/div/text()'):
        print d
    print '='*20


====================
 100
====================
====================
 0
====================
====================
 0
====================

Checkbox


In [237]:
e = parsed.xpath('/html/body/div[1]/input[2]')[0]

In [238]:
e.attrib.has_key('checked')


Out[238]:
True

parse date


In [239]:
s = parsed.xpath('/html/body/table[2]/tbody/tr[1]/td[3]/div/text()')[0]
s


Out[239]:
'5/28/2013'

In [240]:
datetime.strptime(s, '%m/%d/%Y')


Out[240]:
datetime.datetime(2013, 5, 28, 0, 0)

In [241]:
filter(lambda x: x % 2, [1,2,3,4,5])


Out[241]:
[1, 3, 5]

In [242]:
os.path.basename('/home/blannon/1223_watzman_20131101.csv')


Out[242]:
'1223_watzman_20131101.csv'

In [243]:
os.extsep


Out[243]:
'.'

In [244]:
u = 'http://soprweb.senate.gov/index.cfm?event=getFilingDetails&filingTypeID=57&filingID=33eb46ef-55a7-4233-8685-f7427c057f41'

In [245]:
urlparse.parse_qsl(urlparse.urlparse(u).query)


Out[245]:
[('event', 'getFilingDetails'),
 ('filingTypeID', '57'),
 ('filingID', '33eb46ef-55a7-4233-8685-f7427c057f41')]

In [246]:
thing = etree.parse(open('../data/cache/sopr_html/2014/REG/a25b34ae-5ccb-409f-848d-f4fa008a06b4.html'), html_parser)

In [247]:
print etree.tostring(thing)


<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title>LD-1 Disclosure Form</title>
<style type="text/css">
                    * {font-size:9pt}
                    hr {height:3px;background:black}
                    table {border-color:black;table-layout:fixed}
                </style>
</head>
<body>
<table style="border-style:double;border-width:3px" width="60%">
<col width="60%"/>
<col width="40%"/>
<tbody>
<tr>
<td>Clerk of the House of Representatives<br/>
                                Legislative Resource Center<br/>
                                B-106 Cannon Building<br/>
                                Washington, DC 20515<br/>
<a style="text-decoration:underline" href="http://lobbyingdisclosure.house.gov">http://lobbyingdisclosure.house.gov</a></td><td>Secretary of the Senate<br/>
                                Office of Public Records<br/>
                                232 Hart Building<br/>
                                Washington, DC 20510<br/>
<a style="text-decoration:underline" href="http://www.senate.gov/lobby">http://www.senate.gov/lobby</a></td>
</tr>
</tbody>
</table>
<p style="font-size:24pt;font-weight:bold">LOBBYING REGISTRATION</p>
<p style="font-size:12pt">Lobbying Disclosure Act of 1995 (Section 4)</p>
<div>
<span style="font-style:italic;text-decoration:underline;font-size:8pt">Check One:</span><input disabled="disabled" type="checkbox"/> New Registrant <input disabled="disabled" type="checkbox" checked="checked"/> New Client for Existing Registrant<input disabled="disabled" type="checkbox"/> Amendment</div>
<br/>
<table width="100%">
<col span="2" width="20%"/>
<col span="2" width="10%"/>
<col span="2" width="20%"/>
<tbody>
<tr valign="baseline" style="font-size:10pt">
<td colspan="3"/><td style="text-align:center" colspan="2">1. Effective Date of Registration</td><td>
<div style="border-bottom:solid 1px">05/01/2014</div>
</td>
</tr>
<tr valign="baseline" style="font-size:10pt">
<td style="font-weight:bold">2. House Identification</td><td>
<div style="border-bottom:solid 1px">&#160;42262</div>
</td><td colspan="2"/><td style="font-weight:bold">Senate Identification</td><td>
<div style="border-bottom:solid 1px">&#160;401047945</div>
</td>
</tr>
</tbody>
</table>
<hr/>
<p>
<span style="font-weight:bold;font-size:14pt">REGISTRANT </span><input disabled="disabled" type="checkbox"/> Organization/Lobbying Firm <input disabled="disabled" type="checkbox" checked="checked"/> Self Employed Individual</p>
<table width="100%">
<col width="11%"/>
<col width="9%"/>
<col width="5%"/>
<col width="10%"/>
<col width="20%"/>
<col width="10%"/>
<col width="35%"/>
<tbody>
<tr>
<td>3. Registrant</td><td>Prefix</td><td>
<div style="border-bottom:solid 1px">&#160;Mr.</div>
</td><td>First</td><td>
<div style="border-bottom:solid 1px">&#160;Andrew</div>
</td><td>Last</td><td>
<div style="border-bottom:solid 1px">&#160;Wahlquist</div>
</td>
</tr>
</tbody>
</table>
<table width="100%">
<col width="8%"/>
<col width="42%"/>
<col width="8%"/>
<col width="42%"/>
<tbody>
<tr>
<td>Address</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">6622 Madison McLean Drive</div>
</td><td>Address2</td><td>
<div style="border-bottom:solid 1px">&#160;</div>
</td>
</tr>
</tbody>
</table>
<table width="100%">
<col width="5%"/>
<col width="40%"/>
<col width="5%"/>
<col width="5%"/>
<col width="5%"/>
<col width="20%"/>
<col width="6%"/>
<col width="14%"/>
<tbody>
<tr>
<td>City</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">McLean</div>
</td><td>State</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">VA</div>
</td><td>Zip</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">22101</div>
</td><td>Country</td><td style="padding-left:1em">
<div style="border-bottom:solid 1px">USA</div>
</td>
</tr>
</tbody>
</table>4. Principal place of business (if different than line 3)<br/>
<table width="100%">
<col width="5%"/>
<col width="40%"/>
<col width="5%"/>
<col width="5%"/>
<col width="5%"/>
<col width="20%"/>
<col width="6%"/>
<col width="14%"/>
<tbody>
<tr>
<td>City</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;</div>
</td><td>State</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;</div>
</td><td>Zip</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;</div>
</td><td>Country</td><td style="padding-left:1em">
<div style="border-bottom:solid 1px">&#160;</div>
</td>
</tr>
</tbody>
</table>
<table width="100%">
<col span="2" width="50%"/>
<tbody>
<tr>
<td>5. Contact name and telephone number</td><td><input disabled="disabled" type="checkbox"/><span style="font-weight:bold;font-size:75%"> International Number</span></td>
</tr>
</tbody>
</table>
<table width="100%">
<col width="7%"/>
<col width="34%"/>
<col width="7%"/>
<col width="20%"/>
<col width="7%"/>
<col width="25%"/>
<tbody>
<tr>
<td>Contact</td><td style="padding-right:.5em;padding-left:.5em">
<div style="border-bottom:solid 1px">&#160;Mr. Andrew Wahlquist</div>
</td><td>Telephone</td><td style="padding-right:.5em;padding-left:2em">
<div style="border-bottom:solid 1px">&#160;2025778848</div>
</td><td>E-mail</td><td>
<div style="border-bottom:solid 1px">&#160;awahlquist@cox.net</div>
</td>
</tr>
</tbody>
</table>6. General description of registrant&#8217;s business or activities<div style="border-bottom:solid 1px">&#160;Government relations and business development</div>
<hr/>
<p style="margin-bottom:-.25em;margin-top:-.5em;font-size:14pt;font-weight:bold">CLIENT
                    <span style="font-weight:normal;font-style:italic;font-size:8pt">A Lobbying Firm is required to file a separate
                        registration for each client. Organizations employing in-house lobbyists should
                        check the box labeled &#8220;Self&#8221; and proceed to line 10.</span> <input disabled="disabled" type="checkbox"/><span style="font-style:italic;font-weight:bold;font-size:8pt">Self</span>
</p>
<table width="100%">
<col width="12%"/>
<col width="88%"/>
<tbody>
<tr>
<td>7. Client name</td><td>
<div style="border-bottom:solid 1px">&#160;Technology and Suipply Management, LLC</div>
</td>
</tr>
<tr>
<td>Address</td><td>
<div style="border-bottom:solid 1px">&#160;3877 Fairfax Ridge Road - Suite 110N</div>
</td>
</tr>
</tbody>
</table>
<table width="100%">
<col width="7%"/>
<col width="55%"/>
<col width="5%"/>
<col width="5%"/>
<col width="5%"/>
<col width="10%"/>
<col width="6%"/>
<col width="7%"/>
<tbody>
<tr>
<td>City</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;Fairfax</div>
</td><td>State</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;VA</div>
</td><td>Zip</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;22030</div>
</td><td>Country</td><td>
<div style="border-bottom:solid 1px">&#160;USA</div>
</td>
</tr>
</tbody>
</table>8. Principal place of business (if different than line 7)<table width="100%">
<col width="7%"/>
<col width="55%"/>
<col width="5%"/>
<col width="5%"/>
<col width="5%"/>
<col width="10%"/>
<col width="6%"/>
<col width="7%"/>
<tbody>
<tr>
<td>City</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;</div>
</td><td>State</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;</div>
</td><td>Zip</td><td style="padding-right:.5em">
<div style="border-bottom:1px solid">&#160;</div>
</td><td>Country</td><td>
<div style="border-bottom:solid 1px">&#160;</div>
</td>
</tr>
</tbody>
</table>9. General description of client&#8217;s business or activities<div style="border-bottom:solid 1px">&#160;Government contracting and commercial logistics</div>
<hr/>
<p style="margin-bottom:-.5em;margin-top:-.5em;font-size:14pt;font-weight:bold">LOBBYISTS</p>
<p style="font-size:10pt">10. Name of each individual who has acted or is expected to act as a lobbyist
                    for the client identified on line 7. If any person listed in this section has
                    served as a &#8220;covered executive branch official&#8221; or
                    &#8220;covered legislative branch official&#8221; within twenty years
                    of first acting as a lobbyist for the client,
                    <span style="font-style:italic">state the executive and/or legislative
                        position(s) in which the person served.</span>
</p>
<table style="border-collapse:collapse" width="100%">
<col span="2" width="22.5%"/>
<col width="5%"/>
<col width="50%"/>
<tbody>
<tr>
<td/><td style="text-align:center">Name</td><td style="border-right:solid 2px">&#160;</td><td style="text-align:center">
                                Covered Official Position (if applicable)
                            </td>
</tr>
<tr style="font-size:8pt">
<td style="border-bottom:solid 2px">
<div style="border:solid 1px">First</div>
</td><td style="border-bottom:solid 2px">
<div style="border:solid 1px">Last</div>
</td><td style="border-right:2px solid;border-bottom:solid 2px">
<div style="border:solid 1px">Suffix</div>
</td><td style="border-bottom:solid 2px"/>
</tr>
<tr>
<td style="border-bottom:2px solid">&#160;Andrew </td><td style="border-bottom:2px solid">&#160;Wahlquist</td><td style="border-bottom:2px solid;border-right:2px solid">&#160;Mr.</td><td style="border-bottom:2px solid">&#160;</td>
</tr>
</tbody>
</table>
<hr/>
<p style="margin-bottom:-.5em;margin-top:-.5em;                                 font-size:14pt;font-weight:bold">LOBBYING ISSUES</p>
<p>11. General lobbying issue areas (Select all applicable codes).</p>
<table width="90%">
<col span="9" width="11.1%"/>
<tbody>
<tr>
<td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;DEF</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;HOM</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td><td style="padding-right:.5em">
<div style="border-bottom:solid 1px">&#160;</div>
</td>
</tr>
</tbody>
</table>
<hr/>
<p>12. Specific lobbying issues (current and anticipated)</p>
<p>Promote aerostat technology for border security</p>
<hr/>
<p style="margin-bottom:-.5em;margin-top:-.5em;font-size:14pt;font-weight:bold">AFFILIATED ORGANIZATIONS</p>
<p>13. Is there an entity other than the client that contributes more than
                    $5,000 to the lobbying activities of the registrant in a quarterly period
                    and either actively participates in and/or in whole or in major part supervises, plans, or
                    controls the registrant&#8217;s lobbying activities?</p>
<table width="100%">
<col width="40%"/>
<col width="60%"/>
<tbody>
<tr>
<td><input disabled="disabled" type="checkbox" checked="checked"/>  No --&gt; Go to line 14.</td><td><input disabled="disabled" type="checkbox"/>  
                                Yes --&gt; Complete the rest of this section for each entity matching the
                                criteria above, then proceed to line 14.
                            </td>
</tr>
</tbody>
</table>
<hr style="height:1px"/>
<table width="100%">
<col width="10%"/>
<col width="90%"/>
<tbody>
<td>Internet Address:</td><td>
<div style="border-bottom:solid 1px">&#160;</div>
</td>
</tbody>
</table>
<table width="100%">
<col span="3" width="33.3%"/>
<tbody>
<tr style="text-align:center">
<td>Name</td><td>Address</td><td>Principal Place of Business</td>
</tr>
<tr>
<td/><td>Street</td><td/>
</tr>
<tr>
<td/><td>
<table width="100%">
<col width="25%"/>
<col width="40%"/>
<col width="15%"/>
<col width="20%"/>
<tbody>
<tr>
<td>City</td><td style="text-align:right">State/Province</td><td>Zip Code</td><td>Country</td>
</tr>
</tbody>
</table>
</td><td/>
</tr>
</tbody>
</table>
<hr/>
<p style="margin-bottom:-.5em;margin-top:-.5em;font-size:14pt;font-weight:bold">FOREIGN ENTITIES</p>
<p>14. Is there any foreign entity</p>
<p style="margin-left:5em">a) holds at least 20% equitable ownership in the client or
                    any organization identified on line 13; or</p>
<p style="margin-left:5em">b) directly or indirectly, in whole or in major part, plans,
                    supervises, controls, directs, finances or subsidizes activities of the client or
                    any organization identified on line 13; or</p>
<p style="margin-left:5em">c) is an affiliate of the client or any organization identified
                    on line 13 and has a direct interest in the outcome of the lobbying activity?</p>
<table width="100%">
<col width="5%"/>
<col width="30%"/>
<col width="5%"/>
<col width="60%"/>
<tbody>
<tr>
<td><input disabled="disabled" type="checkbox" checked="checked"/></td><td>No --&gt; Sign and date the registration.</td><td><input disabled="disabled" type="checkbox"/></td><td>Yes --&gt; Complete the rest of this section for each entity matching the
                                criteria above, then sign the registration.</td>
</tr>
</tbody>
</table>
<hr/>
<table width="100%">
<col width="20%"/>
<col width="25%"/>
<col width="20%"/>
<col width="20%"/>
<col width="10%"/>
<col width="5%"/>
<tbody>
<tr>
<td/><td style="text-align:center">Address</td><td/><td/><td style="text-align:center" colspan="2">Ownership</td>
</tr>
<tr>
<td style="text-align:center">Name</td><td>Street</td><td style="text-align:center">Principal place of business</td><td style="text-align:center">Amount of contribution</td>
</tr>
<tr>
<td/><td>
<table width="100%">
<col width="25%"/>
<col width="50%"/>
<col width="25%"/>
<tbody>
<tr>
<td>City</td><td style="text-align:right">State/Province</td><td>Country</td>
</tr>
</tbody>
</table>
</td><td style="text-align:center">(city and state or country)</td><td style="text-align:center">for lobbying activities</td>
</tr>
</tbody>
</table>
<hr style="height:1px"/>
<table width="100%">
<col width="20%"/>
<col width="25%"/>
<col width="28%"/>
<col width="20%"/>
<col width="5%"/>
<col width="2%"/>
<tbody/>
</table>
<table width="100%">
<col width="10%"/>
<col width="70%"/>
<col span="2" width="10%"/>
<tbody>
<tr>
<td style="font-weight:bold">Signature</td><td>
<div style="border:solid 1px">Digitally Signed By: Andrew Wahlquist - Consultant</div>
</td><td style="text-align:center;font-weight:bold">Date</td><td>
<div style="border-bottom:solid 2px">06/30/2014</div>
</td>
</tr>
</tbody>
</table>
</body>
</html>

In [248]:
sys.path.append('..')

In [249]:
from tasks import extract
from tasks.schema import ld1_schema

In [250]:
elements = filter(lambda x: 'children' not in x, ld1_schema)
containers = filter(lambda x: 'children' in x, ld1_schema)

In [251]:
elements


Out[251]:
[{'field': 'new_registrant',
  'lda_question': None,
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/div[1]/input[1]',
  'section': 'registration_type'},
 {'field': 'new_client_for_existing_registrant',
  'lda_question': None,
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/div[1]/input[2]',
  'section': 'registration_type'},
 {'field': 'amendment',
  'lda_question': None,
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/div[1]/input[3]',
  'section': 'registration_type'},
 {'field': 'effective_date',
  'lda_question': '1',
  'parser': <function tasks.schema.parse_datetime>,
  'path': '/html/body/table[2]/tbody/tr[1]/td[3]/div',
  'section': 'datetimes'},
 {'field': 'registrant_house_id',
  'lda_question': '2',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[2]/tbody/tr[2]/td[2]/div',
  'section': 'identifiers'},
 {'field': 'registrant_senate_id',
  'lda_question': '2',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[2]/tbody/tr[2]/td[5]/div',
  'section': 'identifiers'},
 {'field': 'organization_or_lobbying_firm',
  'lda_question': None,
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/p[3]/input[1]',
  'section': 'registrant'},
 {'field': 'self_employed_individual',
  'lda_question': None,
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/p[3]/input[2]',
  'section': 'registrant'},
 {'field': 'registrant_name',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[3]/tbody/tr/td[3]/div',
  'section': 'registrant'},
 {'field': 'registrant_address_one',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[4]/tbody/tr/td[2]/div',
  'section': 'registrant'},
 {'field': 'registrant_address_two',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[4]/tbody/tr/td[4]/div',
  'section': 'registrant'},
 {'field': 'registrant_city',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[5]/tbody/tr/td[2]/div',
  'section': 'registrant'},
 {'field': 'registrant_state',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[5]/tbody/tr/td[4]/div',
  'section': 'registrant'},
 {'field': 'registrant_zip',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[5]/tbody/tr/td[6]/div',
  'section': 'registrant'},
 {'field': 'registrant_country',
  'lda_question': '3',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[5]/tbody/tr/td[8]/div',
  'section': 'registrant'},
 {'field': 'registrant_ppb_city',
  'lda_question': '4',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[6]/tbody/tr/td[2]/div',
  'section': 'registrant'},
 {'field': 'registrant_ppb_state',
  'lda_question': '4',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[6]/tbody/tr/td[4]/div',
  'section': 'registrant'},
 {'field': 'registrant_ppb_zip',
  'lda_question': '4',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[6]/tbody/tr/td[6]/div',
  'section': 'registrant'},
 {'field': 'registrant_ppb_country',
  'lda_question': '4',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[6]/tbody/tr/td[8]/div',
  'section': 'registrant'},
 {'field': 'registrant_international_phone',
  'lda_question': '5',
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/table[7]/tbody/tr/td[2]/input',
  'section': 'registrant'},
 {'field': 'registrant_contact',
  'lda_question': '5',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[8]/tbody/tr/td[2]/div',
  'section': 'registrant'},
 {'field': 'registrant_phone',
  'lda_question': '5',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[8]/tbody/tr/td[4]/div',
  'section': 'registrant'},
 {'field': 'registrant_email',
  'lda_question': '5',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[8]/tbody/tr/td[6]/div',
  'section': 'registrant'},
 {'field': 'registrant_email',
  'lda_question': '5',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[8]/tbody/tr/td[6]/div',
  'section': 'registrant'},
 {'field': 'registrant_general_description',
  'lda_question': '6',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/div[2]',
  'section': 'registrant'},
 {'field': 'client_self',
  'lda_question': None,
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/p[4]/input',
  'section': 'client'},
 {'field': 'client_name',
  'lda_question': '7',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[9]/tbody/tr[1]/td[2]/div',
  'section': 'client'},
 {'field': 'client_address',
  'lda_question': '7',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[9]/tbody/tr[2]/td[2]/div',
  'section': 'client'},
 {'field': 'client_city',
  'lda_question': '7',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[10]/tbody/tr/td[2]/div',
  'section': 'client'},
 {'field': 'client_state',
  'lda_question': '7',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[10]/tbody/tr/td[4]/div',
  'section': 'client'},
 {'field': 'client_zip',
  'lda_question': '7',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[10]/tbody/tr/td[6]/div',
  'section': 'client'},
 {'field': 'client_country',
  'lda_question': '7',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[10]/tbody/tr/td[8]/div',
  'section': 'client'},
 {'field': 'client_ppb_city',
  'lda_question': '8',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[11]/tbody/tr/td[2]/div',
  'section': 'client'},
 {'field': 'client_ppb_state',
  'lda_question': '8',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[11]/tbody/tr/td[4]/div',
  'section': 'client'},
 {'field': 'client_ppb_zip',
  'lda_question': '8',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[11]/tbody/tr/td[6]/div',
  'section': 'client'},
 {'field': 'client_ppb_country',
  'lda_question': '8',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[11]/tbody/tr/td[8]/div',
  'section': 'client'},
 {'field': 'client_general_description',
  'lda_question': '9',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/div[3]',
  'section': 'client'},
 {'field': 'lobbying_issues_detail',
  'lda_question': '12',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/p[10]',
  'section': 'lobbying_issues_detail'},
 {'field': 'affiliated_organizations_no',
  'lda_question': '13',
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/table[14]/tbody/tr/td[1]/input',
  'section': 'affiliated_organizations'},
 {'field': 'affiliated_organizations_yes',
  'lda_question': '13',
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/table[14]/tbody/tr/td[2]/input',
  'section': 'affiliated_organizations'},
 {'field': 'affiliated_organizations_url',
  'lda_question': '13',
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[15]/tbody/tr/td[2]/div',
  'section': 'affiliated_organizations'},
 {'field': 'foreign_entities_no',
  'lda_question': '14',
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/table[17]/tbody/tr/td[1]/input',
  'section': 'foreign_entities'},
 {'field': 'foreign_entities_yes',
  'lda_question': '14',
  'parser': <function tasks.schema.checkbox_boolean>,
  'path': '/html/body/table[17]/tbody/tr/td[3]/input',
  'section': 'foreign_entities'},
 {'field': 'signature',
  'lda_question': None,
  'parser': <function tasks.schema.clean_text>,
  'path': '/html/body/table[20]/tbody/tr/td[2]/div',
  'section': 'signature'},
 {'field': 'signature_date',
  'lda_question': None,
  'parser': <function tasks.schema.parse_datetime>,
  'path': '/html/body/table[20]/tbody/tr/td[4]/div',
  'section': 'datetimes'}]

In [252]:
reload(extract)
extract.extract_html('../data/cache/sopr_html/2014/REG/a25b34ae-5ccb-409f-848d-f4fa008a06b4.html',
                     elements,
                     containers)


05/01/2014
06/30/2014
{
  "lobbying_issues": {
    "lobbying_issues": [
      {
        "issue_code": "DEF"
      }, 
      {
        "issue_code": "HOM"
      }
    ]
  }, 
  "identifiers": {
    "registrant_house_id": "42262", 
    "registrant_senate_id": "401047945"
  }, 
  "affiliated_organizations": {
    "affiliated_organizations_yes": false, 
    "affiliated_organizations_url": null, 
    "affiliated_organizations": [], 
    "affiliated_organizations_no": true
  }, 
  "datetimes": {
    "effective_date": "2014-05-01T00:00:00", 
    "signature_date": "2014-06-30T00:00:00"
  }, 
  "client": {
    "client_country": "USA", 
    "client_ppb_city": "", 
    "client_self": false, 
    "client_name": "Technology and Suipply Management, LLC", 
    "client_city": "Fairfax", 
    "client_zip": "22030", 
    "client_ppb_zip": "", 
    "client_ppb_country": "", 
    "client_ppb_state": "", 
    "client_general_description": "Government contracting and commercial logistics", 
    "client_state": "VA", 
    "client_address": "3877 Fairfax Ridge Road - Suite 110N"
  }, 
  "foreign_entities": {
    "foreign_entities_yes": false, 
    "foreign_entities": [], 
    "foreign_entities_no": true
  }, 
  "lobbyists": {
    "lobbyists": [
      {
        "lobbyist_covered_official_position": "", 
        "lobbyist_first_name": "Andrew", 
        "lobbyist_last_name": "Wahlquist", 
        "lobbyist_suffix": "Mr."
      }
    ]
  }, 
  "signature": {
    "signature": "Digitally Signed By: Andrew Wahlquist - Consultant"
  }, 
  "lobbying_issues_detail": {
    "lobbying_issues_detail": "Promote aerostat technology for border security"
  }, 
  "registrant": {
    "registrant_name": "Mr.", 
    "registrant_zip": "22101", 
    "registrant_address_one": "6622 Madison McLean Drive", 
    "self_employed_individual": true, 
    "registrant_ppb_city": "", 
    "registrant_ppb_zip": "", 
    "registrant_city": "McLean", 
    "registrant_email": "awahlquist@cox.net", 
    "registrant_state": "VA", 
    "registrant_ppb_state": "", 
    "organization_or_lobbying_firm": false, 
    "registrant_country": "USA", 
    "registrant_address_two": "", 
    "registrant_contact": "Mr. Andrew Wahlquist", 
    "registrant_international_phone": false, 
    "registrant_phone": "2025778848", 
    "registrant_general_description": "Government relations and business development", 
    "registrant_ppb_country": ""
  }, 
  "document_id": "a25b34ae-5ccb-409f-848d-f4fa008a06b4", 
  "registration_type": {
    "amendment": false, 
    "new_registrant": false, 
    "new_client_for_existing_registrant": true
  }
}

In [253]:
"goo"


Out[253]:
'goo'

In [254]:
dt = datetime(2014, 4, 1, 0, 0)

In [255]:
dt.isoformat()


Out[255]:
'2014-04-01T00:00:00'

LD-2


In [256]:
_params = {'event': 'getFilingDetails',
           'filingID': '80b956e1-3448-404a-bdfd-558ffe2631ce',
           'filingTypeID': 69}

resp = requests.get(_filing_detail_url, params=_params)
multiple_issues =  etree.parse(StringIO(resp.text), html_parser)

In [257]:
print etree.tostring(multiple_issues.xpath('/html/body/table[4]/tbody/tr[5]/td[1]/table')[0])


<table width="65%">
<col width="47.77%"/>
<col span="2" width="8%"/>
<col width="36.23%"/>
<tbody>
<td style="text-decoration:underline;font-size=75%">$5,000 or more</td><td><input disabled="disabled" type="checkbox" checked="checked"/></td><td style="text-align:right">$ </td><td>
<div style="border-bottom:solid 1px">&#160;50,000.00</div>
</td>
</tbody>
</table>


In [258]:
for e in multiple_issues.xpath('//p[@style="page-break-before:always"]'):
    print etree.tostring(e)
    print "="*80


<p style="page-break-before:always">
<span style="font-weight:bold">LOBBYING ACTIVITY.</span>
                        Select as many codes as necessary to reflect the general issue areas in which the registrant
                        engaged in lobbying on behalf of the client during the reporting period. Using
                        a separate page for each code, provide information as requested. Add additional page(s)
                        as needed.
                    </p>

================================================================================
<p style="page-break-before:always">
<span style="font-weight:bold">LOBBYING ACTIVITY.</span>
                        Select as many codes as necessary to reflect the general issue areas in which the registrant
                        engaged in lobbying on behalf of the client during the reporting period. Using
                        a separate page for each code, provide information as requested. Add additional page(s)
                        as needed.
                    </p>

================================================================================
<p style="page-break-before:always">
<span style="font-weight:bold">LOBBYING ACTIVITY.</span>
                        Select as many codes as necessary to reflect the general issue areas in which the registrant
                        engaged in lobbying on behalf of the client during the reporting period. Using
                        a separate page for each code, provide information as requested. Add additional page(s)
                        as needed.
                    </p>

================================================================================

In [259]:
page_break = multiple_issues.xpath('//p[@style="page-break-before:always"]')[0]

In [260]:
#general_issue_code = multiple_issues.xpath('//p[@style="page-break-before:always"]/following-sibling::p[1]')[0]
general_issue_code = multiple_issues.xpath('//p[contains(.,"15. General issue area")]')[0]
print etree.tostring(general_issue_code)


<p>15. General issue area code TAX </p>


In [261]:
[etree.tostring(e) for e in multiple_issues.xpath('//p[contains(.,"15. General issue area")]')]


Out[261]:
['<p>15. General issue area code TAX </p>\n',
 '<p>15. General issue area code MMM </p>\n',
 '<p>15. General issue area code ECN </p>\n']

In [262]:
general_issue_code = page_break.getnext()
filler = general_issue_code.getnext()
specific_lobbying_issues = filler.getnext()
congress_agency_check = specific_lobbying_issues.getnext()
congress_agency_detail = congress_agency_check.getnext()
filler = congress_agency_detail.getnext()
lobbyists = filler.getnext()
foreign_entity_check = lobbyists.getnext()
foreign_entity_list = foreign_entity_check.getnext()

In [263]:
fs = [general_issue_code,
      specific_lobbying_issues,
      congress_agency_check,
      congress_agency_detail,
      lobbyists,
      foreign_entity_check,
      foreign_entity_list]

for f in fs:
    print etree.tostring(f)
    print '='*20


<p>15. General issue area code TAX </p>

====================
<p class="lightbox">S. 232/H.R. 523, The Protect Medical Innovation Act of 2013.<br/>
</p>

====================
<p>17. House(s) of Congress and Federal agencies
                        <span style="padding-left:2em"><input disabled="disabled" type="checkbox"/></span>
                        Check if None
                    </p>

====================
<p class="lightbox">U.S. HOUSE OF REPRESENTATIVES,</p>

====================
<table width="100%">
<col span="2" width="23.2%"/>
<col width="5.8%"/>
<col width="42%"/>
<col width="5.8%"/>
<tbody>
<tr>
<td>First Name</td><td>Last Name</td><td style="border-right:2px solid">Suffix</td><td style="text-align:center;border-right:2px solid">
                                    Covered Official Position (if applicable)
                                </td><td style="border-right:2px solid">New</td>
</tr>
<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Jodie</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Curtis</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox"/></td>
</tr>
</tbody>
</table>

====================
<p>19. Interest of each foreign entity in the specific issues listed on line 16 above
                        <span style="padding-left:2em"><input disabled="disabled" type="checkbox" checked="checked"/></span>
                        Check if None
                    </p>

====================
<p class="lightbox">&#8203;</p>

====================

In [264]:
[etree.tostring(e) for e in general_issue_code.xpath('../following-sibling::p')]


Out[264]:
[]

In [265]:
print etree.tostring(general_issue_code.xpath(
                     'following-sibling::p[7]')[0])


<p class="lightbox">&#8203;</p>


In [266]:
second_gic = multiple_issues.xpath('//p[@style="page-break-before:always"]/following-sibling::p[1]')[1]

for r in second_gic.xpath('following-sibling::table[1]/tbody/tr[position() > 1]'):
    print etree.tostring(r)


<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Jodie</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Curtis</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Jim</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Twaddell</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">Leg. Aide/Dpty Comm. Dir./Senator Arlen Specter</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox" checked="checked"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Jeremy </div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Scott</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">Staff Asst./Leg. Corr., Senator Mike DeWine</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox" checked="checked"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Ilisa</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Halpern Paul</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">Staff Asst./Leg. Corr., Senator Dianne Feinstein</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox" checked="checked"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Rebecca</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">McGrath</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">Leg. Asst./Scheduler, Senator Chris Dodd</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox" checked="checked"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Julie</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Hyams</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">Leg. Asst./Staff Asst. Cong. Stokes</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox" checked="checked"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Erin</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Morton</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox" checked="checked"/></td>
</tr>

<tr>
<td style="border-bottom:2px solid">
<div class="lightbox">Anna</div>
</td><td style="border-bottom:2px solid">
<div class="lightbox">Howard</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid">
<div class="lightbox">&#160;</div>
</td><td style="border-bottom:2px solid;border-right:2px solid"><input disabled="disabled" type="checkbox"/></td>
</tr>


In [267]:
print etree.tostring(multiple_issues.xpath('//p[contains(.,"20.")]')[0])


<p>20. Client new address</p>


In [268]:
found = multiple_issues.xpath('//p[contains(.,"23. Name of each previously")]'
                              '/following-sibling::table[1]'
                              '/tbody/tr[position()>1]/td/table'
                              '/tbody')
for e in found:
    print etree.tostring(e)
    print '='*30


<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;Andrew </div>
</td><td>
<div style="border-bottom:solid 2px">&#160;Bowman</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================
<tbody>
<td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td><td>
<div style="border-bottom:solid 2px">&#160;</div>
</td>
</tbody>

==============================

In [269]:
resp.url


Out[269]:
u'http://soprweb.senate.gov/index.cfm?filingTypeID=69&filingID=80b956e1-3448-404a-bdfd-558ffe2631ce&event=getFilingDetails'

In [270]:
len(multiple_issues.xpath('//p[contains(.,"24. General lobbying issue")]/following-sibling::table[1]/tbody/tr/td/div'))


Out[270]:
18

multiple added affiliated orgs


In [271]:
_params = {'event': 'getFilingDetails',
           'filingID': '42524728-28e1-424f-9608-2b4f05f7cd2b',
           'filingTypeID': 82}

resp = requests.get(_filing_detail_url, params=_params)
multiple_added_aff =  etree.parse(StringIO(resp.text), html_parser)

In [272]:
rows = multiple_added_aff.xpath('//p[contains(.,"25. Add the following")]'
                         '/following-sibling::table[1]/tbody/tr')
rows


Out[272]:
[<Element tr at 0x7fd28f2695f0>, <Element tr at 0x7fd28f269638>]

In [273]:
for r in rows:
    for e in (r.xpath('td[3]/table/tbody/tr[2]/td[2]')):
        print etree.tostring(e)
    #print etree.tostring(r)
    print "="*20


<td>Country USA</td>

====================
<td>Country 
        </td>

====================

In [274]:
r.xpath('td[3]/table/tbody/tr[2]/td[2]')[0].text.split()


Out[274]:
['Country']

In [275]:
r.getchildren()


Out[275]:
[<Element td at 0x7fd28d99be18>,
 <Element td at 0x7fd28d9a47e8>,
 <Element td at 0x7fd28d9a4758>]

multiple no longer affiliated


In [276]:
_params = {'event': 'getFilingDetails',
           'filingID': '2897035b-c56e-4d05-9a51-cab6a4b505f8',
           'filingTypeID': 53}

resp = requests.get(_filing_detail_url, params=_params)
multiple_removed_aff =  etree.parse(StringIO(resp.text), html_parser)

In [277]:
table = multiple_removed_aff.xpath('//p[contains(.,"26. Name of each previously")]'
                         '/following-sibling::table[1]')[0]

In [278]:
for e in table.xpath('tbody/tr/td'):
    print etree.tostring(e)


<td><span style="font-weight:bold;padding:1px" class="lightbox">1</span> Sylvan Learning</td>
<td><span style="font-weight:bold;padding:1px" class="lightbox">2</span> Educate Online, Inc.</td>
<td><span style="font-weight:bold;padding:1px" class="lightbox">3</span> Educate, Inc.</td>


In [279]:
table.xpath('tbody/tr/td/span')


Out[279]:
[<Element span at 0x7fd28d9a4680>,
 <Element span at 0x7fd28d9a4710>,
 <Element span at 0x7fd28d9a4320>]

In [280]:
e = table.xpath('tbody/tr/td/span')[0]

In [281]:
e.tail


Out[281]:
' Sylvan Learning'

In [282]:
[e.tail.strip() for e in multiple_removed_aff.xpath('//p[contains(.,"26. Name of each previously")]'
                         '/following-sibling::table[1]/tbody/tr/td/span')]


Out[282]:
['Sylvan Learning', 'Educate Online, Inc.', 'Educate, Inc.']

multiple added foreign


In [283]:
_params = {'event': 'getFilingDetails',
           'filingID': '6e8effc6-e1e3-413e-86c9-24eda20858f2',
           'filingTypeID': 60}

resp = requests.get(_filing_detail_url, params=_params)
multiple_added_foreign =  etree.parse(StringIO(resp.text), html_parser)

In [284]:
rows = multiple_added_foreign.xpath('//p[contains(.,"27. Add the following foreign")]'
                             '/following-sibling::table[1]/tbody/tr')

In [285]:
for r in rows:
    print etree.tostring(r)
    print "="*20


<tr>
<td style="border-right:solid 1px">E.ON AG</td><td style="border-right:solid 1px">
<table width="100%">
<col width="50%"/>
<col width="30%"/>
<col width="20%"/>
<tbody>
<tr>
<td colspan="3">E.ON - Platz 1 40479</td>
</tr>
<tr>
<td>Dusseldorf</td><td>
        </td><td>DEN</td>
</tr>
</tbody>
</table>
</td><td style="border-right:solid 1px">
<table width="100%">
<col span="2" width="50%"/>
<tbody>
<tr>
<td colspan="2">City Dusseldorf</td>
</tr>
<tr>
<td>State 
        </td><td>Country DEN</td>
</tr>
</tbody>
</table>
</td><td style="border-right:solid 1px">0</td><td style="text-align:right"> %</td>
</tr>

====================
<tr>
<td style="border-right:solid 1px">E.ON AG</td><td style="border-right:solid 1px">
<table width="100%">
<col width="50%"/>
<col width="30%"/>
<col width="20%"/>
<tbody>
<tr>
<td colspan="3">E.ON - Platz 1 40479</td>
</tr>
<tr>
<td>Dusseldorf</td><td>
        </td><td>DEN</td>
</tr>
</tbody>
</table>
</td><td style="border-right:solid 1px">
<table width="100%">
<col span="2" width="50%"/>
<tbody>
<tr>
<td colspan="2">City Dusseldorf</td>
</tr>
<tr>
<td>State 
        </td><td>Country DEN</td>
</tr>
</tbody>
</table>
</td><td style="border-right:solid 1px">0</td><td style="text-align:right"> %</td>
</tr>

====================

In [286]:
for r in rows:
    for e in (r.xpath('td[5]')):
        print etree.tostring(e)
        print "="*20
    print "="*20


<td style="text-align:right"> %</td>

====================
====================
<td style="text-align:right"> %</td>

====================
====================

In [287]:
print etree.tostring(r)


<tr>
<td style="border-right:solid 1px">E.ON AG</td><td style="border-right:solid 1px">
<table width="100%">
<col width="50%"/>
<col width="30%"/>
<col width="20%"/>
<tbody>
<tr>
<td colspan="3">E.ON - Platz 1 40479</td>
</tr>
<tr>
<td>Dusseldorf</td><td>
        </td><td>DEN</td>
</tr>
</tbody>
</table>
</td><td style="border-right:solid 1px">
<table width="100%">
<col span="2" width="50%"/>
<tbody>
<tr>
<td colspan="2">City Dusseldorf</td>
</tr>
<tr>
<td>State 
        </td><td>Country DEN</td>
</tr>
</tbody>
</table>
</td><td style="border-right:solid 1px">0</td><td style="text-align:right"> %</td>
</tr>


In [288]:
import locale

multiple inactive foreign


In [290]:
_params = {'event': 'getFilingDetails',
           'filingID': '55dd2926-23b4-489d-8132-b040cc6ddac5',
           'filingTypeID': 78}

resp = requests.get(_filing_detail_url, params=_params)
multiple_inactive_foreign =  etree.parse(StringIO(resp.text), html_parser)

In [291]:
[e.tail.strip() for e in multiple_inactive_foreign.xpath(
    '//p[contains(.,"28. Name of each previously reported foreign entity")]'
    '/following-sibling::table[1]/tbody/tr/td/span')]


Out[291]:
['CNH Global N.V.',
 '',
 '',
 'Fiat Industrial S.p.A.',
 '',
 '',
 'Fiat Netherlands Holding N.V.',
 '',
 '',
 '',
 '',
 '']

In [292]:
from collections import defaultdict, Counter
record = defaultdict(dict)

In [293]:
record['a'] = 2

In [294]:
record


Out[294]:
defaultdict(<type 'dict'>, {'a': 2})

In [295]:
json.dumps(record)


Out[295]:
'{"a": 2}'

In [296]:
from glob import glob

In [297]:
dirs = glob(os.path.join(settings.CACHE_DIR, 'sopr_html/200[89]/Q2/*.html'))\
       + glob(os.path.join(settings.CACHE_DIR, 'sopr_html/201[0-9]/Q2/*.html'))
len(dirs)


Out[297]:
41

In [298]:
Counter([i.split('/')[9] for i in dirs])


Out[298]:
Counter({'2014': 41})

Debug


In [306]:
from pymongo import mongo_client

In [308]:
mc = mongo_client.MongoClient()
db = mc.lobbying_federal_domestic

In [311]:
db.house_ld2.find_one({"LOBBYINGDISCLOSURE2.alis.0.lobbyists": 
                        {'$elemMatch': 
                         {'coveredPosition': {'$ne': ''}}}})


Out[311]:
{u'LOBBYINGDISCLOSURE2': {u'address1': u'4908 Cloister Drive',
  u'address2': u'',
  u'alis': [{u'federal_agencies': u'U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE, Natl Credit Union Administration (NCUA)',
    u'foreign_entity_issues': u'',
    u'issueAreaCode': u'FIN',
    u'lobbyists': [{u'coveredPosition': u'Director of Public and Congressional Affairs,',
      u'lobbyistFirstName': u'John',
      u'lobbyistLastName': u'McKechnie',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'National Credit Union Administration',
      u'lobbyistFirstName': u'(continued)',
      u'lobbyistLastName': u'(continued)',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''}],
    u'specific_issues': [u'H.R. 688 (Credit Union Small Business Job Creation Act)\nH.R. 719 (Capital Access for Small Businesses and Jobs Act)\nCooperative development']},
   {u'federal_agencies': u'U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE',
    u'foreign_entity_issues': u'',
    u'issueAreaCode': u'TAX',
    u'lobbyists': [{u'coveredPosition': u'Director of Public and Congressional Affairs,',
      u'lobbyistFirstName': u'John',
      u'lobbyistLastName': u'McKechnie',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'National Credit Union Administration',
      u'lobbyistFirstName': u'(continued)',
      u'lobbyistLastName': u'(continued)',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''}],
    u'specific_issues': [u'Cooperative taxation, cooperative development']},
   {u'federal_agencies': u'U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE, Agriculture - Dept of  (USDA)',
    u'foreign_entity_issues': u'',
    u'issueAreaCode': u'AGR',
    u'lobbyists': [{u'coveredPosition': u'Director of Public and Congressional Affairs,',
      u'lobbyistFirstName': u'John',
      u'lobbyistLastName': u'McKechnie',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'National Credit Union Administration',
      u'lobbyistFirstName': u'(continued)',
      u'lobbyistLastName': u'(continued)',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''}],
    u'specific_issues': [u'Agricultural market policies, cooperative development']},
   {u'federal_agencies': u'U.S. HOUSE OF REPRESENTATIVES, U.S. SENATE',
    u'foreign_entity_issues': u'',
    u'issueAreaCode': u'ENG',
    u'lobbyists': [{u'coveredPosition': u'Director of Public and Congressional Affairs,',
      u'lobbyistFirstName': u'John',
      u'lobbyistLastName': u'McKechnie',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'National Credit Union Administration',
      u'lobbyistFirstName': u'(continued)',
      u'lobbyistLastName': u'(continued)',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''},
     {u'coveredPosition': u'',
      u'lobbyistFirstName': u'',
      u'lobbyistLastName': u'',
      u'lobbyistNew': u'N',
      u'lobbyistSuffix': u''}],
    u'specific_issues': [u'Electric power generation, cooperative development']}],
  u'city': u'North Bethesda',
  u'clientGovtEntity': u'N',
  u'clientName': u'National Cooperative Business Association',
  u'country': u'USA',
  u'expenses': u'',
  u'expensesMethod': u'',
  u'firstName': u'',
  u'houseID': u'416800008',
  u'imported': u'N',
  u'income': u'',
  u'lastName': u'',
  u'noLobbying': u'',
  u'organizationName': u'John McKechnie LLC',
  u'pages': u'5',
  u'prefix': u'',
  u'principal_city': u'',
  u'principal_country': u'',
  u'principal_state': u'',
  u'principal_zip': u'',
  u'principal_zipext': u'',
  u'printedName': u'Mr. John McKechnie, Owner',
  u'registrantDifferentAddress': u'N',
  u'reportType': u'Q4',
  u'reportYear': u'2013',
  u'selfSelect': u'',
  u'senateID': u'400751249-90',
  u'signedDate': u'01/21/2014',
  u'state': u'MD',
  u'submitURL': u'',
  u'terminationDate': u'',
  u'updates': {u'affiliatedOrgs': [{u'affiliatedOrgAddress': u'',
     u'affiliatedOrgCity': u'',
     u'affiliatedOrgCountry': u'',
     u'affiliatedOrgName': u'',
     u'affiliatedOrgState': u'',
     u'affiliatedOrgZip': u'',
     u'affiliatedPrinOrgCity': u'',
     u'affiliatedPrinOrgCountry': u'',
     u'affiliatedPrinOrgState': u''},
    {u'affiliatedOrgAddress': u'',
     u'affiliatedOrgCity': u'',
     u'affiliatedOrgCountry': u'',
     u'affiliatedOrgName': u'',
     u'affiliatedOrgState': u'',
     u'affiliatedOrgZip': u'',
     u'affiliatedPrinOrgCity': u'',
     u'affiliatedPrinOrgCountry': u'',
     u'affiliatedPrinOrgState': u''}],
   u'affiliatedUrl': u'',
   u'clientAddress': u'',
   u'clientCity': u'',
   u'clientCountry': u'',
   u'clientState': u'',
   u'clientZip': u'',
   u'clientZipext': u'',
   u'foreignEntities': [{u'address': u'',
     u'city': u'',
     u'contribution': u'',
     u'country': u'',
     u'name': u'',
     u'ownership_Percentage': u'',
     u'prinCity': u'',
     u'prinCountry': u'',
     u'prinState': u'',
     u'state': u''}],
   u'generalDescription': u'',
   u'inactiveOrgs': [u'', u'', u''],
   u'inactive_ALIs': [u'', u'', u'', u'', u'', u'', u'', u'', u''],
   u'inactive_ForeignEntities': [u'', u'', u'', u'', u'', u''],
   u'inactive_lobbyists': [{u'firstName': u'',
     u'lastName': u'',
     u'suffix': u''},
    {u'firstName': u'', u'lastName': u'', u'suffix': u''},
    {u'firstName': u'', u'lastName': u'', u'suffix': u''},
    {u'firstName': u'', u'lastName': u'', u'suffix': u''}],
   u'prinClientCity': u'',
   u'prinClientCountry': u'',
   u'prinClientState': u'',
   u'prinClientZip': u'',
   u'prinClientZipext': u''},
  u'zip': u'20852',
  u'zipext': u''},
 u'_id': ObjectId('537a43ff6e95522a2a96f904'),
 u'doc_id': u'300624537'}

In [299]:
floc_template = '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/cache/sopr_html/{year}/{subyear}/{id}.html'
floc = floc_template.format(year=2014,
                            subyear='Q1',
                            id='33eb46ef-55a7-4233-8685-f7427c057f41')

In [300]:
dbg = etree.parse(open(floc), parser=html_parser)

In [301]:
etree.tostring(e.xpath('//p[contains(., "15. General issue area")]'
                       '/following-sibling::p[1]')[0])
                       #'/tbody/tr[position() > 1]'
                       #'/td')[0])


Out[301]:
'<p>16. Specific lobbying issues</p>\n'

In [302]:
#from tasks import extract
#from tasks import schema
reload(extract)
reload(schema)

ld2_containers = filter(lambda x: 'children' in x, schema.ld2_schema)
ld2_elements = filter(lambda x: 'children' not in x, schema.ld2_schema)

In [303]:
ld2_containers[:3]


Out[303]:
[{'children': [{'field': 'general_issue_area',
    'lda_question': '15',
    'parser': <function tasks.schema.split_keep_rightmost>,
    'path': '.',
    'section': 'lobbying_activities'},
   {'field': 'specific_issues',
    'lda_question': '16',
    'parser': <function tasks.schema.clean_text>,
    'path': 'following-sibling::p[2]',
    'section': 'lobbying_activities'},
   {'field': 'houses_and_agencies_none',
    'lda_question': '17',
    'parser': <function tasks.schema.checkbox_boolean>,
    'path': 'following-sibling::p[3]/span/input',
    'section': 'lobbying_activities'},
   {'field': 'houses_and_agencies',
    'lda_question': '17',
    'parser': <function tasks.schema.clean_text>,
    'path': 'following-sibling::p[4]',
    'section': 'lobbying_activities'},
   {'children': [{'field': 'lobbyist_first_name',
      'lda_question': '18',
      'parser': <function tasks.schema.clean_text>,
      'path': 'td[1]/div',
      'section': 'lobbying_activities'},
     {'field': 'lobbyist_last_name',
      'lda_question': '18',
      'parser': <function tasks.schema.clean_text>,
      'path': 'td[2]/div',
      'section': 'lobbying_activities'},
     {'field': 'lobbyist_suffix',
      'lda_question': '18',
      'parser': <function tasks.schema.clean_text>,
      'path': 'td[3]/div',
      'section': 'lobbying_activities'},
     {'field': 'lobbyist_covered_position',
      'lda_question': '18',
      'parser': <function tasks.schema.clean_text>,
      'path': 'td[4]/div',
      'section': 'lobbying_activities'},
     {'field': 'lobbyist_is_new',
      'lda_question': '18',
      'parser': <function tasks.schema.checkbox_boolean>,
      'path': 'td[5]/input',
      'section': 'lobbying_activities'}],
    'field': 'lobbyists',
    'lda_question': '18',
    'parser': <function tasks.schema.parse_array>,
    'path': 'following-sibling::table[1]/tbody/tr[position() > 1]',
    'section': 'lobbying_activities'},
   {'field': 'foreign_entity_interest_none',
    'lda_question': '19',
    'parser': <function tasks.schema.checkbox_boolean>,
    'path': 'following-sibling::p[6]/span/input',
    'section': 'lobbying_activities'},
   {'field': 'foreign_entity_interest',
    'lda_question': '19',
    'parser': <function tasks.schema.clean_text>,
    'path': 'following-sibling::p[7]',
    'section': 'lobbying_activities'}],
  'field': 'lobbying_activities',
  'lda_question': None,
  'parser': <function tasks.schema.parse_array>,
  'path': '//p[contains(.,"15. General issue area")]',
  'section': 'lobbying_activities'},
 {'children': [{'field': 'removed_lobbyist_first_name',
    'lda_question': 23,
    'parser': <function tasks.schema.clean_text>,
    'path': 'td[1]/div',
    'section': 'registration_update'},
   {'field': 'removed_lobbyist_last_name',
    'lda_question': 23,
    'parser': <function tasks.schema.clean_text>,
    'path': 'td[2]/div',
    'section': 'registration_update'},
   {'field': 'removed_lobbyist_last_name',
    'lda_question': 23,
    'parser': <function tasks.schema.clean_text>,
    'path': 'td[3]/div',
    'section': 'registration_update'}],
  'field': 'removed_lobbyists',
  'lda_question': 23,
  'parser': <function tasks.schema.parse_array>,
  'path': '//p[contains(.,"23. Name of each previously")]/following-sibling::table[1]/tbody/tr[position()>1]/td/table/tbody',
  'section': 'registration_update'},
 {'children': [{'field': 'issue_code',
    'lda_question': '24',
    'parser': <function tasks.schema.clean_text>,
    'path': '.',
    'section': 'lobbying_issues'}],
  'field': 'removed_lobbying_issues',
  'lda_question': 24,
  'parser': <function tasks.schema.parse_array>,
  'path': '//p[contains(.,"24. General lobbying issue")]/following-sibling::table[1]/tbody/tr/td/div',
  'section': 'registration_update'}]

In [304]:
extract.extract_html(floc, ld2_elements, ld2_containers)


03/31/2014
06/30/2014
{
  "lobbying_activities": {
    "lobbying_activities": [
      {
        "general_issue_area": "HCR", 
        "houses_and_agencies_none": false, 
        "specific_issues": "Implementation of PL 111-148, The Patient Protection and Affordable Care Act and the economic impact of budget cuts to biopharmaceuticals.", 
        "houses_and_agencies": "U.S. SENATE, U.S. HOUSE OF REPRESENTATIVES", 
        "foreign_entity_interest": "", 
        "lobbyists": [
          {
            "lobbyist_covered_position": "", 
            "lobbyist_is_new": false, 
            "lobbyist_first_name": "Sara", 
            "lobbyist_last_name": "Froelich", 
            "lobbyist_suffix": ""
          }
        ], 
        "foreign_entity_interest_none": true
      }
    ]
  }, 
  "identifiers": {
    "client_registrant_house_id": null, 
    "client_registrant_senate_id": null
  }, 
  "expenses": {
    "expense_amount": null, 
    "expense_method_a": false, 
    "expense_method_c": false, 
    "expense_method_b": false, 
    "expense_less_than_five_thousand": false, 
    "expense_five_thousand_or_more": false
  }, 
  "registration_update": {
    "client_new_ppb_city": "", 
    "client_new_ppb_zip": "", 
    "added_foreign_entities": [], 
    "added_affiliated_organizations": [], 
    "client_new_zip": "", 
    "client_new_city": "", 
    "client_new_ppb_country": "", 
    "client_new_general_description": "", 
    "removed_lobbyists": [], 
    "removed_lobbying_issues": [], 
    "client_new_address": "", 
    "client_new_country": "", 
    "client_new_state": "", 
    "removed_affiliated_organizations": [], 
    "client_new_ppb_state": ""
  }, 
  "client": {
    "client_name": null, 
    "client_self": null, 
    "client_state_or_local_government": null
  }, 
  "income": {
    "income_less_than_five_thousand": true, 
    "income_amount": null, 
    "income_five_thousand_or_more": false
  }, 
  "report": {
    "report_quarter_one": true, 
    "report_quarter_four": false, 
    "report_no_activity": false, 
    "report_year": "2014", 
    "report_quarter_two": false, 
    "report_termination_date": "2014-03-31T00:00:00", 
    "report_quarter_three": false, 
    "report_is_amendment": true, 
    "report_is_termination": true
  }, 
  "signature": {
    "signature_date": "2014-06-30T00:00:00", 
    "signature": "Digitally Signed By: Sara Froelich"
  }, 
  "registrant": {
    "registrant_name": "TKG Federal Affairs", 
    "registrant_zip": null, 
    "registrant_contact_email": null, 
    "registrant_address_one": null, 
    "self_employed_individual": false, 
    "registrant_ppb_city": null, 
    "registrant_ppb_zip": null, 
    "registrant_city": null, 
    "registrant_contact_name": null, 
    "registrant_state": null, 
    "registrant_ppb_state": null, 
    "registrant_contact_phone": null, 
    "registrant_country": null, 
    "registrant_address_two": null, 
    "organization_lobbying_firm": true, 
    "registrant_contact_name_prefix": null
  }, 
  "document_id": "33eb46ef-55a7-4233-8685-f7427c057f41"
}

In [305]:
2385.88 - 1153


Out[305]:
1232.88

In [305]: