In [2]:
import json
import os
import re
import sys
import urlparse
sys.path.append(os.path.join(os.getcwd(),os.path.pardir))
from datetime import datetime, date, timedelta
from io import StringIO
import requests
from pyquery import PyQuery as pq
from lxml import etree
import settings
In [11]:
html_parser = etree.HTMLParser()
In [12]:
_filing_detail_url = 'http://soprweb.senate.gov/index.cfm?'
In [13]:
_params = {'event': 'getFilingDetails',
'filingID': 'b931620a-16aa-4834-b798-e08b0a3bddf8',
'filingTypeID': 1}
resp = requests.get(_filing_detail_url, params=_params)
In [18]:
parsed = etree.parse(StringIO(resp.text), html_parser)
In [28]:
#e =
e = parsed.xpath('/html/body/table[3]/tbody/tr/'
'td[contains(.,"Prefix")]/following-sibling::td[1]/div')[0]
print etree.tostring(e)
In [214]:
_params = {'event': 'getFilingDetails',
'filingID': 'b4c3bd67-7c7c-45e6-8b6c-5fd6b55eec3f',
'filingTypeID': 1}
resp = requests.get(_filing_detail_url, params=_params)
In [216]:
parsed = etree.parse(StringIO(resp.text), html_parser)
parsed.xpath('/html/body/table[position() > 2 and position() < 10]/tbody/tr/*')
Out[216]:
In [217]:
parsed.xpath('/html/body/table[8]/tbody/tr/td[4]/div/text()')
Out[217]:
In [218]:
s = parsed.xpath('/html/body/table[8]/tbody/tr/td[4]/div/text()')[0]
s.replace(u'\xa0',u'')
Out[218]:
In [219]:
parsed.xpath('/html/body/table[16]/tbody/tr[position() > 3]')
Out[219]:
In [220]:
_params = {'event': 'getFilingDetails',
'filingID': 'C3A7E902-87A2-49FB-8D27-1D031F48DC12',
'filingTypeID': 1}
resp = requests.get(_filing_detail_url, params=_params)
multiple_lobbyists = etree.parse(StringIO(resp.text), html_parser)
In [221]:
lobb_rows = multiple_lobbyists.xpath('/html/body/table[12]/tbody/tr[position() > 2]')
In [222]:
lobb_row = lobb_rows[0]
etree.tostring(lobb_row)
Out[222]:
In [223]:
lobb_row.xpath('td[1]')
Out[223]:
In [224]:
_params = {'event': 'getFilingDetails',
'filingID': '3A144627-84A0-4190-81A8-B40718EA37EC',
'filingTypeID': 1}
resp = requests.get(_filing_detail_url, params=_params)
multiple_issues = etree.parse(StringIO(resp.text), html_parser)
In [225]:
multiple_issues.xpath('/html/body/table[13]/tbody/tr/td/div')
Out[225]:
In [226]:
multiple_issues.xpath('/html/body/table[13]/tbody/tr/td/div/text()')
Out[226]:
In [229]:
_params = {'event': 'getFilingDetails',
'filingID': 'C3A7E902-87A2-49FB-8D27-1D031F48DC12',
'filingTypeID': 1}
resp = requests.get(_filing_detail_url, params=_params)
multiple_aff_orgs = etree.parse(StringIO(resp.text), html_parser)
In [230]:
multiple_aff_orgs.xpath('/html/body/table[16]/tbody/tr[position() > 3]')
Out[230]:
In [231]:
def _zip_odd_even(arr):
return [(arr[i].xpath('td/div/text()'), arr[i+1].xpath('td[position() > 1]/table/tbody/tr/td/div/text()')) for i in xrange(0,len(arr),2) ]
_zip_odd_even(multiple_aff_orgs.xpath('/html/body/table[16]/tbody/tr[position() > 3]'))
Out[231]:
In [232]:
row_eg = multiple_aff_orgs.xpath('/html/body/table[16]/tbody/tr[position() > 3]')[0]
In [233]:
row_eg.xpath('.//td/div/text()')
Out[233]:
In [234]:
for r in multiple_aff_orgs.xpath('/html/body/table[16]/tbody/tr[position() > 3]'):
for d in r.xpath('td[3]/table/tbody/tr/td[4]/div/text()'):
print d
print '='*20
In [235]:
_params = {'event': 'getFilingDetails',
'filingID': '3A144627-84A0-4190-81A8-B40718EA37EC',
'filingTypeID': 1}
resp = requests.get(_filing_detail_url, params=_params)
multiple_foreign = etree.parse(StringIO(resp.text), html_parser)
In [236]:
for r in multiple_foreign.xpath('/html/body/table[19]/tbody/tr'):
for d in r.xpath('td[5]/div/text()'):
print d
print '='*20
In [237]:
e = parsed.xpath('/html/body/div[1]/input[2]')[0]
In [238]:
e.attrib.has_key('checked')
Out[238]:
In [239]:
s = parsed.xpath('/html/body/table[2]/tbody/tr[1]/td[3]/div/text()')[0]
s
Out[239]:
In [240]:
datetime.strptime(s, '%m/%d/%Y')
Out[240]:
In [241]:
filter(lambda x: x % 2, [1,2,3,4,5])
Out[241]:
In [242]:
os.path.basename('/home/blannon/1223_watzman_20131101.csv')
Out[242]:
In [243]:
os.extsep
Out[243]:
In [244]:
u = 'http://soprweb.senate.gov/index.cfm?event=getFilingDetails&filingTypeID=57&filingID=33eb46ef-55a7-4233-8685-f7427c057f41'
In [245]:
urlparse.parse_qsl(urlparse.urlparse(u).query)
Out[245]:
In [246]:
thing = etree.parse(open('../data/cache/sopr_html/2014/REG/a25b34ae-5ccb-409f-848d-f4fa008a06b4.html'), html_parser)
In [247]:
print etree.tostring(thing)
In [248]:
sys.path.append('..')
In [249]:
from tasks import extract
from tasks.schema import ld1_schema
In [250]:
elements = filter(lambda x: 'children' not in x, ld1_schema)
containers = filter(lambda x: 'children' in x, ld1_schema)
In [251]:
elements
Out[251]:
In [252]:
reload(extract)
extract.extract_html('../data/cache/sopr_html/2014/REG/a25b34ae-5ccb-409f-848d-f4fa008a06b4.html',
elements,
containers)
In [253]:
"goo"
Out[253]:
In [254]:
dt = datetime(2014, 4, 1, 0, 0)
In [255]:
dt.isoformat()
Out[255]:
In [256]:
_params = {'event': 'getFilingDetails',
'filingID': '80b956e1-3448-404a-bdfd-558ffe2631ce',
'filingTypeID': 69}
resp = requests.get(_filing_detail_url, params=_params)
multiple_issues = etree.parse(StringIO(resp.text), html_parser)
In [257]:
print etree.tostring(multiple_issues.xpath('/html/body/table[4]/tbody/tr[5]/td[1]/table')[0])
In [258]:
for e in multiple_issues.xpath('//p[@style="page-break-before:always"]'):
print etree.tostring(e)
print "="*80
In [259]:
page_break = multiple_issues.xpath('//p[@style="page-break-before:always"]')[0]
In [260]:
#general_issue_code = multiple_issues.xpath('//p[@style="page-break-before:always"]/following-sibling::p[1]')[0]
general_issue_code = multiple_issues.xpath('//p[contains(.,"15. General issue area")]')[0]
print etree.tostring(general_issue_code)
In [261]:
[etree.tostring(e) for e in multiple_issues.xpath('//p[contains(.,"15. General issue area")]')]
Out[261]:
In [262]:
general_issue_code = page_break.getnext()
filler = general_issue_code.getnext()
specific_lobbying_issues = filler.getnext()
congress_agency_check = specific_lobbying_issues.getnext()
congress_agency_detail = congress_agency_check.getnext()
filler = congress_agency_detail.getnext()
lobbyists = filler.getnext()
foreign_entity_check = lobbyists.getnext()
foreign_entity_list = foreign_entity_check.getnext()
In [263]:
fs = [general_issue_code,
specific_lobbying_issues,
congress_agency_check,
congress_agency_detail,
lobbyists,
foreign_entity_check,
foreign_entity_list]
for f in fs:
print etree.tostring(f)
print '='*20
In [264]:
[etree.tostring(e) for e in general_issue_code.xpath('../following-sibling::p')]
Out[264]:
In [265]:
print etree.tostring(general_issue_code.xpath(
'following-sibling::p[7]')[0])
In [266]:
second_gic = multiple_issues.xpath('//p[@style="page-break-before:always"]/following-sibling::p[1]')[1]
for r in second_gic.xpath('following-sibling::table[1]/tbody/tr[position() > 1]'):
print etree.tostring(r)
In [267]:
print etree.tostring(multiple_issues.xpath('//p[contains(.,"20.")]')[0])
In [268]:
found = multiple_issues.xpath('//p[contains(.,"23. Name of each previously")]'
'/following-sibling::table[1]'
'/tbody/tr[position()>1]/td/table'
'/tbody')
for e in found:
print etree.tostring(e)
print '='*30
In [269]:
resp.url
Out[269]:
In [270]:
len(multiple_issues.xpath('//p[contains(.,"24. General lobbying issue")]/following-sibling::table[1]/tbody/tr/td/div'))
Out[270]:
In [271]:
_params = {'event': 'getFilingDetails',
'filingID': '42524728-28e1-424f-9608-2b4f05f7cd2b',
'filingTypeID': 82}
resp = requests.get(_filing_detail_url, params=_params)
multiple_added_aff = etree.parse(StringIO(resp.text), html_parser)
In [272]:
rows = multiple_added_aff.xpath('//p[contains(.,"25. Add the following")]'
'/following-sibling::table[1]/tbody/tr')
rows
Out[272]:
In [273]:
for r in rows:
for e in (r.xpath('td[3]/table/tbody/tr[2]/td[2]')):
print etree.tostring(e)
#print etree.tostring(r)
print "="*20
In [274]:
r.xpath('td[3]/table/tbody/tr[2]/td[2]')[0].text.split()
Out[274]:
In [275]:
r.getchildren()
Out[275]:
In [276]:
_params = {'event': 'getFilingDetails',
'filingID': '2897035b-c56e-4d05-9a51-cab6a4b505f8',
'filingTypeID': 53}
resp = requests.get(_filing_detail_url, params=_params)
multiple_removed_aff = etree.parse(StringIO(resp.text), html_parser)
In [277]:
table = multiple_removed_aff.xpath('//p[contains(.,"26. Name of each previously")]'
'/following-sibling::table[1]')[0]
In [278]:
for e in table.xpath('tbody/tr/td'):
print etree.tostring(e)
In [279]:
table.xpath('tbody/tr/td/span')
Out[279]:
In [280]:
e = table.xpath('tbody/tr/td/span')[0]
In [281]:
e.tail
Out[281]:
In [282]:
[e.tail.strip() for e in multiple_removed_aff.xpath('//p[contains(.,"26. Name of each previously")]'
'/following-sibling::table[1]/tbody/tr/td/span')]
Out[282]:
In [283]:
_params = {'event': 'getFilingDetails',
'filingID': '6e8effc6-e1e3-413e-86c9-24eda20858f2',
'filingTypeID': 60}
resp = requests.get(_filing_detail_url, params=_params)
multiple_added_foreign = etree.parse(StringIO(resp.text), html_parser)
In [284]:
rows = multiple_added_foreign.xpath('//p[contains(.,"27. Add the following foreign")]'
'/following-sibling::table[1]/tbody/tr')
In [285]:
for r in rows:
print etree.tostring(r)
print "="*20
In [286]:
for r in rows:
for e in (r.xpath('td[5]')):
print etree.tostring(e)
print "="*20
print "="*20
In [287]:
print etree.tostring(r)
In [288]:
import locale
In [290]:
_params = {'event': 'getFilingDetails',
'filingID': '55dd2926-23b4-489d-8132-b040cc6ddac5',
'filingTypeID': 78}
resp = requests.get(_filing_detail_url, params=_params)
multiple_inactive_foreign = etree.parse(StringIO(resp.text), html_parser)
In [291]:
[e.tail.strip() for e in multiple_inactive_foreign.xpath(
'//p[contains(.,"28. Name of each previously reported foreign entity")]'
'/following-sibling::table[1]/tbody/tr/td/span')]
Out[291]:
In [292]:
from collections import defaultdict, Counter
record = defaultdict(dict)
In [293]:
record['a'] = 2
In [294]:
record
Out[294]:
In [295]:
json.dumps(record)
Out[295]:
In [296]:
from glob import glob
In [297]:
dirs = glob(os.path.join(settings.CACHE_DIR, 'sopr_html/200[89]/Q2/*.html'))\
+ glob(os.path.join(settings.CACHE_DIR, 'sopr_html/201[0-9]/Q2/*.html'))
len(dirs)
Out[297]:
In [298]:
Counter([i.split('/')[9] for i in dirs])
Out[298]:
In [306]:
from pymongo import mongo_client
In [308]:
mc = mongo_client.MongoClient()
db = mc.lobbying_federal_domestic
In [311]:
db.house_ld2.find_one({"LOBBYINGDISCLOSURE2.alis.0.lobbyists":
{'$elemMatch':
{'coveredPosition': {'$ne': ''}}}})
Out[311]:
In [299]:
floc_template = '/home/blannon/dev/influence-usa/lobbying-federal-domestic/data/cache/sopr_html/{year}/{subyear}/{id}.html'
floc = floc_template.format(year=2014,
subyear='Q1',
id='33eb46ef-55a7-4233-8685-f7427c057f41')
In [300]:
dbg = etree.parse(open(floc), parser=html_parser)
In [301]:
etree.tostring(e.xpath('//p[contains(., "15. General issue area")]'
'/following-sibling::p[1]')[0])
#'/tbody/tr[position() > 1]'
#'/td')[0])
Out[301]:
In [302]:
#from tasks import extract
#from tasks import schema
reload(extract)
reload(schema)
ld2_containers = filter(lambda x: 'children' in x, schema.ld2_schema)
ld2_elements = filter(lambda x: 'children' not in x, schema.ld2_schema)
In [303]:
ld2_containers[:3]
Out[303]:
In [304]:
extract.extract_html(floc, ld2_elements, ld2_containers)
In [305]:
2385.88 - 1153
Out[305]:
In [305]: