In [1]:
%load_ext autoreload
In [2]:
%autoreload 2
In [15]:
import copy
import requests
from validscrape.utils.data_munge import (clean_text, checkbox_boolean,
parse_datetime, parse_date)
from validscrape import target
from validscrape import extract
In [4]:
#from pupa.scrape.schemas.common import fuzzy_date, fuzzy_datetime_blank
fuzzy_date = {
"type": "string",
"pattern": "(^[0-9]{4})?(-[0-9]{2}){0,2}$"
}
fuzzy_date_blank = {
"type": "string",
"pattern": "(^[0-9]{4})?(-[0-9]{2}){0,2}$",
"blank": True
}
fuzzy_datetime_blank = {
"type": "string",
"pattern": "(^[0-9]{4})?(-[0-9]{2}){0,2}( [0-9]{2}:[0-9]{2}:[0-9]{2})?$",
"blank": True
}
def pupa_date(parse_properties):
pd = copy.deepcopy(fuzzy_date)
pd.update(parse_properties)
return pd
def pupa_datetime_blank(parse_properties):
pd = copy.deepcopy(fuzzy_datetime_blank)
pd.update(parse_properties)
return pd
In [5]:
# scrapers_us_federal: unitedstates.ref.sopr_lobbying_reference
FILING_TYPES = [
{
"action": "registration",
"code": "1",
"name": "REGISTRATION"
},
{
"action": "registration_amendment",
"code": "2",
"name": "REGISTRATION AMENDMENT"
},
{
"action": "report",
"code": "3",
"name": "MID-YEAR REPORT"
},
{
"action": "report",
"code": "4",
"name": "MID-YEAR (NO ACTIVITY)"
},
{
"action": "report_amendment",
"code": "5",
"name": "MID-YEAR AMENDMENT"
},
{
"action": "termination",
"code": "6",
"name": "MID-YEAR TERMINATION"
},
{
"action": "termination_letter",
"code": "7",
"name": "MID-YEAR TERMINATION LETTER"
},
{
"action": "termination_amendment",
"code": "8",
"name": "MID-YEAR TERMINATION AMENDMENT"
},
{
"action": "report",
"code": "9",
"name": "YEAR-END REPORT"
},
{
"action": "report",
"code": "10",
"name": "YEAR-END (NO ACTIVITY)"
},
{
"action": "report_amendment",
"code": "11",
"name": "YEAR-END AMENDMENT"
},
{
"action": "termination",
"code": "12",
"name": "YEAR-END TERMINATION"
},
{
"action": "termination_letter",
"code": "13",
"name": "YEAR-END TERMINATION LETTER"
},
{
"action": "termination_amendment",
"code": "14",
"name": "YEAR-END TERMINATION AMENDMENT"
},
{
"action": "termination",
"code": "15",
"name": "YEAR-END TERMINATION (NO ACTIVITY)"
},
{
"action": "termination",
"code": "16",
"name": "MID-YEAR TERMINATION (NO ACTIVITY)"
},
{
"action": "misc_termination",
"code": "17",
"name": "MISC TERM"
},
{
"action": "misc_document",
"code": "18",
"name": "MISC. DOC"
},
{
"action": "termination_amendment",
"code": "19",
"name": "MID-YEAR TERMINATION AMENDMENT (NO ACTIVITY)"
},
{
"action": "report_amendment",
"code": "20",
"name": "MID-YEAR AMENDMENT (NO ACTIVITY)"
},
{
"action": "report_amendment",
"code": "21",
"name": "YEAR-END AMENDMENT (NO ACTIVITY)"
},
{
"action": "termination_amendment",
"code": "22",
"name": "YEAR-END TERMINATION AMENDMENT (NO ACTIVITY)"
},
{
"action": "misc_update",
"code": "29",
"name": "UPDATE PAGE IN A REPORT"
},
{
"action": "report",
"code": "51",
"name": "FIRST QUARTER REPORT"
},
{
"action": "report",
"code": "52",
"name": "FIRST QUARTER (NO ACTIVITY)"
},
{
"action": "termination",
"code": "53",
"name": "FIRST QUARTER TERMINATION"
},
{
"action": "termination",
"code": "54",
"name": "FIRST QUARTER TERMINATION (NO ACTIVITY)"
},
{
"action": "report_amendment",
"code": "55",
"name": "FIRST QUARTER AMENDMENT"
},
{
"action": "report_amendment",
"code": "56",
"name": "FIRST QUARTER AMENDMENT (NO ACTIVITY)"
},
{
"action": "termination_amendment",
"code": "57",
"name": "FIRST QUARTER TERMINATION AMENDMENT"
},
{
"action": "termination_amendment",
"code": "58",
"name": "FIRST QUARTER TERMINATION AMENDMENT (NO ACTIVITY)"
},
{
"action": "termination_letter",
"code": "59",
"name": "FIRST QUARTER TERMINATION LETTER"
},
{
"action": "report",
"code": "60",
"name": "SECOND QUARTER REPORT"
},
{
"action": "report",
"code": "61",
"name": "SECOND QUARTER (NO ACTIVITY)"
},
{
"action": "termination",
"code": "62",
"name": "SECOND QUARTER TERMINATION"
},
{
"action": "termination",
"code": "63",
"name": "SECOND QUARTER TERMINATION (NO ACTIVITY)"
},
{
"action": "report_amendment",
"code": "64",
"name": "SECOND QUARTER AMENDMENT"
},
{
"action": "report_amendment",
"code": "65",
"name": "SECOND QUARTER AMENDMENT (NO ACTIVITY)"
},
{
"action": "termination_amendment",
"code": "66",
"name": "SECOND QUARTER TERMINATION AMENDMENT"
},
{
"action": "termination_amendment",
"code": "67",
"name": "SECOND QUARTER TERMINATION AMENDMENT (NO ACTIVITY)"
},
{
"action": "termination_letter",
"code": "68",
"name": "SECOND QUARTER TERMINATION LETTER"
},
{
"action": "report",
"code": "69",
"name": "THIRD QUARTER REPORT"
},
{
"action": "report",
"code": "70",
"name": "THIRD QUARTER (NO ACTIVITY)"
},
{
"action": "termination",
"code": "71",
"name": "THIRD QUARTER TERMINATION"
},
{
"action": "termination",
"code": "72",
"name": "THIRD QUARTER TERMINATION (NO ACTIVITY)"
},
{
"action": "report_amendment",
"code": "73",
"name": "THIRD QUARTER AMENDMENT"
},
{
"action": "report_amendment",
"code": "74",
"name": "THIRD QUARTER AMENDMENT (NO ACTIVITY)"
},
{
"action": "termination_amendment",
"code": "75",
"name": "THIRD QUARTER TERMINATION AMENDMENT"
},
{
"action": "termination_amendment",
"code": "76",
"name": "THIRD QUARTER TERMINATION AMENDMENT (NO ACTIVITY)"
},
{
"action": "termination_letter",
"code": "77",
"name": "THIRD QUARTER TERMINATION LETTER"
},
{
"action": "report",
"code": "78",
"name": "FOURTH QUARTER REPORT"
},
{
"action": "report",
"code": "79",
"name": "FOURTH QUARTER (NO ACTIVITY)"
},
{
"action": "termination",
"code": "80",
"name": "FOURTH QUARTER TERMINATION"
},
{
"action": "termination",
"code": "81",
"name": "FOURTH QUARTER TERMINATION (NO ACTIVITY)"
},
{
"action": "report_amendment",
"code": "82",
"name": "FOURTH QUARTER AMENDMENT"
},
{
"action": "report_amendment",
"code": "83",
"name": "FOURTH QUARTER AMENDMENT (NO ACTIVITY)"
},
{
"action": "termination_amendment",
"code": "84",
"name": "FOURTH QUARTER TERMINATION AMENDMENT"
},
{
"action": "termination_amendment",
"code": "85",
"name": "FOURTH QUARTER TERMINATION AMENDMENT (NO ACTIVITY)"
},
{
"action": "termination_letter",
"code": "86",
"name": "FOURTH QUARTER TERMINATION LETTER"
}
]
GENERAL_ISSUE_CODES = [
{
"issue_code": "ACC",
"description": "Accounting"
},
{
"issue_code": "CPI",
"description": "Computer Industry"
},
{
"issue_code": "AER",
"description": "Aerospace"
},
{
"issue_code": "REL",
"description": "Religion"
},
{
"issue_code": "MIA",
"description": "Media (Information/Publishing)"
},
{
"issue_code": "DOC",
"description": "District of Columbia"
},
{
"issue_code": "CAW",
"description": "Clean Air & Water (Quality)"
},
{
"issue_code": "CPT",
"description": "Copyright/Patent/Trademark"
},
{
"issue_code": "ANI",
"description": "Animals"
},
{
"issue_code": "TOB",
"description": "Tobacco"
},
{
"issue_code": "FUE",
"description": "Fuel/Gas/Oil"
},
{
"issue_code": "TOU",
"description": "Travel/Tourism"
},
{
"issue_code": "CIV",
"description": "Civil Rights/Civil Liberties"
},
{
"issue_code": "NAT",
"description": "Natural Resources"
},
{
"issue_code": "BAN",
"description": "Banking"
},
{
"issue_code": "BEV",
"description": "Beverage Industry"
},
{
"issue_code": "AGR",
"description": "Agriculture"
},
{
"issue_code": "DEF",
"description": "Defense"
},
{
"issue_code": "CON",
"description": "Constitution"
},
{
"issue_code": "MMM",
"description": "Medicare/Medicaid"
},
{
"issue_code": "GOV",
"description": "Government Issues"
},
{
"issue_code": "SCI",
"description": "Science/Technology"
},
{
"issue_code": "URB",
"description": "Urban Development/Municipalities"
},
{
"issue_code": "TAR",
"description": "Miscellaneous Tariff Bills"
},
{
"issue_code": "COM",
"description": "Communications/Broadcasting/Radio/TV"
},
{
"issue_code": "TAX",
"description": "Taxation/Internal Revenue Code"
},
{
"issue_code": "TEC",
"description": "Telecommunications"
},
{
"issue_code": "ROD",
"description": "Roads/Highway"
},
{
"issue_code": "POS",
"description": "Postal"
},
{
"issue_code": "RET",
"description": "Retirement"
},
{
"issue_code": "TOR",
"description": "Torts"
},
{
"issue_code": "GAM",
"description": "Gaming/Gambling/Casino"
},
{
"issue_code": "SMB",
"description": "Small Business"
},
{
"issue_code": "FAM",
"description": "Family Issues/Abortion/Adoption"
},
{
"issue_code": "WAS",
"description": "Waste (hazardous/solid/interstate/nuclear)"
},
{
"issue_code": "UTI",
"description": "Utilities"
},
{
"issue_code": "DIS",
"description": "Disaster Planning/Emergencies"
},
{
"issue_code": "WEL",
"description": "Welfare"
},
{
"issue_code": "RRR",
"description": "Railroads"
},
{
"issue_code": "BUD",
"description": "Budget/Appropriations"
},
{
"issue_code": "MON",
"description": "Minting/Money/Gold Standard"
},
{
"issue_code": "ADV",
"description": "Advertising"
},
{
"issue_code": "VET",
"description": "Veterans"
},
{
"issue_code": "HOM",
"description": "Homeland Security"
},
{
"issue_code": "TRU",
"description": "Trucking/Shipping"
},
{
"issue_code": "UNM",
"description": "Unemployment"
},
{
"issue_code": "FOR",
"description": "Foreign Relations"
},
{
"issue_code": "ENG",
"description": "Energy/Nuclear"
},
{
"issue_code": "FIR",
"description": "Firearms/Guns/Ammunition"
},
{
"issue_code": "EDU",
"description": "Education"
},
{
"issue_code": "IMM",
"description": "Immigration"
},
{
"issue_code": "CHM",
"description": "Chemicals/Chemical Industry"
},
{
"issue_code": "TRD",
"description": "Trade (Domestic & Foreign)"
},
{
"issue_code": "BNK",
"description": "Bankruptcy"
},
{
"issue_code": "HCR",
"description": "Health Issues"
},
{
"issue_code": "HOU",
"description": "Housing"
},
{
"issue_code": "AUT",
"description": "Automotive Industry"
},
{
"issue_code": "ENV",
"description": "Environmental/Superfund"
},
{
"issue_code": "RES",
"description": "Real Estate/Land Use/Conservation"
},
{
"issue_code": "FOO",
"description": "Food Industry (Safety, Labeling, etc.)"
},
{
"issue_code": "FIN",
"description": "Financial Institutions/Investments/Securities"
},
{
"issue_code": "CSP",
"description": "Consumer Issues/Safety/Protection"
},
{
"issue_code": "MED",
"description": "Medical/Disease Research/Clinical Labs"
},
{
"issue_code": "MAR",
"description": "Marine/Maritime/Boating/Fisheries"
},
{
"issue_code": "ART",
"description": "Arts/Entertainment"
},
{
"issue_code": "INT",
"description": "Intelligence and Surveillance"
},
{
"issue_code": "APP",
"description": "Apparel/Clothing Industry/Textiles"
},
{
"issue_code": "TRA",
"description": "Transportation"
},
{
"issue_code": "ALC",
"description": "Alcohol & Drug Abuse"
},
{
"issue_code": "INS",
"description": "Insurance"
},
{
"issue_code": "CDT",
"description": "Commodities (Big Ticket)"
},
{
"issue_code": "LBR",
"description": "Labor Issues/Antitrust/Workplace"
},
{
"issue_code": "AVI",
"description": "Aviation/Aircraft/Airlines"
},
{
"issue_code": "ECN",
"description": "Economics/Economic Development"
},
{
"issue_code": "IND",
"description": "Indian/Native American Affairs"
},
{
"issue_code": "SPO",
"description": "Sports/Athletics"
},
{
"issue_code": "LAW",
"description": "Law Enforcement/Crime/Criminal Justice"
},
{
"issue_code": "PHA",
"description": "Pharmacy"
},
{
"issue_code": "MAN",
"description": "Manufacturing"
}
]
In [6]:
sopr_general_issue_codes = [i['issue_code'] for i in GENERAL_ISSUE_CODES]
In [7]:
ld1_schema = {
"title": "Lobbying Registration",
"description": "Lobbying Disclosure Act of 1995 (Section 4)",
"type": "object",
"properties": {
"_meta": {
"type": "object",
"properties": {
"document_id": {
"type": "string",
"format": "uuid_hex",
},
}
},
"affiliated_organizations_url": {
"type": ["null", "string"],
"format": "url_http",
"missing": True,
"blank": True,
'path': '/html/body/table[15]/tbody/td[2]/div',
'parser': clean_text
},
"signature": {
"type": "string",
"blank": False,
'path': '/html/body/table[20]/tbody/tr/td[2]/div',
'parser': clean_text
},
"datetimes": {
"type": "object",
"properties": {
"signature_date": pupa_datetime_blank({
'path': '/html/body/table[20]/tbody/tr/td[4]/div',
'parser': parse_datetime
}),
"effective_date": pupa_datetime_blank({
'path': '/html/body/table[2]/tbody/tr[1]/td[3]/div',
'parser': parse_datetime
})
}
},
"registration_type": {
"type": "object",
"properties": {
"new_registrant": {
"type": "boolean",
'path': '/html/body/div[1]/input[1]',
'parser': checkbox_boolean
},
"new_client_for_existing_registrant": {
"type": "boolean",
'path': '/html/body/div[1]/input[2]',
'parser': checkbox_boolean
},
"is_amendment": {
"type": "boolean",
'path': '/html/body/div[1]/input[3]',
'parser': checkbox_boolean
}
}
},
"registrant": {
"type": "object",
"properties": {
"organization_or_lobbying_firm": {
"type": "boolean",
'path': '/html/body/p[3]/input[1]',
'parser': checkbox_boolean
},
"self_employed_individual": {
"type": "boolean",
'path': '/html/body/p[3]/input[2]',
'parser': checkbox_boolean
},
"registrant_org_name": {
"type": ["null", "string"],
'path': '/html/body/table[3]/tbody/tr/td[contains(.,"Organization")]/following-sibling::td[1]/div',
'parser': clean_text,
'missing': True,
},
"registrant_individual_prefix": {
"type": ["null", "string"],
'path': '/html/body/table[3]/tbody/tr/td[contains(.,"Prefix")]/following-sibling::td[1]/div',
'parser': clean_text,
'missing': True,
},
"registrant_individual_firstname": {
"type": ["null", "string"],
'path': '/html/body/table[3]/tbody/tr/td[5]/div',
'parser': clean_text,
'missing': True,
},
"registrant_individual_lastname": {
"type": ["null", "string"],
'path': '/html/body/table[3]/tbody/tr/td[7]/div',
'parser': clean_text,
'missing': True,
},
"registrant_address_one": {
"type": "string",
'path': '/html/body/table[4]/tbody/tr/td[2]/div',
'parser': clean_text
},
"registrant_address_two": {
"type": "string",
"blank": True,
'path': '/html/body/table[4]/tbody/tr/td[4]/div',
'parser': clean_text
},
"registrant_city": {
"type": "string",
'path': '/html/body/table[5]/tbody/tr/td[2]/div',
'parser': clean_text
},
"registrant_state": {
"type": "string",
"blank": True,
'path': '/html/body/table[5]/tbody/tr/td[4]/div',
'parser': clean_text
},
"registrant_zip": {
"type": "string",
"blank": True,
'path': '/html/body/table[5]/tbody/tr/td[6]/div',
'parser': clean_text
},
"registrant_country": {
"type": "string",
'path': '/html/body/table[5]/tbody/tr/td[8]/div',
'parser': clean_text
},
"registrant_ppb_city": {
"type": "string",
"blank": True,
'path': '/html/body/table[6]/tbody/tr/td[2]/div',
'parser': clean_text
},
"registrant_ppb_state": {
"type": "string",
"blank": True,
'path': '/html/body/table[6]/tbody/tr/td[4]/div',
'parser': clean_text
},
"registrant_ppb_zip": {
"type": "string",
"blank": True,
'path': '/html/body/table[6]/tbody/tr/td[6]/div',
'parser': clean_text
},
"registrant_ppb_country": {
"type": "string",
"blank": True,
'path': '/html/body/table[6]/tbody/tr/td[8]/div',
'parser': clean_text
},
"registrant_international_phone": {
"type": "boolean",
'path': '/html/body/table[7]/tbody/tr/td[2]/input',
'parser': checkbox_boolean
},
"registrant_contact_name": {
"type": "string",
'path': '/html/body/table[8]/tbody/tr/td[2]/div',
'parser': clean_text
},
"registrant_contact_phone": {
"type": "string",
'path': '/html/body/table[8]/tbody/tr/td[4]/div',
'parser': clean_text
},
"registrant_contact_email": {
"type": "string",
"format": "email",
'path': '/html/body/table[8]/tbody/tr/td[6]/div',
'parser': clean_text
},
"registrant_general_description": {
"type": "string",
'path': '/html/body/div[2]',
'parser': clean_text
},
"registrant_house_id": {
"type": "string",
"blank": True,
'path': '/html/body/table[2]/tbody/tr[2]/td[2]/div',
'parser': clean_text
},
"registrant_senate_id": {
"type": "string",
'path': '/html/body/table[2]/tbody/tr[2]/td[5]/div',
'parser': clean_text
}
}
},
"client": {
"type": "object",
"properties": {
"client_self": {
"type": "boolean",
'path': '/html/body/p[4]/input',
'parser': checkbox_boolean
},
"client_name": {
"type": "string",
'path': '/html/body/table[9]/tbody/tr[1]/td[2]/div',
'parser': clean_text
},
"client_general_description": {
"type": "string",
"blank": True,
'path': '/html/body/div[3]',
'parser': clean_text
},
"client_address": {
"type": "string",
"blank": True,
'path': '/html/body/table[9]/tbody/tr[2]/td[2]/div',
'parser': clean_text
},
"client_city": {
"type": "string",
"blank": True,
'path': '/html/body/table[10]/tbody/tr/td[2]/div',
'parser': clean_text
},
"client_state": {
"type": "string",
"blank": True,
'path': '/html/body/table[10]/tbody/tr/td[4]/div',
'parser': clean_text
},
"client_zip": {
"type": "string",
"blank": True,
'path': '/html/body/table[10]/tbody/tr/td[6]/div',
'parser': clean_text
},
"client_country": {
"type": "string",
"blank": True,
'path': '/html/body/table[10]/tbody/tr/td[8]/div',
'parser': clean_text
},
"client_ppb_city": {
"type": "string",
"blank": True,
'path': '/html/body/table[11]/tbody/tr/td[2]/div',
'parser': clean_text
},
"client_ppb_state": {
"type": "string",
"blank": True,
'path': '/html/body/table[11]/tbody/tr/td[4]/div',
'parser': clean_text
},
"client_ppb_zip": {
"type": "string",
"blank": True,
'path': '/html/body/table[11]/tbody/tr/td[6]/div',
'parser': clean_text
},
"client_ppb_country": {
"type": "string",
"blank": True,
'path': '/html/body/table[11]/tbody/tr/td[8]/div',
'parser': clean_text
}
}
},
"lobbying_issues_detail": {
"type": "string",
"blank": True,
'path': '/html/body/p[10]',
'parser': clean_text
},
"lobbying_issues": {
"type": "array",
'even_odd': False,
'path': '/html/body/table[13]/tbody',
"items": {
"type": "object",
"path": "tr//td/div",
"properties": {
"general_issue_area": {
"type": ["string"],
"enum": sopr_general_issue_codes,
'path': '.',
'parser': clean_text,
'blank': True
}
}
}
},
"affiliated_organizations": {
"type": "array",
'even_odd': True,
'path': '/html/body/table[16]/tbody',
"items": {
"type": "object",
'path': 'tr[position() > 3]',
'missing': True,
"properties": {
"affiliated_organization_name": {
"type": "string",
"even_odd": "even",
'path': 'td[1]/div',
'parser': clean_text
},
"affiliated_organization_address": {
"type": "string",
"even_odd": "even",
'path': 'td[2]/div',
'parser': clean_text
},
"affiliated_organization_city": {
"type": "string",
"even_odd": "odd",
'path': 'td[2]/table/tbody/tr/td[1]/div',
'parser': clean_text
},
"affiliated_organization_state": {
"type": "string",
"blank": True,
"even_odd": "odd",
'path': 'td[2]/table/tbody/tr/td[2]/div',
'parser': clean_text
},
"affiliated_organization_zip": {
"type": "string",
"blank": True,
"even_odd": "odd",
'path': 'td[2]/table/tbody/tr/td[3]/div',
'parser': clean_text
},
"affiliated_organization_country": {
"type": "string",
"even_odd": "odd",
'path': 'td[2]/table/tbody/tr/td[4]/div',
'parser': clean_text
},
"affiliated_organization_ppb_state": {
"type": "string",
"blank": True,
"even_odd": "odd",
'path': 'td[3]/table/tbody/tr/td[2]/div',
'parser': clean_text
},
"affiliated_organization_ppb_city": {
"type": "string",
"blank": True,
"even_odd": "even",
'path': 'td[3]/table/tbody/tr/td[2]/div',
'parser': clean_text
},
"affiliated_organization_ppb_country": {
"type": "string",
"blank": True,
"even_odd": "odd",
'path': 'td[3]/table/tbody/tr/td[4]/div',
'parser': clean_text
}
}
}
},
'foreign_entities_no': {
'type': 'boolean',
'path': '/html/body/table[17]/tbody/tr/td[1]/input',
'parser': checkbox_boolean
},
'foreign_entities_yes': {
'type': 'boolean',
'path': '/html/body/table[17]/tbody/tr/td[3]/input',
'parser': checkbox_boolean
},
"foreign_entities": {
"type": "array",
'even_odd': True,
'path': '/html/body/table[19]/tbody',
'missing': True,
"items": {
"type": "object",
"path": "tr",
'missing': True,
"properties": {
"foreign_entity_name": {
"type": "string",
"even_odd": "odd",
'path': 'td[1]/div',
'parser': clean_text
},
"foreign_entity_address": {
"type": "string",
"even_odd": "even",
'path': 'td[2]/div',
'parser': clean_text
},
"foreign_entity_city": {
"type": "string",
"even_odd": "odd",
'path': 'td[2]/table/tbody/tr/td[1]/div',
'parser': clean_text
},
"foreign_entity_state": {
"type": "string",
"even_odd": "odd",
"blank": True,
'path': 'td[2]/table/tbody/tr/td[2]/div',
'parser': clean_text
},
"foreign_entity_country": {
"type": "string",
"even_odd": "odd",
'path': 'td[2]/table/tbody/tr/td[3]/div',
'parser': clean_text
},
"foreign_entity_ppb_city": {
"type": "string",
"even_odd": "even",
"blank": True,
'path': 'td[3]/table/tbody/tr/td[2]/div',
'parser': clean_text
},
"foreign_entity_ppb_state": {
"type": "string",
"even_odd": "odd",
"blank": True,
'path': 'td[3]/table/tbody/tr/td[2]/div',
'parser': clean_text
},
"foreign_entity_ppb_country": {
"type": "string",
"even_odd": "odd",
"blank": True,
'path': 'td[3]/table/tbody/tr/td[4]/div',
'parser': clean_text
},
"foreign_entity_amount": {
"type": "string",
"even_odd": "odd",
"blank": True,
'path': 'td[4]/div',
'parser': clean_text
},
"foreign_entity_ownership_percentage": {
"type": "string",
"even_odd": "odd",
"blank": True,
'path': 'td[5]/div',
'parser': clean_text
}
}
}
},
"lobbyists": {
"type": "array",
'path': '/html/body/table[12]/tbody',
"items": {
"type": "object",
"path": "tr[position() > 2]",
"properties": {
"lobbyist_suffix": {
"type": "string",
"blank": True,
'path': 'td[3]',
'parser': clean_text
},
"lobbyist_first_name": {
"type": "string",
'path': 'td[1]',
'parser': clean_text
},
"lobbyist_last_name": {
"type": "string",
'path': 'td[2]',
"blank": True,
'parser': clean_text
},
"lobbyist_covered_official_position": {
"type": "string",
"blank": True,
'path': 'td[4]',
'parser': clean_text
}
}
}
},
}
}
In [8]:
post_employment_schema = {
"title": "House Post-Employment Lobbying Restriction",
"description": "Lobbying restriction reported by the House Clerk's Office",
"type": "object",
"object_path": "/PostEmployment/Employee",
"properties": {
"_meta": {
"type": "object",
"properties": {
"document_id": {
"type": "string",
"format": "uuid_hex",
},
}
},
"employee_name": {
"type": "string",
'path': 'EmployeeName',
'parser': clean_text,
},
"office_name": {
"type": ["string"],
'path': 'OfficeName',
'parser': clean_text,
},
"termination_date": pupa_date({
'path': 'TerminationDate',
'parser': parse_date
}),
"lobbying_eligibility_date": pupa_date({
'path': 'LobbyingEligibilityDate',
'parser': parse_date
}),
}
}
In [49]:
class LobbyingRegistrationTarget(target.Target):
schema = ld1_schema
class PostEmploymentTarget(target.Target):
schema = post_employment_schema
In [80]:
lobbying_registration_extractor = extract.HTMLSchemaExtractor(LobbyingRegistrationTarget)
postemployment_extractor = extract.XMLSchemaExtractor(PostEmploymentTarget)
In [81]:
ld1_eg = 'http://soprweb.senate.gov/index.cfm?event=getFilingDetails&filingID=e031bb00-861b-4121-b3d6-e609e3afe62b&filingTypeID=1'
In [82]:
resp = requests.get(ld1_eg)
In [83]:
type(resp.content)
Out[83]:
In [84]:
from io import BytesIO
In [89]:
r_targets = [t for t in lobbying_registration_extractor.do_extract(resp.content)]
In [90]:
r_targets
Out[90]:
In [91]:
r_target = targets[0]
In [92]:
r_target.record
Out[92]:
In [93]:
with open('/home/blannon/og_data/post-employment/house/PostEmployment.xml') as fin:
pe_targets = [t for t in postemployment_extractor.do_extract(fin)]
In [95]:
pe_targets[:10]
Out[95]:
In [96]:
pe_target = pe_targets[0]
In [97]:
pe_target.record
Out[97]:
In [ ]: