In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

Imports


In [15]:
import copy

import requests

from validscrape.utils.data_munge import (clean_text, checkbox_boolean,
                                           parse_datetime, parse_date)
from validscrape import target
from validscrape import extract

stuff from pupa


In [4]:
#from pupa.scrape.schemas.common import fuzzy_date, fuzzy_datetime_blank

fuzzy_date = {
    "type": "string",
    "pattern": "(^[0-9]{4})?(-[0-9]{2}){0,2}$"
}

fuzzy_date_blank = {
    "type": "string",
    "pattern": "(^[0-9]{4})?(-[0-9]{2}){0,2}$",
    "blank": True
}                                                                                        

fuzzy_datetime_blank = {
    "type": "string",
    "pattern": "(^[0-9]{4})?(-[0-9]{2}){0,2}( [0-9]{2}:[0-9]{2}:[0-9]{2})?$",
    "blank": True
}

def pupa_date(parse_properties):
    pd = copy.deepcopy(fuzzy_date)
    pd.update(parse_properties)
    return pd

def pupa_datetime_blank(parse_properties):
    pd = copy.deepcopy(fuzzy_datetime_blank)
    pd.update(parse_properties)
    return pd

Reference data


In [5]:
# scrapers_us_federal: unitedstates.ref.sopr_lobbying_reference
FILING_TYPES = [
    {
        "action": "registration",
        "code": "1",
        "name": "REGISTRATION"
    },
    {
        "action": "registration_amendment",
        "code": "2",
        "name": "REGISTRATION AMENDMENT"
    },
    {
        "action": "report",
        "code": "3",
        "name": "MID-YEAR REPORT"
    },
    {
        "action": "report",
        "code": "4",
        "name": "MID-YEAR (NO ACTIVITY)"
    },
    {
        "action": "report_amendment",
        "code": "5",
        "name": "MID-YEAR AMENDMENT"
    },
    {
        "action": "termination",
        "code": "6",
        "name": "MID-YEAR TERMINATION"
    },
    {
        "action": "termination_letter",
        "code": "7",
        "name": "MID-YEAR TERMINATION LETTER"
    },
    {
        "action": "termination_amendment",
        "code": "8",
        "name": "MID-YEAR TERMINATION AMENDMENT"
    },
    {
        "action": "report",
        "code": "9",
        "name": "YEAR-END REPORT"
    },
    {
        "action": "report",
        "code": "10",
        "name": "YEAR-END (NO ACTIVITY)"
    },
    {
        "action": "report_amendment",
        "code": "11",
        "name": "YEAR-END AMENDMENT"
    },
    {
        "action": "termination",
        "code": "12",
        "name": "YEAR-END TERMINATION"
    },
    {
        "action": "termination_letter",
        "code": "13",
        "name": "YEAR-END TERMINATION LETTER"
    },
    {
        "action": "termination_amendment",
        "code": "14",
        "name": "YEAR-END TERMINATION AMENDMENT"
    },
    {
        "action": "termination",
        "code": "15",
        "name": "YEAR-END TERMINATION (NO ACTIVITY)"
    },
    {
        "action": "termination",
        "code": "16",
        "name": "MID-YEAR TERMINATION (NO ACTIVITY)"
    },
    {
        "action": "misc_termination",
        "code": "17",
        "name": "MISC TERM"
    },
    {
        "action": "misc_document",
        "code": "18",
        "name": "MISC. DOC"
    },
    {
        "action": "termination_amendment",
        "code": "19",
        "name": "MID-YEAR TERMINATION AMENDMENT (NO ACTIVITY)"
    },
    {
        "action": "report_amendment",
        "code": "20",
        "name": "MID-YEAR AMENDMENT (NO ACTIVITY)"
    },
    {
        "action": "report_amendment",
        "code": "21",
        "name": "YEAR-END AMENDMENT (NO ACTIVITY)"
    },
    {
        "action": "termination_amendment",
        "code": "22",
        "name": "YEAR-END TERMINATION AMENDMENT (NO ACTIVITY)"
    },
    {
        "action": "misc_update",
        "code": "29",
        "name": "UPDATE PAGE IN A REPORT"
    },
    {
        "action": "report",
        "code": "51",
        "name": "FIRST QUARTER REPORT"
    },
    {
        "action": "report",
        "code": "52",
        "name": "FIRST QUARTER (NO ACTIVITY)"
    },
    {
        "action": "termination",
        "code": "53",
        "name": "FIRST QUARTER TERMINATION"
    },
    {
        "action": "termination",
        "code": "54",
        "name": "FIRST QUARTER TERMINATION (NO ACTIVITY)"
    },
    {
        "action": "report_amendment",
        "code": "55",
        "name": "FIRST QUARTER AMENDMENT"
    },
    {
        "action": "report_amendment",
        "code": "56",
        "name": "FIRST QUARTER AMENDMENT (NO ACTIVITY)"
    },
    {
        "action": "termination_amendment",
        "code": "57",
        "name": "FIRST QUARTER TERMINATION AMENDMENT"
    },
    {
        "action": "termination_amendment",
        "code": "58",
        "name": "FIRST QUARTER TERMINATION AMENDMENT (NO ACTIVITY)"
    },
    {
        "action": "termination_letter",
        "code": "59",
        "name": "FIRST QUARTER TERMINATION LETTER"
    },
    {
        "action": "report",
        "code": "60",
        "name": "SECOND QUARTER REPORT"
    },
    {
        "action": "report",
        "code": "61",
        "name": "SECOND QUARTER (NO ACTIVITY)"
    },
    {
        "action": "termination",
        "code": "62",
        "name": "SECOND QUARTER TERMINATION"
    },
    {
        "action": "termination",
        "code": "63",
        "name": "SECOND QUARTER TERMINATION (NO ACTIVITY)"
    },
    {
        "action": "report_amendment",
        "code": "64",
        "name": "SECOND QUARTER AMENDMENT"
    },
    {
        "action": "report_amendment",
        "code": "65",
        "name": "SECOND QUARTER AMENDMENT (NO ACTIVITY)"
    },
    {
        "action": "termination_amendment",
        "code": "66",
        "name": "SECOND QUARTER TERMINATION AMENDMENT"
    },
    {
        "action": "termination_amendment",
        "code": "67",
        "name": "SECOND QUARTER TERMINATION AMENDMENT (NO ACTIVITY)"
    },
    {
        "action": "termination_letter",
        "code": "68",
        "name": "SECOND QUARTER TERMINATION LETTER"
    },
    {
        "action": "report",
        "code": "69",
        "name": "THIRD QUARTER REPORT"
    },
    {
        "action": "report",
        "code": "70",
        "name": "THIRD QUARTER (NO ACTIVITY)"
    },
    {
        "action": "termination",
        "code": "71",
        "name": "THIRD QUARTER TERMINATION"
    },
    {
        "action": "termination",
        "code": "72",
        "name": "THIRD QUARTER TERMINATION (NO ACTIVITY)"
    },
    {
        "action": "report_amendment",
        "code": "73",
        "name": "THIRD QUARTER AMENDMENT"
    },
    {
        "action": "report_amendment",
        "code": "74",
        "name": "THIRD QUARTER AMENDMENT (NO ACTIVITY)"
    },
    {
        "action": "termination_amendment",
        "code": "75",
        "name": "THIRD QUARTER TERMINATION AMENDMENT"
    },
    {
        "action": "termination_amendment",
        "code": "76",
        "name": "THIRD QUARTER TERMINATION AMENDMENT (NO ACTIVITY)"
    },
    {
        "action": "termination_letter",
        "code": "77",
        "name": "THIRD QUARTER TERMINATION LETTER"
    },
    {
        "action": "report",
        "code": "78",
        "name": "FOURTH QUARTER REPORT"
    },
    {
        "action": "report",
        "code": "79",
        "name": "FOURTH QUARTER (NO ACTIVITY)"
    },
    {
        "action": "termination",
        "code": "80",
        "name": "FOURTH QUARTER TERMINATION"
    },
    {
        "action": "termination",
        "code": "81",
        "name": "FOURTH QUARTER TERMINATION (NO ACTIVITY)"
    },
    {
        "action": "report_amendment",
        "code": "82",
        "name": "FOURTH QUARTER AMENDMENT"
    },
    {
        "action": "report_amendment",
        "code": "83",
        "name": "FOURTH QUARTER AMENDMENT (NO ACTIVITY)"
    },
    {
        "action": "termination_amendment",
        "code": "84",
        "name": "FOURTH QUARTER TERMINATION AMENDMENT"
    },
    {
        "action": "termination_amendment",
        "code": "85",
        "name": "FOURTH QUARTER TERMINATION AMENDMENT (NO ACTIVITY)"
    },
    {
        "action": "termination_letter",
        "code": "86",
        "name": "FOURTH QUARTER TERMINATION LETTER"
    }
]

GENERAL_ISSUE_CODES = [
    {
        "issue_code": "ACC",
        "description": "Accounting"
    },
    {
        "issue_code": "CPI",
        "description": "Computer Industry"
    },
    {
        "issue_code": "AER",
        "description": "Aerospace"
    },
    {
        "issue_code": "REL",
        "description": "Religion"
    },
    {
        "issue_code": "MIA",
        "description": "Media (Information/Publishing)"
    },
    {
        "issue_code": "DOC",
        "description": "District of Columbia"
    },
    {
        "issue_code": "CAW",
        "description": "Clean Air & Water (Quality)"
    },
    {
        "issue_code": "CPT",
        "description": "Copyright/Patent/Trademark"
    },
    {
        "issue_code": "ANI",
        "description": "Animals"
    },
    {
        "issue_code": "TOB",
        "description": "Tobacco"
    },
    {
        "issue_code": "FUE",
        "description": "Fuel/Gas/Oil"
    },
    {
        "issue_code": "TOU",
        "description": "Travel/Tourism"
    },
    {
        "issue_code": "CIV",
        "description": "Civil Rights/Civil Liberties"
    },
    {
        "issue_code": "NAT",
        "description": "Natural Resources"
    },
    {
        "issue_code": "BAN",
        "description": "Banking"
    },
    {
        "issue_code": "BEV",
        "description": "Beverage Industry"
    },
    {
        "issue_code": "AGR",
        "description": "Agriculture"
    },
    {
        "issue_code": "DEF",
        "description": "Defense"
    },
    {
        "issue_code": "CON",
        "description": "Constitution"
    },
    {
        "issue_code": "MMM",
        "description": "Medicare/Medicaid"
    },
    {
        "issue_code": "GOV",
        "description": "Government Issues"
    },
    {
        "issue_code": "SCI",
        "description": "Science/Technology"
    },
    {
        "issue_code": "URB",
        "description": "Urban Development/Municipalities"
    },
    {
        "issue_code": "TAR",
        "description": "Miscellaneous Tariff Bills"
    },
    {
        "issue_code": "COM",
        "description": "Communications/Broadcasting/Radio/TV"
    },
    {
        "issue_code": "TAX",
        "description": "Taxation/Internal Revenue Code"
    },
    {
        "issue_code": "TEC",
        "description": "Telecommunications"
    },
    {
        "issue_code": "ROD",
        "description": "Roads/Highway"
    },
    {
        "issue_code": "POS",
        "description": "Postal"
    },
    {
        "issue_code": "RET",
        "description": "Retirement"
    },
    {
        "issue_code": "TOR",
        "description": "Torts"
    },
    {
        "issue_code": "GAM",
        "description": "Gaming/Gambling/Casino"
    },
    {
        "issue_code": "SMB",
        "description": "Small Business"
    },
    {
        "issue_code": "FAM",
        "description": "Family Issues/Abortion/Adoption"
    },
    {
        "issue_code": "WAS",
        "description": "Waste (hazardous/solid/interstate/nuclear)"
    },
    {
        "issue_code": "UTI",
        "description": "Utilities"
    },
    {
        "issue_code": "DIS",
        "description": "Disaster Planning/Emergencies"
    },
    {
        "issue_code": "WEL",
        "description": "Welfare"
    },
    {
        "issue_code": "RRR",
        "description": "Railroads"
    },
    {
        "issue_code": "BUD",
        "description": "Budget/Appropriations"
    },
    {
        "issue_code": "MON",
        "description": "Minting/Money/Gold Standard"
    },
    {
        "issue_code": "ADV",
        "description": "Advertising"
    },
    {
        "issue_code": "VET",
        "description": "Veterans"
    },
    {
        "issue_code": "HOM",
        "description": "Homeland Security"
    },
    {
        "issue_code": "TRU",
        "description": "Trucking/Shipping"
    },
    {
        "issue_code": "UNM",
        "description": "Unemployment"
    },
    {
        "issue_code": "FOR",
        "description": "Foreign Relations"
    },
    {
        "issue_code": "ENG",
        "description": "Energy/Nuclear"
    },
    {
        "issue_code": "FIR",
        "description": "Firearms/Guns/Ammunition"
    },
    {
        "issue_code": "EDU",
        "description": "Education"
    },
    {
        "issue_code": "IMM",
        "description": "Immigration"
    },
    {
        "issue_code": "CHM",
        "description": "Chemicals/Chemical Industry"
    },
    {
        "issue_code": "TRD",
        "description": "Trade (Domestic & Foreign)"
    },
    {
        "issue_code": "BNK",
        "description": "Bankruptcy"
    },
    {
        "issue_code": "HCR",
        "description": "Health Issues"
    },
    {
        "issue_code": "HOU",
        "description": "Housing"
    },
    {
        "issue_code": "AUT",
        "description": "Automotive Industry"
    },
    {
        "issue_code": "ENV",
        "description": "Environmental/Superfund"
    },
    {
        "issue_code": "RES",
        "description": "Real Estate/Land Use/Conservation"
    },
    {
        "issue_code": "FOO",
        "description": "Food Industry (Safety, Labeling, etc.)"
    },
    {
        "issue_code": "FIN",
        "description": "Financial Institutions/Investments/Securities"
    },
    {
        "issue_code": "CSP",
        "description": "Consumer Issues/Safety/Protection"
    },
    {
        "issue_code": "MED",
        "description": "Medical/Disease Research/Clinical Labs"
    },
    {
        "issue_code": "MAR",
        "description": "Marine/Maritime/Boating/Fisheries"
    },
    {
        "issue_code": "ART",
        "description": "Arts/Entertainment"
    },
    {
        "issue_code": "INT",
        "description": "Intelligence and Surveillance"
    },
    {
        "issue_code": "APP",
        "description": "Apparel/Clothing Industry/Textiles"
    },
    {
        "issue_code": "TRA",
        "description": "Transportation"
    },
    {
        "issue_code": "ALC",
        "description": "Alcohol & Drug Abuse"
    },
    {
        "issue_code": "INS",
        "description": "Insurance"
    },
    {
        "issue_code": "CDT",
        "description": "Commodities (Big Ticket)"
    },
    {
        "issue_code": "LBR",
        "description": "Labor Issues/Antitrust/Workplace"
    },
    {
        "issue_code": "AVI",
        "description": "Aviation/Aircraft/Airlines"
    },
    {
        "issue_code": "ECN",
        "description": "Economics/Economic Development"
    },
    {
        "issue_code": "IND",
        "description": "Indian/Native American Affairs"
    },
    {
        "issue_code": "SPO",
        "description": "Sports/Athletics"
    },
    {
        "issue_code": "LAW",
        "description": "Law Enforcement/Crime/Criminal Justice"
    },
    {
        "issue_code": "PHA",
        "description": "Pharmacy"
    },
    {
        "issue_code": "MAN",
        "description": "Manufacturing"
    }
]

Schemas


In [6]:
sopr_general_issue_codes = [i['issue_code'] for i in GENERAL_ISSUE_CODES]

LD1


In [7]:
ld1_schema = {
    "title": "Lobbying Registration",
    "description": "Lobbying Disclosure Act of 1995 (Section 4)",
    "type": "object",
    "properties": {
        "_meta": {
            "type": "object",
            "properties": {
                "document_id": {
                    "type": "string",
                    "format": "uuid_hex",
                },
            }
        },
        "affiliated_organizations_url": {
            "type": ["null", "string"],
            "format": "url_http",
            "missing": True,
            "blank": True,
            'path': '/html/body/table[15]/tbody/td[2]/div',
            'parser': clean_text
        },
        "signature": {
            "type": "string",
            "blank": False,
            'path': '/html/body/table[20]/tbody/tr/td[2]/div',
            'parser': clean_text
        },
        "datetimes": {
            "type": "object",
            "properties": {
                "signature_date": pupa_datetime_blank({
                    'path': '/html/body/table[20]/tbody/tr/td[4]/div',
                    'parser': parse_datetime
                }),
                "effective_date": pupa_datetime_blank({
                    'path': '/html/body/table[2]/tbody/tr[1]/td[3]/div',
                    'parser': parse_datetime
                })
            }
        },
        "registration_type": {
            "type": "object",
            "properties": {
                "new_registrant": {
                    "type": "boolean",
                    'path': '/html/body/div[1]/input[1]',
                    'parser': checkbox_boolean
                },
                "new_client_for_existing_registrant": {
                    "type": "boolean",
                    'path': '/html/body/div[1]/input[2]',
                    'parser': checkbox_boolean
                },
                "is_amendment": {
                    "type": "boolean",
                    'path': '/html/body/div[1]/input[3]',
                    'parser': checkbox_boolean
                }
            }
        },
        "registrant": {
            "type": "object",
            "properties": {
                "organization_or_lobbying_firm": {
                    "type": "boolean",
                    'path': '/html/body/p[3]/input[1]',
                    'parser': checkbox_boolean
                },
                "self_employed_individual": {
                    "type": "boolean",
                    'path': '/html/body/p[3]/input[2]',
                    'parser': checkbox_boolean
                },
                "registrant_org_name": {
                    "type": ["null", "string"],
                    'path': '/html/body/table[3]/tbody/tr/td[contains(.,"Organization")]/following-sibling::td[1]/div',
                    'parser': clean_text,
                    'missing': True,
                },
                "registrant_individual_prefix": {
                    "type": ["null", "string"],
                    'path': '/html/body/table[3]/tbody/tr/td[contains(.,"Prefix")]/following-sibling::td[1]/div',
                    'parser': clean_text,
                    'missing': True,
                },
                "registrant_individual_firstname": {
                    "type": ["null", "string"],
                    'path': '/html/body/table[3]/tbody/tr/td[5]/div',
                    'parser': clean_text,
                    'missing': True,
                },
                "registrant_individual_lastname": {
                    "type": ["null", "string"],
                    'path': '/html/body/table[3]/tbody/tr/td[7]/div',
                    'parser': clean_text,
                    'missing': True,
                },
                "registrant_address_one": {
                    "type": "string",
                    'path': '/html/body/table[4]/tbody/tr/td[2]/div',
                    'parser': clean_text
                },
                "registrant_address_two": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[4]/tbody/tr/td[4]/div',
                    'parser': clean_text
                },
                "registrant_city": {
                    "type": "string",
                    'path': '/html/body/table[5]/tbody/tr/td[2]/div',
                    'parser': clean_text
                },
                "registrant_state": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[5]/tbody/tr/td[4]/div',
                    'parser': clean_text
                },
                "registrant_zip": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[5]/tbody/tr/td[6]/div',
                    'parser': clean_text
                },
                "registrant_country": {
                    "type": "string",
                    'path': '/html/body/table[5]/tbody/tr/td[8]/div',
                    'parser': clean_text
                },
                "registrant_ppb_city": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[6]/tbody/tr/td[2]/div',
                    'parser': clean_text
                },
                "registrant_ppb_state": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[6]/tbody/tr/td[4]/div',
                    'parser': clean_text
                },
                "registrant_ppb_zip": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[6]/tbody/tr/td[6]/div',
                    'parser': clean_text
                },
                "registrant_ppb_country": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[6]/tbody/tr/td[8]/div',
                    'parser': clean_text
                },
                "registrant_international_phone": {
                    "type": "boolean",
                    'path': '/html/body/table[7]/tbody/tr/td[2]/input',
                    'parser': checkbox_boolean
                },
                "registrant_contact_name": {
                    "type": "string",
                    'path': '/html/body/table[8]/tbody/tr/td[2]/div',
                    'parser': clean_text
                },
                "registrant_contact_phone": {
                    "type": "string",
                    'path': '/html/body/table[8]/tbody/tr/td[4]/div',
                    'parser': clean_text
                },
                "registrant_contact_email": {
                    "type": "string",
                    "format": "email",
                    'path': '/html/body/table[8]/tbody/tr/td[6]/div',
                    'parser': clean_text
                },
                "registrant_general_description": {
                    "type": "string",
                    'path': '/html/body/div[2]',
                    'parser': clean_text
                },
                "registrant_house_id": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[2]/tbody/tr[2]/td[2]/div',
                    'parser': clean_text
                },
                "registrant_senate_id": {
                    "type": "string",
                    'path': '/html/body/table[2]/tbody/tr[2]/td[5]/div',
                    'parser': clean_text
                }
            }
        },
        "client": {
            "type": "object",
            "properties": {
                "client_self": {
                    "type": "boolean",
                    'path': '/html/body/p[4]/input',
                    'parser': checkbox_boolean
                },
                "client_name": {
                    "type": "string",
                    'path': '/html/body/table[9]/tbody/tr[1]/td[2]/div',
                    'parser': clean_text
                },
                "client_general_description": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/div[3]',
                    'parser': clean_text
                },
                "client_address": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[9]/tbody/tr[2]/td[2]/div',
                    'parser': clean_text
                },
                "client_city": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[10]/tbody/tr/td[2]/div',
                    'parser': clean_text
                },
                "client_state": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[10]/tbody/tr/td[4]/div',
                    'parser': clean_text
                },
                "client_zip": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[10]/tbody/tr/td[6]/div',
                    'parser': clean_text
                },
                "client_country": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[10]/tbody/tr/td[8]/div',
                    'parser': clean_text
                },
                "client_ppb_city": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[11]/tbody/tr/td[2]/div',
                    'parser': clean_text
                },
                "client_ppb_state": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[11]/tbody/tr/td[4]/div',
                    'parser': clean_text
                },
                "client_ppb_zip": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[11]/tbody/tr/td[6]/div',
                    'parser': clean_text
                },
                "client_ppb_country": {
                    "type": "string",
                    "blank": True,
                    'path': '/html/body/table[11]/tbody/tr/td[8]/div',
                    'parser': clean_text
                }
            }
        },
        "lobbying_issues_detail": {
            "type": "string",
            "blank": True,
            'path': '/html/body/p[10]',
            'parser': clean_text
        },
        "lobbying_issues": {
            "type": "array",
            'even_odd': False,
            'path': '/html/body/table[13]/tbody',
            "items": {
                "type": "object",
                "path": "tr//td/div",
                "properties": {
                    "general_issue_area": {
                        "type": ["string"],
                        "enum": sopr_general_issue_codes,
                        'path': '.',
                        'parser': clean_text,
                        'blank': True
                    }
                }
            }
        },
        "affiliated_organizations": {
            "type": "array",
            'even_odd': True,
            'path': '/html/body/table[16]/tbody',
            "items": {
                "type": "object",
                'path': 'tr[position() > 3]',
                'missing': True,
                "properties": {
                    "affiliated_organization_name": {
                        "type": "string",
                        "even_odd": "even",
                        'path': 'td[1]/div',
                        'parser': clean_text
                    },
                    "affiliated_organization_address": {
                        "type": "string",
                        "even_odd": "even",
                        'path': 'td[2]/div',
                        'parser': clean_text
                    },
                    "affiliated_organization_city": {
                        "type": "string",
                        "even_odd": "odd",
                        'path': 'td[2]/table/tbody/tr/td[1]/div',
                        'parser': clean_text
                    },
                    "affiliated_organization_state": {
                        "type": "string",
                        "blank": True,
                        "even_odd": "odd",
                        'path': 'td[2]/table/tbody/tr/td[2]/div',
                        'parser': clean_text
                    },
                    "affiliated_organization_zip": {
                        "type": "string",
                        "blank": True,
                        "even_odd": "odd",
                        'path': 'td[2]/table/tbody/tr/td[3]/div',
                        'parser': clean_text
                    },
                    "affiliated_organization_country": {
                        "type": "string",
                        "even_odd": "odd",
                        'path': 'td[2]/table/tbody/tr/td[4]/div',
                        'parser': clean_text
                    },
                    "affiliated_organization_ppb_state": {
                        "type": "string",
                        "blank": True,
                        "even_odd": "odd",
                        'path': 'td[3]/table/tbody/tr/td[2]/div',
                        'parser': clean_text
                    },
                    "affiliated_organization_ppb_city": {
                        "type": "string",
                        "blank": True,
                        "even_odd": "even",
                        'path': 'td[3]/table/tbody/tr/td[2]/div',
                        'parser': clean_text
                    },
                    "affiliated_organization_ppb_country": {
                        "type": "string",
                        "blank": True,
                        "even_odd": "odd",
                        'path': 'td[3]/table/tbody/tr/td[4]/div',
                        'parser': clean_text
                    }
                }
            }
        },
        'foreign_entities_no': {
            'type': 'boolean',
            'path': '/html/body/table[17]/tbody/tr/td[1]/input',
            'parser': checkbox_boolean
        },
        'foreign_entities_yes': {
            'type': 'boolean',
            'path': '/html/body/table[17]/tbody/tr/td[3]/input',
            'parser': checkbox_boolean
        },
        "foreign_entities": {
            "type": "array",
            'even_odd': True,
            'path': '/html/body/table[19]/tbody',
            'missing': True,
            "items": {
                "type": "object",
                "path": "tr",
                'missing': True,
                "properties": {
                    "foreign_entity_name": {
                        "type": "string",
                        "even_odd": "odd",
                        'path': 'td[1]/div',
                        'parser': clean_text
                    },
                    "foreign_entity_address": {
                        "type": "string",
                        "even_odd": "even",
                        'path': 'td[2]/div',
                        'parser': clean_text
                    },
                    "foreign_entity_city": {
                        "type": "string",
                        "even_odd": "odd",
                        'path': 'td[2]/table/tbody/tr/td[1]/div',
                        'parser': clean_text
                    },
                    "foreign_entity_state": {
                        "type": "string",
                        "even_odd": "odd",
                        "blank": True,
                        'path': 'td[2]/table/tbody/tr/td[2]/div',
                        'parser': clean_text
                    },
                    "foreign_entity_country": {
                        "type": "string",
                        "even_odd": "odd",
                        'path': 'td[2]/table/tbody/tr/td[3]/div',
                        'parser': clean_text
                    },
                    "foreign_entity_ppb_city": {
                        "type": "string",
                        "even_odd": "even",
                        "blank": True,
                        'path': 'td[3]/table/tbody/tr/td[2]/div',
                        'parser': clean_text
                    },
                    "foreign_entity_ppb_state": {
                        "type": "string",
                        "even_odd": "odd",
                        "blank": True,
                        'path': 'td[3]/table/tbody/tr/td[2]/div',
                        'parser': clean_text
                    },
                    "foreign_entity_ppb_country": {
                        "type": "string",
                        "even_odd": "odd",
                        "blank": True,
                        'path': 'td[3]/table/tbody/tr/td[4]/div',
                        'parser': clean_text
                    },
                    "foreign_entity_amount": {
                        "type": "string",
                        "even_odd": "odd",
                        "blank": True,
                        'path': 'td[4]/div',
                        'parser': clean_text
                    },
                    "foreign_entity_ownership_percentage": {
                        "type": "string",
                        "even_odd": "odd",
                        "blank": True,
                        'path': 'td[5]/div',
                        'parser': clean_text
                    }
                }
            }
        },
        "lobbyists": {
            "type": "array",
            'path': '/html/body/table[12]/tbody',
            "items": {
                "type": "object",
                "path": "tr[position() > 2]",
                "properties": {
                    "lobbyist_suffix": {
                        "type": "string",
                        "blank": True,
                        'path': 'td[3]',
                        'parser': clean_text
                    },
                    "lobbyist_first_name": {
                        "type": "string",
                        'path': 'td[1]',
                        'parser': clean_text
                    },
                    "lobbyist_last_name": {
                        "type": "string",
                        'path': 'td[2]',
                        "blank": True,
                        'parser': clean_text
                    },
                    "lobbyist_covered_official_position": {
                        "type": "string",
                        "blank": True,
                        'path': 'td[4]',
                        'parser': clean_text
                    }
                }
            }
        },
    }
}

House Post-Employment


In [8]:
post_employment_schema = {
    "title": "House Post-Employment Lobbying Restriction",
    "description": "Lobbying restriction reported by the House Clerk's Office",
    "type": "object",
    "object_path": "/PostEmployment/Employee",
    "properties": {
        "_meta": {
            "type": "object",
            "properties": {
                "document_id": {
                    "type": "string",
                    "format": "uuid_hex",
                },
            }
        },
        "employee_name": {
            "type": "string",
            'path': 'EmployeeName',
            'parser': clean_text,
        },
        "office_name": {
            "type": ["string"],
            'path': 'OfficeName',
            'parser': clean_text,
        },
        "termination_date": pupa_date({
            'path': 'TerminationDate',
            'parser': parse_date
        }),
        "lobbying_eligibility_date": pupa_date({
            'path': 'LobbyingEligibilityDate',
            'parser': parse_date
        }),
    }
}

Validscrape Setup

Targets


In [49]:
class LobbyingRegistrationTarget(target.Target):
    schema = ld1_schema

class PostEmploymentTarget(target.Target):
    schema = post_employment_schema

Extractors


In [80]:
lobbying_registration_extractor = extract.HTMLSchemaExtractor(LobbyingRegistrationTarget)

postemployment_extractor = extract.XMLSchemaExtractor(PostEmploymentTarget)

Extracting

Registration (HTML)


In [81]:
ld1_eg = 'http://soprweb.senate.gov/index.cfm?event=getFilingDetails&filingID=e031bb00-861b-4121-b3d6-e609e3afe62b&filingTypeID=1'

In [82]:
resp = requests.get(ld1_eg)

In [83]:
type(resp.content)


Out[83]:
bytes

In [84]:
from io import BytesIO

In [89]:
r_targets = [t for t in lobbying_registration_extractor.do_extract(resp.content)]


WARNING:extractor:no object_path specified for <property object at 0x7f6e0c4b9c28> schema, assuming doc root

In [90]:
r_targets


Out[90]:
[<__main__.LobbyingRegistrationTarget at 0x7f6e0c4c6898>]

In [91]:
r_target = targets[0]

In [92]:
r_target.record


Out[92]:
{'_meta': {},
 'affiliated_organizations': [],
 'affiliated_organizations_url': '',
 'client': {'client_address': '210 E. Third Street - Suite 100',
  'client_city': 'Royal Oak',
  'client_country': 'USA',
  'client_general_description': 'Private, web-based inter-pharmacy marketplace',
  'client_name': 'MatchRx',
  'client_ppb_city': '',
  'client_ppb_country': '',
  'client_ppb_state': '',
  'client_ppb_zip': '',
  'client_self': False,
  'client_state': 'MI',
  'client_zip': '48067'},
 'datetimes': {'effective_date': '2015-04-30 00:00:00',
  'signature_date': '2015-06-02 14:58:14'},
 'foreign_entities': [],
 'foreign_entities_no': True,
 'foreign_entities_yes': False,
 'lobbying_issues': [{'general_issue_area': 'HCR'},
  {'general_issue_area': ''},
  {'general_issue_area': ''},
  {'general_issue_area': ''},
  {'general_issue_area': ''},
  {'general_issue_area': ''},
  {'general_issue_area': ''},
  {'general_issue_area': ''},
  {'general_issue_area': ''}],
 'lobbying_issues_detail': 'Implementation of Drug Quality and Security Act',
 'lobbyists': [{'lobbyist_covered_official_position': 'Hlth pol advsr, Chrmn Harkin, Sen Comm on Hlth Ed Lbr & Pnsns, Mar 2013 - Apr 3, 2014',
   'lobbyist_first_name': 'Nate',
   'lobbyist_last_name': 'Brown',
   'lobbyist_suffix': ''}],
 'registrant': {'organization_or_lobbying_firm': True,
  'registrant_address_one': '1333 NEW HAMPSHIRE AVE., NW',
  'registrant_address_two': '',
  'registrant_city': 'WASHINGTON',
  'registrant_contact_email': 'mstosik@akingump.com',
  'registrant_contact_name': 'Ms. Margaret A. Stosik',
  'registrant_contact_phone': '2028874117',
  'registrant_country': 'USA',
  'registrant_general_description': 'Law firm',
  'registrant_house_id': '31784',
  'registrant_individual_firstname': None,
  'registrant_individual_lastname': None,
  'registrant_individual_prefix': None,
  'registrant_international_phone': False,
  'registrant_org_name': 'AKIN GUMP STRAUSS HAUER & FELD',
  'registrant_ppb_city': '',
  'registrant_ppb_country': '',
  'registrant_ppb_state': '',
  'registrant_ppb_zip': '',
  'registrant_senate_id': '682',
  'registrant_state': 'DC',
  'registrant_zip': '20036',
  'self_employed_individual': False},
 'registration_type': {'is_amendment': False,
  'new_client_for_existing_registrant': True,
  'new_registrant': False},
 'signature': 'Digitally Signed By: Margaret A. Stosik'}

Post-Employment


In [93]:
with open('/home/blannon/og_data/post-employment/house/PostEmployment.xml') as fin:
    pe_targets = [t for t in postemployment_extractor.do_extract(fin)]

In [95]:
pe_targets[:10]


Out[95]:
[<__main__.PostEmploymentTarget at 0x7f6e1c15b400>,
 <__main__.PostEmploymentTarget at 0x7f6e1c15bef0>,
 <__main__.PostEmploymentTarget at 0x7f6e2305e438>,
 <__main__.PostEmploymentTarget at 0x7f6e2305ef98>,
 <__main__.PostEmploymentTarget at 0x7f6e2305e5f8>,
 <__main__.PostEmploymentTarget at 0x7f6e2305e588>,
 <__main__.PostEmploymentTarget at 0x7f6e23056400>,
 <__main__.PostEmploymentTarget at 0x7f6e23056160>,
 <__main__.PostEmploymentTarget at 0x7f6e23056518>,
 <__main__.PostEmploymentTarget at 0x7f6e23056438>]

In [96]:
pe_target = pe_targets[0]

In [97]:
pe_target.record


Out[97]:
{'_meta': {},
 'employee_name': 'BACO-SANCHEZ, LUIS, E',
 'lobbying_eligibility_date': '2008-12-05',
 'office_name': 'FORTUNO, LUIS G.',
 'termination_date': '2007-12-05'}

In [ ]: