In [1]:
import os
import settings
import pandas as pd
In [2]:
filings = pd.read_csv(os.path.join(settings.data_dir, "filings.csv"))
In [3]:
filings.head()
Out[3]:
In [4]:
monetary_sql_template = """
SELECT *
FROM calaccess_processed_form460scheduleaitem
WHERE filing_id IN ({})
"""
In [5]:
monetary_sql = monetary_sql_template.format(
",".join(map(str, list(filings.calaccess_filing_id)))
)
In [6]:
contribs = pd.read_sql_query(monetary_sql, settings.db_connection)
In [7]:
trimmed_contribs = contribs.rename(columns={
'filing_id': 'calaccess_filing_id',
'id': 'ccdc_contrib_id',
'transaction_id': 'calaccess_contrib_id'
}).drop([
'line_item',
'date_received_thru',
'transaction_type',
'memo_reference_number',
'cumulative_ytd_amount',
'cumulative_election_amount',
], axis=1)
In [8]:
trimmed_contribs.head()
Out[8]:
In [9]:
merged = pd.merge(
filings,
trimmed_contribs,
how="inner",
on="calaccess_filing_id"
)
In [10]:
unwanted_columns = [
'from_date',
'thru_date',
'total_contributions',
'total_expenditures_made',
'ccdc_contrib_id',
'date_filed',
'calaccess_contrib_id',
'contributor_code',
'contributor_committee_id',
'contributor_title',
'contributor_name_suffix',
'intermediary_committee_id',
'intermediary_title',
'intermediary_lastname',
'intermediary_firstname',
'intermediary_name_suffix',
'intermediary_city',
'intermediary_state',
'intermediary_zip',
'intermediary_employer',
'intermediary_occupation',
'intermediary_is_self_employed'
]
In [11]:
trimmed_merged = merged.drop(unwanted_columns, axis=1)
In [12]:
trimmed_merged.head()
Out[12]:
In [13]:
trimmed_merged.to_csv(
os.path.join(settings.data_dir, 'contributions.csv'),
index=False
)
In [14]:
recent_filings = pd.read_csv(os.path.join(settings.data_dir, "filings-2016.csv"))
In [15]:
recent_contribs = trimmed_merged[trimmed_merged.calaccess_filing_id.isin(recent_filings.calaccess_filing_id)]
In [16]:
recent_contribs.head(10)
Out[16]:
In [17]:
recent_contribs.to_csv(
os.path.join(settings.data_dir, 'contributions-2016.csv'),
index=False
)