In [29]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
In [2]:
#unzip all data
In [2]:
sc_2012 = pd.read_excel('../new_data/DOE-SC_Grants_FY2012.xlsx')
usa_2012 = pd.read_csv('../new_data/2012_089_Assistance_Full_20190131_1.csv')
In [3]:
print len(sc_2012), len(usa_2012)
In [8]:
len(usa_2012.groupby('award_id_fain').count())
Out[8]:
In [9]:
len(sc_2012.groupby('Award Number').count())
Out[9]:
So USA has many more awards listed than the SC file. Maybe not all of the awards in USA are office of science?
In [24]:
usa_2012.columns
Out[24]:
In [5]:
usa_2012[usa_2012['total_loan_value'] == usa_2012['total_loan_value'].dropna().max()]['recipient_name']
Out[5]:
In [7]:
usa_2012['awarding_office_code']
Out[7]:
OK, so usaspending officially sucks -- it doesn't have any information below "DOE". Let's see if the contracts are equally bad.
In [8]:
usa_2012 = pd.read_csv('../new_data/2012_089_Contracts_Full_20190131_1.csv')
In [9]:
usa_2012.columns
Out[9]:
In [10]:
usa_2012['awarding_office_name']
Out[10]:
What about directly downloading grants from 2012?
In [12]:
usa_2012 = pd.read_csv('~/Downloads/all_assistance_prime_transactions_1 2.csv')
In [13]:
usa_2012['awarding_office_name']
Out[13]:
hmm, so the DATA act was only passed in 2014 -- maybe more recent data has this info?
In [4]:
usa = pd.read_csv('../new_data/2012_089_Contracts_Full_20190131_1.csv')
In [15]:
usa.columns.values
Out[15]:
In [23]:
print usa['action_date']
In [82]:
def clean_doe_contract_data():
print 'Generating DOE Contract data...'
contract_file_list = glob('../new_data/*089_Contracts*.csv')
contract_df_list = []
for contract_file in contract_file_list:
df = pd.read_csv(contract_file)
df['Fiscal Year'] = contract_file[:4]
contract_df_list.append(df)
fulldata = pd.concat(contract_df_list,ignore_index=True)
print len(fulldata)
sc_awarding_offices = ['CHICAGO SERVICE CENTER (OFFICE OF SCIENCE)',
'OAK RIDGE OFFICE (OFFICE OF SCIENCE)',
'SC CHICAGO SERVICE CENTER',
'SC OAK RIDGE OFFICE']
sc_funding_offices = ['CHICAGO SERVICE CENTER (OFFICE OF SCIENCE)',
'OAK RIDGE OFFICE (OFFICE OF SCIENCE)',
'SCIENCE',
'SC OAK RIDGE OFFICE',
'SC CHICAGO SERVICE CENTER'
]
sc_contracts = fulldata[(fulldata['awarding_office_name'].isin(
sc_awarding_offices)) | (fulldata['funding_office_name'].isin(sc_funding_offices))]
#Clean data
sc_contracts = sc_contracts[['award_id_piid', 'federal_action_obligation','recipient_name',
'primary_place_of_performance_state_code',
'primary_place_of_performance_congressional_district',
'product_or_service_code_description',
'Fiscal Year'
]]
sc_contracts = sc_contracts.rename(columns = {
'federal_action_obligation':'Amount ($)',
'award_id_piid' : 'award_id',
'recipient_name' : 'Vendor',
'primary_place_of_performance_state_code' : 'State',
'primary_place_of_performance_congressional_district' : 'District',
'product_or_service_code_description' : 'Item'
})
sc_contracts = sc_contracts.dropna(subset=['District'])
sc_contracts['District'] = sc_contracts['State'] + sc_contracts['District'].map(int).map(str).str.zfill(2)
sc_contracts.to_pickle('../cleaned_data/sc_contracts.pkl')
In [83]:
contracts = clean_doe_contract_data()
In [84]:
contracts_by_state = contracts.groupby(['State'])
contracts_by_district = contracts.groupby('District')
In [91]:
currentFY = 2019
n_years_desired = 6
N_YEARS_STR = '6'
END_YEAR = 2019
def get_state_contracts(distcode):
try:
contracts_by_state.get_group(distcode)
except KeyError:
print 'This state received no SC contracts from 2012-'+str(END_YEAR+1)
return
n_contracts = contracts_by_state.get_group(distcode)['Amount ($)'].count()
total_contract_value = contracts_by_state.get_group(distcode)['Amount ($)'].sum()
print 'In the past '+N_YEARS_STR+' years, this state has received:'
print n_contracts, 'Office of Science contracts, totalling', '${:,.2f}'.format(total_contract_value)
print ' '
n_firms = len(contracts_by_state.get_group(distcode).groupby(['Vendor'])[['Amount ($)']])
if n_firms < 4:
print contracts_by_state.get_group(distcode).groupby(['Vendor','fiscal_year'])[['Amount ($)']].sum()
else:
print contracts_by_state.get_group(distcode).groupby(['Vendor'])[['Amount ($)']].sum().sort_values(['Amount ($)'],ascending=False).head(n=10)
if n_firms > 10: print 'and ', n_firms-10, ' other firms.'
In [92]:
get_state_contracts('CA')
In [121]:
import re
def clean_doe_grant_data():
print 'Generating DOE Grant data...'
dataA = pd.read_excel('../new_data/DOE-SC_Grants_FY2018.xlsx')
data0 = pd.read_excel('../new_data/DOE-SC_Grants_FY2017.xlsx')
data = pd.read_excel('../new_data/DOE-SC_Grants_FY2016.xlsx')
data2 = pd.read_excel('../new_data/DOE-SC_Grants_FY2015.xlsx',
sheet_name='DOE SC Awards FY 2015')
data3 = pd.read_excel('../new_data/DOE-SC_Grants_FY2014.xlsx',
sheet_name='DOE SC Awards FY 2014')
data4 = pd.read_excel('../new_data/DOE-SC_Grants_FY2013.xlsx',
sheet_name='DOE SC Awards FY 2013', skiprows=1)
data5 = pd.read_excel('../new_data/DOE-SC_Grants_FY2012.xlsx',
sheet_name='DOE SC Awards FY 2012')
### FIXES TO RAW DATA
data2.loc[data2['Institution'] == 'University of Minnesota', 'Congressional District'] = 'MN-05'
data4.loc[data4['Institution'] == 'CALIFORNIA INST. OF TECHNOLOGY', 'Congressional District *'] = 'CA-27'
data3.loc[data3['Institution'] == 'California Institute of Technology (CalTech)', 'Congressional District'] = 'CA-27'
data2.loc[data2['Institution'] == 'California Institute of Technology', 'Congressional District'] = 'CA-27'
data.loc[data['Institution'] == 'California Institute of Technology', 'Congressional District'] = 'CA-27'
data0.loc[data0['Institution'] == 'California Institute of Technology', 'Congressional District'] = 'CA-27'
### END FIXES
institutions = pd.concat([dataA['Institution'],data0['Institution'], data['Institution'], data2['Institution'],
data3['Institution'], data4['Institution'], data5['Institution']], ignore_index=True, axis=0)
districts = pd.concat([dataA['Congressional District'],data0['Congressional District'], data['Congressional District'], data2['Congressional District'],
data3['Congressional District'], data4['Congressional District *'], data5['Congressional District']], ignore_index=True, axis=0)
amounts = pd.concat([dataA['Awarded Amount'],data0['Awarded Amount'], data['Awarded Amount'], data2['Awarded Amount'],
data3['Awarded Amount'], data4['FY 2013 Funding'], data5['2012 Funding']], ignore_index=True, axis=0)
years = pd.Series(np.concatenate([2018 * np.ones(len(dataA)), 2017 * np.ones(len(data0)), 2016 * np.ones(len(data)), 2015 * np.ones(
len(data2)), 2014 * np.ones(len(data3)), 2013 * np.ones(len(data4)), 2012 * np.ones(len(data5))]))
states = pd.concat([dataA['State'], data0['State'], data['State/Territory'], data2['State/Territory'],
data3['State'], data4['State'], data5['State']], ignore_index=True, axis=0)
programs = pd.concat([dataA['Program Office'], data0['Organization'], data['Organization'], data2['Organization'],
data3['Organization'], data4['SC Program'], data5['SC Program']], ignore_index=True, axis=0)
fulldata = pd.concat([programs, years, states, districts, institutions, amounts], axis=1, keys=[
'SC Office', 'Year', 'State', 'District', 'Institution', 'Amount ($)'])
fulldata['State'] = fulldata['State'].map(str).map(str.strip)
agencies = fulldata['SC Office'].values
abbrev_agencies = []
for entry in list(agencies):
test = re.split(r"\(|\)", str(entry))
if len(test) > 1:
abbrev_agencies.append(test[1])
else:
abbrev_agencies.append(test[0])
fulldata['SC Office'] = abbrev_agencies
hepdata = fulldata[fulldata['SC Office'] == 'HEP']
return hepdata
In [122]:
test = clean_doe_grant_data()
In [124]:
test.tail()
Out[124]:
In [106]:
test = pd.read_excel('../new_data/DOE-SC_Grants_FY2018.xlsx')
In [107]:
test.columns
Out[107]:
In [110]:
test['Program Office']
Out[110]:
In [166]:
def clean_nsf_grant_data():
print 'Generating NSF Grant data...'
contract_file_list = glob('../new_data/*049_Assistance*.csv')
contract_df_list = []
for contract_file in contract_file_list:
df = pd.read_csv(contract_file)
df['Fiscal Year'] = contract_file[:4]
contract_df_list.append(df)
fulldata = pd.concat(contract_df_list,ignore_index=True)
mps_grants = fulldata[fulldata['cfda_title'].map(str.strip).map(str.lower) == 'mathematical and physical sciences']
mps_grants = mps_grants[['cfda_title','federal_action_obligation',
'recipient_state_code', 'recipient_congressional_district',
'recipient_name'
]]
mps_grants = mps_grants.rename(columns = {
'federal_action_obligation' : 'Amount ($)',
'recipient_state_code' : 'State',
'recipient_congressional_district' : 'District',
'recipient_name' : 'Institution'
})
mps_grants = mps_grants.dropna(subset=['District'])
mps_grants['District'] = mps_grants['State'] + mps_grants['District'].map(int).map(str).str.zfill(2)
mps_grants.loc[mps_grants['District'] == 'OR00', 'State'] = 'PR'
mps_grants.loc[mps_grants['District'] == 'OR00', 'District'] = 'PR00'
return mps_grants
In [167]:
test = clean_nsf_grant_data()
In [170]:
test.groupby('District').get_group('PR00')
Out[170]:
In [129]:
usa_2012 = pd.read_csv('../new_data/2012_049_Assistance_Full_20190131_1.csv')
In [134]:
usa_2012.columns
Out[134]:
In [146]:
Out[146]:
In [172]:
np.ones(10,dtype=int)
Out[172]:
In [176]:
test['Amount ($)'] = test['Amount ($)'].map(int)
In [177]:
test
Out[177]:
In [ ]: