In [1]:
import glob
import pandas as pd
from pandas import DataFrame as df
from os import path
import csv
from io import StringIO, BytesIO
import numpy as np
In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
In [3]:
from tools import GetNIH, GetNSF
In [6]:
NIH_df = GetNIH()
NSF_df = GetNSF()
In [5]:
NIH_df.head()
Out[5]:
In [6]:
NSF_df.head()
Out[6]:
In [7]:
!mkdir data/processed
In [8]:
NSF_df.to_csv("data/Grants/processed/nsf_combined.csv", encoding='utf-8')
NIH_df.to_csv("data/Grants/processed/nih_combined.csv", encoding='utf-8')
In [9]:
NIH = df.from_csv("data/Grants/processed/nih_combined.csv", encoding='utf-8')
The goal to map each university that is mentioned in the NIH database to a state. We will rely on two source of information to build a partial mapping:
After building the partial list, we will fill the gaps manually.
In [10]:
university_df = pd.DataFrame.from_csv("data/Grants/Accreditation_2015_09.csv")
university_df[["Institution_Name", "Institution_State"]].head()
Out[10]:
In [11]:
code_to_state = {
'AK': 'Alaska',
'AL': 'Alabama',
'AR': 'Arkansas',
'AS': 'American Samoa',
'AZ': 'Arizona',
'CA': 'California',
'CO': 'Colorado',
'CT': 'Connecticut',
'DC': 'District of Columbia',
'DE': 'Delaware',
'FL': 'Florida',
'GA': 'Georgia',
'GU': 'Guam',
'HI': 'Hawaii',
'IA': 'Iowa',
'ID': 'Idaho',
'IL': 'Illinois',
'IN': 'Indiana',
'KS': 'Kansas',
'KY': 'Kentucky',
'LA': 'Louisiana',
'MA': 'Massachusetts',
'MD': 'Maryland',
'ME': 'Maine',
'MI': 'Michigan',
'MN': 'Minnesota',
'MO': 'Missouri',
'MP': 'Northern Mariana Islands',
'MS': 'Mississippi',
'MT': 'Montana',
'NA': 'National',
'NC': 'North Carolina',
'ND': 'North Dakota',
'NE': 'Nebraska',
'NH': 'New Hampshire',
'NJ': 'New Jersey',
'NM': 'New Mexico',
'NV': 'Nevada',
'NY': 'New York',
'OH': 'Ohio',
'OK': 'Oklahoma',
'OR': 'Oregon',
'PA': 'Pennsylvania',
'PR': 'Puerto Rico',
'RI': 'Rhode Island',
'SC': 'South Carolina',
'SD': 'South Dakota',
'TN': 'Tennessee',
'TX': 'Texas',
'UT': 'Utah',
'VA': 'Virginia',
'VI': 'Virgin Islands',
'VT': 'Vermont',
'WA': 'Washington',
'WI': 'Wisconsin',
'WV': 'West Virginia',
'WY': 'Wyoming'
}
In [12]:
state_to_code = {state.lower():code for code,state in code_to_state.items()}
In [13]:
university_to_state = {uni.lower():state for uni,state in university_df[["Institution_Name", "Institution_State"]].values}
In [14]:
universities = set(university_to_state.keys())
In [15]:
def FindState(name):
name = name.lower()
if name in universities:
return university_to_state[name]
for state in state_to_code.keys():
if state in name:
return state_to_code[state]
for n in universities:
if name in n:
return university_to_state[n]
return ""
In [16]:
NIH_universities = {x.lower() for x in set(NIH["Organization Name"].fillna("").values)}
In [17]:
NIH_uni_state_df = pd.DataFrame.from_dict([{"University":uni, "State":FindState(uni)} for uni in NIH_universities])
In [18]:
NIH_uni_state_df.to_csv("data/Grants/processed/university_to_state.csv")
In [19]:
tmp = df.from_csv("data/Grants/university_to_state.csv").fillna("")
university_to_state = {x[1]:x[0] for x in tmp.values}
In [20]:
NIH = df.from_csv("data/Grants/processed/nih_combined.csv", encoding='utf-8')
NIH.loc[:, "FY Total Cost "] = NIH["FY Total Cost "].fillna(0)
NIH.loc[:, "FY Total Cost (Sub Projects)"] = NIH["FY Total Cost (Sub Projects)"].fillna(0)
NIH["Award Amount"] = NIH["FY Total Cost (Sub Projects)"] + NIH["FY Total Cost "]
remaining_cols = [c for c in NIH.columns if c not in
{'ARRA Indicator', 'Administering IC', 'Application ID', 'Contact PI Person ID',
'Other PI or Project Leader(s)', 'Project Number', 'Serial Number', 'Subproject Number',
'Suffix', "Contact PI / Project Leader",
"FY Total Cost ", "FY Total Cost (Sub Projects)", 'Subproject Number '}]
NIH_minimal = NIH[remaining_cols]
NIH_minimal.index.rename("ID", inplace=True)
NIH_minimal = NIH_minimal.rename(columns={"word": "Term",
"Activity": "Grant Program",
"FY": "Start Year",
"Type": "Grant Type",
"Grant Organization": "Funded Organization"})
NIH_minimal = NIH_minimal.rename(columns={"IC": "Grant Organization"})
id_to_term = { 1: "multiculturalism", 2: "polyculturalism", 3: "cultural pluralism", 4: "monocultural",
5: "monoracial", 6: "bicultural", 7: "biracial", 8: "biethnic", 9: "interracial",
10: "multicultural", 11: "multiracial", 12: "polycultural", 13: "polyracial",
14: "polyethnic", 15: "mixed race", 16: "mixed ethnicity", 17: "other race", 18: "other ethnicity"}
term_to_id = {term:id for id, term in id_to_term.items()}
NIH_minimal.loc[:, "TermCode"] = NIH_minimal.Term.map(term_to_id)
NIH_minimal.loc[:, "Grant Type"] = NIH_minimal["Grant Type"].map(lambda x: 1 if x=='1' else 2)
NIH_minimal.loc[:, "State"] = NIH_minimal["Organization Name"].fillna("").map(lambda x: university_to_state[x.lower()])
title_occurrence = []
for x,y in NIH_minimal[["Term", "Project Title"]].fillna("").values:
if x.lower() in y.lower():
title_occurrence.append(1)
else:
title_occurrence.append(0)
NIH_minimal["term_in_title"] = title_occurrence
NIH_minimal.head()
Out[20]:
In [21]:
Xs = []
for x in NIH_minimal["Organization Name"].fillna(""):
if x.lower() not in university_to_state:
Xs.append(x.lower())
print("\n".join(set(Xs)))
In [22]:
NIH_minimal.to_csv("data/Grants/processed/nih_clean.csv", encoding='utf-8')
In [23]:
NSF = df.from_csv("data/Grants/processed/nsf_combined.csv", encoding='utf-8')
In [24]:
NSF["AwardInstrument"].value_counts(dropna=False)
Out[24]:
In [25]:
NSF.columns
Out[25]:
In [26]:
NSF = df.from_csv("data/Grants/processed/nsf_combined.csv", encoding='utf-8')
NSF.loc[:, "Support Year"] = 2015 - pd.DatetimeIndex(NSF["StartDate"]).year
NSF.loc[:, "Award Length"] = pd.DatetimeIndex(NSF["EndDate"]).year - pd.DatetimeIndex(NSF["StartDate"]).year
NSF.loc[:, "Start Year"] = pd.DatetimeIndex(NSF["StartDate"]).year
grant_to_code = {"Standard Grant": 1, "Continuing grant": 2, "Fellowship": 3, "Cooperative Agreement": 4, "":np.nan}
NSF.loc[:, "AwardInstrument"] = NSF.AwardInstrument.fillna("").map(lambda x: grant_to_code.get(x, 5))
remaining_cols = [c for c in NSF.columns if c not in
{"ARRAAmount", "LastAmmendmentDate", "OrganizationCity", "OrganizationZip",
"ProgramElementCode", "StartDate", "AwardNumber", "Co-PIName(s)", "LastAmmendmentDate",
"OrganizationPhone", "OrganizationStreet", "PIEmailAddress", "PrincipalInvestigator",
"Program(s)", "ProgramElementCode(s)", "ProgramManager", "ProgramReferenceCode(s)",
"OrganizationState"}]
NSF_minimal = NSF[remaining_cols]
NSF_minimal.index.rename("ID", inplace=True)
NSF_minimal = NSF_minimal.rename(columns={"word": "Term", "Abstract": "Project Abstract",
"AwardedAmountToDate": "Award Amount",
"OrganizationName": "Funded Organization", "Title":"Project Title",
"StartDate":"Start Year",
"AwardInstrument": "Grant Type"})
id_to_term = { 1: "multiculturalism", 2: "polyculturalism", 3: "cultural pluralism", 4: "monocultural",
5: "monoracial", 6: "bicultural", 7: "biracial", 8: "biethnic", 9: "interracial",
10: "multicultural", 11: "multiracial", 12: "polycultural", 13: "polyracial",
14: "polyethnic", 15: "mixed race", 16: "mixed ethnicity", 17: "other race", 18: "other ethnicity"}
term_to_id = {term:id for id, term in id_to_term.items()}
NSF_minimal.loc[:, "TermCode"] = NSF_minimal.Term.map(term_to_id)
title_occurrence = []
for x,y in NSF_minimal[["Term", "Project Title"]].fillna("").values:
if x.lower() in y.lower():
title_occurrence.append(1)
else:
title_occurrence.append(0)
NSF_minimal["term_in_title"] = title_occurrence
abstract_occurrence = []
for x,y in NSF_minimal[["Term", "Project Abstract"]].fillna("").values:
if x.lower() in y.lower():
abstract_occurrence.append(1)
else:
abstract_occurrence.append(0)
NSF_minimal["term_in_abstract"] = abstract_occurrence
NSF_minimal.head()
Out[26]:
In [27]:
NSF_minimal.to_csv("data/Grants/processed/nsf_clean.csv", encoding='utf-8')
NIH
Add NIH to the beginning of each remaining column
Remove the following columns:
NSF
Add NSF to the beginning of each remaining column
Remove the following columns:
In [28]:
NSF_minimal = NSF_minimal.rename(columns={"Organization": "Organization Name"})
In [76]:
common_cols = ['Term', 'Start Year',
'Organization Name', 'Project Title', 'Support Year', 'Grant Type',
'Award Amount', 'State', "TermCode"]
NSF_merge = NSF_minimal[common_cols]
NSF_merge.insert(0, "Grant Agency", ["NSF"]*len(NSF_merge))
NIH_merge = NIH_minimal[common_cols]
NIH_merge.insert(0, "Grant Agency", ["NIH"]*len(NIH_merge))
In [77]:
merged = pd.concat([NSF_merge, NIH_merge])
merged.loc[:, "AgencyCode"] = merged["Grant Agency"].map(lambda x: 1 if x == 'NIH' else 2)
title_occurrence = []
for x,y in merged[["Term", "Project Title"]].fillna("").values:
if x.lower() in y.lower():
title_occurrence.append(1)
else:
title_occurrence.append(0)
merged["term_in_title"] = title_occurrence
In [31]:
merged.tail()
Out[31]:
In [32]:
merged.to_csv("data/Grants/processed/nsf_nih_merged.csv")
In [89]:
merged = pd.read_csv("data/Grants/processed/nsf_nih_merged.csv")
In [120]:
years = merged["Start Year"].fillna("2017")
years = years.map(lambda x: {" ": "2017"}.get(x, x))
years = years.astype(np.int64)
In [129]:
merged[years==1956]
Out[129]:
In [127]:
years[years<=1981].sort_values()
Out[127]:
In [218]:
import xml.etree.ElementTree as ET
import locale
In [210]:
file = "data/Grants/NSF/NSFBudgetHistory.xml"
locale.setlocale(locale.LC_ALL, 'en_US.UTF8')
tree = ET.parse(file)
In [224]:
root = tree.getroot()
records = []
seen = {}
for child in root.getchildren():
record = {}
if child.tag.endswith("Omnibus") or child.tag.endswith("ARRA"): continue
year_ = child.tag.replace("FY", "")
year = int(year_.replace("Total", ""))
if(year > 2009) and not year_.endswith("Total"): continue
record["year"] = year
for grandchild in child.getchildren():
if grandchild.tag == "ConstantDollars":
for grandchild2 in grandchild.getchildren():
if grandchild2.tag == "NSFTotal":
v = list(grandchild2.itertext())[0].strip()
record["Total"] = locale.atof(v.strip("$"))
records.append(record)
nsf_budget = df.from_records(records)
nsf_budget.set_index("year", inplace=True)
In [226]:
nsf_budget.to_csv("data/Grants/processed/nsf_budget.csv")
In [ ]:
NIH
Add NIH to the beginning of each remaining column
Remove the following columns:
NSF
Add NSF to the beginning of each remaining column
Remove the following columns: