In [1]:

    
import glob
import pandas as pd
from pandas import DataFrame as df
from os import path
import csv
from io import StringIO, BytesIO
import numpy as np



In [2]:

    
import matplotlib.pyplot as plt
%matplotlib inline



In [3]:

    
from tools import GetNIH, GetNSF

Merge CSV files

Each cvs file represent a specific word results obtained from the NSF and NIH websites.



In [6]:

    
NIH_df = GetNIH()
NSF_df = GetNSF()



In [5]:

    
NIH_df.head()









    Out[5]:






  
    
      
      word
      ARRA Indicator
      Activity
      Administering  IC
      Application ID
      Contact PI  Person ID
      Contact PI / Project Leader
      FY
      FY Total Cost
      FY Total Cost  (Sub Projects)
      IC
      Organization Name
      Other PI or Project Leader(s)
      Project Number
      Project Title
      Serial Number
      Subproject Number
      Suffix
      Support Year
      Type
    
  
  
    
      0
      bicultural
      
      R01
      NICHD
      6521173
      1896188
      ALTMAN, JACKI
      2002
      335137
      
      HD
      UNIVERSITY OF CONNECTICUT STORRS
      Not Applicable
      5R01HD037515-03
      CULTURAL CHANGE IN CHILDREARING BELIEFS AND PR...
      37515
      
      
      3
      5
    
    
      1
      bicultural
      
      R01
      NICHD
      6388079
      1896188
      ALTMAN, JACKI
      2001
      355033
      
      HD
      UNIVERSITY OF CONNECTICUT STORRS
      Not Applicable
      5R01HD037515-02
      CULTURAL CHANGE IN CHILDREARING BELIEFS AND PR...
      37515
      
      
      2
      5
    
    
      2
      bicultural
      
      R01
      NICHD
      6138870
      1896188
      ALTMAN, JACKI
      2000
      415255
      
      HD
      UNIVERSITY OF CONNECTICUT STORRS
      Not Applicable
      1R01HD037515-01A1
      CULTURAL CHANGE IN CHILDREARING BELIEFS AND PR...
      37515
      
      A1
      1
      1
    
    
      3
      bicultural
      
      R01
      NIMH
      7664631
      1861565
      AMBADY, NALINI
      2009
      207082
      
      MH
      TUFTS UNIVERSITY MEDFORD
      Not Applicable
      5R01MH070833-05
      COMMUNICATION OF EMOTION: BEHAVIORAL AND NEURA...
      70833
      
      
      5
      5
    
    
      4
      bicultural
      
      R01
      NIMH
      7488888
      1861565
      AMBADY, NALINI
      2008
      207082
      
      MH
      TUFTS UNIVERSITY MEDFORD
      Not Applicable
      5R01MH070833-04
      COMMUNICATION OF EMOTION: BEHAVIORAL AND NEURA...
      70833
      
      
      4
      5



In [6]:

    
NSF_df.head()









    Out[6]:






  
    
      
      word
      ARRAAmount
      Abstract
      AwardInstrument
      AwardNumber
      AwardedAmountToDate
      Co-PIName(s)
      EndDate
      LastAmendmentDate
      NSFDirectorate
      ...
      OrganizationZip
      PIEmailAddress
      PrincipalInvestigator
      Program(s)
      ProgramElementCode(s)
      ProgramManager
      ProgramReferenceCode(s)
      StartDate
      State
      Title
    
  
  
    
      0
      bicultural
      $0.00
      This project is an empirical research study us...
      Standard Grant
      1251588
      $347,496.00
      
      08/31/2016
      09/16/2013
      EHR
      ...
      871199828
      pam@aises.org
      Pamala Silas
      REAL
      7625
      Jolene K. Jesse
      9150, 8212
      09/15/2013
      NM
      Collaborative Research to Understand the Role ...
    
    
      1
      bicultural
      $0.00
      This project is an empirical research study us...
      Standard Grant
      1251516
      $311,981.00
      
      08/31/2017
      09/07/2013
      EHR
      ...
      602013149
      medin@northwestern.edu
      Douglas Medin
      REAL
      7625
      Jolene K. Jesse
      8212
      09/15/2013
      IL
      Collaborative Research to Understand the Role ...
    
    
      2
      bicultural
      $0.00
      This project is an empirical research study us...
      Standard Grant
      1251532
      $732,512.00
      
      08/31/2016
      09/07/2013
      EHR
      ...
      871310001
      JPage-Reeves@salud.unm.edu
      Janet Page-Reeves
      REAL
      7625
      Jolene K. Jesse
      9150, 9177, SMET, 8212
      09/15/2013
      NM
      Collaborative Research to Understand the Role ...
    
    
      3
      bicultural
      $0.00
      U.S. surveys of minorities and immigrants allo...
      Standard Grant
      0751899
      $6,920.00
      Robert Groves, Emilia Peytcheva
      04/30/2013
      05/23/2012
      SBE
      ...
      481091274
      nschwarz@umich.edu
      Norbert Schwarz
      METHOD, MEASURE & STATS
      1333
      Cheryl L. Eavey
      9179, SMET
      05/15/2008
      MI
      Doctoral Dissertation Research: Language of Ad...
    
    
      4
      bicultural
      $0.00
      
      Standard Grant
      8316803
      $261,256.00
      
      07/31/1987
      07/22/1985
      EHR
      ...
      860110001
      
      Gordon Johnson
      INSTRUCTIONAL MATERIALS DEVELP
      7355
      Alice Moses
      
      02/01/1984
      AZ
      Outdoor World Science - A Model Bicultural Hig...
    
  

5 rows × 26 columns



In [7]:

    
!mkdir data/processed









    



mkdir: cannot create directory ‘data/processed’: File exists

Save the combined query results



In [8]:

    
NSF_df.to_csv("data/Grants/processed/nsf_combined.csv", encoding='utf-8')
NIH_df.to_csv("data/Grants/processed/nih_combined.csv", encoding='utf-8')

Process NIH database



In [9]:

    
NIH = df.from_csv("data/Grants/processed/nih_combined.csv", encoding='utf-8')

Load University Database

The goal to map each university that is mentioned in the NIH database to a state. We will rely on two source of information to build a partial mapping:

University accredation database.
List of states and their acronyms.

After building the partial list, we will fill the gaps manually.



In [10]:

    
university_df = pd.DataFrame.from_csv("data/Grants/Accreditation_2015_09.csv")
university_df[["Institution_Name", "Institution_State"]].head()









    Out[10]:






  
    
      
      Institution_Name
      Institution_State
    
    
      Institution_ID
      
      
    
  
  
    
      180319
      21st Century Oncology, Inc. School for Radiati...
      FL
    
    
      180319
      21st Century Oncology, Inc. School for Radiati...
      FL
    
    
      180319
      21st Century Oncology, Inc. School for Radiati...
      FL
    
    
      203827
      360 Degrees Beauty Academy
      TX
    
    
      211194
      4-States Academy of Cosmetology
      OK



In [11]:

    
code_to_state = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}



In [12]:

    
state_to_code = {state.lower():code for code,state in code_to_state.items()}



In [13]:

    
university_to_state = {uni.lower():state for uni,state in university_df[["Institution_Name", "Institution_State"]].values}



In [14]:

    
universities = set(university_to_state.keys())



In [15]:

    
def FindState(name):
  name = name.lower()
  if name in universities:
    return university_to_state[name]
  for state in state_to_code.keys():
    if state in name:
      return state_to_code[state]
  for n in universities:
    if name in n:
      return university_to_state[n]
  return ""



In [16]:

    
NIH_universities = {x.lower() for x in set(NIH["Organization Name"].fillna("").values)}



In [17]:

    
NIH_uni_state_df = pd.DataFrame.from_dict([{"University":uni, "State":FindState(uni)} for uni in NIH_universities])



In [18]:

    
NIH_uni_state_df.to_csv("data/Grants/processed/university_to_state.csv")

Load manual university -> state dictionary



In [19]:

    
tmp = df.from_csv("data/Grants/university_to_state.csv").fillna("")
university_to_state = {x[1]:x[0] for x in tmp.values}

NIH Modification Steps

~~Remove the following columns: ARRA indicator, Administering IC, Application ID, Contact PI Person ID, Contact PI, Other PI, Project Number, Serial Number, Subproject Number, Suffix~~
~~First column --> give title 'ID #'~~
Copy the word data into a new column (title it 'terms')--> code them as the following: 1 = multiculturalism, 2 = polyculturalism, 3 = cultural pluralism, 4 = monocultural, 5 = monoracial, 6 = bicultural, 7 = biracial, 8 = biethnic, 9 = interracial, 10 = multicultural, 11 = multiracial, 12 = polycultural, 13 = polyracial, 14 = polyethnic, 15 = mixed race, 16 = mixed ethnicity, 17 = other race, 18 = other ethnicity
~~'Activity' --> Rename 'Grant Program'~~
~~'FY' - 1 --> Rename 'Start Year'~~
~~Combine 'FY Total Cost' and 'FY Total Cost Subproject' Columns --> Rename 'Award Amount'~~
~~'IC' --> Rename 'Grant Organization'~~
~~'Organization Name' --> Rename 'Funded Organization'~~
Add new column --> 'Organization State' [use csv file in email to get state info for universities in list]
~~'Type'--> Rename 'Grant Type' [Recode as the following: 1 = 1 (New Grant), 2+ = 2 (Continuing Grant)]~~
Remove the following columns: Contact PI, FY Total Cost, FY Total Cost (SubProjects), Subproject Number



In [20]:

    
NIH = df.from_csv("data/Grants/processed/nih_combined.csv", encoding='utf-8')
NIH.loc[:, "FY Total Cost "] = NIH["FY Total Cost "].fillna(0)
NIH.loc[:, "FY Total Cost  (Sub Projects)"] = NIH["FY Total Cost  (Sub Projects)"].fillna(0)
NIH["Award Amount"] = NIH["FY Total Cost  (Sub Projects)"] + NIH["FY Total Cost "]

remaining_cols = [c for c in NIH.columns if c not in
                  {'ARRA Indicator', 'Administering  IC', 'Application ID', 'Contact PI  Person ID',
                   'Other PI or Project Leader(s)', 'Project Number', 'Serial Number', 'Subproject Number',
                   'Suffix', "Contact PI / Project Leader",
                  "FY Total Cost ", "FY Total Cost  (Sub Projects)", 'Subproject Number '}]
NIH_minimal = NIH[remaining_cols]
NIH_minimal.index.rename("ID", inplace=True)
NIH_minimal = NIH_minimal.rename(columns={"word": "Term",
                                          "Activity": "Grant Program",
                                          "FY": "Start Year",
                                          "Type": "Grant Type",
                                          "Grant Organization": "Funded Organization"})
NIH_minimal = NIH_minimal.rename(columns={"IC": "Grant Organization"})
id_to_term = { 1: "multiculturalism", 2: "polyculturalism", 3: "cultural pluralism", 4: "monocultural",
              5: "monoracial", 6: "bicultural", 7: "biracial", 8: "biethnic", 9: "interracial",
              10: "multicultural", 11: "multiracial", 12: "polycultural", 13: "polyracial",
              14: "polyethnic", 15: "mixed race", 16: "mixed ethnicity", 17: "other race", 18: "other ethnicity"}
term_to_id = {term:id for id, term in id_to_term.items()}
NIH_minimal.loc[:, "TermCode"] = NIH_minimal.Term.map(term_to_id)
NIH_minimal.loc[:, "Grant Type"] = NIH_minimal["Grant Type"].map(lambda x: 1 if x=='1' else 2)
NIH_minimal.loc[:, "State"] = NIH_minimal["Organization Name"].fillna("").map(lambda x: university_to_state[x.lower()])
title_occurrence = []
for x,y in NIH_minimal[["Term", "Project Title"]].fillna("").values:
  if x.lower() in y.lower():
    title_occurrence.append(1)
  else:
    title_occurrence.append(0)
NIH_minimal["term_in_title"] = title_occurrence
NIH_minimal.head()









    Out[20]:






  
    
      
      Term
      Grant Program
      Start Year
      Grant Organization
      Organization Name
      Project Title
      Support Year
      Grant Type
      Award Amount
      TermCode
      State
      term_in_title
    
    
      ID
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      bicultural
      R01
      2002
      HD
      UNIVERSITY OF CONNECTICUT STORRS
      CULTURAL CHANGE IN CHILDREARING BELIEFS AND PR...
      3
      2
      335137
      6
      CT
      0
    
    
      1
      bicultural
      R01
      2001
      HD
      UNIVERSITY OF CONNECTICUT STORRS
      CULTURAL CHANGE IN CHILDREARING BELIEFS AND PR...
      2
      2
      355033
      6
      CT
      0
    
    
      2
      bicultural
      R01
      2000
      HD
      UNIVERSITY OF CONNECTICUT STORRS
      CULTURAL CHANGE IN CHILDREARING BELIEFS AND PR...
      1
      1
      415255
      6
      CT
      0
    
    
      3
      bicultural
      R01
      2009
      MH
      TUFTS UNIVERSITY MEDFORD
      COMMUNICATION OF EMOTION: BEHAVIORAL AND NEURA...
      5
      2
      207082
      6
      MA
      0
    
    
      4
      bicultural
      R01
      2008
      MH
      TUFTS UNIVERSITY MEDFORD
      COMMUNICATION OF EMOTION: BEHAVIORAL AND NEURA...
      4
      2
      207082
      6
      MA
      0



In [21]:

    
Xs = []
for x in NIH_minimal["Organization Name"].fillna(""):
    if x.lower() not in university_to_state:
        Xs.append(x.lower())
print("\n".join(set(Xs)))

Save new NIH dataset



In [22]:

    
NIH_minimal.to_csv("data/Grants/processed/nih_clean.csv", encoding='utf-8')

Process NSF Database



In [23]:

    
NSF = df.from_csv("data/Grants/processed/nsf_combined.csv", encoding='utf-8')

NSF Modification Steps

~~Remove the following columns: ARRA amount, Last Amendment Date, Organization City, Organization Zip, Program Element Code, State~~
~~First column --> give title 'ID #'~~
Copy the word data into a new column (title it 'terms')--> code them as the following: 1 = multiculturalism, 2 = polyculturalism, 3 = cultural pluralism, 4 = monocultural, 5 = monoracial, 6 = bicultural, 7 = biracial, 8 = biethnic, 9 = interracial, 10 = multicultural, 11 = multiracial, 12 = polycultural, 13 = polyracial, 14 = polyethnic, 15 = mixed race, 16 = mixed ethnicity, 17 = other race, 18 = other ethnicity
~~'Abstract' --> Rename 'Project Abstract'~~
~~'Award Instrument'--> Rename 'Grant Type' [Recode as the following: 1 = Standard Grant, 2 = Continuing Grant, 3 = Fellowship, 4 = Cooperative Agreement, 5 = Other]~~
~~'Awarded Amount to Date' --> Rename 'Award Amount'~~
~~'NSF Organization' --> Rename 'Grant Organization' --> Note: subdivision of the NSF directorate~~
~~'Organization Name' --> Rename 'Funded Organization'~~
~~'Start Date' --> Rename 'Start Year' [Only use year from date information]~~
~~New Column 'Support Year': 2015-Start Year~~ (I do not understand this one!)
~~New Column 'Award Length': End Date - Start Year (Years only)~~
~~Title --> Rename 'Project Title'~~
Remove the following columns: Award number, Co-PI names, Last Amendment Date, Organization Phone, Organization Street, PI Email Address, Principal Investigator, Program(s), Program Element Code(s), Program Manager, Program Reference Codes



In [24]:

    
NSF["AwardInstrument"].value_counts(dropna=False)









    Out[24]:





Standard Grant                    2527
Continuing grant                   770
Fellowship                          40
Cooperative Agreement               33
NaN                                 17
Interagency Agreement                4
Fixed Price Award                    3
Contract Interagency Agreement       2
Contract                             1
Name: AwardInstrument, dtype: int64



In [25]:

    
NSF.columns









    Out[25]:





Index(['word', 'ARRAAmount', 'Abstract', 'AwardInstrument', 'AwardNumber',
       'AwardedAmountToDate', 'Co-PIName(s)', 'EndDate', 'LastAmendmentDate',
       'NSFDirectorate', 'NSFOrganization', 'Organization', 'OrganizationCity',
       'OrganizationPhone', 'OrganizationState', 'OrganizationStreet',
       'OrganizationZip', 'PIEmailAddress', 'PrincipalInvestigator',
       'Program(s)', 'ProgramElementCode(s)', 'ProgramManager',
       'ProgramReferenceCode(s)', 'StartDate', 'State', 'Title'],
      dtype='object')



In [26]:

    
NSF = df.from_csv("data/Grants/processed/nsf_combined.csv", encoding='utf-8')
NSF.loc[:, "Support Year"] = 2015 - pd.DatetimeIndex(NSF["StartDate"]).year
NSF.loc[:, "Award Length"] = pd.DatetimeIndex(NSF["EndDate"]).year - pd.DatetimeIndex(NSF["StartDate"]).year
NSF.loc[:, "Start Year"] = pd.DatetimeIndex(NSF["StartDate"]).year
grant_to_code = {"Standard Grant": 1, "Continuing grant": 2, "Fellowship": 3, "Cooperative Agreement": 4, "":np.nan}
NSF.loc[:, "AwardInstrument"] = NSF.AwardInstrument.fillna("").map(lambda x: grant_to_code.get(x, 5))
remaining_cols = [c for c in NSF.columns if c not in
                  {"ARRAAmount", "LastAmmendmentDate", "OrganizationCity", "OrganizationZip",
                   "ProgramElementCode", "StartDate", "AwardNumber", "Co-PIName(s)", "LastAmmendmentDate",
                  "OrganizationPhone", "OrganizationStreet", "PIEmailAddress", "PrincipalInvestigator",
                  "Program(s)", "ProgramElementCode(s)", "ProgramManager", "ProgramReferenceCode(s)",
                   "OrganizationState"}]

NSF_minimal = NSF[remaining_cols]
NSF_minimal.index.rename("ID", inplace=True)
NSF_minimal = NSF_minimal.rename(columns={"word": "Term", "Abstract": "Project Abstract",
                                         "AwardedAmountToDate": "Award Amount",
                                          "OrganizationName": "Funded Organization", "Title":"Project Title",
                                         "StartDate":"Start Year",
                                         "AwardInstrument": "Grant Type"})
id_to_term = { 1: "multiculturalism", 2: "polyculturalism", 3: "cultural pluralism", 4: "monocultural",
              5: "monoracial", 6: "bicultural", 7: "biracial", 8: "biethnic", 9: "interracial",
              10: "multicultural", 11: "multiracial", 12: "polycultural", 13: "polyracial",
              14: "polyethnic", 15: "mixed race", 16: "mixed ethnicity", 17: "other race", 18: "other ethnicity"}
term_to_id = {term:id for id, term in id_to_term.items()}
NSF_minimal.loc[:, "TermCode"] = NSF_minimal.Term.map(term_to_id)
title_occurrence = []
for x,y in NSF_minimal[["Term", "Project Title"]].fillna("").values:
  if x.lower() in y.lower():
    title_occurrence.append(1)
  else:
    title_occurrence.append(0)
NSF_minimal["term_in_title"] = title_occurrence

abstract_occurrence = []
for x,y in NSF_minimal[["Term", "Project Abstract"]].fillna("").values:
  if x.lower() in y.lower():
    abstract_occurrence.append(1)
  else:
    abstract_occurrence.append(0)
NSF_minimal["term_in_abstract"] = abstract_occurrence
NSF_minimal.head()









    Out[26]:






  
    
      
      Term
      Project Abstract
      Grant Type
      Award Amount
      EndDate
      LastAmendmentDate
      NSFDirectorate
      NSFOrganization
      Organization
      State
      Project Title
      Support Year
      Award Length
      Start Year
      TermCode
      term_in_title
      term_in_abstract
    
    
      ID
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      0
      bicultural
      This project is an empirical research study us...
      1
      $347,496.00
      08/31/2016
      09/16/2013
      EHR
      DRL
      American Indian Science and Engineering Societ...
      NM
      Collaborative Research to Understand the Role ...
      2
      3
      2013
      6
      0
      0
    
    
      1
      bicultural
      This project is an empirical research study us...
      1
      $311,981.00
      08/31/2017
      09/07/2013
      EHR
      DRL
      Northwestern University
      IL
      Collaborative Research to Understand the Role ...
      2
      4
      2013
      6
      0
      0
    
    
      2
      bicultural
      This project is an empirical research study us...
      1
      $732,512.00
      08/31/2016
      09/07/2013
      EHR
      DRL
      University of New Mexico Health Sciences Center
      NM
      Collaborative Research to Understand the Role ...
      2
      3
      2013
      6
      0
      0
    
    
      3
      bicultural
      U.S. surveys of minorities and immigrants allo...
      1
      $6,920.00
      04/30/2013
      05/23/2012
      SBE
      SES
      University of Michigan Ann Arbor
      MI
      Doctoral Dissertation Research: Language of Ad...
      7
      5
      2008
      6
      0
      1
    
    
      4
      bicultural
      NaN
      1
      $261,256.00
      07/31/1987
      07/22/1985
      EHR
      DRL
      Northern Arizona University
      AZ
      Outdoor World Science - A Model Bicultural Hig...
      31
      3
      1984
      6
      1
      0

Save results



In [27]:

    
NSF_minimal.to_csv("data/Grants/processed/nsf_clean.csv", encoding='utf-8')

Mega NIH/NSF Dataset

New Column: 'Grant Agency' [Code --> 1 = NIH, 2 = NSF]
Combine the two datasets (after following the instructions below)

NIH

Add NIH to the beginning of each remaining column

Remove the following columns:

Grant Program
Grant Organization
Funded Organization?

NSF

Add NSF to the beginning of each remaining column

Remove the following columns:

Abstract
NSF Organization/NSF Directorate
Funded Organization?
Program Topic



In [28]:

    
NSF_minimal = NSF_minimal.rename(columns={"Organization": "Organization Name"})



In [76]:

    
common_cols = ['Term', 'Start Year', 
       'Organization Name', 'Project Title', 'Support Year', 'Grant Type',
       'Award Amount', 'State', "TermCode"]
NSF_merge = NSF_minimal[common_cols]
NSF_merge.insert(0, "Grant Agency", ["NSF"]*len(NSF_merge))
NIH_merge = NIH_minimal[common_cols]
NIH_merge.insert(0, "Grant Agency", ["NIH"]*len(NIH_merge))



In [77]:

    
merged = pd.concat([NSF_merge, NIH_merge])
merged.loc[:, "AgencyCode"] = merged["Grant Agency"].map(lambda x: 1 if x == 'NIH' else 2)
title_occurrence = []
for x,y in merged[["Term", "Project Title"]].fillna("").values:
  if x.lower() in y.lower():
    title_occurrence.append(1)
  else:
    title_occurrence.append(0)
merged["term_in_title"] = title_occurrence



In [31]:

    
merged.tail()









    Out[31]:






  
    
      
      Grant Agency
      Term
      Start Year
      Organization Name
      Project Title
      Support Year
      Grant Type
      Award Amount
      State
      TermCode
      AgencyCode
      term_in_title
    
    
      ID
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      241
      NIH
      multiracial
      2010
      EMORY UNIVERSITY
      NEUROLOGICAL EMERGENCIES TREATMENT TRIALS (NETT)
      4
      2
      194117
      GA
      11
      1
      0
    
    
      242
      NIH
      multiracial
      2009
      EMORY UNIVERSITY
      NEUROLOGICAL EMERGENCIES TREATMENT TRIALS (NETT)
      3
      2
      194117
      GA
      11
      1
      0
    
    
      243
      NIH
      multiracial
      2008
      EMORY UNIVERSITY
      NEUROLOGICAL EMERGENCIES TREATMENT TRIALS (NETT)
      2
      2
      194117
      GA
      11
      1
      0
    
    
      244
      NIH
      multiracial
      2007
      EMORY UNIVERSITY
      NEUROLOGICAL EMERGENCIES TREATMENT TRIALS (NETT)
      1
      1
      382500
      GA
      11
      1
      0
    
    
      245
      NIH
      multiracial
      2005
      UNIVERSITY OF PITTSBURGH AT PITTSBURGH
      OFFICE BARRIES & FACILITATORS TO OVERCOMING DI...
      1
      1
      250000
      PA
      11
      1
      0



In [32]:

    
merged.to_csv("data/Grants/processed/nsf_nih_merged.csv")



In [89]:

    
merged = pd.read_csv("data/Grants/processed/nsf_nih_merged.csv")



In [120]:

    
years = merged["Start Year"].fillna("2017")
years = years.map(lambda x: {" ": "2017"}.get(x, x))
years = years.astype(np.int64)



In [129]:

    
merged[years==1956]









    Out[129]:






  
    
      
      ID
      Grant Agency
      Term
      Start Year
      Organization Name
      Project Title
      Support Year
      Grant Type
      Award Amount
      State
      TermCode
      AgencyCode
      term_in_title
    
  
  
    
      1084
      582
      NSF
      other race
      1956
      University of California-Berkeley
      Race and Species Formation in the Genus Clarkia
      59
      NaN
      $22,500.00
      CA
      17
      2
      0



In [127]:

    
years[years<=1981].sort_values()









    Out[127]:





1084    1956
3329    1959
1086    1959
1085    1964
2437    1968
1087    1970
1081    1970
1083    1970
5       1971
3333    1971
3330    1972
2435    1972
2438    1972
3178    1972
2436    1973
879     1973
1082    1973
3172    1974
3179    1974
2421    1975
2419    1975
894     1977
2366    1978
1078    1980
2433    1980
2425    1981
Name: Start Year, dtype: int64

NSF Budget



In [218]:

    
import xml.etree.ElementTree as ET
import locale



In [210]:

    
file = "data/Grants/NSF/NSFBudgetHistory.xml"
locale.setlocale(locale.LC_ALL, 'en_US.UTF8')
tree = ET.parse(file)



In [224]:

    
root = tree.getroot()
records = []
seen  = {}
for child in root.getchildren():
  record = {}
  if child.tag.endswith("Omnibus") or child.tag.endswith("ARRA"): continue
  year_ = child.tag.replace("FY", "")
  year = int(year_.replace("Total", ""))
  if(year > 2009) and not year_.endswith("Total"): continue
  record["year"] = year
  for grandchild in child.getchildren():
    if grandchild.tag == "ConstantDollars":
      for grandchild2 in grandchild.getchildren():
        if grandchild2.tag == "NSFTotal":
          v = list(grandchild2.itertext())[0].strip()
          record["Total"] = locale.atof(v.strip("$"))
  records.append(record)
nsf_budget = df.from_records(records)
nsf_budget.set_index("year", inplace=True)



In [226]:

    
nsf_budget.to_csv("data/Grants/processed/nsf_budget.csv")

NIH Combined Modified Dataset

Remove the following columns: ARRA indicator, Administering IC, Application ID, Contact PI Person ID, Contact PI, Other PI, Project Number, Serial Number, Subproject Number, Suffix
First column --> give title 'ID #'
Copy the word data into a new column (title it 'terms')--> code them as the following: 1 = multiculturalism, 2 = polyculturalism, 3 = cultural pluralism, 4 = monocultural, 5 = monoracial, 6 = bicultural, 7 = biracial, 8 = biethnic, 9 = interracial, 10 = multicultural, 11 = multiracial, 12 = polycultural, 13 = polyracial, 14 = polyethnic, 15 = mixed race, 16 = mixed ethnicity, 17 = other race, 18 = other ethnicity
'Activity' --> Rename 'Grant Program'
'FY' - 1 --> Rename 'Start Year'
Combine 'FY Total Cost' and 'FY Total Cost Subproject' Columns --> Rename 'Award Amount'
'IC' --> Rename 'Grant Organization'
'Organization Name' --> Rename 'Funded Organization'
Add new column --> 'Organization State' [use csv file in email to get state info for universities in list]
'Type'--> Rename 'Grant Type' [Recode as the following: 1 = 1 (New Grant), 2+ = 2 (Continuing Grant)]

NSF Combined Modified Dataset

Remove the following columns: ARRA amount, Last Amendment Date, Organization City, Organization Zip, Program Element Code, State
First column --> give title 'ID #'
Copy the word data into a new column (title it 'terms')--> code them as the following: 1 = multiculturalism, 2 = polyculturalism, 3 = cultural pluralism, 4 = monocultural, 5 = monoracial, 6 = bicultural, 7 = biracial, 8 = biethnic, 9 = interracial, 10 = multicultural, 11 = multiracial, 12 = polycultural, 13 = polyracial, 14 = polyethnic, 15 = mixed race, 16 = mixed ethnicity, 17 = other race, 18 = other ethnicity
'Abstract' --> Rename 'Project Abstract'
'Award Instrument'--> Rename 'Grant Type' [Recode as the following: 1 = Standard Grant, 2 = Continuing Grant, 3 = Fellowship, ...] we should speak about this to see all possible responses in this column
'Awarded Amount to Date' --> Rename 'Award Amount'
'NSF Directorate' OR 'NSF Organization' --> Rename 'Grant Organization' We should discuss this before you execute this
'Organization Name' --> Rename 'Funded Organization'
'Program(s)' --> Rename 'Program Topic' [Recode --> first we must have a list of all possible responses in dataset]
'Start Date' --> Rename 'Start Year' [Only use year from date information]
New Column 'Support Year': 2015-Start Year
New Column 'Award Length': End Date - Start Year (Years only)
Title --> Rename 'Project Title'



In [ ]:

Mega NIH/NSF Dataset

New Column: 'Grant Agency' [Code --> 1 = NIH, 2 = NSF]
Combine the two datasets (after following the instructions below)

NIH

Add NIH to the beginning of each remaining column

Remove the following columns:

Grant Program
Grant Organization
Funded Organization?

NSF

Add NSF to the beginning of each remaining column

Remove the following columns:

Abstract
NSF Organization/NSF Directorate
Funded Organization?
Program Topic

	word	Activity	Administering IC	Application ID	Contact PI Person ID	Contact PI / Project Leader	FY	FY Total Cost	IC	Organization Name	Other PI or Project Leader(s)	Project Number	Project Title	Serial Number	Suffix	Support Year	Type
0	bicultural	R01	NICHD	6521173	1896188	ALTMAN, JACKI	2002	335137	HD	UNIVERSITY OF CONNECTICUT STORRS	Not Applicable	5R01HD037515-03	CULTURAL CHANGE IN CHILDREARING BELIEFS AND PR...	37515		3	5
1	bicultural	R01	NICHD	6388079	1896188	ALTMAN, JACKI	2001	355033	HD	UNIVERSITY OF CONNECTICUT STORRS	Not Applicable	5R01HD037515-02	CULTURAL CHANGE IN CHILDREARING BELIEFS AND PR...	37515		2	5
2	bicultural	R01	NICHD	6138870	1896188	ALTMAN, JACKI	2000	415255	HD	UNIVERSITY OF CONNECTICUT STORRS	Not Applicable	1R01HD037515-01A1	CULTURAL CHANGE IN CHILDREARING BELIEFS AND PR...	37515	A1	1	1
3	bicultural	R01	NIMH	7664631	1861565	AMBADY, NALINI	2009	207082	MH	TUFTS UNIVERSITY MEDFORD	Not Applicable	5R01MH070833-05	COMMUNICATION OF EMOTION: BEHAVIORAL AND NEURA...	70833		5	5
4	bicultural	R01	NIMH	7488888	1861565	AMBADY, NALINI	2008	207082	MH	TUFTS UNIVERSITY MEDFORD	Not Applicable	5R01MH070833-04	COMMUNICATION OF EMOTION: BEHAVIORAL AND NEURA...	70833		4	5

	word	ARRAAmount	Abstract	AwardInstrument	AwardNumber	AwardedAmountToDate	Co-PIName(s)	EndDate	LastAmendmentDate	NSFDirectorate	...	OrganizationZip	PIEmailAddress	PrincipalInvestigator	Program(s)	ProgramElementCode(s)	ProgramManager	ProgramReferenceCode(s)	StartDate	State	Title
0	bicultural	$0.00	This project is an empirical research study us...	Standard Grant	1251588	$347,496.00		08/31/2016	09/16/2013	EHR	...	871199828	pam@aises.org	Pamala Silas	REAL	7625	Jolene K. Jesse	9150, 8212	09/15/2013	NM	Collaborative Research to Understand the Role ...
1	bicultural	$0.00	This project is an empirical research study us...	Standard Grant	1251516	$311,981.00		08/31/2017	09/07/2013	EHR	...	602013149	medin@northwestern.edu	Douglas Medin	REAL	7625	Jolene K. Jesse	8212	09/15/2013	IL	Collaborative Research to Understand the Role ...
2	bicultural	$0.00	This project is an empirical research study us...	Standard Grant	1251532	$732,512.00		08/31/2016	09/07/2013	EHR	...	871310001	JPage-Reeves@salud.unm.edu	Janet Page-Reeves	REAL	7625	Jolene K. Jesse	9150, 9177, SMET, 8212	09/15/2013	NM	Collaborative Research to Understand the Role ...
3	bicultural	$0.00	U.S. surveys of minorities and immigrants allo...	Standard Grant	0751899	$6,920.00	Robert Groves, Emilia Peytcheva	04/30/2013	05/23/2012	SBE	...	481091274	nschwarz@umich.edu	Norbert Schwarz	METHOD, MEASURE & STATS	1333	Cheryl L. Eavey	9179, SMET	05/15/2008	MI	Doctoral Dissertation Research: Language of Ad...
4	bicultural	$0.00		Standard Grant	8316803	$261,256.00		07/31/1987	07/22/1985	EHR	...	860110001		Gordon Johnson	INSTRUCTIONAL MATERIALS DEVELP	7355	Alice Moses		02/01/1984	AZ	Outdoor World Science - A Model Bicultural Hig...

	Institution_Name	Institution_State
Institution_ID
180319	21st Century Oncology, Inc. School for Radiati...	FL
180319	21st Century Oncology, Inc. School for Radiati...	FL
180319	21st Century Oncology, Inc. School for Radiati...	FL
203827	360 Degrees Beauty Academy	TX
211194	4-States Academy of Cosmetology	OK

	Grant Agency	Term	Start Year	Organization Name	Project Title	Support Year	Grant Type	Award Amount	State	TermCode	AgencyCode	term_in_title
ID
241	NIH	multiracial	2010	EMORY UNIVERSITY	NEUROLOGICAL EMERGENCIES TREATMENT TRIALS (NETT)	4	2	194117	GA	11	1	0
242	NIH	multiracial	2009	EMORY UNIVERSITY	NEUROLOGICAL EMERGENCIES TREATMENT TRIALS (NETT)	3	2	194117	GA	11	1	0
243	NIH	multiracial	2008	EMORY UNIVERSITY	NEUROLOGICAL EMERGENCIES TREATMENT TRIALS (NETT)	2	2	194117	GA	11	1	0
244	NIH	multiracial	2007	EMORY UNIVERSITY	NEUROLOGICAL EMERGENCIES TREATMENT TRIALS (NETT)	1	1	382500	GA	11	1	0
245	NIH	multiracial	2005	UNIVERSITY OF PITTSBURGH AT PITTSBURGH	OFFICE BARRIES & FACILITATORS TO OVERCOMING DI...	1	1	250000	PA	11	1	0