In [4]:
import requests
import pandas as pd
import os
from bs4 import BeautifulSoup

In [5]:
# scrape org names from Fast Forward directory
npages = 30
org_names_raw = []

for ipage in range(1, npages+1):
    url = 'http://www.ffwd.org/wp-admin/admin-ajax.php?action=get_results&sfid=2724&paged=%d' % ipage
    response = requests.get(url)
    soup = BeautifulSoup(response.text)
    for elt in soup.find_all('h2'):
        org_names_raw.append(elt.contents[0].contents[0])

print 'Got %d org names from %s to %s' % (len(org_names_raw), org_names_raw[0], org_names_raw[-1])


Got 296 org names from #IGottaMakeIt to Zoonk
/usr/local/lib/python2.7/site-packages/bs4/__init__.py:181: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

The code that caused this warning is on line 162 of the file /usr/local/Cellar/python/2.7.9/Frameworks/Python.framework/Versions/2.7/lib/python2.7/runpy.py. To get rid of this warning, change code that looks like this:

 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))

In [19]:
# QA based on visual examination.
# In most cases, the procedure was:
# look up Fast Forward org name on http://www.guidestar.org/,
# find EIN if it exists,
# then look up corrected org name for EIN in https://s3.amazonaws.com/irs-form-990/index_*.csv.
# Note that EINs in the S3 files have no dashes, while those on GuideStar do.
org_set = set(org_names_raw)

# first is bad, second is good
to_replace = [
    ('AdoptTogether', 'HOPING HEARTS FOUNDATION INC'),
    ('Adopt a Classroom', 'ADOPT-A-CLASSROOM INC'),
    ('Benetech', 'BENEFICENT TECHNOLOGY'),
    ('CareerVillage.org', 'CareerVillage'),
    ('CareMessage', 'ANJNA PATIENT EDUCATION DBA CAREMESSAGE'),
    ('Classroom, Inc.', 'Classroom Inc'),
    ('Code.org', 'CODEORG'),
    ('Coworker.org', 'COWORKERORG'),
    ('Digital Democracy', 'DTWO LTD'),
    ('DonorsChoose.org', 'DONORSCHOOSEORG'),
    ('DoSomething.org', 'DO SOMETHING INC'),
    ('Elephant Action League/WildLeaks', 'ELEPHANT ACTION LEAGUE'),
    ('E The People', 'E-THE PEOPLE'),
    ('EveryoneOn', 'CONNECT TO COMPETE'),
    ('FreeCycle.org', 'Freecycle Network'),
    ('GiveDirectly', 'Give Direct'),
    ('Gooru', 'Ednovo'),
    ('GreatNonprofits', 'Great Nonprofits'),
    ('Hollaback!', 'Hollaback'),
    ('ioby', 'IN OUR BACKYARDS'),
    ('Kiva', 'KIVA FOUNDATION'),
    ('Laborlink by Good World Solutions', 'Good World Solutions'),
    ('Learn Fresh Education Co.', 'Learn Fresh Education Co'),
    ('The Lunchbox Fund', 'Lunchbox Fund'),
    ('Made In a Free World', 'FAIR TRADE FUND INC'),
    ('New Classrooms Innovation Partners for Learning', 'NEW CLASSROOMS INNOVATION PARTNERS'),
    ('Samaschool', 'Samasource'),
    ('Scratch Foundation', 'CODE-TO-LEARN FOUNDATION'),
    ('Social Interest Solutions', 'CENTER TO PROMOTE HEALTHCARE ACCESS INC'),
    ('Stellar.org', 'STELLAR FOUNDATION'),
    ('The Global Lives Project', 'Global Lives Project'),
    ('The Freecycle Network', 'Freecycle Network'),
    ('UNCOMMEN', 'MEN OF COURAGE FOUNDATION DBA UNCOMMEN'),
    ('Wishbone', 'WISHBONEORG'),
    ('WITNESS', 'WITNESS INC'),
    ('Wordnik Society', 'PLANETWORK NGO INC'),
    ('YTH', 'INTERNET SEXUALITY INFORMATION SERVICES'),
]

for bad_name, good_name in to_replace:
    org_set.remove(bad_name)
    org_set.add(good_name)
    
# orgs where the name matches but the match is incorrect and there is no correct match
to_remove = set([
    'Pennies',
    'Quill',
    'UltraViolet',
])
org_set -= to_remove

# to sorted list again
org_names = sorted(list(org_set))

In [20]:
# collect org unique IDs from metadata files
years = [2011, 2012, 2013, 2014, 2015, 2016]
pieces = []
orgs_not_found = set(org_names)

path = '.'  # https://s3.amazonaws.com/

for year in years:
    # pull full metadata file
    # locally have deleted line 39569 of year 2014 because there is an extra comma
    year_df = pd.read_csv(os.path.join(path, 'irs-form-990', 'index_%s.csv' % year),
                          header=0)
    print 'For year %d, got %d lines with columns %s' % (year, len(year_df), year_df.columns)
    
    # store rows matching each org name
    for name in org_names:
        # search for rows where TAXPAYER_NAME starts with the org name
        regex = r'^' + name
        criterion = year_df['TAXPAYER_NAME'].str.contains(regex, case=False)
        org_rows = year_df[criterion]

        # store if found a single org
        if len(org_rows['TAXPAYER_NAME'].unique()) == 1:
            # one or more submissions for a single org, store all
            pieces.append(org_rows)
            orgs_not_found.discard(name)

# collect
df = pd.concat(pieces)


For year 2011, got 203074 lines with columns Index([u'RETURN_ID', u'FILING_TYPE', u'EIN', u'TAX_PERIOD', u'SUB_DATE',
       u'TAXPAYER_NAME', u'RETURN_TYPE', u'DLN', u'OBJECT_ID'],
      dtype='object')
For year 2012, got 261622 lines with columns Index([u'RETURN_ID', u'FILING_TYPE', u'EIN', u'TAX_PERIOD', u'SUB_DATE',
       u'TAXPAYER_NAME', u'RETURN_TYPE', u'DLN', u'OBJECT_ID'],
      dtype='object')
For year 2013, got 261449 lines with columns Index([u'RETURN_ID', u'FILING_TYPE', u'EIN', u'TAX_PERIOD', u'SUB_DATE',
       u'TAXPAYER_NAME', u'RETURN_TYPE', u'DLN', u'OBJECT_ID'],
      dtype='object')
For year 2014, got 387528 lines with columns Index([u'RETURN_ID', u'FILING_TYPE', u'EIN', u'TAX_PERIOD', u'SUB_DATE',
       u'TAXPAYER_NAME', u'RETURN_TYPE', u'DLN', u'OBJECT_ID'],
      dtype='object')
For year 2015, got 261032 lines with columns Index([u'RETURN_ID', u'FILING_TYPE', u'EIN', u'TAX_PERIOD', u'SUB_DATE',
       u'TAXPAYER_NAME', u'RETURN_TYPE', u'DLN', u'OBJECT_ID'],
      dtype='object')
For year 2016, got 205043 lines with columns Index([u'RETURN_ID', u'FILING_TYPE', u'EIN', u'TAX_PERIOD', u'SUB_DATE',
       u'TAXPAYER_NAME', u'RETURN_TYPE', u'DLN', u'OBJECT_ID'],
      dtype='object')
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:19: UserWarning: This pattern has match groups. To actually get the groups, use str.extract.

In [21]:
# log found and missing orgs
orgs_found = set(org_names) - orgs_not_found
print 'found %d orgs: %s' % (len(orgs_found), sorted(list(orgs_found)))
print
print 'failed to find %d orgs: %s' % (len(orgs_not_found), sorted(list(orgs_not_found)))


found 104 orgs: [u'1947 Partition Archive', 'ADOPT-A-CLASSROOM INC', 'ANJNA PATIENT EDUCATION DBA CAREMESSAGE', 'BENEFICENT TECHNOLOGY', u'Blue Planet Network', u'Brackets For Good', 'CENTER TO PROMOTE HEALTHCARE ACCESS INC', 'CODE-TO-LEARN FOUNDATION', 'CODEORG', 'CONNECT TO COMPETE', 'COWORKERORG', u'Cancer Commons', 'CareerVillage', u'Case Commons', u'Center for Student Opportunity', 'Classroom Inc', u'Code for America', u'Common Sense Media', u'Crisis Text Line', u'D-Rev', 'DO SOMETHING INC', 'DONORSCHOOSEORG', 'DTWO LTD', u'Democracy Works', u'Design that Matters', u'Digital Green', 'E-THE PEOPLE', 'ELEPHANT ACTION LEAGUE', 'Ednovo', u'Elephant Action League', 'FAIR TRADE FUND INC', u'Families Empowered', 'Freecycle Network', u'Get Schooled', 'Give Direct', u'GiveWell', 'Global Lives Project', 'Good World Solutions', 'Great Nonprofits', 'HOPING HEARTS FOUNDATION INC', u'Harmony Institute', 'Hollaback', u'HopeLab', u'Humanitarian OpenStreetMap Team', 'IN OUR BACKYARDS', 'INTERNET SEXUALITY INFORMATION SERVICES', u'Imaging the World', u'Impact Network International', u'InSTEDD', u'International Bridges to Justice', u'Internews', 'KIVA FOUNDATION', u'Kangu', u'Khan Academy', 'Learn Fresh Education Co', u'Lifebox Foundation', u'Literacy Bridge', u'Literacy Lab', 'Lunchbox Fund', 'MEN OF COURAGE FOUNDATION DBA UNCOMMEN', u'MIND Research Institute', u'MakeSense', u'Medic Mobile', u'Moneythink', u'Mozilla', 'NEW CLASSROOMS INNOVATION PARTNERS', u'National Institute on Money in State Politics', u'Newborn Foundation', u'Nexleaf Analytics', u'Npower', u'OCEARCH', u'One Laptop per Child', u'Open Media Foundation', u'OpenGov Foundation', u'Operation ASHA', 'PLANETWORK NGO INC', u'Participatory Culture Foundation', u'Participatory Politics Foundation', u'PulsePoint Foundation', u'ReAllocate', u'ReadWorks', u'Reasoning Mind', 'STELLAR FOUNDATION', u'Samasource', u'Science Buddies', u'Single Stop', u'SkyTruth', u'TeachAIDS', u'TechSoup', u'Tidepool', u'Transparency Toolkit', u'UniversalGiving', u'Video Volunteers', 'WISHBONEORG', 'WITNESS INC', u'Watsi', u'WeThrive', u'Wikimedia Foundation', u'Worldreader', u'Zearn', u'Zidisha', u'edX', u'iCivics', u'myAgro']

failed to find 184 orgs: [u'#IGottaMakeIt', u'38 Degrees', u'80,000 Hours', u'AbleThrive', u'AdvanceNet Labs', u'Aliim', u'AmmiTips', u'AmpleHarvest.org', u'Apps for Good', u'ArtSlooth', u'Atma Connect', u'Bayes Impact', u'Beehive', u'Ben\u2019s Friends', u'Better World Ed', u'Beyond12', u'BitGive', u'Bloode.org', u'Blueprint International', u'Br\u0101v', u'CASH Music', u'Cadasta Foundation', u'Carbon Offsets to Alleviate Poverty (COTAP.org)', u'Cell-Ed', u'Center for Technology and Civic Life', u'Chayn', u'Civic Hall Labs', u'Climate Cents', u'Code to Inspire', u'Color of Change', u'CommonLit', u'Concrn', u'Contratados', u'CoreTech Foundation', u'CriticaLink', u'Culvee', u'Curious Learning', u'CyclePhilly', u'DNA Digest', u'Dat', u'DataKind', u'Democracy Earth Foundation', u'DemocracyOS', u'Dev4X', u'Disaster Recovery 360', u'ENVenture', u'Enlearn', u'Falling Fruit', u'Fcancer Project', u'Feeding Forward', u'Field Ready', u'FrontlineSMS', u'FunDza Literacy Trust', u'Fuppy', u'GIVMED', u'Give A Day Global', u'GiveMob', u'Glucosio', u'GradGuru', u'Grassroot', u'GreatSchools', u'Green Impact Campaign', u'Green Our Planet', u'Gun by Gun', u'Hack Club', u'Harassmap', u'Heat Seek', u'Help from Home', u'Hermes Center for Transparency and Digital Human Rights', u'History Project', u'Immigration Advocates Network', u'InPlay', u'InfoxChange', u'Intelehealth', u'Internet Archive', u'Janaagraha', u'Jompeame', u'JustFix.nyc', u'K-12 OER Collaborative', u'Karmonize', u'Key Conservation', u'Khushi Baby', u'KoBo Toolbox', u'Library For All', u'LibriVox', u'Lifelites', u'Loanwise', u'LocalWiki', u'MAMA (Mobile Alliance for Maternal Action)', u'MEANS Database', u'Majal', u'Mealshare', u'Mealshare Aid Society', u'Meedan', u'Mifos Initiative', u'MindRight', u'Miracle Messages', u'Miti Health', u'New Incentives', u'New Story Charity', u'Noora Health', u'ORGANIZE', u'OSET Foundation', u'Odisi', u'On Our Radar', u'One Brick', u'One Degree', u'Open Food Foundation', u'Open Policing', u'Open mHealth', u'OpenCurriculum', u'Pentorship', u'Philanthropy University', u'PowerMyLearning', u'Praekelt Foundation', u'Pro Bono Net', u'ProjectExplorer', u'RE-volv', u'Rainforest Connection (RFCx)', u'RateMyFosterHome.com', u'Red Dot Foundation (Safecity)', u'Refugees United (REFUNITE)', u'Refugees Welcome', u'Rent-a-Student', u'Rescuing Leftover Cuisine', u'RootIO', u'Rumie Initiative', u'SIRUM', u'SOAR', u'STEAM Initiative', u'Safecast', u'Sahana Software Foundation', u'Samasta Health Foundation', u'Satellite Assisted Pastoral Resource Management (SAPARM) Program', u'SaveOhno', u'Security First', u'Selfless', u'Serval Project', u'Sexual Health Innovations', u'Share for Cures', u'Simprints Technology', u'Social Heroes (SOZIALHELDEN)', u'Solstice Initiative', u'Solve Education!', u'SpellAfrica Initiative', u'Swinfen Charitable Trust', u'TalkingPoints', u'The Collaborative Knowledge Foundation', u'The Guardian Project', u'The MAVEN Project', u'The Ocean Cleanup', u'The Organ Preservation Alliance', u'The Saylor Academy', u'The School Fund', u'The Wild Neighbors Database Project', u'Think of Us', u'TimeHeroes', u'Tinder Foundation', u'Trek Medics International', u'Tulalens', u'Union Capital Boston', u'Unsung.org', u'Up to Global', u'Verdentum', u'Virtual Learning Academy', u'Visual Learners, \u201cAprendices Visuales\u201d', u'Vote.org', u'Voxe.org', u'WE CARE Solar', u'WattTime', u'Wayfindr', u'We The Protesters', u'WellDone International', u'Womentum', u'Young Invincibles', u'Zariya', u'Zooniverse', u'Zoonk', u'billionBricks', u'e-NABLE', u'embraceKulture', u'iProbono', u'mRelief', u'more good deeds']

In [22]:
# output
df.to_csv('output/metadata.csv', index=False)