In [8]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

In [28]:
# pull 990 xmls
metadata_df = pd.read_csv('output/metadata.csv', header=0)
rows = []

for xmlid in metadata_df['OBJECT_ID']:
    # pull from S3
    url = 'https://s3.amazonaws.com/irs-form-990/%d_public.xml' % xmlid
    print url
    response = requests.get(url)
    
    # parse to soup
    soup = BeautifulSoup(response.text[3:], 'xml')
    
    # get org data
    ein = soup.find('EIN').contents[0]
    try:
        org_name = soup.find('Filer').Name.BusinessNameLine1.contents[0]
    except AttributeError:
        try:
            org_name = soup.find('Filer').BusinessName.BusinessNameLine1.contents[0]
        except AttributeError:
            org_name = soup.find('Filer').BusinessName.BusinessNameLine1Txt.contents[0]
    try:
        city = soup.find('Filer').USAddress.City.contents[0]
    except AttributeError:
        city = soup.find('Filer').USAddress.CityNm.contents[0]
    try:
        state = soup.find('Filer').USAddress.State.contents[0]
    except AttributeError:
        state = soup.find('Filer').USAddress.StateAbbreviationCd.contents[0]
    try:
        tax_year = soup.find('TaxYear').contents[0]
    except AttributeError:
        tax_year = soup.find('TaxYr').contents[0]
    try:
        submission_time = soup.find('Timestamp').contents[0]
    except AttributeError:
        submission_time = soup.find('ReturnTs').contents[0]        
    try:
        year_formed = soup.find('YearFormation').contents[0]
    except AttributeError:
        try:
            year_formed = soup.find('FormationYr').contents[0]
        except:
            print 'year formed'
            year_formed = None
    try:
        n_employees = soup.find('TotalNbrEmployees').contents[0]
    except AttributeError:
        try:
            n_employees = soup.find('TotalEmployeeCnt').contents[0]
        except:
            print 'n employees'
            n_employees = None

    # get revenue
    total_revenue = None
    revenue_fields = ['TotalRevenueCurrentYear', 'TotalRevenue', 'TotalRevenueAmt',
                      'TotalRevenueAndExpenses', 'CYTotalRevenueAmt', 'TotalRevAndExpnssAmt']
    for revenue_field in revenue_fields:
        try:
            total_revenue = soup.find(revenue_field).contents[0]
            break
        except AttributeError:
            pass
    if total_revenue in [None, '\n']:
        break
        
    base_row = [
        org_name, city, state, ein, url,
        tax_year, year_formed, submission_time,
        n_employees, total_revenue,
    ]

    # get compensation data
    n_people_found = 0
    for employee in soup.find_all('Form990PartVIISectionA'):
        try:
            title = employee.Title.contents[0]
            comp = employee.ReportableCompFromOrganization.contents[0]
        except AttributeError:
            continue
        rows.append(base_row + [title, comp])
        n_people_found += 1
    for employee in soup.find_all('Form990PartVIISectionAGrp'):
        try:
            title = employee.TitleTxt.contents[0]
            comp = employee.ReportableCompFromOrgAmt.contents[0]
        except AttributeError:
            continue
        rows.append(base_row + [title, comp])
        n_people_found += 1
    for employee in soup.find_all('OfficerDirTrstKeyEmplGrp') + soup.find_all('OfficerDirectorTrusteeEmplGrp'):
        try:
            title = employee.TitleTxt.contents[0]
            comp = employee.CompensationAmt.contents[0]
        except AttributeError:
            continue
        rows.append(base_row + [title, comp])
        n_people_found += 1
    for employee in soup.find_all('OfficerDirectorTrusteeKeyEmpl') + soup.find_all('OfcrDirTrusteesOrKeyEmployee') + soup.find_all('CompensationOfHighestPaidEmpl'):
        try:
            title = employee.Title.contents[0]
            comp = employee.Compensation.contents[0]
        except AttributeError:
            continue
        rows.append(base_row + [title, comp])
        n_people_found += 1

    print org_name, n_people_found

df = pd.DataFrame(rows)


https://s3.amazonaws.com/irs-form-990/201120429349301812_public.xml
The Center to Promote Healthcare Access Inc 11
https://s3.amazonaws.com/irs-form-990/201102279349304550_public.xml
CENTER FOR STUDENT OPPORTUNITY INC 3
https://s3.amazonaws.com/irs-form-990/201132989349300003_public.xml
CLASSROOM INC 15
https://s3.amazonaws.com/irs-form-990/201103139349300945_public.xml
CODE FOR AMERICA LABS INC 6
https://s3.amazonaws.com/irs-form-990/201143199349310864_public.xml
D-REV DESIGN FOR THE OTHER 90 8
https://s3.amazonaws.com/irs-form-990/201122989349300912_public.xml
FAMILIES EMPOWERED 2
https://s3.amazonaws.com/irs-form-990/201100329349200010_public.xml
year formed
n employees
THE FREECYCLE NETWORK 3
https://s3.amazonaws.com/irs-form-990/201133189349303268_public.xml
HARMONY INSTITUTE INC 3
https://s3.amazonaws.com/irs-form-990/201133199349101553_public.xml
year formed
n employees
HopeLab Foundation Inc 9
https://s3.amazonaws.com/irs-form-990/201113199349301711_public.xml
IN OUR BACKYARDS INC 10
https://s3.amazonaws.com/irs-form-990/201123199349309522_public.xml
IMAGING THE WORLD CORPORATION 15
https://s3.amazonaws.com/irs-form-990/201143119349100554_public.xml
year formed
n employees
THE KIVA FOUNDATION 3
https://s3.amazonaws.com/irs-form-990/201103189349307545_public.xml
LITERACY BRIDGE 4
https://s3.amazonaws.com/irs-form-990/201113199349309251_public.xml
ONE LAPTOP PER CHILD ASSOCIATION INC 6
https://s3.amazonaws.com/irs-form-990/201131959349300623_public.xml
OPEN MEDIA FOUNDATION 11
https://s3.amazonaws.com/irs-form-990/201112249349300801_public.xml
PARTICIPATORY CULTURE FOUNDATION INC 10
https://s3.amazonaws.com/irs-form-990/201103199349309475_public.xml
PARTICIPATORY POLITICS FOUNDATION 4
https://s3.amazonaws.com/irs-form-990/201123199349305262_public.xml
SAMASOURCE INC 7
https://s3.amazonaws.com/irs-form-990/201121349349300517_public.xml
SkyTruth 7
https://s3.amazonaws.com/irs-form-990/201121369349305607_public.xml
TECHSOUP GLOBAL 21
https://s3.amazonaws.com/irs-form-990/201120469349300637_public.xml
Video Volunteers 11
https://s3.amazonaws.com/irs-form-990/201130919349300623_public.xml
Wikimedia Foundation Inc 14
https://s3.amazonaws.com/irs-form-990/201103189349303710_public.xml
iCivics Inc 17
https://s3.amazonaws.com/irs-form-990/201120469349301227_public.xml
iCivics Inc 13
https://s3.amazonaws.com/irs-form-990/201200689349301005_public.xml
ADOPT-A-CLASSROOM INC 13
https://s3.amazonaws.com/irs-form-990/201243159349300019_public.xml
Blue Planet Network 9
https://s3.amazonaws.com/irs-form-990/201232929349300613_public.xml
CLASSROOM INC 16
https://s3.amazonaws.com/irs-form-990/201202569349300710_public.xml
COMMON SENSE MEDIA 33
https://s3.amazonaws.com/irs-form-990/201211329349302176_public.xml
Do Something Inc 11
https://s3.amazonaws.com/irs-form-990/201221329349301047_public.xml
Do Something Inc 11
https://s3.amazonaws.com/irs-form-990/201202849349300825_public.xml
DTWO LTD 3
https://s3.amazonaws.com/irs-form-990/201201369349201200_public.xml
year formed
n employees
DEMOCRACY WORKS INC 3
https://s3.amazonaws.com/irs-form-990/201241359349303439_public.xml
DESIGN THAT MATTERS INC 2
https://s3.amazonaws.com/irs-form-990/201203199349302890_public.xml
DIGITAL GREEN FOUNDATION 5
https://s3.amazonaws.com/irs-form-990/201223199349302697_public.xml
DIGITAL GREEN FOUNDATION 3
https://s3.amazonaws.com/irs-form-990/201202419349300615_public.xml
EDNOVO 1
https://s3.amazonaws.com/irs-form-990/201213189349301046_public.xml
FAMILIES EMPOWERED 3
https://s3.amazonaws.com/irs-form-990/201233199349202593_public.xml
year formed
n employees
GLOBAL LIVES PROJECT INC 14
https://s3.amazonaws.com/irs-form-990/201233079349300323_public.xml
Great NonProfits 11
https://s3.amazonaws.com/irs-form-990/201213199349101501_public.xml
year formed
n employees
HopeLab Foundation Inc 10
https://s3.amazonaws.com/irs-form-990/201233209349308583_public.xml
IMAGING THE WORLD CORPORATION 13
https://s3.amazonaws.com/irs-form-990/201213219349300501_public.xml
InSTEDD 8
https://s3.amazonaws.com/irs-form-990/201232549349100103_public.xml
year formed
n employees
THE KIVA FOUNDATION 3
https://s3.amazonaws.com/irs-form-990/201223209349305537_public.xml
LITERACY BRIDGE 4
https://s3.amazonaws.com/irs-form-990/201231249349300743_public.xml
LITERACY LAB 6
https://s3.amazonaws.com/irs-form-990/201212409349300021_public.xml
MIND RESEARCH INSTITUTE 35
https://s3.amazonaws.com/irs-form-990/201240629349301609_public.xml
National Institute on Money in State 10
https://s3.amazonaws.com/irs-form-990/201243189349304369_public.xml
NEWBORN FOUNDATION 3
https://s3.amazonaws.com/irs-form-990/201212209349301091_public.xml
NEXLEAF ANALYTICS 5
https://s3.amazonaws.com/irs-form-990/201233209349305723_public.xml
ONE LAPTOP PER CHILD ASSOCIATION INC 7
https://s3.amazonaws.com/irs-form-990/201222999349300912_public.xml
OPEN MEDIA FOUNDATION 8
https://s3.amazonaws.com/irs-form-990/201242149349300724_public.xml
PARTICIPATORY CULTURE FOUNDATION INC 10
https://s3.amazonaws.com/irs-form-990/201203479349300040_public.xml
PARTICIPATORY POLITICS FOUNDATION 4
https://s3.amazonaws.com/irs-form-990/201223209349303937_public.xml
SAMASOURCE INC 6
https://s3.amazonaws.com/irs-form-990/201232769349300703_public.xml
Single Stop USA Inc 18
https://s3.amazonaws.com/irs-form-990/201221369349305462_public.xml
SKYTRUTH 6
https://s3.amazonaws.com/irs-form-990/201211369349307016_public.xml
TECHSOUP GLOBAL 22
https://s3.amazonaws.com/irs-form-990/201233209349307078_public.xml
UNIVERSALGIVING 7
https://s3.amazonaws.com/irs-form-990/201240869349300634_public.xml
Video Volunteers 13
https://s3.amazonaws.com/irs-form-990/201241369349302854_public.xml
WISHBONEORG 3
https://s3.amazonaws.com/irs-form-990/201241309349302274_public.xml
Wikimedia Foundation Inc 17
https://s3.amazonaws.com/irs-form-990/201242139349300439_public.xml
WORLDREADERORG 3
https://s3.amazonaws.com/irs-form-990/201223079349301162_public.xml
iCivics Inc 16
https://s3.amazonaws.com/irs-form-990/201312269349201931_public.xml
year formed
n employees
THE 1947 PARTITION ARCHIVE 2
https://s3.amazonaws.com/irs-form-990/201311359349305906_public.xml
ADOPT-A-CLASSROOM INC 14
https://s3.amazonaws.com/irs-form-990/201332229349300303_public.xml
Blue Planet Network 8
https://s3.amazonaws.com/irs-form-990/201341309349301729_public.xml
THE CENTER TO PROMOTE HEALTHCARE ACCESS INC 11
https://s3.amazonaws.com/irs-form-990/201322279349100507_public.xml
year formed
n employees
CASE COMMONS INC 12
https://s3.amazonaws.com/irs-form-990/201330429349301363_public.xml
CENTER FOR STUDENT OPPORTUNITY INC 4
https://s3.amazonaws.com/irs-form-990/201330449349300513_public.xml
CODE FOR AMERICA LABS INC 8
https://s3.amazonaws.com/irs-form-990/201333189349304218_public.xml
COMMON SENSE MEDIA 38
https://s3.amazonaws.com/irs-form-990/201343179349302349_public.xml
D-Rev Design for the Other 90 7
https://s3.amazonaws.com/irs-form-990/201331289349301198_public.xml
Do Something Inc 13
https://s3.amazonaws.com/irs-form-990/201301709349300320_public.xml
DEMOCRACY WORKS INC 6
https://s3.amazonaws.com/irs-form-990/201341709349300114_public.xml
DESIGN THAT MATTERS INC 2
https://s3.amazonaws.com/irs-form-990/201303049349300500_public.xml
EDNOVO 3
https://s3.amazonaws.com/irs-form-990/201343189349302609_public.xml
FAIR TRADE FUND INC 6
https://s3.amazonaws.com/irs-form-990/201302559349300700_public.xml
FAMILIES EMPOWERED 3
https://s3.amazonaws.com/irs-form-990/201321359349305357_public.xml
THE FREECYCLE NETWORK 3
https://s3.amazonaws.com/irs-form-990/201311289349301131_public.xml
THE GET SCHOOLED FOUNDATION 9
https://s3.amazonaws.com/irs-form-990/201310159349301326_public.xml
GIVE DIRECT INC 5
https://s3.amazonaws.com/irs-form-990/201322059349300132_public.xml
GOOD WORLD SOLUTIONS INC 3
https://s3.amazonaws.com/irs-form-990/201332269349301263_public.xml
Great NonProfits 14
https://s3.amazonaws.com/irs-form-990/201313189349306691_public.xml
HARMONY INSTITUTE INC 6
https://s3.amazonaws.com/irs-form-990/201330179349300133_public.xml
HARMONY INSTITUTE INC 4
https://s3.amazonaws.com/irs-form-990/201341359349203334_public.xml
year formed
n employees
HOLLABACK INC 12
https://s3.amazonaws.com/irs-form-990/201333179349100853_public.xml
year formed
n employees
HopeLab Foundation Inc 10
https://s3.amazonaws.com/irs-form-990/201343169349303194_public.xml
HUMANITARIAN OPENSTREETMAP TEAM UNITED STATES INC 8
https://s3.amazonaws.com/irs-form-990/201330329349300703_public.xml
IN OUR BACKYARDS INC 16
https://s3.amazonaws.com/irs-form-990/201303199349302740_public.xml
IN OUR BACKYARDS INC 16
https://s3.amazonaws.com/irs-form-990/201313479349200316_public.xml
year formed
n employees
IMAGING THE WORLD CORPORATION 11
https://s3.amazonaws.com/irs-form-990/201302629349300710_public.xml
INTERNEWS NETWORK INC 21
https://s3.amazonaws.com/irs-form-990/201311309349100016_public.xml
year formed
n employees
THE KIVA FOUNDATION 3
https://s3.amazonaws.com/irs-form-990/201303179349306290_public.xml
KHAN ACADEMY INC 14
https://s3.amazonaws.com/irs-form-990/201342249349300249_public.xml
THE LITERACY LAB 6
https://s3.amazonaws.com/irs-form-990/201342249349301364_public.xml
year formed
The Lunchbox Fund 4
https://s3.amazonaws.com/irs-form-990/201341929349300524_public.xml
MIND RESEARCH INSTITUTE 26
https://s3.amazonaws.com/irs-form-990/201323509349300747_public.xml
Moneythink 6
https://s3.amazonaws.com/irs-form-990/201300819349300900_public.xml
NEW CLASSROOMS INNOVATION PARTNERSINC 7
https://s3.amazonaws.com/irs-form-990/201340439349301859_public.xml
National Institute on Money in State 10
https://s3.amazonaws.com/irs-form-990/201332329349300233_public.xml
NEWBORN FOUNDATION 3
https://s3.amazonaws.com/irs-form-990/201332359349300708_public.xml
NEXLEAF ANALYTICS 7
https://s3.amazonaws.com/irs-form-990/201303029349300400_public.xml
NPOWERNY INC 26
https://s3.amazonaws.com/irs-form-990/201342709349300039_public.xml
OPEN MEDIA FOUNDATION 8
https://s3.amazonaws.com/irs-form-990/201312249349300731_public.xml
PARTICIPATORY CULTURE FOUNDATION INC 10
https://s3.amazonaws.com/irs-form-990/201303179349303935_public.xml
PARTICIPATORY POLITICS FOUNDATION 4
https://s3.amazonaws.com/irs-form-990/201341369349200709_public.xml
year formed
n employees
REALLOCATE INC 5
https://s3.amazonaws.com/irs-form-990/201332319349300013_public.xml
Single Stop USA Inc 20
https://s3.amazonaws.com/irs-form-990/201322269349302302_public.xml
SkyTruth 7
https://s3.amazonaws.com/irs-form-990/201301349349307710_public.xml
TECHSOUP GLOBAL 21
https://s3.amazonaws.com/irs-form-990/201310359349300441_public.xml
Video Volunteers 13
https://s3.amazonaws.com/irs-form-990/201331309349303228_public.xml
WISHBONEORG 3
https://s3.amazonaws.com/irs-form-990/201321069349300127_public.xml
Wikimedia Foundation Inc 18
https://s3.amazonaws.com/irs-form-990/201312219349200756_public.xml
year formed
n employees
ZIDISHA INC 4
https://s3.amazonaws.com/irs-form-990/201342979349300624_public.xml
iCivics Inc 17
https://s3.amazonaws.com/irs-form-990/201442249349201164_public.xml
year formed
n employees
THE 1947 PARTITION ARCHIVE 2
https://s3.amazonaws.com/irs-form-990/201400229349300440_public.xml
ADOPT-A-CLASSROOM INC 13
https://s3.amazonaws.com/irs-form-990/201403189349303940_public.xml
ADOPT-A-CLASSROOM INC 16
https://s3.amazonaws.com/irs-form-990/201402259349303355_public.xml
Anjna Patient Education 6
https://s3.amazonaws.com/irs-form-990/201421329349300247_public.xml
THE CENTER TO PROMOTE HEALTHCARE ACCESS INC 14
https://s3.amazonaws.com/irs-form-990/201443219349306104_public.xml
CODE-TO-LEARN FOUNDATION 5
https://s3.amazonaws.com/irs-form-990/201442259349302539_public.xml
CODEORG 9
https://s3.amazonaws.com/irs-form-990/201413379349300421_public.xml
COWORKERORG 5
https://s3.amazonaws.com/irs-form-990/201421189349300932_public.xml
CANCER COMMONS 7
https://s3.amazonaws.com/irs-form-990/201441049349200834_public.xml
year formed
n employees
CAREERVILLAGE INC 5
https://s3.amazonaws.com/irs-form-990/201421029349200507_public.xml
year formed
n employees
CAREERVILLAGE INC 5
https://s3.amazonaws.com/irs-form-990/201413179349100046_public.xml
year formed
n employees
CASE COMMONS INC 7
https://s3.amazonaws.com/irs-form-990/201420439349301802_public.xml
CENTER FOR STUDENT OPPORTUNITY INC 4
https://s3.amazonaws.com/irs-form-990/201403089349300020_public.xml
CLASSROOM INC 19
https://s3.amazonaws.com/irs-form-990/201322909349300512_public.xml
CLASSROOM INC 14
https://s3.amazonaws.com/irs-form-990/201400439349301310_public.xml
CODE FOR AMERICA LABS INC 12
https://s3.amazonaws.com/irs-form-990/201422169349300032_public.xml
COMMON SENSE MEDIA 38
https://s3.amazonaws.com/irs-form-990/201403219349304515_public.xml
D-REV DESIGN FOR THE OTHER 90 9
https://s3.amazonaws.com/irs-form-990/201442119349300934_public.xml
DO SOMETHING INC 14
https://s3.amazonaws.com/irs-form-990/201403149349301740_public.xml
DONORSCHOOSEORG 23
https://s3.amazonaws.com/irs-form-990/201443219349310934_public.xml
DTWO LTD 4
https://s3.amazonaws.com/irs-form-990/201313199349309261_public.xml
DTWO LTD 3
https://s3.amazonaws.com/irs-form-990/201443169349303409_public.xml
DEMOCRACY WORKS INC 7
https://s3.amazonaws.com/irs-form-990/201430939349300718_public.xml
DESIGN THAT MATTERS INC 4
https://s3.amazonaws.com/irs-form-990/201333199349305408_public.xml
DIGITAL GREEN FOUNDATION 7
https://s3.amazonaws.com/irs-form-990/201413219349304766_public.xml
DIGITAL GREEN FOUNDATION 7
https://s3.amazonaws.com/irs-form-990/201443079349300704_public.xml
EDNOVO 2
https://s3.amazonaws.com/irs-form-990/201442209349301109_public.xml
FAIR TRADE FUND INC 7
https://s3.amazonaws.com/irs-form-990/201423039349301302_public.xml
FAMILIES EMPOWERED 3
https://s3.amazonaws.com/irs-form-990/201420139349300007_public.xml
THE FREECYCLE NETWORK 3
https://s3.amazonaws.com/irs-form-990/201441349349302264_public.xml
THE GET SCHOOLED FOUNDATION 9
https://s3.amazonaws.com/irs-form-990/201410869349300616_public.xml
GIVE DIRECT INC 0
https://s3.amazonaws.com/irs-form-990/201433189349200323_public.xml
year formed
n employees
GLOBAL LIVES PROJECT INC 12
https://s3.amazonaws.com/irs-form-990/201422819349300747_public.xml
GOOD WORLD SOLUTIONS INC 3
https://s3.amazonaws.com/irs-form-990/201411989349300021_public.xml
Great NonProfits 12
https://s3.amazonaws.com/irs-form-990/201303199349308180_public.xml
HOPING HEARTS FOUNDATION INC 4
https://s3.amazonaws.com/irs-form-990/201433169349304393_public.xml
HOPING HEARTS FOUNDATION INC 4
https://s3.amazonaws.com/irs-form-990/201423229349300622_public.xml
HARMONY INSTITUTE INC 2
https://s3.amazonaws.com/irs-form-990/201430889349300413_public.xml
HOLLABACK INC 16
https://s3.amazonaws.com/irs-form-990/201413189349102001_public.xml
year formed
n employees
HopeLab Foundation Inc 5
https://s3.amazonaws.com/irs-form-990/201443219349305954_public.xml
IN OUR BACKYARDS INC 19
https://s3.amazonaws.com/irs-form-990/201423179349201142_public.xml
year formed
n employees
IMAGING THE WORLD CORP 10
https://s3.amazonaws.com/irs-form-990/201403219349303840_public.xml
InSTEDD 9
https://s3.amazonaws.com/irs-form-990/201333199349310773_public.xml
InSTEDD 9
https://s3.amazonaws.com/irs-form-990/201433179349305208_public.xml
INTERNEWS NETWORK INC 22
https://s3.amazonaws.com/irs-form-990/201441259349100734_public.xml
year formed
n employees
THE KIVA FOUNDATION 3
https://s3.amazonaws.com/irs-form-990/201413189349302501_public.xml
KHAN ACADEMY INC 17
https://s3.amazonaws.com/irs-form-990/201441329349201729_public.xml
year formed
n employees
LIFEBOX FOUNDATION INC 4
https://s3.amazonaws.com/irs-form-990/201413189349309466_public.xml
Literacy Bridge 6
https://s3.amazonaws.com/irs-form-990/201323199349310847_public.xml
LITERACY BRIDGE 4
https://s3.amazonaws.com/irs-form-990/201412239349301626_public.xml
THE LITERACY LAB 7
https://s3.amazonaws.com/irs-form-990/201433159349301688_public.xml
year formed
The Lunchbox Fund 4
https://s3.amazonaws.com/irs-form-990/201432359349300303_public.xml
MIND RESEARCH INSTITUTE 27
https://s3.amazonaws.com/irs-form-990/201333229349300448_public.xml
MEDIC MOBILE INC 7
https://s3.amazonaws.com/irs-form-990/201403229349301375_public.xml
MEDIC MOBILE INC 7
https://s3.amazonaws.com/irs-form-990/201401349349305055_public.xml
NEW CLASSROOMS INNOVATION PARTNERSINC 13
https://s3.amazonaws.com/irs-form-990/201343299349300619_public.xml
NATIONAL INSTITUTE ON MONEY IN STATE POLITICS 11
https://s3.amazonaws.com/irs-form-990/201402259349302940_public.xml
NEXLEAF ANALYTICS 7
https://s3.amazonaws.com/irs-form-990/201403219349309225_public.xml
OCEARCH 10
https://s3.amazonaws.com/irs-form-990/201313199349311756_public.xml
OCEARCH 11
https://s3.amazonaws.com/irs-form-990/201343199349309234_public.xml
ONE LAPTOP PER CHILD ASSOCIATION INC 9
https://s3.amazonaws.com/irs-form-990/201412259349302466_public.xml
OPEN MEDIA FOUNDATION 9
https://s3.amazonaws.com/irs-form-990/201421329349300447_public.xml
THE OPENGOV FOUNDATION 7
https://s3.amazonaws.com/irs-form-990/201413009349300026_public.xml
OPERATION ASHA NFP 13
https://s3.amazonaws.com/irs-form-990/201403119349301295_public.xml
PARTICIPATORY CULTURE FOUNDATION INC 12
https://s3.amazonaws.com/irs-form-990/201423179349306547_public.xml
PARTICIPATORY POLITICS FOUNDATION 4
https://s3.amazonaws.com/irs-form-990/201410359349200726_public.xml
year formed
n employees
PULSEPOINT FOUNDATION 6
https://s3.amazonaws.com/irs-form-990/201442809349300529_public.xml
PULSEPOINT FOUNDATION 6
https://s3.amazonaws.com/irs-form-990/201411889349300766_public.xml
ReadworksInc 23
https://s3.amazonaws.com/irs-form-990/201401069349300205_public.xml
REASONING MIND INC 29
https://s3.amazonaws.com/irs-form-990/201341489349100414_public.xml
year formed
n employees
THE STELLAR FOUNDATION 5
https://s3.amazonaws.com/irs-form-990/201401649349100200_public.xml
year formed
n employees
THE STELLAR FOUNDATION 5
https://s3.amazonaws.com/irs-form-990/201433219349310743_public.xml
SAMASOURCE INC 10
https://s3.amazonaws.com/irs-form-990/201442249349301454_public.xml
SCIENCE BUDDIES 7
https://s3.amazonaws.com/irs-form-990/201402139349300835_public.xml
Single Stop USA Inc 20
https://s3.amazonaws.com/irs-form-990/201402759349300605_public.xml
SkyTruth 7
https://s3.amazonaws.com/irs-form-990/201421359349308847_public.xml
TECHSOUP GLOBAL 19
https://s3.amazonaws.com/irs-form-990/201413219349308226_public.xml
UNIVERSALGIVING 9
https://s3.amazonaws.com/irs-form-990/201343199349309289_public.xml
UNIVERSALGIVING 8
https://s3.amazonaws.com/irs-form-990/201421819349300017_public.xml
WISHBONEORG 6
https://s3.amazonaws.com/irs-form-990/201402279349301040_public.xml
WATSI INC 5
https://s3.amazonaws.com/irs-form-990/201411139349300116_public.xml
Wikimedia Foundation Inc 8
https://s3.amazonaws.com/irs-form-990/201402309349300725_public.xml
WORLDREADERORG 10
https://s3.amazonaws.com/irs-form-990/201421709349300012_public.xml
ZIDISHA INC 4
https://s3.amazonaws.com/irs-form-990/201413169349300111_public.xml
iCivics Inc 16
https://s3.amazonaws.com/irs-form-990/201413179349302161_public.xml
myAgro Farms 4
https://s3.amazonaws.com/irs-form-990/201532099349200018_public.xml
year formed
n employees
THE 1947 PARTITION ARCHIVE 2
https://s3.amazonaws.com/irs-form-990/201502249349302085_public.xml
BENEFICENT TECHNOLOGY INC DBA Benetech 16
https://s3.amazonaws.com/irs-form-990/201501749349300500_public.xml
BRACKETS FOR GOOD INC 7
https://s3.amazonaws.com/irs-form-990/201511279349302026_public.xml
THE CENTER TO PROMOTE HEALTHCARE ACCESS INC 18
https://s3.amazonaws.com/irs-form-990/201512469349300951_public.xml
CODEORG 13
https://s3.amazonaws.com/irs-form-990/201502739349300730_public.xml
CONNECT TO COMPETE INC 6
https://s3.amazonaws.com/irs-form-990/201502539349200750_public.xml
year formed
n employees
COWORKERORG 5
https://s3.amazonaws.com/irs-form-990/201520479349302657_public.xml
CANCER COMMONS 9
https://s3.amazonaws.com/irs-form-990/201500799349200305_public.xml
year formed
n employees
CAREERVILLAGE INC 5
https://s3.amazonaws.com/irs-form-990/201540429349300024_public.xml
CENTER FOR STUDENT OPPORTUNITY INC 4
https://s3.amazonaws.com/irs-form-990/201542379349200034_public.xml
year formed
n employees
CRISIS TEXT LINE INC 6
https://s3.amazonaws.com/irs-form-990/201542219349200209_public.xml
year formed
n employees
CRISIS TEXT LINE INC 6
https://s3.amazonaws.com/irs-form-990/201542219349300409_public.xml
CRISIS TEXT LINE INC 7
https://s3.amazonaws.com/irs-form-990/201541409349300039_public.xml
DO SOMETHING INC 16
https://s3.amazonaws.com/irs-form-990/201521049349300137_public.xml
DESIGN THAT MATTERS INC 6
https://s3.amazonaws.com/irs-form-990/201501319349302305_public.xml
THE GET SCHOOLED FOUNDATION 10
https://s3.amazonaws.com/irs-form-990/201530309349300318_public.xml
HARMONY INSTITUTE INC 6
https://s3.amazonaws.com/irs-form-990/201531279349300963_public.xml
HOLLABACK INC 20
https://s3.amazonaws.com/irs-form-990/201521559349300607_public.xml
Impact Network International Inc 10
https://s3.amazonaws.com/irs-form-990/201531359349101253_public.xml
year formed
n employees
THE KIVA FOUNDATION 3
https://s3.amazonaws.com/irs-form-990/201532179349200013_public.xml
year formed
n employees
KANGU INC 3
https://s3.amazonaws.com/irs-form-990/201532179349300428_public.xml
KANGU INC 3
https://s3.amazonaws.com/irs-form-990/201532249349302008_public.xml
KHAN ACADEMY INC 18
https://s3.amazonaws.com/irs-form-990/201522249349300707_public.xml
LEARN FRESH EDUCATION CO 4
https://s3.amazonaws.com/irs-form-990/201512529349300111_public.xml
Lifebox Foundation Inc 6
https://s3.amazonaws.com/irs-form-990/201531759349300018_public.xml
THE LITERACY LAB 8
https://s3.amazonaws.com/irs-form-990/201532649349300053_public.xml
MEN OF COURAGE FOUNDATION DBA UNCOMMEN 4
https://s3.amazonaws.com/irs-form-990/201511469349200336_public.xml
year formed
n employees
The MakeSense Foundation 6
https://s3.amazonaws.com/irs-form-990/201510689349300626_public.xml
Moneythink 8
https://s3.amazonaws.com/irs-form-990/201540369349300029_public.xml
NEW CLASSROOMS INNOVATION PARTNERSINC 17
https://s3.amazonaws.com/irs-form-990/201520449349302512_public.xml
NATIONAL INSTITUTE ON MONEY IN STATE POLITICS 10
https://s3.amazonaws.com/irs-form-990/201522619349300817_public.xml
OCEARCH 11
https://s3.amazonaws.com/irs-form-990/201501359349303045_public.xml
THE OPENGOV FOUNDATION 8
https://s3.amazonaws.com/irs-form-990/201531349349307138_public.xml
PARTICIPATORY POLITICS FOUNDATION 4
https://s3.amazonaws.com/irs-form-990/201500759349201000_public.xml
year formed
n employees
REALLOCATE INC 5
https://s3.amazonaws.com/irs-form-990/201531629349300818_public.xml
ReadworksInc 23
https://s3.amazonaws.com/irs-form-990/201511039349300831_public.xml
REASONING MIND INC 26
https://s3.amazonaws.com/irs-form-990/201512119349100301_public.xml
year formed
n employees
THE STELLAR FOUNDATION 5
https://s3.amazonaws.com/irs-form-990/201512299349302171_public.xml
SCIENCE BUDDIES 7
https://s3.amazonaws.com/irs-form-990/201531359349306948_public.xml
TECHSOUP GLOBAL 22
https://s3.amazonaws.com/irs-form-990/201530239349300148_public.xml
TIDEPOOL PROJECT 7
https://s3.amazonaws.com/irs-form-990/201532239349201173_public.xml
year formed
n employees
TRANSPARENCY TOOLKIT INC 3
https://s3.amazonaws.com/irs-form-990/201510359349300431_public.xml
Video Volunteers 10
https://s3.amazonaws.com/irs-form-990/201532299349302913_public.xml
WISHBONEORG 5
https://s3.amazonaws.com/irs-form-990/201522269349303777_public.xml
WATSI INC 4
https://s3.amazonaws.com/irs-form-990/201530429349200523_public.xml
year formed
n employees
weTHRIVE INC 5
https://s3.amazonaws.com/irs-form-990/201511199349300201_public.xml
Wikimedia Foundation Inc 18
https://s3.amazonaws.com/irs-form-990/201511419349300016_public.xml
WORLDREADERORG 9
https://s3.amazonaws.com/irs-form-990/201522179349301142_public.xml
ZIDISHA INC 3
https://s3.amazonaws.com/irs-form-990/201541339349303074_public.xml
EDX INC 19
https://s3.amazonaws.com/irs-form-990/201640419349301824_public.xml
ADOPT-A-CLASSROOM INC 15
https://s3.amazonaws.com/irs-form-990/201600929349300610_public.xml
THE CENTER TO PROMOTE HEALTHCARE ACCESS INC 20
https://s3.amazonaws.com/irs-form-990/201503179349300200_public.xml
CODE-TO-LEARN FOUNDATION 6
https://s3.amazonaws.com/irs-form-990/201621069349301122_public.xml
CANCER COMMONS 8
https://s3.amazonaws.com/irs-form-990/201630159349200408_public.xml
year formed
n employees
CAREERVILLAGE INC 6
https://s3.amazonaws.com/irs-form-990/201513209349101051_public.xml
year formed
n employees
CASE COMMONS INC 7
https://s3.amazonaws.com/irs-form-990/201503489349301155_public.xml
CENTER FOR STUDENT OPPORTUNITY INC 4
https://s3.amazonaws.com/irs-form-990/201513149349303286_public.xml
CLASSROOM INC 17
https://s3.amazonaws.com/irs-form-990/201513179349305516_public.xml
CODE FOR AMERICA LABS INC 17
https://s3.amazonaws.com/irs-form-990/201503209349314685_public.xml
COMMON SENSE MEDIA 35
https://s3.amazonaws.com/irs-form-990/201533209349311808_public.xml
D-REV DESIGN FOR THE OTHER 90 12
https://s3.amazonaws.com/irs-form-990/201523089349301407_public.xml
DONORSCHOOSEORG 25
https://s3.amazonaws.com/irs-form-990/201543179349202514_public.xml
year formed
n employees
DTWO LTD 5
https://s3.amazonaws.com/irs-form-990/201513209349303006_public.xml
DEMOCRACY WORKS INC 12
https://s3.amazonaws.com/irs-form-990/201630479349301353_public.xml
DIGITAL GREEN FOUNDATION 7
https://s3.amazonaws.com/irs-form-990/201543209349306209_public.xml
E-THE PEOPLE 4
https://s3.amazonaws.com/irs-form-990/201513249349200521_public.xml
year formed
n employees
Elephant Action League 3
https://s3.amazonaws.com/irs-form-990/201513249349200521_public.xml
year formed
n employees
Elephant Action League 3
https://s3.amazonaws.com/irs-form-990/201542809349300109_public.xml
FAMILIES EMPOWERED 2
https://s3.amazonaws.com/irs-form-990/201503149349301110_public.xml
THE FREECYCLE NETWORK 3
https://s3.amazonaws.com/irs-form-990/201502859349301150_public.xml
GIVEWELL COMMUNITY FOUNDATION INC 22
https://s3.amazonaws.com/irs-form-990/201523209349206167_public.xml
year formed
n employees
GLOBAL LIVES PROJECT INC 11
https://s3.amazonaws.com/irs-form-990/201513169349306556_public.xml
GOOD WORLD SOLUTIONS INC 7
https://s3.amazonaws.com/irs-form-990/201523229349300127_public.xml
Great NonProfits 12
https://s3.amazonaws.com/irs-form-990/201513209349306931_public.xml
HOPING HEARTS FOUNDATION INC 4
https://s3.amazonaws.com/irs-form-990/201543159349303854_public.xml
HARMONY INSTITUTE INC 7
https://s3.amazonaws.com/irs-form-990/201611169349301116_public.xml
HOLLABACK INC 10
https://s3.amazonaws.com/irs-form-990/201543169349100614_public.xml
year formed
n employees
HopeLab Foundation Inc 6
https://s3.amazonaws.com/irs-form-990/201503209349311725_public.xml
IN OUR BACKYARDS INC 14
https://s3.amazonaws.com/irs-form-990/201543159349303544_public.xml
year formed
INTERNET SEXUALITY INFORMATION SERVICES 9
https://s3.amazonaws.com/irs-form-990/201513179349201336_public.xml
year formed
n employees
IMAGING THE WORLD CORP 10
https://s3.amazonaws.com/irs-form-990/201523209349311202_public.xml
InSTEDD 6
https://s3.amazonaws.com/irs-form-990/201503209349316710_public.xml
International Bridges to Justice Inc 13
https://s3.amazonaws.com/irs-form-990/201503009349301060_public.xml
INTERNEWS NETWORK INC 22
https://s3.amazonaws.com/irs-form-990/201513209349312361_public.xml
Literacy Bridge 7
https://s3.amazonaws.com/irs-form-990/201543149349302334_public.xml
year formed
The Lunchbox Fund 4
https://s3.amazonaws.com/irs-form-990/201620259349301152_public.xml
MIND Research Institute 33
https://s3.amazonaws.com/irs-form-990/201513209349310671_public.xml
MEDIC MOBILE INC 8
https://s3.amazonaws.com/irs-form-990/201533209349311223_public.xml
MEDIC MOBILE INC 7
https://s3.amazonaws.com/irs-form-990/201513079349301056_public.xml
Moneythink 7
https://s3.amazonaws.com/irs-form-990/201523209349312592_public.xml
MOZILLA FOUNDATION 18
https://s3.amazonaws.com/irs-form-990/201630749349300613_public.xml
NEW CLASSROOMS INNOVATION PARTNERSINC 17
https://s3.amazonaws.com/irs-form-990/201513559349300081_public.xml
NATIONAL INSTITUTE ON MONEY IN STATE POLITICS 10
https://s3.amazonaws.com/irs-form-990/201523209349312412_public.xml
year formed
NEXLEAF ANALYTICS 9
https://s3.amazonaws.com/irs-form-990/201523219349300537_public.xml
NPOWER INC 27
https://s3.amazonaws.com/irs-form-990/201503209349310235_public.xml
OCEARCH 9
https://s3.amazonaws.com/irs-form-990/201543209349314394_public.xml
OPEN MEDIA FOUNDATION 11
https://s3.amazonaws.com/irs-form-990/201503139349302785_public.xml
OPERATION ASHA NFP 12
https://s3.amazonaws.com/irs-form-990/201513159349201371_public.xml
year formed
n employees
Planetwork NGO Inc 4
https://s3.amazonaws.com/irs-form-990/201640509349200504_public.xml
year formed
n employees
PLANETWORK NGO INC 4
https://s3.amazonaws.com/irs-form-990/201512589349300801_public.xml
PARTICIPATORY CULTURE FOUNDATION INC 7
https://s3.amazonaws.com/irs-form-990/201543169349302424_public.xml
PULSEPOINT FOUNDATION 6
https://s3.amazonaws.com/irs-form-990/201503209349206655_public.xml
year formed
n employees
REALLOCATE INC 5
https://s3.amazonaws.com/irs-form-990/201610559349300016_public.xml
REASONING MIND INC 25
https://s3.amazonaws.com/irs-form-990/201512949349300511_public.xml
Single Stop USA Inc 20
https://s3.amazonaws.com/irs-form-990/201502799349300705_public.xml
SkyTruth 8
https://s3.amazonaws.com/irs-form-990/201503209349304545_public.xml
TEACHAIDS 5
https://s3.amazonaws.com/irs-form-990/201601379349310585_public.xml
TECHSOUP GLOBAL 23
https://s3.amazonaws.com/irs-form-990/201630129349300623_public.xml
TIDEPOOL PROJECT 6
https://s3.amazonaws.com/irs-form-990/201533219349300518_public.xml
UNIVERSALGIVING 10
https://s3.amazonaws.com/irs-form-990/201610149349300801_public.xml
Video Volunteers 13
https://s3.amazonaws.com/irs-form-990/201631249349300518_public.xml
WITNESS INC 19
https://s3.amazonaws.com/irs-form-990/201620419349200622_public.xml
year formed
n employees
weTHRIVE INC 5
https://s3.amazonaws.com/irs-form-990/201621269349301537_public.xml
Wikimedia Foundation Inc 19
https://s3.amazonaws.com/irs-form-990/201610929349300631_public.xml
ZEARN INC 14
https://s3.amazonaws.com/irs-form-990/201611279349300401_public.xml
edX Inc 21
https://s3.amazonaws.com/irs-form-990/201513219349301356_public.xml
iCivics Inc 14
https://s3.amazonaws.com/irs-form-990/201620539349300532_public.xml
myAgro Farms 9

In [29]:
# label and output
df.columns = [
    'org_name', 'city', 'state', 'ein', 'url',
    'tax_year', 'year_formed', 'submission_time',
    'n_employees', 'total_revenue',
    'title', 'comp',
]
df.to_csv('output/comp_data_raw.csv')

In [30]:
# full list of job titles
print df['title'].unique()


[u'Executive Director' u'Dir of App Solutions' u'director'
 u'Product Manager' u'asst dir of app solution' u'Dir Fin & Admin Thru 4/6'
 u'Exec Director/CEO' u'Board Member' u'CHAIRMAN' u'TREASURER' u'SECRETARY'
 u'DIRECTOR' u'PRESIDENT' u'VP AND COO' u'VP OF CURRICULUM & ASSESSMENT'
 u'DIRECTOR OF FINANCE' u'SENIOR DIRECTOR OF BUSINESS DEVELOPMENT'
 u'Director' u'Executive Direc' u'CHAIRMAN, TREASURER'
 u'CHIEF EXECUTIVE OFFICER' u'Founder' u'Board Chair' u'VICE CHAIR'
 u'CHAIRPERSON' u'VICE PRESIDENT' u'President & CEO' u'Dir of Prod Dev'
 u'VP Impact & Adm' u'VP Staff Dev' u'VP Strat. P/S' u'VP Research'
 u'COFOUNDER & EXECUTIVE DIRECTOR' u'COFOUNDER & CHIEF OPERATING OFFICER'
 u'COFOUNDER, DIRECTOR' u'CHAIR, BOARD OF DIRECTOR'
 u'SECRETARY, BOARD OF DIRECTORS' u'TRESURER, BOARD OF DIRECTORS'
 u'BOARD OF DIRECTORS' u'Trustee' u'President' u'Secretary' u'Treasurer'
 u'TRUSTEE' u'PRESIDENT &' u'VICE PRES' u'CHAIRMAN/CEO' u'CFO'
 u'VP AFRICA MKTG & OPERATIONS' u'VP OF LEARNING' u'VICE-PRESIDENT'
 u'TREASURER & EXEC. DIR.' u'EXECUTIVE DIRECTOR' u'BOARD CHAIR'
 u'BOARD MEMBER' u'CHIEF OPERATING OFFICER' u'CHIEF DELIVERY OFFICER'
 u'Director, President' u'Director, Chairman'
 u'Director, Chief Technology Officer' u'Secretary, Treasurer'
 u'PRESIDENT & CO-CEO' u'CO-CEO' u'CONTROLLER'
 u'SENIOR DIR. BUS. DEVELOPMENT' u'COO - IT AND OPERATIONS'
 u'VP, DEVELOPMENT' u'VP, TECHSOUP PRODUCTS' u'CEO, Treasurer, Director'
 u'Director/Board chair' u'Chairman' u'Treasurer/Director' u'Chair'
 u'Vice-Chair' u'Executive Secretary' u'Trustee (Founder)' u'CFOO'
 u'General Counsel' u'President; Former Executive Director'
 u'Treasurer/Secretary' u'CHAIR' u'CEO' u'Vice Chairman'
 u'SENIOR DIRECTOR OF BUSINESS' u'PRESIDENT/COO' u'INTERIM PRESIDENT/COO'
 u'EDITOR-IN-CHIEF' u'CHIEF EDUCATION & STRATEGY OFFICER'
 u'VP OF MARKETING' u'VP OF EDUCATION CONTENT' u'CHIEF DEVELOPMENT OFFICER'
 u'MANAGING DIR, EDUCATION & RESEARCH' u'DIRECTOR OF PRODUCT DEVELOPMENT'
 u'VICE PRESIDE' u'FORMER PRESIDENT' u'PRESIDENT/FORMER VICE PRESIDENT'
 u'SECRETARY/TREASURER' u'Vice Chair' u'Program Manager'
 u'Board Chair and Director' u'Secretary and Director'
 u'VP of Communications, Treasurer, and Director' u'Vice President'
 u'VP Comm & Mktg' u'PRESIDENT/CPO' u'CTO' u'COO' u'SENIOR PROGRAM MGR'
 u'Executive Director and Officer' u'Deputy Director' u'CHIEF TECH. OFF'
 u'Pres-Education' u'Pres-Research' u'VP-ED SVCS' u'VP-MARKETING'
 u'VP-SALES' u'VP-DEVELOPMENT' u'CIO' u"Sec'y/Treasurer" u'SALES ASSOCIATE'
 u'SALES DIRECTOR' u'PRESIDENT/DIRECTOR' u'TREASURER/SECRETARY/DIRECTOR'
 u'TRUSTEE/DIRECTOR' u'PRESIDENT, DIRECTOR' u'SECRETARY, TREAS, DIRECTOR'
 u'VP AFRICA MKTG & OPERATION' u'SERVER ARCHITECT'
 u'DIR LEARNING SOUTH AFRICA' u'CHAIRMAN OF THE BOARD' u'VP Fin & Admin'
 u'Chief Tech Officer' u'Senior VP' u'VP Strategy' u'Dir. Software Dev.'
 u'PRESIDENT AND CO-CEO' u'CHIEF FINANCIAL OFFICER' u'VICE PRESIDENT-TSP'
 u'COO-PLANNING AND DEPLOYMENT' u'SR. DIRECTOR, BIZ DEV CBDO'
 u'CEO, Director' u'PRESIDENT AND BOARD MEMBER'
 u'Vice-Chair (Thru July 2010)' u'Treasurer & Vice Chair'
 u'Chief Technology Officer' u'Chief Community Officer' u'Project Director'
 u'CEO AND CO-FOUNDER' u'MANAGING DIRECTOR AND CO-FOUNDER'
 u'BOARD DIRECTOR' u'Director; Former Executiv' u'Executive Dir.'
 u'DIRECTOR (THRU 6/30/12)' u'CO-EXECUTIVE DIRECTOR'
 u'DIR OF APP SOLUTIONS' u'ASST DIR OF APPLICATION' u'CHAIR OF THE BOARD'
 u'CEO/BOARD MEMBER' u'SECRETARY/BOARD MEMBER' u'TREASURER/BOARD MEMBER'
 u"GEN'L COUNSEL & DIR OF POL" u'DIR/PRODUCT DEVELOPM'
 u'DIRECTOR OF ENGINEER' u'PROJECT DESIGN LEAD' u'SR. SOFTWARE ENGINEE'
 u'PROJECT EXECUTIVE' u'PROGRAM DIR.' u'PRGM DIR/DIR.'
 u'VP OF FINANCE AND ADMINISTRATION'
 u'CHIEF EDUCATION AND STRATEGY OFFICER' u'GENERAL MANAGER OF EDUCATION'
 u'CHIEF TECHNOLOGY OFFICER' u'VP OF PRODUCT DEVELOPMENT' u'VP OF POLICY'
 u'MANAGING DIR., EDUCATION AND RESEARCH' u'SENIOR ENGINEER'
 u'VP OF COMMUNICATION' u'FOUNDER' u'DIRECTOR OF OPERATIONS'
 u'DIRECTOR OF ANALYTICS' u'SECRETARY OF CORPORATION' u'CEO and Member'
 u'Member' u'DEPUTY DIRECTOR' u'MEMBER' u'Dir Prod Dev' u'VP Strat Ptrshps'
 u'COFOUNDER & BOARD MEMBER' u'COFOUNDER & EXECUTIVE DIRE'
 u'COFOUNDER & CHIEF OPERATIN' u'COFOUNDER & CHIEF PARTNERSHIPS OFFICER'
 u'Executive Vice President' u'VP of Bus. Strategy' u'Co-chair'
 u'Sr. Vice President' u'Country Director' u'FOUNDER & CEO'
 u'PRESIDENT & COO' u'HEAD OF FINANCE' u'LEAD DEVELOPER' u'LEAD DESIGNER'
 u'HEAD OF STRATEGY & PROGRAMS' u'LEAD EXERCISES DEVELOPER'
 u'SOFTWARE ENGINEER' u'DEAN OF COMPUTER SCIENCE' u'VP SALES'
 u'CHEIF PROGRAM OFFICER' u'PEXECUTIVE DIRECTOR' u'VICE CHAIRMAN'
 u'DIRECTOR OF FINANCE & ADMINISTRATION' u'SR. MANAGER'
 u'FOUNDER/BOARD CHAIR' u'CFO/BOARD MEMBER' u'CFAO' u'VP Programs'
 u'SR. DIR & CHIEF BIZ DVL' u'VICE PRESIDENT DEVELOPMENT'
 u'SR DIR. INFRASTRUCTURE' u'CFA' u'Chief Revenue Officer'
 u'Chief Global Development OFC' u'Chief Talent and Culture OFC'
 u'PRESIDENT AND FOUNDER' u'CEO/Co-Founder'
 u'BOARD MEMBER/SEE SCH O FOR COMP INFOP' u'BOARD MEMBER/INTERIM CEO'
 u'DIRECTOR/DIR APPLICATION SOLUTIONS' u'ASST DIR APPLICATION SOLUTIONS'
 u'SR SOLUTIONS LEAD' u'PRESIDENT/CHAIR' u'VICE-CHAIR'
 u'ASSISTANT SECRETARY' u'PRESIDENT/CEO' u'BOARD SECRETARY/COO'
 u'TREASURER/DIRECTOR OF FINANCE & ADMINISTRATION' u'DIRECTOR OF EDUCATION'
 u'CHIEF EVANGELIST' u'DIRECTOR, PAST PRESIDENT' u'DIRECTOR SECRETARY'
 u'DIRECTOR, TREASURER' u'PROGRAM DIRECTOR' u'PRESIDENT AND TREASURER'
 u'EXECUTIVE DIRECTOR AND CLERK' u'TERM ENDED OCT. 2013'
 u'VP OF FINANCE & ADMINISTRATION' u'SENIOR EDUCATION ADVISOR'
 u'VP OF DEVELOPMENT' u'VP OF BUSINESS DEVELOPMENT' u'CO EXEC DIR'
 u'BOARD MEMBER (THRU 12/31/2013)' u'BOARD MEMBER (THRU 5/31/2013)'
 u'EDUCATION VP AND GENERAL MANAGER' u'CO DIRECTOR OF ENGINEERING'
 u'BOARD CHAIRMAN' u'DIRECTOR OF TECHNICAL OPERATIONS'
 u'CHIEF PARTNERSHIPS OFFICER' u'CHIEF MARKETING OFFICER'
 u'SENIOR VICE PRESIDENT' u'DIRECTOR OF ENGINEERING'
 u'PRINCIPAL SOFTWARE ENGINEER' u'EXEC DIRECTO' u'SECRETARY, T'
 u'chairperson' u'FORMER EXECUTIVE DIRECTOR' u'BD CHAIR / ED'
 u'President and CEO' u'PRESIDENT/EXECUTIVE DIRECTOR'
 u'PRESIDENT/EXECUTIVE DIRECT' u'COFOUNDER & CHIEF PARTNERS'
 u'BOARD MEMBER - THRU 3/2013' u'TREASURER/FINANCE CHAIR'
 u'BOARD MEMBER/FUNDRAISING CHAIR' u'SECRETARY - THRU 3/2013'
 u'EXECUTIVE DI' u'CHAIR AND V.' u'President, CEO' u'Board Member, COO'
 u'Board Member. Chief Tech Officer' u'Dir. of Engineering'
 u'Project Manager' u'President Emeritus/Founder' u'President/CEO'
 u'Senior Vice President' u'ED - ICCIL' u'VP - Asia and Environment'
 u'VP - ICT Policy and Programs' u'COP - South Sudan'
 u'Vice President - Finance' u'FOUNDER & EXECUTIVE DIRECTOR'
 u'HEAD OF STRATEGY AND PROGRAMS' u'PROGRAM MANAGER'
 u'PRESIDENT / DIRECTOR' u'TRESURER / DIRECTOR' u'CLERK / DIRECTOR'
 u'CO DIRECTOR' u'VP Product Dvlp' u'EMPLOYEE' u'BOARD OF DIRECTOR'
 u'CHIEF STRATEGIST' u'CHIEF PROGRAM OFFICER' u'THROUGH 2/1/2013'
 u'CHIEF ACADEMIC OFFICER' u'DIRECTOR OF ARCHITECTURE'
 u'DIR. OF CENTRAL PROG. OPS.' u'VICE CHAIRMAN OF THE BOARD'
 u'VP BUSINESS DEVELOPMENT' u'VP HARDWARE' u'CHAIRMAN ON THE BOARD'
 u'DIRECTOR FORMER EXECUTIVE DIRECTOR & TREASURER'
 u'CURRENT EXECUTIVE DIRECTOR & TREASURER' u'CEO & PRESIDENT'
 u'SENIOR VP OF NATIONAL EXPANSION' u'VP OF SOFTWARE DEVELOPMENT' u'VP-CFO'
 u'VP OF PROGRAM' u'HEAD OF MOSCOW BRANCH' u'VP OF CURRICULUM DEVELOPMENT'
 u'VP OF SALES' u'SR. SOFTWARE ENGINEER' u'CHIEF SOFTWARE ARC'
 u'QC/QA MANAGER' u'Chief Prog Officer' u'VP FINANCE'
 u'VP, ALLIANCES & GLOBAL MEDIA' u'SR. CUSTOM PRO SPECIALIST'
 u'VP COMMUNITY & PLATFORMS' u'Treasurer and CFA'
 u'Secretary and General Counsel' u'Chief Talent & Culture Officer'
 u'dir of engineering operations' u'Deputy General Counsel'
 u'PRESIDENT & CEO' u'VICE PRESIDENT & SECRETARY'
 u'BOARD DIRECTOR (FROM 10/13)' u'BOARD DIRECTOR (UNTIL 10/13)'
 u'DIRECTOR, FINANCE & CONTROL' u'E.D. through July 2013'
 u'President and E.D.' u'Treasurer & Secretary' u'Pres & Dir'
 u'Treas & Dir' u'Sec & Dir' u'Intl Fin Dir' u'VP & GM, LITERACY'
 u'VP HR & ADMIN' u'DIRECTOR OF OPS' u'DIR PRODUCT STGY'
 u'DIR OF MARKETING' u'DIR CONTENT ACQUIS' u'DIRECTOR OF PROGRAMMING'
 u'CORPORATE SECRETARY' u'DIRECTOR OF INFRASTRUCTURE'
 u'DIR APPLICATION SOLUTIONS' u'ASST DIR APPLICATION SOLUT'
 u'CHIEF TECHNICAL ARCHITECT' u'PROJECT MANAGER'
 u'OPERATIONS SUPPORT MANAGER' u'SPECIAL PROJECTS MANAGER'
 u'CHIEF DATA ARCHITECT' u'TREASURER/VP OF FINANCE & ADMIN'
 u'VP OF EDUCATION' u'VP OF PRODUCT' u'ENGINEER' u'PRESIDENT AND CEO'
 u'CHAIR, DIRECTOR' u'DIRECTOR, SECRETARY' u'LEAD DIRECTOR'
 u'ENGINEERING DIRECTOR' u'CHIEF SCIENTIST'
 u'Board Member and Executive Director' u'DIRECTOR - 2014'
 u'EXEC DIRECTOR TO NOV 2014' u'DIRECTOR OF TECHNICAL PRODUCT' u'CMO'
 u'TREASURER - FORMER' u'TREASURER - CURRENT'
 u'BOARD CHAIR - FROM APRIL 2014' u'BOARD VICE-CHAIR - FROM APRIL 2014'
 u'TREASURER - FROM APRIL 2014' u'SECRETARY - FROM APRIL 2014'
 u'BOARD MEMBER - FROM APRIL 2014' u'BOARD MEMBER - THROUGH APRIL 2014'
 u'Director (from 07/15)' u'DIRECTOR (THRU 4/7/2014)'
 u'LEAD PRODUCT MANAGER' u'EXECUTIVE CH' u'VP-LIC & BUS' u'CO-DIRECTOR'
 u'Director thru Spring 2014' u'DIRECTOR/CHIEF EXECUTIVE OFFICER'
 u'DIRECTOR/CHIEF PROGRAM OFFICER' u'THROUGH APRIL 2013'
 u'DIRECTOR OF SITE OPERATIONS' u'DIRECTOR OF BASE OPERATIONS'
 u'DIRECTOR OF PROGRAM INITIATIVES' u'SOFTWARE DEVELOPER'
 u'EXECUTIVE VICE PRESIDENT' u'CHIEF SYSTEMS ARCHITECT'
 u'VP OF CURRICULUM DEVELOPME' u'FOUNDER & SECRETARY' u'VP - FINANCE'
 u'CEO - CARAVAN STUDIOS' u'VP - ALLIANCES & GLOBAL MED'
 u'SR. DIR. BIZ DEV. CBDO' u'VP - TECHSOUP GLOBAL NETWORK'
 u'VP - TECHNOLOGY SOLUTIONS' u'VP - STRATEGY & IMPACT' u'LEAD MEDICAL'
 u'VP PRODUCT &' u'VP ENGINEERI' u'SOFTWARE ENG' u'CLERK' u'founder'
 u'chair' u'trustee' u'Dir of Engineering Features'
 u'Dir of Platform Engineer' u'DIRECTOR, HUMAN & ORG.'
 u'Co-Chair & Director' u'Treasurer/CFO' u'VP and General Counsel'
 u'President, COO' u'VP of Strategic Partnerships' u'edX Architect'
 u'Software Architect' u'Director of PR and Communications'
 u'VP for Engineering' u'DIRECTOR OF HUMAN RESOURCES'
 u'DIRECTOR OF BUSINESS & PARTNERSHIP DEVELOPMENT' u'IT MANAGER'
 u'SR. NETWORK ADMINISTRATOR' u'TERM ENDED SEPT. 2014'
 u'CHIEF OF EDUCATION PARTNER' u'VP OF FINANCE & ADMINISTRA' u'CAO'
 u'CRO & CMO' u'PROGRAM DIRCTOR' u'PROGAM DIRECTOR' u'DESIGNER'
 u'BOARD MEMBER (THRU 11/2014)' u'VP OF MARKETING AND GM CONSUMER'
 u'VP EDITORIAL DIRECTOR' u'NEW YORK DIRECTOR'
 u'DIRECTOR OF TECHNICAL OPER' u'BOARD TREASU' u'SENIOR SOFTWARE ENGINEER'
 u'Director and President' u'Director and Treasurer'
 u'Director and Secretary' u'DIRECTOR OF FINANCE AND OPERATIONS'
 u'EXEC COMMITT' u'PRESIDENT/CE' u'IMMEDIATE PA' u'VICE CHAIRMA'
 u'Executive Director, Secretary' u'DIRECTOR/CHAIRMAN' u'PRES/TREAS'
 u'BOARD VICE-CHAIR' u'Treas,Board Mbr' u'Secretary, COO'
 u'COFOUNDER & CHIEF OPERATING' u'BOARD MEMBER-THRU 11/2014'
 u'BOARD MEMBER/SECRETARY' u'BOARD MEMBER/GOVERNANCE CHAIR'
 u'Interm Exec Dir' u'Sec./Treasurer, COO, CFO' u'Acting Chair'
 u'Board Member. CEO' u'BOARD MEMBER AND CLERK'
 u'Chief of Party - South Sudan' u'Sr. Vice President - IRL'
 u'Country Director - Afghanistan' u'VP Prog. - Europe, Eurasia & Asia'
 u'Director through May 2015' u'Director through June 2015'
 u'Director/CEO Emeritus' u'President, Research Division'
 u'Chief Strategist' u'Chief Executive Officer'
 u'CFO/Secretary & Treasurer' u'VP, Educ. Services' u'Regional VP'
 u'Chief Partnerships Officer' u'VP, Philanthropic Partnerships'
 u'VP, Emerging Markets' u'Chief of Staff' u'Dir., Strategic Partnerships'
 u'Strategic Partnerships Mgr.' u'Educ. Partnerships Mgr.'
 u'Executive Director/Secretary' u'EXECUTIVE DIRECTOR/PRESIDENT'
 u'SECRETARY/ VP OPERATIONS' u'VP LEARNING' u'VP PRODUCTS' u'VP ENGAGEMENT'
 u'DIRECTOR, PARTNERSHIPS' u'DIRECTOR, OPENNEWS'
 u'ONLINE ORGANIZING & FUNDRAISING LEAD' u'SR. DIRECTOR BADGES'
 u'DIRECTOR, HIVE CHICAGO' u'CHIEF GROWTH OFFICER'
 u'DATA SOLUTIONS ARCHITECT' u'VP OF PRODUCTION VISION'
 u'SENIOR DIRECTOR OF TALENT' u'SENIOR DIRECTOR OF PROGRAM INITIATIVES'
 u'Pres., Director' u'CFO/Sec/Direct' u'Chief Science Officer'
 u'VICE CHAIR & HEAD OF DEVELOPMENT COMMITTEE'
 u'TREASURER & HEAD OF AUDIT COMMITTEE'
 u'HEAD OF NOMINATING & GOVERNANCE COMMITTEE' u'DIRECTOR OF FINANCE & ADMI'
 u'DIRECTOR OF DEVELOPMENT' u'DIRECTOR-TECHNOLOGY SERVICE CORPS'
 u'DIRECTOR-THE COMMUNITY CORPS'
 u'REGIONAL DIRECTOR - TECHNOLOGY SERVICE CORPS'
 u'DIRECTOR FOUNDATION CONNECT' u'TREASURERCFO' u'CHAIRMAN-ELECT'
 u'COO AND VP OF FINANCE' u'COO/Gen Counsel' u'CDO' u'CSO'
 u'Managing Dir, NYC' u'VP - ALLIANCES & GLOBAL ME'
 u'VP - TECHSOUP GLOBAL NETWO' u'VP - TECHNOLOGY'
 u'VP - STRATEGIC ALLIANCES & GENERAL COUNSEL' u'vice chair'
 u'VP of Product' u'Special Advisor' u'Director of Engineering'
 u'Sr. dir. community engagement' u'CEO/PRESIDENT, DIRECTOR THRU 6/30/15'
 u'CPO/ED/SECRETARY' u'CHAIR AND DIRECTOR' u'TREASURER AND DIRECTOR'
 u'ACADEMIC DIRECTOR' u'INSTRUCTIONAL DESIGNER' u'PRODUCT DIRECTOR'
 u'President & COO' u'Clerk and General Counsel'
 u'Vice President of Strategic Partnerships'
 u'Vice President of Education Services' u'VP, Business Development'
 u'VP, Marketing' u'edX Senior Director, Engineering'
 u'President and E.D. to August 2014' u'Treasurer and Operations Manager'
 u'Secretary and Chief of Staff' u'Exec Director']

In [31]:
# job title regex

# CEO
ceo_label = 'ceo'
ceo_regex = ['exec', 'ceo', 'e\.d\.', 'ed$']
ceo_antiregex = []

# CTO
cto_label = 'cto'
cto_regex = ['tech[n. ]', '^cto', 'software', 'engin', 'product[s ]', 'product$',
             'app', 'developer', 'archit', 'prod dev']
cto_antiregex = ['regional', 'service']

# COO
coo_label = 'coo'
coo_regex = ['operat', 'coo', 'ops']
coo_antiregex = ['africa', 'engin']

# CFO
cfo_label = 'cfo'
cfo_regex = ['fin', 'cfo', 'treas', 'reven', 'cfa', 'tres']  # sic
cfo_antiregex = []

# BD/partnerships
bd_label = 'bd'
bd_regex = ['partner', 'business', 'bus\.', 'growth', 'cpo',
            'biz', 'sales', 'ptr', 'p/s']
bd_antiregex = []

# president
pres_label = 'pres'
pres_regex = ['pres']
pres_antiregex = ['vice']

# marketing
cmo_label = 'cmo'
cmo_regex = ['marketing', 'cmo', 'pr ', 'communications']
cmo_antiregex = []

# development
cdo_label = 'cdo'
cdo_regex = ['development', 'cdo']
cdo_antiregex = ['bus', 'software', 'product', 'curriculum', 'global']

# board
board_label = 'board'
board_regex = ['board', 'chair', 'secretary', 'director$', 'trustee']
board_antiregex = ['project', 'program', 'exec', 'deputy', 'academic',
                   'engin', 'site', 'base', 'new york', 'sales', 'editorial',
                  'progam',  # sic
                  ]

# founder
founder_label = 'founder'
founder_regex = ['founder']
founder_antiregex = []

# list of all labels
labels = [ceo_label, cto_label, coo_label, cfo_label, bd_label,
          pres_label, cmo_label, cdo_label, board_label, founder_label]

In [32]:
# set titles: 1 if match, 0 if not
for label, regex_list, antiregex_list in [
    (ceo_label, ceo_regex, ceo_antiregex),
    (cto_label, cto_regex, cto_antiregex),
    (coo_label, coo_regex, coo_antiregex),
    (cfo_label, cfo_regex, cfo_antiregex),
    (bd_label, bd_regex, bd_antiregex),
    (pres_label, pres_regex, pres_antiregex),
    (cmo_label, cmo_regex, cmo_antiregex),
    (cdo_label, cdo_regex, cdo_antiregex),
    (board_label, board_regex, board_antiregex),
    (founder_label, founder_regex, founder_antiregex),
]:
    # column name is_label
    colname = 'is_%s' % label

    # default is not the title
    df[colname] = 0
    
    # if matches good regex, it is the title
    for regex in regex_list:
        idx = df['title'].str.contains(regex, case=False)
        df[colname].loc[idx] = 1
        
    # if matches anti-regex, override and make it not the title
    for regex in antiregex_list:
        idx = df['title'].str.contains(regex, case=False)
        df[colname].loc[idx] = 0

    # log
    rows = df[df[colname] == 1]
    print 'for', label, 'found', len(rows), 'rows at', len(rows['ein'].unique()), 'orgs with', len(rows['title'].unique()), 'titles'
    # print rows['title'].unique()


for ceo found 241 rows at 105 orgs with 69 titles
for cto found 129 rows at 34 orgs with 69 titles
for coo found 74 rows at 37 orgs with 29 titles
for cfo found 250 rows at 104 orgs with 66 titles
for bd found 48 rows at 19 orgs with 33 titles
for pres found 183 rows at 87 orgs with 45 titles
for cmo found 13 rows at 9 orgs with 10 titles
for cdo found 12 rows at 9 orgs with 8 titles
for board found 2027 rows at 137 orgs with 131 titles
for founder found 39 rows at 16 orgs with 22 titles

In [33]:
# number of titles held by this individual
df['n_indiv_titles'] = df[['is_%s' % label for label in labels]].sum(axis=1)
print df['n_indiv_titles'].value_counts()


1    2678
0     304
2     166
3       2
Name: n_indiv_titles, dtype: int64

In [34]:
# log uncategorized
uncategorized_rows = df[df['n_indiv_titles'] == 0]
print 'for uncategorized found', len(uncategorized_rows), 'rows at', len(uncategorized_rows['ein'].unique()), 'orgs with', len(uncategorized_rows['title'].unique()), 'titles'
# print uncategorized_rows['title'].unique()


for uncategorized found 304 rows at 71 orgs with 162 titles

In [35]:
# set up features for what's happening across the org in that year
for group_tuple, group in df.groupby(['org_name', 'tax_year']):
    print group_tuple
    for label in labels:
        # number of people with this title
        n_title_in_org_year = group['is_%s' % label].sum()
        
        # set number on df
        df.loc[group.index, 'n_%s_in_org_year' % label] = n_title_in_org_year

        # set boolean for whether title is present (0/1)
        df.loc[group.index, 'has_%s_in_org_year' % label] = 1 if n_title_in_org_year > 0 else 0
        
        # total compensation for people with this title
        total_comp_for_title = group[group['is_%s' % label] == 1]['comp'].sum()
        
        # set compensation on df
        df.loc[group.index, 'total_comp_%s_in_org_year' % label] = total_comp_for_title

        # set boolean for whether title is compensated (0/1)
        df.loc[group.index, 'has_comp_%s_in_org_year' % label] = 1 if total_comp_for_title > 0 else 0


(u'ADOPT-A-CLASSROOM INC', u'2010')
(u'ADOPT-A-CLASSROOM INC', u'2011')
(u'ADOPT-A-CLASSROOM INC', u'2012')
(u'ADOPT-A-CLASSROOM INC', u'2013')
(u'ADOPT-A-CLASSROOM INC', u'2014')
(u'Anjna Patient Education', u'2013')
(u'BENEFICENT TECHNOLOGY INC DBA Benetech', u'2014')
(u'BRACKETS FOR GOOD INC', u'2014')
(u'Blue Planet Network', u'2011')
(u'Blue Planet Network', u'2012')
(u'CANCER COMMONS', u'2012')
(u'CANCER COMMONS', u'2013')
(u'CANCER COMMONS', u'2014')
(u'CAREERVILLAGE INC', u'2012')
(u'CAREERVILLAGE INC', u'2013')
(u'CAREERVILLAGE INC', u'2014')
(u'CASE COMMONS INC', u'2012')
(u'CASE COMMONS INC', u'2013')
(u'CASE COMMONS INC', u'2014')
(u'CENTER FOR STUDENT OPPORTUNITY INC', u'2010')
(u'CENTER FOR STUDENT OPPORTUNITY INC', u'2011')
(u'CENTER FOR STUDENT OPPORTUNITY INC', u'2012')
(u'CENTER FOR STUDENT OPPORTUNITY INC', u'2013')
(u'CENTER FOR STUDENT OPPORTUNITY INC', u'2014')
(u'CLASSROOM INC', u'2010')
(u'CLASSROOM INC', u'2011')
(u'CLASSROOM INC', u'2012')
(u'CLASSROOM INC', u'2013')
(u'CLASSROOM INC', u'2014')
(u'CODE FOR AMERICA LABS INC', u'2010')
(u'CODE FOR AMERICA LABS INC', u'2011')
(u'CODE FOR AMERICA LABS INC', u'2012')
(u'CODE FOR AMERICA LABS INC', u'2014')
(u'CODE-TO-LEARN FOUNDATION', u'2013')
(u'CODE-TO-LEARN FOUNDATION', u'2014')
(u'CODEORG', u'2013')
(u'CODEORG', u'2014')
(u'COMMON SENSE MEDIA', u'2011')
(u'COMMON SENSE MEDIA', u'2012')
(u'COMMON SENSE MEDIA', u'2013')
(u'COMMON SENSE MEDIA', u'2014')
(u'CONNECT TO COMPETE INC', u'2014')
(u'COWORKERORG', u'2013')
(u'COWORKERORG', u'2014')
(u'CRISIS TEXT LINE INC', u'2012')
(u'CRISIS TEXT LINE INC', u'2013')
(u'CRISIS TEXT LINE INC', u'2014')
(u'D-REV DESIGN FOR THE OTHER 90', u'2010')
(u'D-REV DESIGN FOR THE OTHER 90', u'2013')
(u'D-REV DESIGN FOR THE OTHER 90', u'2014')
(u'D-Rev Design for the Other 90', u'2012')
(u'DEMOCRACY WORKS INC', u'2011')
(u'DEMOCRACY WORKS INC', u'2012')
(u'DEMOCRACY WORKS INC', u'2013')
(u'DEMOCRACY WORKS INC', u'2014')
(u'DESIGN THAT MATTERS INC', u'2011')
(u'DESIGN THAT MATTERS INC', u'2012')
(u'DESIGN THAT MATTERS INC', u'2013')
(u'DESIGN THAT MATTERS INC', u'2014')
(u'DIGITAL GREEN FOUNDATION', u'2010')
(u'DIGITAL GREEN FOUNDATION', u'2011')
(u'DIGITAL GREEN FOUNDATION', u'2012')
(u'DIGITAL GREEN FOUNDATION', u'2013')
(u'DIGITAL GREEN FOUNDATION', u'2014')
(u'DO SOMETHING INC', u'2013')
(u'DO SOMETHING INC', u'2014')
(u'DONORSCHOOSEORG', u'2013')
(u'DONORSCHOOSEORG', u'2014')
(u'DTWO LTD', u'2011')
(u'DTWO LTD', u'2012')
(u'DTWO LTD', u'2013')
(u'DTWO LTD', u'2014')
(u'Do Something Inc', u'2011')
(u'Do Something Inc', u'2012')
(u'E-THE PEOPLE', u'2014')
(u'EDNOVO', u'2011')
(u'EDNOVO', u'2012')
(u'EDNOVO', u'2013')
(u'EDX INC', u'2013')
(u'Elephant Action League', u'2014')
(u'FAIR TRADE FUND INC', u'2012')
(u'FAIR TRADE FUND INC', u'2013')
(u'FAMILIES EMPOWERED', u'2010')
(u'FAMILIES EMPOWERED', u'2011')
(u'FAMILIES EMPOWERED', u'2012')
(u'FAMILIES EMPOWERED', u'2013')
(u'FAMILIES EMPOWERED', u'2014')
(u'GIVE DIRECT INC', u'2011')
(u'GIVEWELL COMMUNITY FOUNDATION INC', u'2014')
(u'GLOBAL LIVES PROJECT INC', u'2011')
(u'GLOBAL LIVES PROJECT INC', u'2013')
(u'GLOBAL LIVES PROJECT INC', u'2014')
(u'GOOD WORLD SOLUTIONS INC', u'2012')
(u'GOOD WORLD SOLUTIONS INC', u'2013')
(u'GOOD WORLD SOLUTIONS INC', u'2014')
(u'Great NonProfits', u'2011')
(u'Great NonProfits', u'2012')
(u'Great NonProfits', u'2013')
(u'Great NonProfits', u'2014')
(u'HARMONY INSTITUTE INC', u'2010')
(u'HARMONY INSTITUTE INC', u'2011')
(u'HARMONY INSTITUTE INC', u'2012')
(u'HARMONY INSTITUTE INC', u'2013')
(u'HARMONY INSTITUTE INC', u'2014')
(u'HOLLABACK INC', u'2011')
(u'HOLLABACK INC', u'2012')
(u'HOLLABACK INC', u'2013')
(u'HOLLABACK INC', u'2014')
(u'HOPING HEARTS FOUNDATION INC', u'2012')
(u'HOPING HEARTS FOUNDATION INC', u'2013')
(u'HOPING HEARTS FOUNDATION INC', u'2014')
(u'HUMANITARIAN OPENSTREETMAP TEAM UNITED STATES INC', u'2012')
(u'HopeLab Foundation Inc', u'2010')
(u'HopeLab Foundation Inc', u'2011')
(u'HopeLab Foundation Inc', u'2012')
(u'HopeLab Foundation Inc', u'2013')
(u'HopeLab Foundation Inc', u'2014')
(u'IMAGING THE WORLD CORP', u'2013')
(u'IMAGING THE WORLD CORP', u'2014')
(u'IMAGING THE WORLD CORPORATION', u'2010')
(u'IMAGING THE WORLD CORPORATION', u'2011')
(u'IMAGING THE WORLD CORPORATION', u'2012')
(u'IN OUR BACKYARDS INC', u'2010')
(u'IN OUR BACKYARDS INC', u'2011')
(u'IN OUR BACKYARDS INC', u'2012')
(u'IN OUR BACKYARDS INC', u'2013')
(u'IN OUR BACKYARDS INC', u'2014')
(u'INTERNET SEXUALITY INFORMATION SERVICES', u'2014')
(u'INTERNEWS NETWORK INC', u'2012')
(u'INTERNEWS NETWORK INC', u'2013')
(u'INTERNEWS NETWORK INC', u'2014')
(u'Impact Network International Inc', u'2014')
(u'InSTEDD', u'2011')
(u'InSTEDD', u'2012')
(u'InSTEDD', u'2013')
(u'InSTEDD', u'2014')
(u'International Bridges to Justice Inc', u'2014')
(u'KANGU INC', u'2013')
(u'KANGU INC', u'2014')
(u'KHAN ACADEMY INC', u'2012')
(u'KHAN ACADEMY INC', u'2013')
(u'KHAN ACADEMY INC', u'2014')
(u'LEARN FRESH EDUCATION CO', u'2014')
(u'LIFEBOX FOUNDATION INC', u'2013')
(u'LITERACY BRIDGE', u'2010')
(u'LITERACY BRIDGE', u'2011')
(u'LITERACY BRIDGE', u'2012')
(u'LITERACY LAB', u'2011')
(u'Lifebox Foundation Inc', u'2014')
(u'Literacy Bridge', u'2013')
(u'Literacy Bridge', u'2014')
(u'MEDIC MOBILE INC', u'2012')
(u'MEDIC MOBILE INC', u'2013')
(u'MEDIC MOBILE INC', u'2014')
(u'MEN OF COURAGE FOUNDATION DBA UNCOMMEN', u'2014')
(u'MIND RESEARCH INSTITUTE', u'2011')
(u'MIND RESEARCH INSTITUTE', u'2012')
(u'MIND RESEARCH INSTITUTE', u'2013')
(u'MIND Research Institute', u'2014')
(u'MOZILLA FOUNDATION', u'2014')
(u'Moneythink', u'2012')
(u'Moneythink', u'2013')
(u'Moneythink', u'2014')
(u'NATIONAL INSTITUTE ON MONEY IN STATE POLITICS', u'2012')
(u'NATIONAL INSTITUTE ON MONEY IN STATE POLITICS', u'2013')
(u'NATIONAL INSTITUTE ON MONEY IN STATE POLITICS', u'2014')
(u'NEW CLASSROOMS INNOVATION PARTNERSINC', u'2011')
(u'NEW CLASSROOMS INNOVATION PARTNERSINC', u'2012')
(u'NEW CLASSROOMS INNOVATION PARTNERSINC', u'2013')
(u'NEW CLASSROOMS INNOVATION PARTNERSINC', u'2014')
(u'NEWBORN FOUNDATION', u'2011')
(u'NEWBORN FOUNDATION', u'2012')
(u'NEXLEAF ANALYTICS', u'2011')
(u'NEXLEAF ANALYTICS', u'2012')
(u'NEXLEAF ANALYTICS', u'2013')
(u'NEXLEAF ANALYTICS', u'2014')
(u'NPOWER INC', u'2014')
(u'NPOWERNY INC', u'2012')
(u'National Institute on Money in State', u'2010')
(u'National Institute on Money in State', u'2011')
(u'OCEARCH', u'2012')
(u'OCEARCH', u'2013')
(u'OCEARCH', u'2014')
(u'ONE LAPTOP PER CHILD ASSOCIATION INC', u'2010')
(u'ONE LAPTOP PER CHILD ASSOCIATION INC', u'2011')
(u'ONE LAPTOP PER CHILD ASSOCIATION INC', u'2012')
(u'OPEN MEDIA FOUNDATION', u'2010')
(u'OPEN MEDIA FOUNDATION', u'2011')
(u'OPEN MEDIA FOUNDATION', u'2012')
(u'OPEN MEDIA FOUNDATION', u'2013')
(u'OPEN MEDIA FOUNDATION', u'2014')
(u'OPERATION ASHA NFP', u'2013')
(u'OPERATION ASHA NFP', u'2014')
(u'PARTICIPATORY CULTURE FOUNDATION INC', u'2010')
(u'PARTICIPATORY CULTURE FOUNDATION INC', u'2011')
(u'PARTICIPATORY CULTURE FOUNDATION INC', u'2012')
(u'PARTICIPATORY CULTURE FOUNDATION INC', u'2013')
(u'PARTICIPATORY CULTURE FOUNDATION INC', u'2014')
(u'PARTICIPATORY POLITICS FOUNDATION', u'2010')
(u'PARTICIPATORY POLITICS FOUNDATION', u'2011')
(u'PARTICIPATORY POLITICS FOUNDATION', u'2012')
(u'PARTICIPATORY POLITICS FOUNDATION', u'2013')
(u'PARTICIPATORY POLITICS FOUNDATION', u'2014')
(u'PLANETWORK NGO INC', u'2015')
(u'PULSEPOINT FOUNDATION', u'2012')
(u'PULSEPOINT FOUNDATION', u'2013')
(u'PULSEPOINT FOUNDATION', u'2014')
(u'Planetwork NGO Inc', u'2014')
(u'REALLOCATE INC', u'2012')
(u'REALLOCATE INC', u'2013')
(u'REALLOCATE INC', u'2014')
(u'REASONING MIND INC', u'2012')
(u'REASONING MIND INC', u'2013')
(u'REASONING MIND INC', u'2014')
(u'ReadworksInc', u'2012')
(u'ReadworksInc', u'2013')
(u'SAMASOURCE INC', u'2010')
(u'SAMASOURCE INC', u'2011')
(u'SAMASOURCE INC', u'2013')
(u'SCIENCE BUDDIES', u'2013')
(u'SCIENCE BUDDIES', u'2014')
(u'SKYTRUTH', u'2011')
(u'Single Stop USA Inc', u'2011')
(u'Single Stop USA Inc', u'2012')
(u'Single Stop USA Inc', u'2013')
(u'Single Stop USA Inc', u'2014')
(u'SkyTruth', u'2010')
(u'SkyTruth', u'2012')
(u'SkyTruth', u'2013')
(u'SkyTruth', u'2014')
(u'TEACHAIDS', u'2014')
(u'TECHSOUP GLOBAL', u'2009')
(u'TECHSOUP GLOBAL', u'2010')
(u'TECHSOUP GLOBAL', u'2011')
(u'TECHSOUP GLOBAL', u'2012')
(u'TECHSOUP GLOBAL', u'2013')
(u'TECHSOUP GLOBAL', u'2014')
(u'THE 1947 PARTITION ARCHIVE', u'2012')
(u'THE 1947 PARTITION ARCHIVE', u'2013')
(u'THE 1947 PARTITION ARCHIVE', u'2014')
(u'THE CENTER TO PROMOTE HEALTHCARE ACCESS INC', u'2011')
(u'THE CENTER TO PROMOTE HEALTHCARE ACCESS INC', u'2012')
(u'THE CENTER TO PROMOTE HEALTHCARE ACCESS INC', u'2013')
(u'THE CENTER TO PROMOTE HEALTHCARE ACCESS INC', u'2014')
(u'THE FREECYCLE NETWORK', u'2009')
(u'THE FREECYCLE NETWORK', u'2011')
(u'THE FREECYCLE NETWORK', u'2012')
(u'THE FREECYCLE NETWORK', u'2014')
(u'THE GET SCHOOLED FOUNDATION', u'2012')
(u'THE GET SCHOOLED FOUNDATION', u'2013')
(u'THE GET SCHOOLED FOUNDATION', u'2014')
(u'THE KIVA FOUNDATION', u'2010')
(u'THE KIVA FOUNDATION', u'2011')
(u'THE KIVA FOUNDATION', u'2012')
(u'THE KIVA FOUNDATION', u'2013')
(u'THE KIVA FOUNDATION', u'2014')
(u'THE LITERACY LAB', u'2012')
(u'THE LITERACY LAB', u'2013')
(u'THE LITERACY LAB', u'2014')
(u'THE OPENGOV FOUNDATION', u'2013')
(u'THE OPENGOV FOUNDATION', u'2014')
(u'THE STELLAR FOUNDATION', u'2012')
(u'THE STELLAR FOUNDATION', u'2013')
(u'THE STELLAR FOUNDATION', u'2014')
(u'TIDEPOOL PROJECT', u'2013')
(u'TIDEPOOL PROJECT', u'2014')
(u'TRANSPARENCY TOOLKIT INC', u'2014')
(u'The Center to Promote Healthcare Access Inc', u'2009')
(u'The Lunchbox Fund', u'2012')
(u'The Lunchbox Fund', u'2013')
(u'The Lunchbox Fund', u'2014')
(u'The MakeSense Foundation', u'2014')
(u'UNIVERSALGIVING', u'2011')
(u'UNIVERSALGIVING', u'2012')
(u'UNIVERSALGIVING', u'2013')
(u'UNIVERSALGIVING', u'2014')
(u'Video Volunteers', u'2009')
(u'Video Volunteers', u'2010')
(u'Video Volunteers', u'2011')
(u'Video Volunteers', u'2013')
(u'Video Volunteers', u'2014')
(u'WATSI INC', u'2012')
(u'WATSI INC', u'2013')
(u'WISHBONEORG', u'2011')
(u'WISHBONEORG', u'2012')
(u'WISHBONEORG', u'2013')
(u'WISHBONEORG', u'2014')
(u'WITNESS INC', u'2014')
(u'WORLDREADERORG', u'2011')
(u'WORLDREADERORG', u'2013')
(u'WORLDREADERORG', u'2014')
(u'Wikimedia Foundation Inc', u'2009')
(u'Wikimedia Foundation Inc', u'2010')
(u'Wikimedia Foundation Inc', u'2011')
(u'Wikimedia Foundation Inc', u'2012')
(u'Wikimedia Foundation Inc', u'2013')
(u'Wikimedia Foundation Inc', u'2014')
(u'ZEARN INC', u'2014')
(u'ZIDISHA INC', u'2012')
(u'ZIDISHA INC', u'2013')
(u'ZIDISHA INC', u'2014')
(u'edX Inc', u'2014')
(u'iCivics Inc', u'2009')
(u'iCivics Inc', u'2010')
(u'iCivics Inc', u'2011')
(u'iCivics Inc', u'2012')
(u'iCivics Inc', u'2013')
(u'iCivics Inc', u'2014')
(u'myAgro Farms', u'2013')
(u'myAgro Farms', u'2014')
(u'weTHRIVE INC', u'2013')
(u'weTHRIVE INC', u'2014')

In [36]:
# other features
df['org_age'] = pd.to_numeric(df['tax_year'], errors='coerce') - pd.to_numeric(df['year_formed'], errors='coerce')

In [37]:
# convert categorical features (city and state) to boolean
from sklearn import preprocessing

# integer code for cities
df['city'] = df['city'].str.upper()
cities = sorted(df['city'].unique())
df['city_int'] = df['city'].map(lambda x: cities.index(x))

# integer code for states
states = sorted(df['state'].unique())
df['state_int'] = df['state'].map(lambda x: states.index(x))

# perform one-hot enconding
enc = preprocessing.OneHotEncoder()
enc.fit(df[['city_int', 'state_int']])
transformed = enc.transform(df[['city_int', 'state_int']]).toarray()

# add features to df
header = ['in_%s' % city for city in cities] + ['in_%s' % state for state in states]
transformed_df = pd.DataFrame(transformed, columns=header)
df = pd.concat([df, transformed_df], axis=1)
print 'have features', df.columns.values


have features ['org_name' 'city' 'state' 'ein' 'url' 'tax_year' 'year_formed'
 'submission_time' 'n_employees' 'total_revenue' 'title' 'comp' 'is_ceo'
 'is_cto' 'is_coo' 'is_cfo' 'is_bd' 'is_pres' 'is_cmo' 'is_cdo' 'is_board'
 'is_founder' 'n_indiv_titles' 'n_ceo_in_org_year' 'has_ceo_in_org_year'
 'total_comp_ceo_in_org_year' 'has_comp_ceo_in_org_year'
 'n_cto_in_org_year' 'has_cto_in_org_year' 'total_comp_cto_in_org_year'
 'has_comp_cto_in_org_year' 'n_coo_in_org_year' 'has_coo_in_org_year'
 'total_comp_coo_in_org_year' 'has_comp_coo_in_org_year'
 'n_cfo_in_org_year' 'has_cfo_in_org_year' 'total_comp_cfo_in_org_year'
 'has_comp_cfo_in_org_year' 'n_bd_in_org_year' 'has_bd_in_org_year'
 'total_comp_bd_in_org_year' 'has_comp_bd_in_org_year' 'n_pres_in_org_year'
 'has_pres_in_org_year' 'total_comp_pres_in_org_year'
 'has_comp_pres_in_org_year' 'n_cmo_in_org_year' 'has_cmo_in_org_year'
 'total_comp_cmo_in_org_year' 'has_comp_cmo_in_org_year'
 'n_cdo_in_org_year' 'has_cdo_in_org_year' 'total_comp_cdo_in_org_year'
 'has_comp_cdo_in_org_year' 'n_board_in_org_year' 'has_board_in_org_year'
 'total_comp_board_in_org_year' 'has_comp_board_in_org_year'
 'n_founder_in_org_year' 'has_founder_in_org_year'
 'total_comp_founder_in_org_year' 'has_comp_founder_in_org_year' 'org_age'
 'city_int' 'state_int' u'in_ARCATA' u'in_BARRINGTON' u'in_BELLVALE'
 u'in_BERKELEY' u'in_BETHESDA' u'in_BETHSEDA' u'in_BOSTON' u'in_BROOKLYN'
 u'in_CAMBRIDGE' u'in_CARMEL' u'in_CHARLOTTE' u'in_CHICAGO' u'in_DANVERS'
 u'in_DENVER' u'in_ENCINO' u'in_HELENA' u'in_HOUSTON' u'in_HYANNIS'
 u'in_IRVINE' u'in_JACKSONVILLE' u'in_LAKELAND' u'in_LOS ANGELES'
 u'in_MIAMI' u'in_MINNEAPOLIS' u'in_MOUNTAIN VIEW' u'in_NEW YORK'
 u'in_OAKLAND' u'in_PALO ALTO' u'in_PARK CITY' u'in_PLEASANTON'
 u'in_REDWOOD CITY' u'in_SALEM' u'in_SAN FRANCISCO' u'in_SANTA ANA'
 u'in_SEATTLE' u'in_SHEPHERDSTOWN' u'in_STANFORD' u'in_STERLING'
 u'in_SUNNYVALE' u'in_TUCSON' u'in_WALTHAM' u'in_WARWICK' u'in_WASHINGTON'
 u'in_WAYNESVILLE' u'in_WEST SACRAMENTO' u'in_ZIONSVILLE' u'in_AZ' u'in_CA'
 u'in_CO' u'in_DC' u'in_FL' u'in_IL' u'in_IN' u'in_MA' u'in_MD' u'in_MN'
 u'in_MT' u'in_NC' u'in_NY' u'in_OH' u'in_RI' u'in_TX' u'in_UT' u'in_VA'
 u'in_VT' u'in_WA' u'in_WV']

In [38]:
# output
df.to_csv('output/comp_data_features.csv', index=False)