In [29]:
import pandas as pd
startups = pd.read_csv('data/startups_1.csv', index_col=0)
startups[:3]


Out[29]:
name homepage_url category_list funding_total_usd status country_code state_code region city funding_rounds ... product_crowdfunding_funding_total_usd product_crowdfunding_funding_rounds secondary_market_funding_total_usd secondary_market_funding_rounds seed_funding_total_usd seed_funding_rounds undisclosed_funding_total_usd undisclosed_funding_rounds venture_funding_total_usd venture_funding_rounds
/organization/-qounter :Qounter http://www.qounter.com Application Platforms|Real Time|Social Network... 700000 operating USA DE DE - Other Delaware City 2 ... 0.0 0.0 0.0 0.0 700000.0 1.0 0.0 0.0 0.0 0.0
/organization/004-technologies 004 Technologies http://004gmbh.de/en/004-interact Software - operating USA IL Springfield, Illinois Champaign 1 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
/organization/0xdata H2O.ai http://h2o.ai/ Analytics 33600000 operating USA CA SF Bay Area Mountain View 4 ... 0.0 0.0 0.0 0.0 3000000.0 1.0 0.0 0.0 30600000.0 3.0

3 rows × 91 columns

With the variables we found so far here, we achieved a maximum performance of 75% (ROC AUC), so let's try to extract some more features in order to increase the model performance

Let's find the # of acquisitons made by each company


In [30]:
#I'm considering only Acquisitions made in USA, with USD (dollars)
acquisitions = pd.read_csv('data/acquisitions.csv')
acquisitions = acquisitions[acquisitions['acquirer_country_code'] == 'USA']
acquisitions[:3]


Out[30]:
company_permalink company_name company_category_list company_country_code company_state_code company_region company_city acquirer_permalink acquirer_name acquirer_category_list acquirer_country_code acquirer_state_code acquirer_region acquirer_city acquired_at acquired_month price_amount price_currency_code
1 /organization/0958572-b-c-ltd 0958572 B.C. Ltd. NaN NaN NaN NaN NaN /organization/atlas-intellectual-property-mana... ATLAS Intellectual Property Management Co. Finance|FinTech|Mobile|Telecommunications USA WA Seattle Seattle 2012-02-02 2012-02 9000000.0 USD
2 /organization/1-800-communications 1-800 Communications NaN USA NY Long Island Hicksville /organization/carsdirect-com CarsDirect.com E-Commerce USA CA Los Angeles El Segundo 2005-06-12 2005-06 NaN USD
3 /organization/1-800-contacts 1-800 Contacts Consumer Goods|Retail USA UT Salt Lake City Draper /organization/thomas-h-lee-partners Thomas H. Lee Partners Business Development|Impact Investing|Investme... USA MA Boston Boston 2014-01-07 2014-01 NaN USD

In [31]:
#acquirer_permalink
#rounds_agg = df_rounds.groupby(['company_permalink', 'funding_round_type'])['raised_amount_usd'].agg({'amount': [ pd.Series.sum, pd.Series.count]})
number_of_acquisitions = acquisitions.groupby(['acquirer_permalink'])['acquirer_permalink'].agg({'amount': [ pd.Series.count]}).reset_index()
number_of_acquisitions.columns = number_of_acquisitions.columns.droplevel()
number_of_acquisitions.columns = ['permalink', 'number_of_acquisitions']
number_of_acquisitions = number_of_acquisitions.set_index('permalink')
number_of_acquisitions[:3]


Out[31]:
number_of_acquisitions
permalink
/organization/1-800-flowers-com 1
/organization/1-800-we-answer 1
/organization/11i-solutions 2

Let's find the # of investments made by each company


In [32]:
investments = pd.read_csv('data/investments.csv')
investments = investments[investments['investor_country_code'] == 'USA']
investments[:3]


Out[32]:
company_permalink company_name company_category_list company_country_code company_state_code company_region company_city investor_permalink investor_name investor_country_code investor_state_code investor_region investor_city funding_round_permalink funding_round_type funding_round_code funded_at raised_amount_usd
3 /organization/0xdata H2O.ai Analytics USA CA SF Bay Area Mountain View /organization/capital-one Capital One USA VA Washington, D.C. Mclean /funding-round/3bb2ee4a2d89251a10aaa735b1180e44 venture B 2015-11-09 20000000.0
4 /organization/0xdata H2O.ai Analytics USA CA SF Bay Area Mountain View /organization/nexus-venture-partners Nexus Venture Partners USA CA SF Bay Area Menlo Park /funding-round/383a9bd2c04f7038bb543ccef5ba3eae seed NaN 2013-05-22 3000000.0
5 /organization/0xdata H2O.ai Analytics USA CA SF Bay Area Mountain View /organization/nexus-venture-partners Nexus Venture Partners USA CA SF Bay Area Menlo Park /funding-round/3bb2ee4a2d89251a10aaa735b1180e44 venture B 2015-11-09 20000000.0

In [33]:
#acquirer_permalink
#rounds_agg = df_rounds.groupby(['company_permalink', 'funding_round_type'])['raised_amount_usd'].agg({'amount': [ pd.Series.sum, pd.Series.count]})
number_of_investments = investments.groupby(['investor_permalink'])['investor_permalink'].agg({'amount': [ pd.Series.count]}).reset_index()
number_of_investments.columns = number_of_investments.columns.droplevel()
number_of_investments.columns = ['permalink', 'number_of_investments']
number_of_investments = number_of_investments.set_index('permalink')
number_of_investments[:3]


Out[33]:
number_of_investments
permalink
/organization/1-800-flowers-com 1
/organization/10-10-10 1
/organization/10x-venture-partners 3

In [34]:
#Number of different companies in which each company have invested in
number_of_unique_investments = investments.groupby(['investor_permalink'])['company_permalink'].agg({'amount': [ pd.Series.nunique]}).reset_index()
number_of_unique_investments.columns = number_of_unique_investments.columns.droplevel()
number_of_unique_investments.columns = ['permalink', 'number_of_unique_investments']
number_of_unique_investments = number_of_unique_investments.set_index('permalink')
number_of_unique_investments[:3]


Out[34]:
number_of_unique_investments
permalink
/organization/1-800-flowers-com 1
/organization/10-10-10 1
/organization/10x-venture-partners 2

In [35]:
number_of_investors_per_round = investments.groupby(['company_permalink', 'funding_round_permalink'])['investor_permalink'].agg({'investor_permalink': [ pd.Series.count]}).reset_index()
number_of_investors_per_round.columns = number_of_investors_per_round.columns.droplevel(0)
number_of_investors_per_round.columns = ['company_permalink', 'funding_round_permalink', 'count']
number_of_investors_per_round = number_of_investors_per_round.groupby(['company_permalink']).agg({'count': [ pd.Series.mean]}).reset_index()
number_of_investors_per_round.columns = number_of_investors_per_round.columns.droplevel(0)
number_of_investors_per_round.columns = ['company_permalink', 'number_of_investors_per_round']
number_of_investors_per_round = number_of_investors_per_round.set_index('company_permalink')
number_of_investors_per_round[:3]


Out[35]:
number_of_investors_per_round
company_permalink
/organization/0xdata 2.500000
/organization/1 3.666667
/organization/1-800-dentist 2.000000

In [36]:
from numpy import nanmean
#investments['raised_amount_usd'].dtype()
investments['raised_amount_usd'] = investments['raised_amount_usd'].astype(float)
avg_amount_invested_per_round = investments.groupby(['company_permalink', 'funding_round_permalink'])['raised_amount_usd'].agg({'raised_amount_usd': [ pd.Series.mean]}).reset_index()
avg_amount_invested_per_round.columns = avg_amount_invested_per_round.columns.droplevel(0)
avg_amount_invested_per_round.columns = ['company_permalink', 'funding_round_permalink', 'mean']
avg_amount_invested_per_round = avg_amount_invested_per_round.groupby(['company_permalink']).agg({'mean': [ pd.Series.mean]}).reset_index()
avg_amount_invested_per_round.columns = avg_amount_invested_per_round.columns.droplevel(0)
avg_amount_invested_per_round.columns = ['company_permalink', 'avg_amount_invested_per_round']
avg_amount_invested_per_round = avg_amount_invested_per_round.set_index('company_permalink')
avg_amount_invested_per_round = avg_amount_invested_per_round.fillna(0)
avg_amount_invested_per_round[:3]


Out[36]:
avg_amount_invested_per_round
company_permalink
/organization/0xdata 8400000.0
/organization/1 575025.0
/organization/1-800-dentist 0.0

In [10]:



Out[10]:
name homepage_url category_list funding_total_usd status country_code state_code region city funding_rounds ... product_crowdfunding_funding_total_usd product_crowdfunding_funding_rounds secondary_market_funding_total_usd secondary_market_funding_rounds seed_funding_total_usd seed_funding_rounds undisclosed_funding_total_usd undisclosed_funding_rounds venture_funding_total_usd venture_funding_rounds
/organization/-qounter :Qounter http://www.qounter.com Application Platforms|Real Time|Social Network... 700000 operating USA DE DE - Other Delaware City 2 ... 0.0 0.0 0.0 0.0 700000.0 1.0 0.0 0.0 0.0 0.0
/organization/004-technologies 004 Technologies http://004gmbh.de/en/004-interact Software - operating USA IL Springfield, Illinois Champaign 1 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
/organization/0xdata H2O.ai http://h2o.ai/ Analytics 33600000 operating USA CA SF Bay Area Mountain View 4 ... 0.0 0.0 0.0 0.0 3000000.0 1.0 0.0 0.0 30600000.0 3.0

3 rows × 91 columns


In [37]:
startups = startups.join(number_of_acquisitions).join(number_of_investments).join(number_of_unique_investments).join(number_of_investors_per_round).join(avg_amount_invested_per_round)

In [48]:
startups[['number_of_acquisitions', 'number_of_investments', 'number_of_unique_investments','number_of_investors_per_round', 'avg_amount_invested_per_round']] = startups[['number_of_acquisitions', 'number_of_investments', 'number_of_unique_investments','number_of_investors_per_round', 'avg_amount_invested_per_round']].fillna(value=0)

In [49]:
startups[:3]


Out[49]:
name homepage_url category_list funding_total_usd status country_code state_code region city funding_rounds ... seed_funding_rounds undisclosed_funding_total_usd undisclosed_funding_rounds venture_funding_total_usd venture_funding_rounds number_of_acquisitions number_of_investments number_of_unique_investments number_of_investors_per_round avg_amount_invested_per_round
/organization/-qounter :Qounter http://www.qounter.com Application Platforms|Real Time|Social Network... 700000 operating USA DE DE - Other Delaware City 2 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
/organization/004-technologies 004 Technologies http://004gmbh.de/en/004-interact Software - operating USA IL Springfield, Illinois Champaign 1 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
/organization/0xdata H2O.ai http://h2o.ai/ Analytics 33600000 operating USA CA SF Bay Area Mountain View 4 ... 1.0 0.0 0.0 30600000.0 3.0 0.0 0.0 0.0 2.5 8400000.0

3 rows × 96 columns


In [50]:
startups.to_csv('data/startups_1_1.csv')