Loading Companies...



In [1]:

    
import numpy as np
import pandas as pd

Let's import the data



In [2]:

    
companies = pd.read_csv('data/companies.csv')



In [3]:

    
#Having a look to the companies data structure
companies[:3]









    Out[3]:






  
    
      
      permalink
      name
      homepage_url
      category_list
      funding_total_usd
      status
      country_code
      state_code
      region
      city
      funding_rounds
      founded_at
      first_funding_at
      last_funding_at
    
  
  
    
      0
      /organization/-fame
      #fame
      http://livfame.com
      Media
      10000000
      operating
      IND
      16
      Mumbai
      Mumbai
      1
      NaN
      2015-01-05
      2015-01-05
    
    
      1
      /organization/-qounter
      :Qounter
      http://www.qounter.com
      Application Platforms|Real Time|Social Network...
      700000
      operating
      USA
      DE
      DE - Other
      Delaware City
      2
      2014-09-04
      2014-03-01
      2014-10-14
    
    
      2
      /organization/-the-one-of-them-inc-
      (THE) ONE of THEM,Inc.
      http://oneofthem.jp
      Apps|Games|Mobile
      3406878
      operating
      NaN
      NaN
      NaN
      NaN
      1
      NaN
      2014-01-30
      2014-01-30



In [4]:

    
#Let's first remove non USA companies, since they usually have a lot of missing data
companies_USA = companies[companies['country_code'] == 'USA']



In [5]:

    
#Check if there are any missing data for state_code in USA based companies
companies_USA['state_code'].unique()
#companies_USA['state_code'].value_counts()
# No nan values for state_code

#Let's maintain region and city in the dataset but probably they are not gonna be used
#companies_USA['city'].value_counts()









    Out[5]:





array(['DE', 'IL', 'CA', 'NC', 'FL', 'NJ', 'NY', 'MD', 'OR', 'WA', 'MA',
       'WI', 'OH', 'AL', 'TX', 'RI', 'TN', 'VA', 'IA', 'DC', 'NH', 'LA',
       'VT', 'GA', 'IN', 'CO', 'NV', 'PA', 'MI', 'MN', 'SC', 'CT', 'KY',
       'UT', 'AZ', 'AR', 'MT', 'ID', 'ME', 'KS', 'OK', 'MO', 'HI', nan,
       'NE', 'AK', 'NM', 'WV', 'SD', 'MS', 'ND', 'WY', 'VI', 'GU'], dtype=object)

Converting categories to dummy variables (selecting top 50)



In [6]:

    
from operator import methodcaller
#Let's analyze category_list and probably expand it as dummy variables

#get a unique list of the categories
categories = list(companies_USA['category_list'].astype('str').unique())

#split each categori by |
categories = map(methodcaller("split", "|"), categories)

#flatten the list of sub categories
categories = [item for sublist in categories for item in sublist]

#total of 60k different categories
#categories 
len(categories)









    Out[6]:





60813



In [7]:

    
#We'll need to select the most important categories (that appears most of the times, and use Other for the rest)
companies_series = companies_USA['category_list'].astype('str')
categories_splitted_count = companies_series.str.split('|').apply(lambda x: pd.Series(x).value_counts()).sum()



In [8]:

    
#dummies

dummies = companies_series.str.get_dummies(sep='|')



In [9]:

    
########### Count of categories splitted first 50)###########
top50categories = list(categories_splitted_count.sort_values(ascending=False).index[:50])


##### Create a dataframe with the 50 top categories to be concatenated later to the complete dataframe
categories_df = dummies[top50categories]
categories_df = categories_df.add_prefix('Category_')

Comparing investments.csv and rounds.csv



In [10]:

    
#Let's start by comparing and understanding the difference between investments.csv and rounds.csv
df_investments = pd.read_csv('data/investments.csv')
df_investments = df_investments[df_investments['company_country_code'] == 'USA']

df_rounds = pd.read_csv('data/rounds.csv')
df_rounds = df_rounds[df_rounds['company_country_code'] == 'USA']



In [193]:

    
#companies_USA[companies_USA['permalink'] == '/organization/0xdata']
#df_investments[df_investments['company_permalink'] == '/organization/0xdata' ]
#df_rounds[df_rounds['company_permalink'] == '/organization/0xdata' ]









    Out[193]:






  
    
      
      company_permalink
      company_name
      company_category_list
      company_country_code
      company_state_code
      company_region
      company_city
      funding_round_permalink
      funding_round_type
      funding_round_code
      funded_at
      raised_amount_usd
    
  
  
    
      9
      /organization/0xdata
      H2O.ai
      Analytics
      USA
      CA
      SF Bay Area
      Mountain View
      /funding-round/383a9bd2c04f7038bb543ccef5ba3eae
      seed
      NaN
      2013-05-22
      3000000.0
    
    
      10
      /organization/0xdata
      H2O.ai
      Analytics
      USA
      CA
      SF Bay Area
      Mountain View
      /funding-round/3bb2ee4a2d89251a10aaa735b1180e44
      venture
      B
      2015-11-09
      20000000.0
    
    
      11
      /organization/0xdata
      H2O.ai
      Analytics
      USA
      CA
      SF Bay Area
      Mountain View
      /funding-round/ae2a174c06517c2394aed45006322a7e
      venture
      NaN
      2013-01-03
      1700000.0
    
    
      12
      /organization/0xdata
      H2O.ai
      Analytics
      USA
      CA
      SF Bay Area
      Mountain View
      /funding-round/e1cfcbe1bdf4c70277c5f29a3482f24e
      venture
      A
      2014-07-19
      8900000.0

The difference between investments and rounds is that investments is providing the information of where the money came from. Investments contains information about which investors paid each round. While rounds is grouping and totalizing the information by round.

Analyzing rounds.csv



In [12]:

    
#df_rounds


#Prepare an aggregated rounds dataframe grouped by company and funding type
rounds_agg = df_rounds.groupby(['company_permalink', 'funding_round_type'])['raised_amount_usd'].agg({'amount': [ pd.Series.sum, pd.Series.count]})



In [13]:

    
#Get available unique funding types
funding_types = list(rounds_agg.index.levels[1])
funding_types









    Out[13]:





['angel',
 'convertible_note',
 'debt_financing',
 'equity_crowdfunding',
 'grant',
 'non_equity_assistance',
 'post_ipo_debt',
 'post_ipo_equity',
 'private_equity',
 'product_crowdfunding',
 'secondary_market',
 'seed',
 'undisclosed',
 'venture']



In [14]:

    
#Prepare the dataframe where all the dummy features for each funding type will be added (number of rounds and total sum for each type)
rounds_df = companies_USA[['permalink']]
rounds_df = rounds_df.rename(columns = {'permalink':'company_permalink'})


#Iterate over each kind of funding type, and add two new features for each into the dataframe
def add_dummy_for_funding_type(df, aggr_rounds, funding_type):
    funding_df = aggr_rounds.iloc[aggr_rounds.index.get_level_values('funding_round_type') == funding_type].reset_index()
    funding_df.columns = funding_df.columns.droplevel()
    funding_df.columns = ['company_permalink', funding_type, funding_type+'_funding_total_usd', funding_type+'_funding_rounds']
    funding_df = funding_df.drop(funding_type,1)
    
    new_df = pd.merge(df, funding_df, on='company_permalink', how='left')
    new_df = new_df.fillna(0)
    return new_df
    
#rounds_agg was generated a few steps above
for funding_type in funding_types:
    rounds_df = add_dummy_for_funding_type(rounds_df, rounds_agg, funding_type)

#remove the company_permalink variable, since it's already available in the companies dataframe
rounds_df = rounds_df.drop('company_permalink', 1)

#set rounds_df to have the same index of the other dataframes
rounds_df.index = companies_USA.index

rounds_df[:3]









    Out[14]:






  
    
      
      angel_funding_total_usd
      angel_funding_rounds
      convertible_note_funding_total_usd
      convertible_note_funding_rounds
      debt_financing_funding_total_usd
      debt_financing_funding_rounds
      equity_crowdfunding_funding_total_usd
      equity_crowdfunding_funding_rounds
      grant_funding_total_usd
      grant_funding_rounds
      ...
      product_crowdfunding_funding_total_usd
      product_crowdfunding_funding_rounds
      secondary_market_funding_total_usd
      secondary_market_funding_rounds
      seed_funding_total_usd
      seed_funding_rounds
      undisclosed_funding_total_usd
      undisclosed_funding_rounds
      venture_funding_total_usd
      venture_funding_rounds
    
  
  
    
      1
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      700000.0
      1.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      7
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      3000000.0
      1.0
      0.0
      0.0
      30600000.0
      3.0
    
  

3 rows × 28 columns

Merging 3 dataframes (companies, categories and rounds)



In [15]:

    
startups_df = pd.concat([companies_USA, categories_df, rounds_df], axis=1, ignore_index=False)
startups_df[:3]









    Out[15]:






  
    
      
      permalink
      name
      homepage_url
      category_list
      funding_total_usd
      status
      country_code
      state_code
      region
      city
      ...
      product_crowdfunding_funding_total_usd
      product_crowdfunding_funding_rounds
      secondary_market_funding_total_usd
      secondary_market_funding_rounds
      seed_funding_total_usd
      seed_funding_rounds
      undisclosed_funding_total_usd
      undisclosed_funding_rounds
      venture_funding_total_usd
      venture_funding_rounds
    
  
  
    
      1
      /organization/-qounter
      :Qounter
      http://www.qounter.com
      Application Platforms|Real Time|Social Network...
      700000
      operating
      USA
      DE
      DE - Other
      Delaware City
      ...
      0.0
      0.0
      0.0
      0.0
      700000.0
      1.0
      0.0
      0.0
      0.0
      0.0
    
    
      4
      /organization/004-technologies
      004 Technologies
      http://004gmbh.de/en/004-interact
      Software
      -
      operating
      USA
      IL
      Springfield, Illinois
      Champaign
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      7
      /organization/0xdata
      H2O.ai
      http://h2o.ai/
      Analytics
      33600000
      operating
      USA
      CA
      SF Bay Area
      Mountain View
      ...
      0.0
      0.0
      0.0
      0.0
      3000000.0
      1.0
      0.0
      0.0
      30600000.0
      3.0
    
  

3 rows × 92 columns

Write resulting dataframe to csv file



In [212]:

    
startups_df.index = list(startups_df['permalink'])
startups_df = startups_df.drop('permalink', 1)



In [211]:

    
startups_df.to_csv('data/startups_1.csv')
#startups_df



In [ ]:

	permalink	name	homepage_url	category_list	funding_total_usd	status	country_code	state_code	region	city	funding_rounds	founded_at	first_funding_at	last_funding_at
0	/organization/-fame	#fame	http://livfame.com	Media	10000000	operating	IND	16	Mumbai	Mumbai	1	NaN	2015-01-05	2015-01-05
1	/organization/-qounter	:Qounter	http://www.qounter.com	Application Platforms\|Real Time\|Social Network...	700000	operating	USA	DE	DE - Other	Delaware City	2	2014-09-04	2014-03-01	2014-10-14
2	/organization/-the-one-of-them-inc-	(THE) ONE of THEM,Inc.	http://oneofthem.jp	Apps\|Games\|Mobile	3406878	operating	NaN	NaN	NaN	NaN	1	NaN	2014-01-30	2014-01-30

	company_permalink	company_name	company_category_list	company_country_code	company_state_code	company_region	company_city	funding_round_permalink	funding_round_type	funding_round_code	funded_at	raised_amount_usd
9	/organization/0xdata	H2O.ai	Analytics	USA	CA	SF Bay Area	Mountain View	/funding-round/383a9bd2c04f7038bb543ccef5ba3eae	seed	NaN	2013-05-22	3000000.0
10	/organization/0xdata	H2O.ai	Analytics	USA	CA	SF Bay Area	Mountain View	/funding-round/3bb2ee4a2d89251a10aaa735b1180e44	venture	B	2015-11-09	20000000.0
11	/organization/0xdata	H2O.ai	Analytics	USA	CA	SF Bay Area	Mountain View	/funding-round/ae2a174c06517c2394aed45006322a7e	venture	NaN	2013-01-03	1700000.0
12	/organization/0xdata	H2O.ai	Analytics	USA	CA	SF Bay Area	Mountain View	/funding-round/e1cfcbe1bdf4c70277c5f29a3482f24e	venture	A	2014-07-19	8900000.0

	...	seed_funding_total_usd	seed_funding_rounds	venture_funding_total_usd	venture_funding_rounds
1	...	700000.0	1.0	0.0	0.0
4	...	0.0	0.0	0.0	0.0
7	...	3000000.0	1.0	30600000.0	3.0