Dataset Preparation

Here we'll be removing nan's, normalizing numerical features, converting date features to numerical normalized features, and so on...

Importing the dataset



In [24]:

    
import pandas as pd
startups = pd.read_csv('data/startups_1_1.csv', index_col=0)
startups[:3]









    Out[24]:






  
    
      
      name
      homepage_url
      category_list
      funding_total_usd
      status
      country_code
      state_code
      region
      city
      funding_rounds
      ...
      seed_funding_rounds
      undisclosed_funding_total_usd
      undisclosed_funding_rounds
      venture_funding_total_usd
      venture_funding_rounds
      number_of_acquisitions
      number_of_investments
      number_of_unique_investments
      number_of_investors_per_round
      avg_amount_invested_per_round
    
  
  
    
      /organization/-qounter
      :Qounter
      http://www.qounter.com
      Application Platforms|Real Time|Social Network...
      700000
      operating
      USA
      DE
      DE - Other
      Delaware City
      2
      ...
      1.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      /organization/004-technologies
      004 Technologies
      http://004gmbh.de/en/004-interact
      Software
      -
      operating
      USA
      IL
      Springfield, Illinois
      Champaign
      1
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
    
    
      /organization/0xdata
      H2O.ai
      http://h2o.ai/
      Analytics
      33600000
      operating
      USA
      CA
      SF Bay Area
      Mountain View
      4
      ...
      1.0
      0.0
      0.0
      30600000.0
      3.0
      0.0
      0.0
      0.0
      2.5
      8400000.0
    
  

3 rows × 96 columns

Droping some features

We'll drop homepage_url, category_list, region, city, country_code We'll also move status to the end of the dataframe



In [25]:

    
#drop features
startups_dropped_features = startups.drop(['name','homepage_url', 'category_list', 'region', 'city', 'country_code'], 1)


#move status to the end
cols = list(startups_dropped_features)
cols.append(cols.pop(cols.index('status')))
startups_dropped_features = startups_dropped_features.ix[:, cols]

startups_dropped_features[:3]









    Out[25]:






  
    
      
      funding_total_usd
      state_code
      funding_rounds
      founded_at
      first_funding_at
      last_funding_at
      Category_Software
      Category_Biotechnology
      Category_Mobile
      Category_Enterprise Software
      ...
      undisclosed_funding_total_usd
      undisclosed_funding_rounds
      venture_funding_total_usd
      venture_funding_rounds
      number_of_acquisitions
      number_of_investments
      number_of_unique_investments
      number_of_investors_per_round
      avg_amount_invested_per_round
      status
    
  
  
    
      /organization/-qounter
      700000
      DE
      2
      2014-09-04
      2014-03-01
      2014-10-14
      0
      0
      0
      0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      operating
    
    
      /organization/004-technologies
      -
      IL
      1
      2010-01-01
      2014-07-24
      2014-07-24
      1
      0
      0
      0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      0.0
      operating
    
    
      /organization/0xdata
      33600000
      CA
      4
      2011-01-01
      2013-01-03
      2015-11-09
      0
      0
      0
      0
      ...
      0.0
      0.0
      30600000.0
      3.0
      0.0
      0.0
      0.0
      2.5
      8400000.0
      operating
    
  

3 rows × 90 columns

Normalizing numeric variables



In [26]:

    
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

startups_normalized = startups_dropped_features.copy()

#Convert '-' to zeros in funding_total_usd
startups_normalized['funding_total_usd'] = startups_normalized['funding_total_usd'].replace('-', 0)

columns_to_scale = list(startups_normalized.filter(regex=(".*(funding_rounds|funding_total_usd)|(number_of|avg_).*")).columns)
startups_normalized[columns_to_scale] = min_max_scaler.fit_transform(startups_normalized[columns_to_scale])
startups_normalized[:3]









    Out[26]:






  
    
      
      funding_total_usd
      state_code
      funding_rounds
      founded_at
      first_funding_at
      last_funding_at
      Category_Software
      Category_Biotechnology
      Category_Mobile
      Category_Enterprise Software
      ...
      undisclosed_funding_total_usd
      undisclosed_funding_rounds
      venture_funding_total_usd
      venture_funding_rounds
      number_of_acquisitions
      number_of_investments
      number_of_unique_investments
      number_of_investors_per_round
      avg_amount_invested_per_round
      status
    
  
  
    
      /organization/-qounter
      0.000023
      DE
      0.055556
      2014-09-04
      2014-03-01
      2014-10-14
      0
      0
      0
      0
      ...
      0.0
      0.0
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.000000
      0.000000
      operating
    
    
      /organization/004-technologies
      0.000000
      IL
      0.000000
      2010-01-01
      2014-07-24
      2014-07-24
      1
      0
      0
      0
      ...
      0.0
      0.0
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.000000
      0.000000
      operating
    
    
      /organization/0xdata
      0.001117
      CA
      0.166667
      2011-01-01
      2013-01-03
      2015-11-09
      0
      0
      0
      0
      ...
      0.0
      0.0
      0.001739
      0.214286
      0.0
      0.0
      0.0
      0.104167
      0.000477
      operating
    
  

3 rows × 90 columns

Normalizing date variables



In [28]:

    
from datetime import datetime
from dateutil import relativedelta



def date_to_age_in_months(date):
    if date != date or date == 0: #is NaN
        return 0
    date1 = datetime.strptime(date, '%Y-%m-%d')
    date2 = datetime.strptime('2017-01-01', '%Y-%m-%d') #get age until 01/01/2017
    delta =  relativedelta.relativedelta(date2, date1)
    return delta.years * 12 + delta.months
    
    
startups_dates_normalized = startups_normalized.copy()
startups_dates_normalized['founded_at'] = startups_dates_normalized['founded_at'].map(date_to_age_in_months)
startups_dates_normalized['first_funding_at'] = startups_dates_normalized['first_funding_at'].map(date_to_age_in_months)
startups_dates_normalized['last_funding_at'] = startups_dates_normalized['last_funding_at'].map(date_to_age_in_months)

startups_dates_normalized[:3]









    Out[28]:






  
    
      
      funding_total_usd
      state_code
      funding_rounds
      founded_at
      first_funding_at
      last_funding_at
      Category_Software
      Category_Biotechnology
      Category_Mobile
      Category_Enterprise Software
      ...
      undisclosed_funding_total_usd
      undisclosed_funding_rounds
      venture_funding_total_usd
      venture_funding_rounds
      number_of_acquisitions
      number_of_investments
      number_of_unique_investments
      number_of_investors_per_round
      avg_amount_invested_per_round
      status
    
  
  
    
      /organization/-qounter
      0.000023
      DE
      0.055556
      27
      34
      26
      0
      0
      0
      0
      ...
      0.0
      0.0
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.000000
      0.000000
      operating
    
    
      /organization/004-technologies
      0.000000
      IL
      0.000000
      84
      29
      29
      1
      0
      0
      0
      ...
      0.0
      0.0
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.000000
      0.000000
      operating
    
    
      /organization/0xdata
      0.001117
      CA
      0.166667
      72
      47
      13
      0
      0
      0
      0
      ...
      0.0
      0.0
      0.001739
      0.214286
      0.0
      0.0
      0.0
      0.104167
      0.000477
      operating
    
  

3 rows × 90 columns



In [29]:

    
startups_dates_normalized[['founded_at', 'first_funding_at', 'last_funding_at']] = min_max_scaler.fit_transform(startups_dates_normalized[['founded_at', 'first_funding_at', 'last_funding_at']])
startups_dates_normalized[:3]









    Out[29]:






  
    
      
      funding_total_usd
      state_code
      funding_rounds
      founded_at
      first_funding_at
      last_funding_at
      Category_Software
      Category_Biotechnology
      Category_Mobile
      Category_Enterprise Software
      ...
      undisclosed_funding_total_usd
      undisclosed_funding_rounds
      venture_funding_total_usd
      venture_funding_rounds
      number_of_acquisitions
      number_of_investments
      number_of_unique_investments
      number_of_investors_per_round
      avg_amount_invested_per_round
      status
    
  
  
    
      /organization/-qounter
      0.000023
      DE
      0.055556
      0.005917
      0.002825
      0.027344
      0
      0
      0
      0
      ...
      0.0
      0.0
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.000000
      0.000000
      operating
    
    
      /organization/004-technologies
      0.000000
      IL
      0.000000
      0.018409
      0.002410
      0.033203
      1
      0
      0
      0
      ...
      0.0
      0.0
      0.000000
      0.000000
      0.0
      0.0
      0.0
      0.000000
      0.000000
      operating
    
    
      /organization/0xdata
      0.001117
      CA
      0.166667
      0.015779
      0.003906
      0.001953
      0
      0
      0
      0
      ...
      0.0
      0.0
      0.001739
      0.214286
      0.0
      0.0
      0.0
      0.104167
      0.000477
      operating
    
  

3 rows × 90 columns

Analzying types of status



In [30]:

    
startups_dates_normalized['status'].unique()









    Out[30]:





array(['operating', 'acquired', 'closed', 'ipo'], dtype=object)



In [31]:

    
startups_dates_normalized['status'].value_counts()









    Out[31]:





operating    29429
acquired      4156
closed        2907
ipo           1109
Name: status, dtype: int64

Saved prepared database to csv



In [32]:

    
startups_dates_normalized.to_csv('data/startups_2.csv')

	name	homepage_url	category_list	funding_total_usd	status	country_code	state_code	region	city	funding_rounds	...	seed_funding_rounds	venture_funding_total_usd	venture_funding_rounds	number_of_investors_per_round	avg_amount_invested_per_round
/organization/-qounter	:Qounter	http://www.qounter.com	Application Platforms\|Real Time\|Social Network...	700000	operating	USA	DE	DE - Other	Delaware City	2	...	1.0	0.0	0.0	0.0	0.0
/organization/004-technologies	004 Technologies	http://004gmbh.de/en/004-interact	Software	-	operating	USA	IL	Springfield, Illinois	Champaign	1	...	0.0	0.0	0.0	0.0	0.0
/organization/0xdata	H2O.ai	http://h2o.ai/	Analytics	33600000	operating	USA	CA	SF Bay Area	Mountain View	4	...	1.0	30600000.0	3.0	2.5	8400000.0

	funding_total_usd	state_code	funding_rounds	founded_at	first_funding_at	last_funding_at	Category_Software	...	venture_funding_total_usd	venture_funding_rounds	number_of_investors_per_round	avg_amount_invested_per_round	status
/organization/-qounter	700000	DE	2	2014-09-04	2014-03-01	2014-10-14	0	...	0.0	0.0	0.0	0.0	operating
/organization/004-technologies	-	IL	1	2010-01-01	2014-07-24	2014-07-24	1	...	0.0	0.0	0.0	0.0	operating
/organization/0xdata	33600000	CA	4	2011-01-01	2013-01-03	2015-11-09	0	...	30600000.0	3.0	2.5	8400000.0	operating

	funding_total_usd	state_code	funding_rounds	founded_at	first_funding_at	last_funding_at	Category_Software	...	venture_funding_total_usd	venture_funding_rounds	number_of_investors_per_round	avg_amount_invested_per_round	status
/organization/-qounter	0.000023	DE	0.055556	2014-09-04	2014-03-01	2014-10-14	0	...	0.000000	0.000000	0.000000	0.000000	operating
/organization/004-technologies	0.000000	IL	0.000000	2010-01-01	2014-07-24	2014-07-24	1	...	0.000000	0.000000	0.000000	0.000000	operating
/organization/0xdata	0.001117	CA	0.166667	2011-01-01	2013-01-03	2015-11-09	0	...	0.001739	0.214286	0.104167	0.000477	operating