Dataset Preparation

Here we'll be removing nan's, normalizing numerical features, converting date features to numerical normalized features, and so on...

Importing the dataset


In [24]:
import pandas as pd
startups = pd.read_csv('data/startups_1_1.csv', index_col=0)
startups[:3]


Out[24]:
name homepage_url category_list funding_total_usd status country_code state_code region city funding_rounds ... seed_funding_rounds undisclosed_funding_total_usd undisclosed_funding_rounds venture_funding_total_usd venture_funding_rounds number_of_acquisitions number_of_investments number_of_unique_investments number_of_investors_per_round avg_amount_invested_per_round
/organization/-qounter :Qounter http://www.qounter.com Application Platforms|Real Time|Social Network... 700000 operating USA DE DE - Other Delaware City 2 ... 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
/organization/004-technologies 004 Technologies http://004gmbh.de/en/004-interact Software - operating USA IL Springfield, Illinois Champaign 1 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
/organization/0xdata H2O.ai http://h2o.ai/ Analytics 33600000 operating USA CA SF Bay Area Mountain View 4 ... 1.0 0.0 0.0 30600000.0 3.0 0.0 0.0 0.0 2.5 8400000.0

3 rows × 96 columns

Droping some features

We'll drop homepage_url, category_list, region, city, country_code We'll also move status to the end of the dataframe


In [25]:
#drop features
startups_dropped_features = startups.drop(['name','homepage_url', 'category_list', 'region', 'city', 'country_code'], 1)


#move status to the end
cols = list(startups_dropped_features)
cols.append(cols.pop(cols.index('status')))
startups_dropped_features = startups_dropped_features.ix[:, cols]

startups_dropped_features[:3]


Out[25]:
funding_total_usd state_code funding_rounds founded_at first_funding_at last_funding_at Category_Software Category_Biotechnology Category_Mobile Category_Enterprise Software ... undisclosed_funding_total_usd undisclosed_funding_rounds venture_funding_total_usd venture_funding_rounds number_of_acquisitions number_of_investments number_of_unique_investments number_of_investors_per_round avg_amount_invested_per_round status
/organization/-qounter 700000 DE 2 2014-09-04 2014-03-01 2014-10-14 0 0 0 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 operating
/organization/004-technologies - IL 1 2010-01-01 2014-07-24 2014-07-24 1 0 0 0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 operating
/organization/0xdata 33600000 CA 4 2011-01-01 2013-01-03 2015-11-09 0 0 0 0 ... 0.0 0.0 30600000.0 3.0 0.0 0.0 0.0 2.5 8400000.0 operating

3 rows × 90 columns

Normalizing numeric variables


In [26]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()

startups_normalized = startups_dropped_features.copy()

#Convert '-' to zeros in funding_total_usd
startups_normalized['funding_total_usd'] = startups_normalized['funding_total_usd'].replace('-', 0)

columns_to_scale = list(startups_normalized.filter(regex=(".*(funding_rounds|funding_total_usd)|(number_of|avg_).*")).columns)
startups_normalized[columns_to_scale] = min_max_scaler.fit_transform(startups_normalized[columns_to_scale])
startups_normalized[:3]


Out[26]:
funding_total_usd state_code funding_rounds founded_at first_funding_at last_funding_at Category_Software Category_Biotechnology Category_Mobile Category_Enterprise Software ... undisclosed_funding_total_usd undisclosed_funding_rounds venture_funding_total_usd venture_funding_rounds number_of_acquisitions number_of_investments number_of_unique_investments number_of_investors_per_round avg_amount_invested_per_round status
/organization/-qounter 0.000023 DE 0.055556 2014-09-04 2014-03-01 2014-10-14 0 0 0 0 ... 0.0 0.0 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 operating
/organization/004-technologies 0.000000 IL 0.000000 2010-01-01 2014-07-24 2014-07-24 1 0 0 0 ... 0.0 0.0 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 operating
/organization/0xdata 0.001117 CA 0.166667 2011-01-01 2013-01-03 2015-11-09 0 0 0 0 ... 0.0 0.0 0.001739 0.214286 0.0 0.0 0.0 0.104167 0.000477 operating

3 rows × 90 columns

Normalizing date variables


In [28]:
from datetime import datetime
from dateutil import relativedelta



def date_to_age_in_months(date):
    if date != date or date == 0: #is NaN
        return 0
    date1 = datetime.strptime(date, '%Y-%m-%d')
    date2 = datetime.strptime('2017-01-01', '%Y-%m-%d') #get age until 01/01/2017
    delta =  relativedelta.relativedelta(date2, date1)
    return delta.years * 12 + delta.months
    
    
startups_dates_normalized = startups_normalized.copy()
startups_dates_normalized['founded_at'] = startups_dates_normalized['founded_at'].map(date_to_age_in_months)
startups_dates_normalized['first_funding_at'] = startups_dates_normalized['first_funding_at'].map(date_to_age_in_months)
startups_dates_normalized['last_funding_at'] = startups_dates_normalized['last_funding_at'].map(date_to_age_in_months)

startups_dates_normalized[:3]


Out[28]:
funding_total_usd state_code funding_rounds founded_at first_funding_at last_funding_at Category_Software Category_Biotechnology Category_Mobile Category_Enterprise Software ... undisclosed_funding_total_usd undisclosed_funding_rounds venture_funding_total_usd venture_funding_rounds number_of_acquisitions number_of_investments number_of_unique_investments number_of_investors_per_round avg_amount_invested_per_round status
/organization/-qounter 0.000023 DE 0.055556 27 34 26 0 0 0 0 ... 0.0 0.0 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 operating
/organization/004-technologies 0.000000 IL 0.000000 84 29 29 1 0 0 0 ... 0.0 0.0 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 operating
/organization/0xdata 0.001117 CA 0.166667 72 47 13 0 0 0 0 ... 0.0 0.0 0.001739 0.214286 0.0 0.0 0.0 0.104167 0.000477 operating

3 rows × 90 columns


In [29]:
startups_dates_normalized[['founded_at', 'first_funding_at', 'last_funding_at']] = min_max_scaler.fit_transform(startups_dates_normalized[['founded_at', 'first_funding_at', 'last_funding_at']])
startups_dates_normalized[:3]


Out[29]:
funding_total_usd state_code funding_rounds founded_at first_funding_at last_funding_at Category_Software Category_Biotechnology Category_Mobile Category_Enterprise Software ... undisclosed_funding_total_usd undisclosed_funding_rounds venture_funding_total_usd venture_funding_rounds number_of_acquisitions number_of_investments number_of_unique_investments number_of_investors_per_round avg_amount_invested_per_round status
/organization/-qounter 0.000023 DE 0.055556 0.005917 0.002825 0.027344 0 0 0 0 ... 0.0 0.0 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 operating
/organization/004-technologies 0.000000 IL 0.000000 0.018409 0.002410 0.033203 1 0 0 0 ... 0.0 0.0 0.000000 0.000000 0.0 0.0 0.0 0.000000 0.000000 operating
/organization/0xdata 0.001117 CA 0.166667 0.015779 0.003906 0.001953 0 0 0 0 ... 0.0 0.0 0.001739 0.214286 0.0 0.0 0.0 0.104167 0.000477 operating

3 rows × 90 columns

Analzying types of status


In [30]:
startups_dates_normalized['status'].unique()


Out[30]:
array(['operating', 'acquired', 'closed', 'ipo'], dtype=object)

In [31]:
startups_dates_normalized['status'].value_counts()


Out[31]:
operating    29429
acquired      4156
closed        2907
ipo           1109
Name: status, dtype: int64

Saved prepared database to csv


In [32]:
startups_dates_normalized.to_csv('data/startups_2.csv')