In [24]:
import pandas as pd
startups = pd.read_csv('data/startups_1_1.csv', index_col=0)
startups[:3]
Out[24]:
In [25]:
#drop features
startups_dropped_features = startups.drop(['name','homepage_url', 'category_list', 'region', 'city', 'country_code'], 1)
#move status to the end
cols = list(startups_dropped_features)
cols.append(cols.pop(cols.index('status')))
startups_dropped_features = startups_dropped_features.ix[:, cols]
startups_dropped_features[:3]
Out[25]:
In [26]:
from sklearn import preprocessing
min_max_scaler = preprocessing.MinMaxScaler()
startups_normalized = startups_dropped_features.copy()
#Convert '-' to zeros in funding_total_usd
startups_normalized['funding_total_usd'] = startups_normalized['funding_total_usd'].replace('-', 0)
columns_to_scale = list(startups_normalized.filter(regex=(".*(funding_rounds|funding_total_usd)|(number_of|avg_).*")).columns)
startups_normalized[columns_to_scale] = min_max_scaler.fit_transform(startups_normalized[columns_to_scale])
startups_normalized[:3]
Out[26]:
In [28]:
from datetime import datetime
from dateutil import relativedelta
def date_to_age_in_months(date):
if date != date or date == 0: #is NaN
return 0
date1 = datetime.strptime(date, '%Y-%m-%d')
date2 = datetime.strptime('2017-01-01', '%Y-%m-%d') #get age until 01/01/2017
delta = relativedelta.relativedelta(date2, date1)
return delta.years * 12 + delta.months
startups_dates_normalized = startups_normalized.copy()
startups_dates_normalized['founded_at'] = startups_dates_normalized['founded_at'].map(date_to_age_in_months)
startups_dates_normalized['first_funding_at'] = startups_dates_normalized['first_funding_at'].map(date_to_age_in_months)
startups_dates_normalized['last_funding_at'] = startups_dates_normalized['last_funding_at'].map(date_to_age_in_months)
startups_dates_normalized[:3]
Out[28]:
In [29]:
startups_dates_normalized[['founded_at', 'first_funding_at', 'last_funding_at']] = min_max_scaler.fit_transform(startups_dates_normalized[['founded_at', 'first_funding_at', 'last_funding_at']])
startups_dates_normalized[:3]
Out[29]:
In [30]:
startups_dates_normalized['status'].unique()
Out[30]:
In [31]:
startups_dates_normalized['status'].value_counts()
Out[31]:
In [32]:
startups_dates_normalized.to_csv('data/startups_2.csv')