In [1]:
import numpy as np
import pandas as pd
Let's import the data
In [2]:
companies = pd.read_csv('data/companies.csv')
In [3]:
#Having a look to the companies data structure
companies[:3]
Out[3]:
In [4]:
#Let's first remove non USA companies, since they usually have a lot of missing data
companies_USA = companies[companies['country_code'] == 'USA']
In [5]:
#Check if there are any missing data for state_code in USA based companies
companies_USA['state_code'].unique()
#companies_USA['state_code'].value_counts()
# No nan values for state_code
#Let's maintain region and city in the dataset but probably they are not gonna be used
#companies_USA['city'].value_counts()
Out[5]:
In [6]:
from operator import methodcaller
#Let's analyze category_list and probably expand it as dummy variables
#get a unique list of the categories
categories = list(companies_USA['category_list'].astype('str').unique())
#split each categori by |
categories = map(methodcaller("split", "|"), categories)
#flatten the list of sub categories
categories = [item for sublist in categories for item in sublist]
#total of 60k different categories
#categories
len(categories)
Out[6]:
In [7]:
#We'll need to select the most important categories (that appears most of the times, and use Other for the rest)
companies_series = companies_USA['category_list'].astype('str')
categories_splitted_count = companies_series.str.split('|').apply(lambda x: pd.Series(x).value_counts()).sum()
In [8]:
#dummies
dummies = companies_series.str.get_dummies(sep='|')
In [9]:
########### Count of categories splitted first 50)###########
top50categories = list(categories_splitted_count.sort_values(ascending=False).index[:50])
##### Create a dataframe with the 50 top categories to be concatenated later to the complete dataframe
categories_df = dummies[top50categories]
categories_df = categories_df.add_prefix('Category_')
In [10]:
#Let's start by comparing and understanding the difference between investments.csv and rounds.csv
df_investments = pd.read_csv('data/investments.csv')
df_investments = df_investments[df_investments['company_country_code'] == 'USA']
df_rounds = pd.read_csv('data/rounds.csv')
df_rounds = df_rounds[df_rounds['company_country_code'] == 'USA']
In [193]:
#companies_USA[companies_USA['permalink'] == '/organization/0xdata']
#df_investments[df_investments['company_permalink'] == '/organization/0xdata' ]
#df_rounds[df_rounds['company_permalink'] == '/organization/0xdata' ]
Out[193]:
The difference between investments and rounds is that investments is providing the information of where the money came from. Investments contains information about which investors paid each round. While rounds is grouping and totalizing the information by round.
In [12]:
#df_rounds
#Prepare an aggregated rounds dataframe grouped by company and funding type
rounds_agg = df_rounds.groupby(['company_permalink', 'funding_round_type'])['raised_amount_usd'].agg({'amount': [ pd.Series.sum, pd.Series.count]})
In [13]:
#Get available unique funding types
funding_types = list(rounds_agg.index.levels[1])
funding_types
Out[13]:
In [14]:
#Prepare the dataframe where all the dummy features for each funding type will be added (number of rounds and total sum for each type)
rounds_df = companies_USA[['permalink']]
rounds_df = rounds_df.rename(columns = {'permalink':'company_permalink'})
#Iterate over each kind of funding type, and add two new features for each into the dataframe
def add_dummy_for_funding_type(df, aggr_rounds, funding_type):
funding_df = aggr_rounds.iloc[aggr_rounds.index.get_level_values('funding_round_type') == funding_type].reset_index()
funding_df.columns = funding_df.columns.droplevel()
funding_df.columns = ['company_permalink', funding_type, funding_type+'_funding_total_usd', funding_type+'_funding_rounds']
funding_df = funding_df.drop(funding_type,1)
new_df = pd.merge(df, funding_df, on='company_permalink', how='left')
new_df = new_df.fillna(0)
return new_df
#rounds_agg was generated a few steps above
for funding_type in funding_types:
rounds_df = add_dummy_for_funding_type(rounds_df, rounds_agg, funding_type)
#remove the company_permalink variable, since it's already available in the companies dataframe
rounds_df = rounds_df.drop('company_permalink', 1)
#set rounds_df to have the same index of the other dataframes
rounds_df.index = companies_USA.index
rounds_df[:3]
Out[14]:
In [15]:
startups_df = pd.concat([companies_USA, categories_df, rounds_df], axis=1, ignore_index=False)
startups_df[:3]
Out[15]:
In [212]:
startups_df.index = list(startups_df['permalink'])
startups_df = startups_df.drop('permalink', 1)
In [211]:
startups_df.to_csv('data/startups_1.csv')
#startups_df
In [ ]: