In [1]:
# You should be running python3
import sys
print(sys.version)
In [2]:
import pandas as pd # http://pandas.pydata.org/
import numpy as np # http://numpy.org/
import statsmodels.api as sm # http://statsmodels.sourceforge.net/stable/index.html
import statsmodels.formula.api as smf
import statsmodels
print("Pandas Version: {}".format(pd.__version__)) # pandas version
print("StatsModels Version: {}".format(statsmodels.__version__)) # StatsModels version
In [3]:
# load the pwc dataset from azure
from azureml import Workspace
ws = Workspace()
ds = ws.datasets['pwc_moneytree.csv']
frame = ds.to_dataframe()
In [4]:
frame.head()
Out[4]:
In [5]:
del frame['Grand Total']
frame.columns = ['year', 'type', 'q1', 'q2', 'q3', 'q4']
frame['year'] = frame['year'].fillna(method='ffill')
frame.head()
Out[5]:
Deals and investments are in alternating rows of frame, let's separate them
In [6]:
deals_df = frame.iloc[0::2]
investments_df = frame.iloc[1::2]
In [7]:
# once separated, 'type' field is identical within each df
# let's delete it
del deals_df['type']
del investments_df['type']
In [8]:
deals_df.head()
Out[8]:
In [9]:
investments_df.head()
Out[9]:
In [10]:
def unstack_to_series(df):
"""
Takes q1-q4 in a dataframe and converts it to a series
input: a dataframe containing ['q1', 'q2', 'q3', 'q4']
ouput: a pandas series
"""
quarters = ['q1', 'q2', 'q3', 'q4']
d = dict()
for i, row in df.iterrows():
for q in quarters:
key = str(int(row['year'])) + q
d[key] = row[q]
# print(key, q, row[q])
return pd.Series(d)
In [11]:
deals = unstack_to_series(deals_df ).dropna()
investments = unstack_to_series(investments_df).dropna()
In [12]:
def string_to_int(money_string):
numerals = [c if c.isnumeric() else '' for c in money_string]
return int(''.join(numerals))
In [13]:
# convert deals from string to integers
deals = deals.apply(string_to_int)
deals.tail()
Out[13]:
In [14]:
# investment in billions USD
# converts to integers - which is ok, since data is in dollars
investments_b = investments.apply(string_to_int)
# in python3 division automatically converts numbers to floats, we don't loose precicion
investments_b = investments_b / 10**9
# round data to 2 decimals
investments_b = investments_b.apply(round, ndigits=2)
investments_b.tail()
Out[14]:
In [15]:
import matplotlib.pyplot as plt # http://matplotlib.org/
import matplotlib.patches as mpatches
import matplotlib.ticker as ticker
%matplotlib inline
# change matplotlib inline display size
# import matplotlib.pylab as pylab
# pylab.rcParams['figure.figsize'] = (8, 6) # that's default image size for this interactive session
fig, ax1 = plt.subplots()
ax1.set_title("VC historical trend (US Data)")
t = range(len(investments_b)) # need to substitute tickers for years later
width = t[1]-t[0]
y1 = investments_b
# create filled step chart for investment amount
ax1.bar(t, y1, width=width, facecolor='0.80', edgecolor='', label = 'Investment ($ Bln.)')
ax1.set_ylabel('Investment ($ Bln.)')
# set up xlabels with years
years = [str(year)[:-2] for year in deals.index][::4] # get years without quarter
ax1.set_xticks(t[::4]) # set 1 tick per year
ax1.set_xticklabels(years, rotation=50) # set tick names
ax1.set_xlabel('Year') # name X axis
# format Y1 tickers to $ billions
formatter = ticker.FormatStrFormatter('$%1.0f Bil.')
ax1.yaxis.set_major_formatter(formatter)
for tick in ax1.yaxis.get_major_ticks():
tick.label1On = False
tick.label2On = True
# create second Y2 axis for Num of Deals
ax2 = ax1.twinx()
y2 = deals
ax2.plot(t, y2, color = 'k', ls = '-', label = 'Num. of Deals')
ax2.set_ylabel('Num. of Deals')
# add annotation bubbles
ax2.annotate('1997-2000 dot-com bubble', xy=(23, 2100), xytext=(6, 1800),
bbox=dict(boxstyle="round4", fc="w"),
arrowprops=dict(arrowstyle="-|>",
connectionstyle="arc3,rad=0.2",
fc="w"),
)
ax2.annotate('2007-08 Financial Crisis', xy=(57, 800), xytext=(40, 1300),
bbox=dict(boxstyle="round4", fc="w"),
arrowprops=dict(arrowstyle="-|>",
connectionstyle="arc3,rad=-0.2",
fc="w"),
)
# add legend
ax1.legend(loc="best")
ax2.legend(bbox_to_anchor=(0.95, 0.88))
fig.tight_layout() # solves cropping problems when saving png
fig.savefig('vc_trend_3.png', dpi=250)
plt.show()
In [18]:
def tex(df):
"""
Print dataframe contents in latex-ready format
"""
for line in df.to_latex().split('\n'):
print(line)
In [34]:
ds = ws.datasets['ipo_mna.csv']
frame = ds.to_dataframe()
frame.tail()
Out[34]:
In [35]:
frame = frame.iloc[:-2]
frame = frame.set_index('q')
frame
Out[35]:
In [36]:
ds = ws.datasets['wsj_unicorns.csv']
frame = ds.to_dataframe()
frame.tail()
Out[36]:
In [37]:
# data from Founder Collective
# http://www.foundercollective.com/
ds = ws.datasets['most_funded_ipo.csv']
frame = ds.to_dataframe()
most_funded = frame.copy()
most_funded.tail()
Out[37]:
In [38]:
from datetime import datetime
most_funded['Firm age'] = datetime.now().year - most_funded['Founded']
most_funded['Years to IPO'] = most_funded['IPO Year'] - most_funded['Founded']
In [39]:
# extract all funding rounds
# R1, R2, ... are funding rounds (Raising VC)
most_funded.iloc[:,2:22:2].tail()
Out[39]:
In [40]:
# [axis = 1] to sum by row instead of by-column
most_funded['VC'] = most_funded.iloc[:,2:22:2].sum(axis=1)
# VC data is in MILLIONS of $
In [41]:
most_funded['IPO Raise'].head(3)
Out[41]:
In [42]:
# convert IPO string to MILLIONS of $
converter = lambda x: round(int((x.replace(',',''))[1:])/10**6, 2)
most_funded['IPO Raise'] = most_funded['IPO Raise' ].apply(converter)
most_funded['Current Market Cap'] = most_funded['Current Market Cap '].apply(converter)
del most_funded['Current Market Cap ']
In [43]:
most_funded['IPO Raise'].head(3)
Out[43]:
In [44]:
# MILLIONS of $
most_funded['VC and IPO'] = most_funded['VC'] + most_funded['IPO Raise']
In [45]:
# Price in ordinary $
most_funded['$ Price change'] = most_funded['Current Share Price'] - most_funded['IPO Share Price']
In [46]:
most_funded['% Price change'] = round(most_funded['$ Price change'] / most_funded['IPO Share Price'], 2)
Facebook is an extreme outlier in venture capital, let's exclude it from our analysis
In [47]:
mask = most_funded['Firm'] == 'Facebook'
most_funded[mask]
Out[47]:
In [48]:
# removing Facebook
most_funded = most_funded[~mask]
In [49]:
# look at all the columns
[print(c) for c in most_funded.columns]
None
In [50]:
cols = most_funded.columns[:2].append(most_funded.columns[22:])
cols
Out[50]:
In [51]:
# remove individual funding rounds - we'll only analyze aggregates
most_funded = most_funded[cols]
In [52]:
from matplotlib.ticker import FuncFormatter
x = most_funded['Firm']
y = sorted(most_funded['VC'], reverse=True)
def millions(x, pos):
'The two args are the value and tick position'
return '$%1.0fM' % (x)
formatter = FuncFormatter(millions)
fig, ax = plt.subplots(figsize=(6,4), dpi=200)
ax.yaxis.set_major_formatter(formatter)
#plt.figure(figsize=(6,4), dpi=200)
# Create a new subplot from a grid of 1x1
# plt.subplot(111)
plt.title("Total VC raised for unicorns")
plt.bar(range(len(x)+2), [0,0]+y, width = 1, facecolor='0.80', edgecolor='k', linewidth=0.3)
plt.ylabel('VC raised per firm\n(before IPO)')
# plt.set_xticks(x) # set 1 tick per year
plt.xlabel('Firms')
plt.xticks([])
plt.show()
In [53]:
cols = ['Firm', 'Sector', 'VC', 'Current Market Cap']
df = most_funded[cols]
df.set_index('Firm', inplace = True)
df.head(2)
Out[53]:
In [54]:
tmp = df.groupby('Sector').sum().applymap(int)
tmp.index += ' Total'
tmp.sort_index(ascending=False, inplace = True)
tmp
Out[54]:
In [55]:
tmp2 = df.groupby('Sector').mean().applymap(int)
tmp2.index += ' Average'
tmp2.sort_index(ascending=False, inplace = True)
tmp2
Out[55]:
In [56]:
tmp.append(tmp2).applymap(lambda x: "${:,}".format(x))
Out[56]:
In [57]:
tex(tmp.append(tmp2).applymap(lambda x: "${:,}".format(x)))
In [58]:
most_funded['Mult'] = (most_funded['Current Market Cap'] / most_funded['VC']).replace([np.inf, -np.inf], np.nan)
most_funded.head()
Out[58]:
In [59]:
tex(most_funded.iloc[:,list(range(8))+list(range(11,20))].head().T)
In [60]:
most_funded.head()
most_funded['Current Market Cap']
Out[60]:
In [61]:
least_20 = most_funded.dropna().sort_values('VC')[1:21]
least_20 = least_20[['VC', 'Current Market Cap','Mult']].mean()
least_20
Out[61]:
In [62]:
most_20 = most_funded.dropna().sort_values('VC')[-20:]
most_20 = most_20[['VC', 'Current Market Cap','Mult']].mean()
most_20
Out[62]:
In [63]:
pd.DataFrame([most_20, least_20], index=['most_20', 'least_20']).applymap(lambda x: round(x, 2))
Out[63]:
In [64]:
tex(pd.DataFrame([most_20, least_20], index=['most_20', 'least_20']).applymap(lambda x: round(x, 2)))
In [65]:
cols = ['Sector', 'VC', '% Price change', 'Firm age', 'Years to IPO', 'Current Market Cap', 'Mult']
df = most_funded[cols]
df.columns = ['Sector', 'VC', 'Growth', 'Age', 'yearsIPO', 'marketCAP', 'Mult']
df.head(2)
Out[65]:
In [66]:
res = smf.ols(formula='Growth ~ VC + yearsIPO + C(Sector)', data=df).fit()
print(res.summary())
In [67]:
res = smf.ols(formula='Growth ~ VC + Age + yearsIPO + C(Sector)', data=df).fit()
print(res.summary())
In [68]:
print(res.summary().as_latex())
In [69]:
res = smf.ols(formula='Mult ~ VC + yearsIPO + C(Sector)', data=df).fit()
print(res.summary())
In [ ]: