This is a Jupyter notebook for David Dobrinskiy's HSE Thesis

How Venture Capital Affects Startups' Success


In [7]:
# You should be running python3

import sys
print(sys.version)


3.5.2 | packaged by conda-forge | (default, Jul 26 2016, 01:37:38) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.54)]

In [8]:
import pandas as pd  # http://pandas.pydata.org/
import numpy as np   # http://numpy.org/

import statsmodels.api as sm  # http://statsmodels.sourceforge.net/stable/index.html
import statsmodels.formula.api as smf
import statsmodels

print("Pandas Version: {}".format(pd.__version__))          # pandas version
print("StatsModels Version: {}".format(statsmodels.__version__))  # StatsModels version


Pandas Version: 0.19.0
StatsModels Version: 0.6.1

Let us look at the dynamics of total US VC investment


In [9]:
# load the pwc dataset from azure



frame = pd.read_csv('pwc_moneytree.csv')

In [10]:
frame.head()


Out[10]:
Quarter Unnamed: 1 1 2 3 4 Grand Total
0 1995.0 # of Deals 503 466 434 494 1,897
1 NaN Amount Invested $1,690,040,500 $2,546,374,500 $1,709,754,500 $2,070,353,900 $8,016,523,400
2 1996.0 # of Deals 585 671 604 775 2,635
3 NaN Amount Invested $2,437,964,900 $3,017,298,000 $2,606,841,300 $3,223,318,800 $11,285,423,000
4 1997.0 # of Deals 773 778 762 919 3,232

In [11]:
del frame['Grand Total']
frame.columns = ['year', 'type', 'q1', 'q2', 'q3', 'q4']
frame['year'] = frame['year'].fillna(method='ffill')
frame.head()


Out[11]:
year type q1 q2 q3 q4
0 1995.0 # of Deals 503 466 434 494
1 1995.0 Amount Invested $1,690,040,500 $2,546,374,500 $1,709,754,500 $2,070,353,900
2 1996.0 # of Deals 585 671 604 775
3 1996.0 Amount Invested $2,437,964,900 $3,017,298,000 $2,606,841,300 $3,223,318,800
4 1997.0 # of Deals 773 778 762 919

Deals and investments are in alternating rows of frame, let's separate them


In [12]:
deals_df = frame.iloc[0::2]
investments_df = frame.iloc[1::2]

In [13]:
# once separated, 'type' field is identical within each df
# let's delete it

del       deals_df['type']
del investments_df['type']

In [14]:
deals_df.head()


Out[14]:
year q1 q2 q3 q4
0 1995.0 503 466 434 494
2 1996.0 585 671 604 775
4 1997.0 773 778 762 919
6 1998.0 871 912 931 1,030
8 1999.0 936 1,330 1,448 1,892

In [15]:
investments_df.head()


Out[15]:
year q1 q2 q3 q4
1 1995.0 $1,690,040,500 $2,546,374,500 $1,709,754,500 $2,070,353,900
3 1996.0 $2,437,964,900 $3,017,298,000 $2,606,841,300 $3,223,318,800
5 1997.0 $3,104,430,200 $3,691,566,400 $3,775,959,100 $4,499,598,900
7 1998.0 $4,199,545,200 $5,784,400,500 $5,412,295,700 $6,165,090,000
9 1999.0 $6,648,622,700 $11,315,434,200 $13,654,453,100 $23,340,087,500

In [16]:
def unstack_to_series(df):
    """
    Takes q1-q4 in a dataframe and converts it to a series
    
    input: a dataframe containing ['q1', 'q2', 'q3', 'q4']
    
    ouput: a pandas series
    """
    quarters = ['q1', 'q2', 'q3', 'q4']
    d = dict()
    for i, row in df.iterrows():
        for q in quarters:
            key = str(int(row['year'])) + q
            d[key] = row[q]

            # print(key, q, row[q])
    return pd.Series(d)

In [17]:
deals = unstack_to_series(deals_df      ).dropna()
investments = unstack_to_series(investments_df).dropna()

In [18]:
def string_to_int(money_string):
    numerals = [c if c.isnumeric() else '' for c in money_string]
    return int(''.join(numerals))

In [19]:
# convert deals from string to integers
deals = deals.apply(string_to_int)
deals.tail()


Out[19]:
2015q3    1189
2015q4    1047
2016q1    1021
2016q2     999
2016q3     891
dtype: int64

In [20]:
# investment in billions USD
# converts to integers - which is ok, since data is in dollars
investments_b = investments.apply(string_to_int) 
# in python3 division automatically converts numbers to floats, we don't loose precicion
investments_b = investments_b / 10**9
# round data to 2 decimals
investments_b = investments_b.apply(round, ndigits=2)
investments_b.tail()


Out[20]:
2015q3    16.71
2015q4    12.34
2016q1    12.64
2016q2    15.57
2016q3    10.63
dtype: float64

Plot data from MoneyTree report

http://www.pwcmoneytree.com


In [23]:
import matplotlib.pyplot  as plt  # http://matplotlib.org/
import matplotlib.patches as mpatches
import matplotlib.ticker  as ticker
%matplotlib inline

# change matplotlib inline display size
# import matplotlib.pylab as pylab
# pylab.rcParams['figure.figsize'] = (8, 6)  # that's default image size for this interactive session

fig, ax1 = plt.subplots()
ax1.set_title("VC historical trend (US Data)")
t = range(len(investments_b))      # need to substitute tickers for years later
width = t[1]-t[0]
y1 = investments_b

# create filled step chart for investment amount
ax1.bar(t, y1, width=width, facecolor='0.80', edgecolor='', label = 'Investment ($ Bln.)')
ax1.set_ylabel('Investment ($ Bln.)')

# set up xlabels with years
years = [str(year)[:-2] for year in deals.index][::4]  # get years without quarter 
ax1.set_xticks(t[::4])                   # set 1 tick per year
ax1.set_xticklabels(years, rotation=50)  # set tick names
ax1.set_xlabel('Year')                   # name X axis

# format Y1 tickers to $ billions
formatter = ticker.FormatStrFormatter('$%1.0f Bil.')
ax1.yaxis.set_major_formatter(formatter)
for tick in ax1.yaxis.get_major_ticks():
    tick.label1On = False
    tick.label2On = True

# create second Y2 axis for Num of Deals
ax2 = ax1.twinx()
y2 = deals
ax2.plot(t, y2, color = 'k', ls = '-', label = 'Num. of Deals')
ax2.set_ylabel('Num. of Deals')

# add annotation bubbles
ax2.annotate('1997-2000 dot-com bubble', xy=(23, 2100), xytext=(3, 1800),
                bbox=dict(boxstyle="round4", fc="w"),
                arrowprops=dict(arrowstyle="-|>",
                                connectionstyle="arc3,rad=0.2",
                                fc="w"),
            )

ax2.annotate('2007-08 Financial Crisis', xy=(57, 800), xytext=(40, 1300),
                bbox=dict(boxstyle="round4", fc="w"),
                arrowprops=dict(arrowstyle="-|>",
                                connectionstyle="arc3,rad=-0.2",
                                fc="w"), 
            )


# add legend
ax1.legend(loc="best")
ax2.legend(bbox_to_anchor=(0.95, 0.88))

fig.tight_layout()  # solves cropping problems when saving png
fig.savefig('vc_trend_3.png', dpi=250)


plt.show()



In [22]:
# load countries dataset from azure
ds = ws.datasets['country_data.csv']


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-22-3291b42f67a8> in <module>()
      1 # load countries dataset from azure
----> 2 ds = ws.datasets['country_data.csv']

NameError: name 'ws' is not defined

In [ ]:
# data for 2015
country_data = ds.to_dataframe()
country_data

In [ ]:
def tex(df):
    """
    Print dataframe contents in latex-ready format
    """
    for line in df.to_latex().split('\n'):
        print(line)

In [ ]:
params = pd.DataFrame(country_data['Criteria'])
params.index = ['y'] + ['X'+str(i) for i in range(1, len(country_data))]
tex(params)

In [ ]:
# set index
country_data = country_data.set_index('Criteria')
# convert values to floats (note: comas need to be replaced by dots for python conversion to work)
country_data.index = ['y'] + ['X'+str(i) for i in range(1, len(country_data))]
country_data

prepare data for ols


In [ ]:
const = pd.Series([1]*len(country_data.columns), index = country_data.columns, name = 'X0')
const

In [ ]:
country_data = pd.concat([pd.DataFrame(const).T, country_data])
country_data = country_data.sort_index()
country_data

In [ ]:
tex(country_data)

In [ ]:
y = country_data.iloc[-1,:]
y

In [ ]:
X = country_data.iloc[:-1, :].T
X

In [ ]:
# Fit regression model
results = sm.OLS(y, X).fit()

# Inspect the results in latex doc, {tab:vc_ols_1}
print(results.summary())

In [ ]:
# Inspect the results in latex doc, {tab:vc_ols_1}
print(results.summary().as_latex())

In [ ]:
# equation for eq:OLS_1_coeffs in LaTeX

equation = 'Y ='
for i, coeff in results.params.iteritems():
    sign = '+' if coeff >= 0 else '-'
    equation += ' ' + sign + str(abs(round(coeff,2))) + '*' + i
print(equation)

In [ ]:
# correlation table
corr = country_data.T.corr().iloc[1:,1:]
corr = corr.applymap(lambda x: round(x, 2))
corr

In [ ]:
# corr table to latex
tex(corr)

In [ ]:
import itertools
# set of unique parameter pairs
pairs = set([frozenset(pair) for pair in itertools.permutations(list(corr.index), 2)]) 
for pair in pairs:
    pair = sorted(list(pair))
    corr_pair = corr.loc[pair[0],pair[1]]
    if corr_pair > 0.7:
        print(pair, round(corr_pair, 2))
        print('-'*40)

In [ ]:
print('a')

In [ ]:
for i in corr.columns:
    for j in corr.columns:
        if abs(corr.loc[i, j]) > 0.7 and i != j:
            print(i+'~'+j, corr.loc[i, j])

In [ ]: