This is a Jupyter notebook for David Dobrinskiy's HSE Thesis

How Venture Capital Affects Startups' Success



In [7]:

    
# You should be running python3

import sys
print(sys.version)









    



3.5.2 | packaged by conda-forge | (default, Jul 26 2016, 01:37:38) 
[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.54)]



In [8]:

    
import pandas as pd  # http://pandas.pydata.org/
import numpy as np   # http://numpy.org/

import statsmodels.api as sm  # http://statsmodels.sourceforge.net/stable/index.html
import statsmodels.formula.api as smf
import statsmodels

print("Pandas Version: {}".format(pd.__version__))          # pandas version
print("StatsModels Version: {}".format(statsmodels.__version__))  # StatsModels version









    



Pandas Version: 0.19.0
StatsModels Version: 0.6.1

Let us look at the dynamics of total US VC investment



In [9]:

    
# load the pwc dataset from azure



frame = pd.read_csv('pwc_moneytree.csv')



In [10]:

    
frame.head()









    Out[10]:






  
    
      
      Quarter
      Unnamed: 1
      1
      2
      3
      4
      Grand Total
    
  
  
    
      0
      1995.0
      # of Deals
      503
      466
      434
      494
      1,897
    
    
      1
      NaN
      Amount Invested
      $1,690,040,500
      $2,546,374,500
      $1,709,754,500
      $2,070,353,900
      $8,016,523,400
    
    
      2
      1996.0
      # of Deals
      585
      671
      604
      775
      2,635
    
    
      3
      NaN
      Amount Invested
      $2,437,964,900
      $3,017,298,000
      $2,606,841,300
      $3,223,318,800
      $11,285,423,000
    
    
      4
      1997.0
      # of Deals
      773
      778
      762
      919
      3,232



In [11]:

    
del frame['Grand Total']
frame.columns = ['year', 'type', 'q1', 'q2', 'q3', 'q4']
frame['year'] = frame['year'].fillna(method='ffill')
frame.head()









    Out[11]:






  
    
      
      year
      type
      q1
      q2
      q3
      q4
    
  
  
    
      0
      1995.0
      # of Deals
      503
      466
      434
      494
    
    
      1
      1995.0
      Amount Invested
      $1,690,040,500
      $2,546,374,500
      $1,709,754,500
      $2,070,353,900
    
    
      2
      1996.0
      # of Deals
      585
      671
      604
      775
    
    
      3
      1996.0
      Amount Invested
      $2,437,964,900
      $3,017,298,000
      $2,606,841,300
      $3,223,318,800
    
    
      4
      1997.0
      # of Deals
      773
      778
      762
      919

Deals and investments are in alternating rows of frame, let's separate them



In [12]:

    
deals_df = frame.iloc[0::2]
investments_df = frame.iloc[1::2]



In [13]:

    
# once separated, 'type' field is identical within each df
# let's delete it

del       deals_df['type']
del investments_df['type']



In [14]:

    
deals_df.head()



In [15]:

    
investments_df.head()









    Out[15]:






  
    
      
      year
      q1
      q2
      q3
      q4
    
  
  
    
      1
      1995.0
      $1,690,040,500
      $2,546,374,500
      $1,709,754,500
      $2,070,353,900
    
    
      3
      1996.0
      $2,437,964,900
      $3,017,298,000
      $2,606,841,300
      $3,223,318,800
    
    
      5
      1997.0
      $3,104,430,200
      $3,691,566,400
      $3,775,959,100
      $4,499,598,900
    
    
      7
      1998.0
      $4,199,545,200
      $5,784,400,500
      $5,412,295,700
      $6,165,090,000
    
    
      9
      1999.0
      $6,648,622,700
      $11,315,434,200
      $13,654,453,100
      $23,340,087,500



In [16]:

    
def unstack_to_series(df):
    """
    Takes q1-q4 in a dataframe and converts it to a series
    
    input: a dataframe containing ['q1', 'q2', 'q3', 'q4']
    
    ouput: a pandas series
    """
    quarters = ['q1', 'q2', 'q3', 'q4']
    d = dict()
    for i, row in df.iterrows():
        for q in quarters:
            key = str(int(row['year'])) + q
            d[key] = row[q]

            # print(key, q, row[q])
    return pd.Series(d)



In [17]:

    
deals = unstack_to_series(deals_df      ).dropna()
investments = unstack_to_series(investments_df).dropna()



In [18]:

    
def string_to_int(money_string):
    numerals = [c if c.isnumeric() else '' for c in money_string]
    return int(''.join(numerals))



In [19]:

    
# convert deals from string to integers
deals = deals.apply(string_to_int)
deals.tail()









    Out[19]:





2015q3    1189
2015q4    1047
2016q1    1021
2016q2     999
2016q3     891
dtype: int64



In [20]:

    
# investment in billions USD
# converts to integers - which is ok, since data is in dollars
investments_b = investments.apply(string_to_int) 
# in python3 division automatically converts numbers to floats, we don't loose precicion
investments_b = investments_b / 10**9
# round data to 2 decimals
investments_b = investments_b.apply(round, ndigits=2)
investments_b.tail()









    Out[20]:





2015q3    16.71
2015q4    12.34
2016q1    12.64
2016q2    15.57
2016q3    10.63
dtype: float64

Plot data from MoneyTree report

http://www.pwcmoneytree.com



In [23]:

    
import matplotlib.pyplot  as plt  # http://matplotlib.org/
import matplotlib.patches as mpatches
import matplotlib.ticker  as ticker
%matplotlib inline

# change matplotlib inline display size
# import matplotlib.pylab as pylab
# pylab.rcParams['figure.figsize'] = (8, 6)  # that's default image size for this interactive session

fig, ax1 = plt.subplots()
ax1.set_title("VC historical trend (US Data)")
t = range(len(investments_b))      # need to substitute tickers for years later
width = t[1]-t[0]
y1 = investments_b

# create filled step chart for investment amount
ax1.bar(t, y1, width=width, facecolor='0.80', edgecolor='', label = 'Investment ($ Bln.)')
ax1.set_ylabel('Investment ($ Bln.)')

# set up xlabels with years
years = [str(year)[:-2] for year in deals.index][::4]  # get years without quarter 
ax1.set_xticks(t[::4])                   # set 1 tick per year
ax1.set_xticklabels(years, rotation=50)  # set tick names
ax1.set_xlabel('Year')                   # name X axis

# format Y1 tickers to $ billions
formatter = ticker.FormatStrFormatter('$%1.0f Bil.')
ax1.yaxis.set_major_formatter(formatter)
for tick in ax1.yaxis.get_major_ticks():
    tick.label1On = False
    tick.label2On = True

# create second Y2 axis for Num of Deals
ax2 = ax1.twinx()
y2 = deals
ax2.plot(t, y2, color = 'k', ls = '-', label = 'Num. of Deals')
ax2.set_ylabel('Num. of Deals')

# add annotation bubbles
ax2.annotate('1997-2000 dot-com bubble', xy=(23, 2100), xytext=(3, 1800),
                bbox=dict(boxstyle="round4", fc="w"),
                arrowprops=dict(arrowstyle="-|>",
                                connectionstyle="arc3,rad=0.2",
                                fc="w"),
            )

ax2.annotate('2007-08 Financial Crisis', xy=(57, 800), xytext=(40, 1300),
                bbox=dict(boxstyle="round4", fc="w"),
                arrowprops=dict(arrowstyle="-|>",
                                connectionstyle="arc3,rad=-0.2",
                                fc="w"), 
            )


# add legend
ax1.legend(loc="best")
ax2.legend(bbox_to_anchor=(0.95, 0.88))

fig.tight_layout()  # solves cropping problems when saving png
fig.savefig('vc_trend_3.png', dpi=250)


plt.show()



In [22]:

    
# load countries dataset from azure
ds = ws.datasets['country_data.csv']









    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-22-3291b42f67a8> in <module>()
      1 # load countries dataset from azure
----> 2 ds = ws.datasets['country_data.csv']

NameError: name 'ws' is not defined



In [ ]:

    
# data for 2015
country_data = ds.to_dataframe()
country_data



In [ ]:

    
def tex(df):
    """
    Print dataframe contents in latex-ready format
    """
    for line in df.to_latex().split('\n'):
        print(line)



In [ ]:

    
params = pd.DataFrame(country_data['Criteria'])
params.index = ['y'] + ['X'+str(i) for i in range(1, len(country_data))]
tex(params)



In [ ]:

    
# set index
country_data = country_data.set_index('Criteria')
# convert values to floats (note: comas need to be replaced by dots for python conversion to work)
country_data.index = ['y'] + ['X'+str(i) for i in range(1, len(country_data))]
country_data

prepare data for ols



In [ ]:

    
const = pd.Series([1]*len(country_data.columns), index = country_data.columns, name = 'X0')
const



In [ ]:

    
country_data = pd.concat([pd.DataFrame(const).T, country_data])
country_data = country_data.sort_index()
country_data



In [ ]:

    
tex(country_data)



In [ ]:

    
y = country_data.iloc[-1,:]
y



In [ ]:

    
X = country_data.iloc[:-1, :].T
X



In [ ]:

    
# Fit regression model
results = sm.OLS(y, X).fit()

# Inspect the results in latex doc, {tab:vc_ols_1}
print(results.summary())



In [ ]:

    
# Inspect the results in latex doc, {tab:vc_ols_1}
print(results.summary().as_latex())



In [ ]:

    
# equation for eq:OLS_1_coeffs in LaTeX

equation = 'Y ='
for i, coeff in results.params.iteritems():
    sign = '+' if coeff >= 0 else '-'
    equation += ' ' + sign + str(abs(round(coeff,2))) + '*' + i
print(equation)



In [ ]:

    
# correlation table
corr = country_data.T.corr().iloc[1:,1:]
corr = corr.applymap(lambda x: round(x, 2))
corr



In [ ]:

    
# corr table to latex
tex(corr)



In [ ]:

    
import itertools
# set of unique parameter pairs
pairs = set([frozenset(pair) for pair in itertools.permutations(list(corr.index), 2)]) 
for pair in pairs:
    pair = sorted(list(pair))
    corr_pair = corr.loc[pair[0],pair[1]]
    if corr_pair > 0.7:
        print(pair, round(corr_pair, 2))
        print('-'*40)



In [ ]:

    
print('a')



In [ ]:

    
for i in corr.columns:
    for j in corr.columns:
        if abs(corr.loc[i, j]) > 0.7 and i != j:
            print(i+'~'+j, corr.loc[i, j])



In [ ]:

	Quarter	Unnamed: 1	1	2	3	4	Grand Total
0	1995.0	# of Deals	503	466	434	494	1,897
1	NaN	Amount Invested	$1,690,040,500	$2,546,374,500	$1,709,754,500	$2,070,353,900	$8,016,523,400
2	1996.0	# of Deals	585	671	604	775	2,635
3	NaN	Amount Invested	$2,437,964,900	$3,017,298,000	$2,606,841,300	$3,223,318,800	$11,285,423,000
4	1997.0	# of Deals	773	778	762	919	3,232