In [7]:
# You should be running python3
import sys
print(sys.version)
In [8]:
import pandas as pd # http://pandas.pydata.org/
import numpy as np # http://numpy.org/
import statsmodels.api as sm # http://statsmodels.sourceforge.net/stable/index.html
import statsmodels.formula.api as smf
import statsmodels
print("Pandas Version: {}".format(pd.__version__)) # pandas version
print("StatsModels Version: {}".format(statsmodels.__version__)) # StatsModels version
In [9]:
# load the pwc dataset from azure
frame = pd.read_csv('pwc_moneytree.csv')
In [10]:
frame.head()
Out[10]:
In [11]:
del frame['Grand Total']
frame.columns = ['year', 'type', 'q1', 'q2', 'q3', 'q4']
frame['year'] = frame['year'].fillna(method='ffill')
frame.head()
Out[11]:
Deals and investments are in alternating rows of frame, let's separate them
In [12]:
deals_df = frame.iloc[0::2]
investments_df = frame.iloc[1::2]
In [13]:
# once separated, 'type' field is identical within each df
# let's delete it
del deals_df['type']
del investments_df['type']
In [14]:
deals_df.head()
Out[14]:
In [15]:
investments_df.head()
Out[15]:
In [16]:
def unstack_to_series(df):
"""
Takes q1-q4 in a dataframe and converts it to a series
input: a dataframe containing ['q1', 'q2', 'q3', 'q4']
ouput: a pandas series
"""
quarters = ['q1', 'q2', 'q3', 'q4']
d = dict()
for i, row in df.iterrows():
for q in quarters:
key = str(int(row['year'])) + q
d[key] = row[q]
# print(key, q, row[q])
return pd.Series(d)
In [17]:
deals = unstack_to_series(deals_df ).dropna()
investments = unstack_to_series(investments_df).dropna()
In [18]:
def string_to_int(money_string):
numerals = [c if c.isnumeric() else '' for c in money_string]
return int(''.join(numerals))
In [19]:
# convert deals from string to integers
deals = deals.apply(string_to_int)
deals.tail()
Out[19]:
In [20]:
# investment in billions USD
# converts to integers - which is ok, since data is in dollars
investments_b = investments.apply(string_to_int)
# in python3 division automatically converts numbers to floats, we don't loose precicion
investments_b = investments_b / 10**9
# round data to 2 decimals
investments_b = investments_b.apply(round, ndigits=2)
investments_b.tail()
Out[20]:
In [23]:
import matplotlib.pyplot as plt # http://matplotlib.org/
import matplotlib.patches as mpatches
import matplotlib.ticker as ticker
%matplotlib inline
# change matplotlib inline display size
# import matplotlib.pylab as pylab
# pylab.rcParams['figure.figsize'] = (8, 6) # that's default image size for this interactive session
fig, ax1 = plt.subplots()
ax1.set_title("VC historical trend (US Data)")
t = range(len(investments_b)) # need to substitute tickers for years later
width = t[1]-t[0]
y1 = investments_b
# create filled step chart for investment amount
ax1.bar(t, y1, width=width, facecolor='0.80', edgecolor='', label = 'Investment ($ Bln.)')
ax1.set_ylabel('Investment ($ Bln.)')
# set up xlabels with years
years = [str(year)[:-2] for year in deals.index][::4] # get years without quarter
ax1.set_xticks(t[::4]) # set 1 tick per year
ax1.set_xticklabels(years, rotation=50) # set tick names
ax1.set_xlabel('Year') # name X axis
# format Y1 tickers to $ billions
formatter = ticker.FormatStrFormatter('$%1.0f Bil.')
ax1.yaxis.set_major_formatter(formatter)
for tick in ax1.yaxis.get_major_ticks():
tick.label1On = False
tick.label2On = True
# create second Y2 axis for Num of Deals
ax2 = ax1.twinx()
y2 = deals
ax2.plot(t, y2, color = 'k', ls = '-', label = 'Num. of Deals')
ax2.set_ylabel('Num. of Deals')
# add annotation bubbles
ax2.annotate('1997-2000 dot-com bubble', xy=(23, 2100), xytext=(3, 1800),
bbox=dict(boxstyle="round4", fc="w"),
arrowprops=dict(arrowstyle="-|>",
connectionstyle="arc3,rad=0.2",
fc="w"),
)
ax2.annotate('2007-08 Financial Crisis', xy=(57, 800), xytext=(40, 1300),
bbox=dict(boxstyle="round4", fc="w"),
arrowprops=dict(arrowstyle="-|>",
connectionstyle="arc3,rad=-0.2",
fc="w"),
)
# add legend
ax1.legend(loc="best")
ax2.legend(bbox_to_anchor=(0.95, 0.88))
fig.tight_layout() # solves cropping problems when saving png
fig.savefig('vc_trend_3.png', dpi=250)
plt.show()
In [22]:
# load countries dataset from azure
ds = ws.datasets['country_data.csv']
In [ ]:
# data for 2015
country_data = ds.to_dataframe()
country_data
In [ ]:
def tex(df):
"""
Print dataframe contents in latex-ready format
"""
for line in df.to_latex().split('\n'):
print(line)
In [ ]:
params = pd.DataFrame(country_data['Criteria'])
params.index = ['y'] + ['X'+str(i) for i in range(1, len(country_data))]
tex(params)
In [ ]:
# set index
country_data = country_data.set_index('Criteria')
# convert values to floats (note: comas need to be replaced by dots for python conversion to work)
country_data.index = ['y'] + ['X'+str(i) for i in range(1, len(country_data))]
country_data
In [ ]:
const = pd.Series([1]*len(country_data.columns), index = country_data.columns, name = 'X0')
const
In [ ]:
country_data = pd.concat([pd.DataFrame(const).T, country_data])
country_data = country_data.sort_index()
country_data
In [ ]:
tex(country_data)
In [ ]:
y = country_data.iloc[-1,:]
y
In [ ]:
X = country_data.iloc[:-1, :].T
X
In [ ]:
# Fit regression model
results = sm.OLS(y, X).fit()
# Inspect the results in latex doc, {tab:vc_ols_1}
print(results.summary())
In [ ]:
# Inspect the results in latex doc, {tab:vc_ols_1}
print(results.summary().as_latex())
In [ ]:
# equation for eq:OLS_1_coeffs in LaTeX
equation = 'Y ='
for i, coeff in results.params.iteritems():
sign = '+' if coeff >= 0 else '-'
equation += ' ' + sign + str(abs(round(coeff,2))) + '*' + i
print(equation)
In [ ]:
# correlation table
corr = country_data.T.corr().iloc[1:,1:]
corr = corr.applymap(lambda x: round(x, 2))
corr
In [ ]:
# corr table to latex
tex(corr)
In [ ]:
import itertools
# set of unique parameter pairs
pairs = set([frozenset(pair) for pair in itertools.permutations(list(corr.index), 2)])
for pair in pairs:
pair = sorted(list(pair))
corr_pair = corr.loc[pair[0],pair[1]]
if corr_pair > 0.7:
print(pair, round(corr_pair, 2))
print('-'*40)
In [ ]:
print('a')
In [ ]:
for i in corr.columns:
for j in corr.columns:
if abs(corr.loc[i, j]) > 0.7 and i != j:
print(i+'~'+j, corr.loc[i, j])
In [ ]: