It is possible for pandas to directly read the data from xls .
In [2]:
import pandas as pd
import numpy as np
We are going to load the Company profile data
In [3]:
xl = pd.io.excel.read_excel('crunchbase_monthly_export_d43b44299ade53.xlsx', sheetname='Companies', index_col='permalink')
In [4]:
xl.info()
In [7]:
xl=xl[xl.founded_year >= 2000]
In [8]:
xl.info()
In [9]:
len(xl.market.value_counts())
Out[9]:
That is way too many. So lets look at the top 20.
In [10]:
xl.market.value_counts().head(20)
Out[10]:
Software seems to be a catch all phrase, when they couldn't place it in a specific category or when there are just way too many products produced by the company. The surprise is 'Biotechnology'. Who knew they were so many?
Ok, now let's create a graph!
In [11]:
from matplotlib import pyplot as plt
%matplotlib inline
In [12]:
from mpltools import style
style.use('ggplot')
In [13]:
xl.market.value_counts().head(30).plot(kind='bar', figsize=(15,8))
Out[13]:
In [14]:
xl.country_code.value_counts().head(15)
Out[14]:
In [15]:
xl.country_code.value_counts().head(15).plot(kind='bar', figsize=(15,8))
Out[15]:
In [16]:
xl.founded_year.value_counts(sort=False).tail(25).plot(kind='bar', figsize=(15,8))
Out[16]:
In [17]:
xl.status.value_counts()
Out[17]:
In [18]:
xl.status.value_counts()* 100/len(xl)
Out[18]:
In [19]:
year_gp = xl.groupby('founded_year')
In [20]:
type(year_gp)
Out[20]:
In [21]:
for year, group in year_gp:
print year
print group.info()
break
In [22]:
year_gp = xl[xl.founded_year>=2000].groupby('founded_year')
In [23]:
for year, group in year_gp:
print year, len(group)
We see that we get a single record for the year 2015. A startup getting funding before it is founded? Lets see what it is.
In [24]:
xl[xl.founded_year==2015]
Out[24]:
The company has had funding this year but the founded_year is set wrong. May be it is just bad data. Let us try to ignore this. So we need to add another criteria saying founded_year
should be less than 2015. but the above simple filtering syntax will not be enough. We need to create a mask.
In [25]:
mask = (xl.founded_year>=2000) & (xl.founded_year<2015)
In [26]:
year_gp = xl[(xl.founded_year>=2000) & (xl.founded_year<2015)].groupby('founded_year')
In [27]:
xl.status.value_counts()
Out[27]:
In [28]:
type(xl.status.value_counts())
Out[28]:
In [29]:
xl.status.value_counts().operating2
In [14]:
for year, group in year_gp:
print year, len(group)
print group.status.value_counts() *100/len(group)
In [30]:
index, closed, operating, acquired, unknown = [],[],[],[],[]
for year, group in year_gp:
index.append(year)
status = group.status.value_counts() *100/len(group)
operating.append(status.operating)
try:
acquired.append(status.acquired)
except AttributeError:
acquired.append(0)
try:
closed.append(status.closed )
except AttributeError:
closed.append(0)
unknown.append(100 - operating[-1] -acquired[-1] -closed[-1])
In [31]:
closed, operating, acquired, unknown = np.array(closed), np.array(operating), np.array(acquired), np.array(unknown)
In [111]:
width = 0.35
plt.figure(figsize=(15,5))
p1 = plt.bar(index, operating, width, color='g')
p2 = plt.bar(index, acquired, width, bottom=operating, color='b')
p3 = plt.bar(index, closed, width, bottom=acquired+operating, color='r')
p4 = plt.bar(index, unknown, width, bottom=closed+acquired+operating, color='y')
plt.ylabel('Success %')
plt.title('Percentage of Startup Success by Year')
#plt.xticks([int(i) for i in index])
#plt.yticks(xrange(0,101,10))
plt.legend( (p1[0], p2[0], p3[0], p4[0]), ('Operating', 'Acquired','Closed', 'Unknown'), bbox_to_anchor=(1.1, 1.05) )
Out[111]:
We can see that in 2000, almost 20% of the companies are acquired. We can also see that between 5-10% of the companies are closed, peaking around 2006-2008.
We are fortunate that the data has both founding date and first funding date. But there are a few unspecified values. So let us select only the records that have the founding date and funding date.
In [32]:
xl.info()
In [37]:
import datetime
first_funding_at = pd.to_datetime(xl.first_funding_at, dayfirst=True, errors='ignore')
print len(first_funding_at)
first_funding_at = first_funding_at[[isinstance(value, datetime.datetime) for index, value in first_funding_at.iteritems() ]]
print len(first_funding_at)
In [38]:
founded_at = pd.to_datetime(xl.founded_at, dayfirst=True)
founded_at = founded_at[[isinstance(value, datetime.datetime) for index, value in founded_at.iteritems()]]
In [39]:
daysdiff = first_funding_at - founded_at
daysdiff = daysdiff[pd.notnull(daysdiff)]
dayslist=[value.days for index, value in daysdiff.iteritems()]
In [40]:
plt.figure(figsize=(15,8))
graph = plt.hist(dayslist, bins=range(0, 6000, 365))
In [169]:
plt.figure(figsize=(15,8))
graph = plt.hist(dayslist, bins=[0, 30, 60, 90, 180, 365, 365+180, 2*365, 2*365 + 180, 3*365])
In [41]:
daysdiff[daysdiff<30]
Out[41]:
In [42]:
daysdiff[(daysdiff<30)&(daysdiff>0)]
Out[42]:
More than 5000 companies have acquired funds before starting. Zynga is one of them.
Now let us read in the investments data.
In [118]:
rounds = pd.io.excel.read_excel('crunchbase_monthly_export_d43b44299ade53.xlsx', sheetname='Rounds', index_col='funding_round_permalink')
In [44]:
rounds.info()
In [45]:
fund_type = rounds.funding_round_type.value_counts()
In [46]:
fund_type
Out[46]:
In [49]:
fund_type.plot(kind='pie', figsize=(10,10))
Out[49]:
In [50]:
recent_funding = rounds[rounds.funded_year >= 2000]
funding_sum = recent_funding[['funded_quarter','raised_amount_usd']].groupby('funded_quarter').sum()
In [51]:
funding_sum.plot(kind='bar', figsize=(15, 8))
Out[51]:
We can see that over all the funding keeps increasing, and there is more interest in Q1, peaking at Q2 and dipping in Q3 and Q4. So what is the story about 2006-Q3?
In [52]:
q3_2006 = recent_funding[recent_funding.funded_quarter=='2006-Q3'].sort("raised_amount_usd", ascending=False)
q3_2006[['company_name', 'company_market', 'raised_amount_usd']].head()
Out[52]:
In [53]:
venture_funding = rounds[(rounds.funded_year >= 2000) & (rounds.funding_round_type == 'venture')]
venture_sum = venture_funding[['funded_quarter','raised_amount_usd']].groupby('funded_quarter').sum()
venture_sum.plot(kind='bar', figsize=(15, 8))
Out[53]:
In [54]:
seed_funding = rounds[(rounds.funded_year >= 2000) & ((rounds.funding_round_type == 'seed') | (rounds.funding_round_type=='angel'))]
seed_sum = seed_funding[['funded_quarter','raised_amount_usd']].groupby('funded_quarter').sum()
seed_sum.plot(kind='bar', figsize=(15, 8))
Out[54]:
For this we need the next sheet of data.
In [55]:
investments = pd.io.excel.read_excel('crunchbase_monthly_export_d43b44299ade53.xlsx', sheetname='Investments', index_col='funding_round_permalink')
To see the behaviours of investment firms, let us group the data by investment name.
In [56]:
investments.info()
In [57]:
invest_gp = investments[['investor_permalink', 'raised_amount_usd']]\
.groupby('investor_permalink').sum()
In [60]:
type(invest_gp)
Out[60]:
In [62]:
invest_gp.sort('raised_amount_usd', ascending=False).head()
Out[62]:
In [70]:
investments[investments.investor_country_code == 'IND']
Out[70]:
In [76]:
invest_ind = investments[investments.company_country_code=='IND']\
[['investor_permalink', 'raised_amount_usd']]\
.groupby('investor_permalink').sum()
In [78]:
len(invest_ind)
Out[78]:
In [79]:
invest_us.sort('raised_amount_usd', ascending=False).head(10)
Out[79]:
In [92]:
cross_country_investing = investments[(investments.company_country_code != investments.investor_country_code) & pd.notnull(investments.company_country_code) & pd.notnull(investments.investor_country_code)]
cross_country_investing = cross_country_investing[['investor_country_code', 'company_country_code', 'company_permalink']]
cross_country_investing.info()
In [94]:
invest_count = cross_country_investing.groupby(['investor_country_code','company_country_code']).count()
In [98]:
invest_count.ix['IND']
Out[98]:
In [99]:
invest_count.ix['USA']
Out[99]:
In [101]:
invest_count.ix[('USA','IND')]
Out[101]:
In [111]:
investments[['raised_amount_usd','company_market']]\
.groupby('company_market')\
.sum()\
.sort('raised_amount_usd',ascending=False)
Out[111]:
In [113]:
investments[investments.company_country_code == 'IND']\
[['raised_amount_usd','company_market']]\
.groupby('company_market')\
.sum()\
.sort('raised_amount_usd',ascending=False).head(20)
Out[113]:
In [112]:
investments[(investments.company_country_code == 'IND') | (investments.funded_year >=2013)]\
[['raised_amount_usd','company_market']]\
.groupby('company_market')\
.sum()\
.sort('raised_amount_usd',ascending=False).head(20)
Out[112]:
In [114]:
investments.ix[1]
Out[114]:
We have to load yet another excel sheet to answer this question.
In [117]:
acquisitions = pd.io.excel.read_excel('crunchbase_monthly_export_d43b44299ade53.xlsx', sheetname='Acquisitions', index_col='company_permalink')
In [119]:
acquisitions.info()
In [120]:
acquisitions = acquisitions[acquisitions.acquired_year>=2000]
In [127]:
acquisitions[['acquirer_permalink','company_name']]\
.groupby("acquirer_permalink")\
.count()\
.sort('company_name',ascending=False)
Out[127]:
In [128]:
investments.info()
In [132]:
yearly_invest = investments[investments.funded_year>=2000][['funded_year','company_market']].groupby('funded_year')
In [142]:
index = []
from collections import defaultdict
plotpoints = defaultdict(dict)
for year, group in yearly_invest:
index.append(year)
rank = 1
for key, value in group.company_market.value_counts().head(10).iteritems():
plotpoints[key][year] = rank
rank = rank + 1
In [144]:
with plt.style.context('fivethirtyeight'):
plt.figure(figsize=(15,8))
plt.gca().invert_yaxis()
for market,points in plotpoints.iteritems():
values =[points.get(year, 11) for year in index]
plt.plot(index, values)
#plt.plot(x, np.sin(x) + x + np.random.randn(50))
#plt.plot(x, np.sin(x) + 0.5 * x + np.random.randn(50))
#plt.plot(x, np.sin(x) + 2 * x + np.random.randn(50))
In [ ]: