In [ ]:
collaboration_data = pd.DataFrame(columns=['Year', 'Country', 'Internal'])
i = 0
for year, graph in zip(years, graphs):
for country in graph.nodes():
counts = Counter()
total = sum([graph[country][neighbor]['weight']
for neighbor in graph.neighbors(country)])
external = sum([graph[country][neighbor]['weight']
for neighbor in graph.neighbors(country)
if neighbor != country])
internal = 1. - (external/total)
if np.isnan(internal):
continue
collaboration_data.loc[i] = [year, country, internal]
i += 1
In [ ]:
plt.scatter(collaboration_data.Year, collaboration_data.Internal, alpha=0.25)
plt.boxplot([collaboration_data[collaboration_data.Year == year].Internal for year in years], positions=years)
plt.ylabel('Internal collaborations')
plt.show()
In [ ]:
import pymc
rates = pymc.Container([pymc.Gamma('rate_%i' % year,
alpha=1.,
beta=1.,
value=5.)
for year in years])
@pymc.deterministic
def rate(rates=rates):
return np.array([rates[years.index(year)]
for year in collaboration_data.Year])
internal = pymc.Gamma('internal',
alpha=1., beta=rate,
value=collaboration_data.Internal,
observed=True)
In [ ]:
model = pymc.Model({
'rates': rates,
'internal': internal
})
In [ ]:
M = pymc.MCMC(model)
M.sample(10000, burn=200, thin=20)
In [ ]:
plt.figure(figsize=(10, 4))
plt.scatter(collaboration_data.Year, collaboration_data.Internal, alpha=0.25)
# Plot the 95% credible interval as box/whiskers.
plt.boxplot([1./dist.trace()[:] for dist in M.rates],
positions=years,
whis=[2.5, 97.5],
showfliers=False)
plt.ylim(0, 1.0)
plt.legend(loc='best')
plt.show()