This notebook plots the development of the number of datasets on datahub.io with a certain tag (e.g. "lod") or a certain resource format (e.g. "api/sparql").
In [1]:
# Display images inline and as SVG
%matplotlib inline
%config InlineBackend.figure_format = 'svg'
# Import the libraries
import json # For loading and converting JSON
import matplotlib.pyplot as plt # For plotting
import pandas as pd # For the statistical stuff
import seaborn as sns # For prettier plotting
import urllib2 # For loading the data from the API
In [2]:
data_lod_0000 = json.load(urllib2.urlopen('http://datahub.io/api/3/action/package_search?fq=tags:lod&rows=1000&start=0'))
data_lod_1000 = json.load(urllib2.urlopen('http://datahub.io/api/3/action/package_search?fq=tags:lod&rows=1000&start=1000'))
In [3]:
# Turn the loaded JSON files into Pandas dataframes
df_lod_0000 = pd.io.json.json_normalize(data_lod_0000['result']['results'])
df_lod_1000 = pd.io.json.json_normalize(data_lod_1000['result']['results'])
# Concatenate the two dataframes
df_lod = pd.concat([df_lod_0000, df_lod_1000])
# Extract and sort the 'metadata_created' column in order to use it as an index
dti_lod = pd.to_datetime(df_lod['metadata_created'])
dti_lod.sort()
# Create a TimeSeries with the prepared index and set the value of each entry to 1
ts_lod = pd.Series(1, index=dti_lod)
# Resample the TimeSeries
ts_lod = ts_lod.resample('M', how='sum', kind='period') # yearly = A, monthly = M
# If there is no value for a given period, fill with 0
ts_lod = ts_lod.fillna(0)
# Create a cumulated TimeSeries, print and plot it
ts_lod_cumsum = ts_lod.cumsum()
ts_lod_cumsum.plot()
ts_lod_cumsum
Out[3]:
In [4]:
data_sparql = json.load(urllib2.urlopen('http://datahub.io/api/3/action/package_search?fq=res_format:api%2Fsparql&rows=1000&start=0'))
In [5]:
# Turn the loaded JSON file into a Pandas dataframe
df_sparql = pd.io.json.json_normalize(data_sparql['result']['results'])
# Extract and sort the 'metadata_created' column in order to use it as an index
dti_sparql = pd.to_datetime(df_sparql['metadata_created'])
dti_sparql.sort()
# Create a TimeSeries with the prepared index and set the value of each entry to 1
ts_sparql = pd.Series(1, index=dti_sparql)
# Resample the TimeSeries
ts_sparql = ts_sparql.resample('M', how='sum', kind='period') # yearly = A, monthly = M
# If there is no value for a given period, fill with 0
ts_sparql = ts_sparql.fillna(0) # If there is no value for a given period, fill with 0
# Create a cumulated TimeSeries, print and plot it
ts_sparql_cumsum = ts_sparql.cumsum()
ts_sparql_cumsum.plot()
ts_sparql_cumsum
Out[5]:
In [6]:
# Combine the two cumulated TimeSeries and fill missing values with 0
ts_combined_cumsum = pd.concat([ts_lod_cumsum, ts_sparql_cumsum], axis=1).fillna(0)
# Rename the index (used as x-axis label)
ts_combined_cumsum.index.name = 'Time'
# Rename the two columns (used for the legend)
ts_combined_cumsum.columns = ['LOD', 'SPARQL']
# Plot the combined TimeSeries
ax = ts_combined_cumsum.plot()
# Set the y-axis label
ax.set_ylabel('Number of datasets')
# Move the legend to the left
ax.legend(loc='upper left')
# Format the x tick labels
labels = ax.get_xticklabels()
for label in labels:
label.set_ha('left')
# Save the plot as PDF and PNG (funny file suffixes because of authorea.com figure workaround)
plt.savefig('../figures/datahubio_datasets.png.pdf')
plt.savefig('../figures/datahubio_datasets.pdf.png', dpi=200)