In [1]:
from tqdm import tqdm
import pandas as pd
from mdf_forge.forge import Forge
In [2]:
mdf = Forge()
In [3]:
# First, let's search for all the datasets. There are less than 10,000 currently, so `search()` will work fine.
res = mdf.search("mdf.resource_type:dataset", advanced=True)
# Now, let's pull out the source_name, title, and number of records for each dataset.
mdf_resources = []
for r in tqdm(res):
q = "mdf.resource_type:record AND mdf.source_name:" + r["mdf"]["source_name"]
x, info = mdf.search(q, advanced=True, info=True, limit=0)
mdf_resources.append((r['mdf']['source_name'], r['dc']["titles"][0]['title'], info["total_query_matches"]))
df = pd.DataFrame(mdf_resources, columns=['source_name', 'title', 'num_records'])
In [4]:
# Finally, we can print the data we gathered.
print("Number of data resources: {n_datasets}".format(n_datasets=len(df)))
df.sort_values(by="num_records", ascending=False).head(15)
Out[4]:
In [5]:
# Bonus: How many records are in MDF in total?
df["num_records"].sum()
Out[5]:
In [ ]: