CouchDb Data Sampler notebook

This should provide a base for creating random samples of documents in a collection in Couch DB. It will create a new db ucldc_samples_from_<cid>_<date> in couchdb for use in QA.


In [1]:
from __future__ import print_function
import os
import json
import random
import datetime
import couchdb

In [2]:
collection_id = '1678' # NOTE: is a string, not a number though numeric data
sample_size = 3
SAMPLE_DB_NAME_TEMPLATE = 'ucldc_samples_from_{}_{}'

In [3]:
# NOTE: the calls below get the results from https://54.84.142.143/couchdb/ucldc/_design/all_provider_docs/_view/by_provider_name?key="1678"
server = couchdb.Server()
server.resource.credentials = ('admin', os.environ['COUCHDB_PASSWORD'])
cdb = server['ucldc']

In [4]:
def get_collection_ids(couchdb, collection_id):
    v = cdb.view('all_provider_docs/by_provider_name',
             key=collection_id)
    doc_ids = []
    for row in v:
        doc_ids.append(row['id'])
    return doc_ids

In [5]:
def get_sample_ids(collection_ids, sample_size):
    population_size = len(collection_ids)
    sample_ids = set()
    # NOTE: Each time this is run, a different sample set will be generated. Remember that notebooks cache results!
    # It may also take more than sample_size loops, if collisions in random ids occur set will not grow
    while len(sample_ids) < sample_size:
        rand_index = random.randint(0, population_size - 1)
        sample_ids.add(collection_ids[rand_index])
    return sample_ids

In [6]:
def create_samples_db(sourcedb, collection_id, sample_size=3):
    collection_ids = get_collection_ids(sourcedb, collection_id)
    sample_ids = get_sample_ids(collection_ids, sample_size)
    sample_db = server.create(SAMPLE_DB_NAME_TEMPLATE.format(collection_id, datetime.date.today()))
    for doc_id in sample_ids:
        sample_db[doc_id] = sourcedb[doc_id]
    return 'Created db {} with docs {}'.format(sample_db, sample_ids)

In [7]:
print(create_samples_db(cdb, '1678'))


Created db <Database 'ucldc_samples_from_1678_2014-12-17'> with docs set([u'1678--http://ark.cdlib.org/ark:/13030/tf9w10119r', u'1678--http://ark.cdlib.org/ark:/13030/tf138nb26t', u'1678--http://ark.cdlib.org/ark:/13030/tf400007gw'])

In [8]:
# Takes a while, big collections will be slow
print(create_samples_db(cdb, '26094', sample_size=50))


Created db <Database 'ucldc_samples_from_26094_2014-12-17'> with docs set([u'26094--LAPL00070090', u'26094--00084182', u'26094--LAPL00053978', u'26094--00025816', u'26094--LAPL00028451', u'26094--LAPL00037037', u'26094--LAPL00055687', u'26094--LAPL00000582', u'26094--00090992', u'26094--LAPL00005454', u'26094--00097011', u'26094--LAPL00027857', u'26094--LAPL00005744', u'26094--LAPL00038956', u'26094--LAPL00007966', u'26094--00097852', u'26094--LAPL00038307', u'26094--00100899', u'26094--LAPL00000824', u'26094--LAPL00064236', u'26094--00079228', u'26094--LAPL00083467', u'26094--LAPL00049524', u'26094--LAPL00036890', u'26094--LAPL00042577', u'26094--LAPL00020854', u'26094--00003281', u'26094--LAPL00040203', u'26094--00094225', u'26094--LAPL00022400', u'26094--00058528', u'26094--LAPL00021635', u'26094--00097775', u'26094--00103225', u'26094--00100460', u'26094--LAPL00035762', u'26094--LAPL00074876', u'26094--00089516', u'26094--LAPL00055414', u'26094--LAPL00057642', u'26094--LAPL00035802', u'26094--LAPL00040181', u'26094--LAPL00084340', u'26094--00092013', u'26094--LAPL00056018', u'26094--00093407', u'26094--001012807', u'26094--LAPL00061860', u'26094--LAPL00008706', u'26094--00084752'])

In [ ]: