In [1]:
# This imports the OpenContextAPI from the api.py file in the
# opencontext directory.
%run '../opencontext/api.py'

In [2]:
import numpy as np
import pandas as pd

oc_api = OpenContextAPI()
oc_api.set_cache_file_prefix('murlo-objs')

# Make multiple values for non-numbers JSON formated strings
oc_api.multi_value_handle_non_number = 'concat'
oc_api.multi_value_handle_keyed_attribs = {
    'Motif': 'json',
    'Decorative Technique': 'json',
    'Fabric Category': 'json',
}

# Clear old cached records.
oc_api.clear_api_cache()

# This is a search url for bovid tibias.
url = 'https://opencontext.org/subjects-search/Italy?proj=24-murlo&prop=oc-gen-cat-object#14/43.1610/11.3961/18/any/Google-Satellite'

# Fetch the 'standard' (linked data identified) attributes in use with
# data at the url.
stnd_attribs_tuples = oc_api.get_standard_attributes(
    url,
)

proj_attribs_tuples = oc_api.get_common_attributes(
    url,
    min_portion=0.001,
)

# Now display the standard attributes found in this search / query result
for slug, label in stnd_attribs_tuples:
    print('Standard: {}, identified by slug: {}'.format(label, slug))

for slug, label in proj_attribs_tuples:
    print('Proj attribute: {}, identified by slug: {}'.format(label, slug))


Standard: Temporal Coverage, identified by slug: dc-terms-temporal
Standard: Consists of, identified by slug: cidoc-crm-p45-consists-of
Standard: Has type, identified by slug: cidoc-crm-p2-has-type
Proj attribute: Fabric Category, identified by slug: 24-fabric-category
Proj attribute: Object Type, identified by slug: 24-object-type
Proj attribute: Director, identified by slug: 24-director
Proj attribute: Fragment Noted, identified by slug: 24-fragment-noted
Proj attribute: Has Related Trench Book Entry, identified by slug: 24-has-related-trench-book-entry
Proj attribute: Decorative Technique, identified by slug: 24-decorative-technique
Proj attribute: Conservation Material, identified by slug: 24-conservation-material
Proj attribute: Conservation Action, identified by slug: 24-conservation-action
Proj attribute: Vessel Form, identified by slug: 24-vessel-form
Proj attribute: Chronology, identified by slug: 24-chronology
Proj attribute: Motif, identified by slug: 24-motif
Proj attribute: Catalogued by, identified by slug: catalogued-by
Proj attribute: reference, identified by slug: 24-reference
Proj attribute: Conserved by, identified by slug: 24-conserved-by
Proj attribute: Record Type, identified by slug: 24-record-type
Proj attribute: Initially documented as, identified by slug: 24-initially-documented-as
Proj attribute: Has specialist record, identified by slug: 24-has-specialist-record
Proj attribute: absorbed by, identified by slug: 24-absorbed-by
Proj attribute: absorbs, identified by slug: 24-absorbs
Proj attribute: Vessel Part Present, identified by slug: 24-vessel-part-present
Proj attribute: joins, identified by slug: 24-joins
Proj attribute: Vessel Part, identified by slug: 24-vessel-part
Proj attribute: Comparanda Form, identified by slug: 24-comparanda-form
Proj attribute: Size, identified by slug: 24-size
Proj attribute: Object Type (notes), identified by slug: 24-object-type-notes
Proj attribute: Condition, identified by slug: 24-condition
Proj attribute: Fabric Description, identified by slug: 24-fabric-description
Proj attribute: Description, identified by slug: 24-description
Proj attribute: Object Master Grid, identified by slug: 24-object-master-grid
Proj attribute: Local Grid (Y), identified by slug: 24-local-grid-y
Proj attribute: Munsell Color, identified by slug: 24-munsell-color
Proj attribute: Catalog ID Note, identified by slug: 24-catalog-id-note
Proj attribute: Conservation People, identified by slug: 24-conservation-people
Proj attribute: Conservation Treatment Notes, identified by slug: 24-conservation-treatment-notes
Proj attribute: Object Coordinate Notes, identified by slug: 24-object-coordinate-notes
Proj attribute: Local Grid (X) (Note), identified by slug: 24-local-grid-x-note
Proj attribute: Trench, identified by slug: 24-trench
Proj attribute: Depth Notes, identified by slug: 24-depth-notes
Proj attribute: General Notes, identified by slug: 24-general-notes
Proj attribute: Grid Notes, identified by slug: 24-grid-notes
Proj attribute: Specialist Description Note, identified by slug: 24-specialist-description-note
Proj attribute: Supplemental Find Identification Note, identified by slug: 24-supplemental-find-identification-note
Proj attribute: Conserved in Group, identified by slug: 24-conserved-in-group
Proj attribute: Object Relationship Note, identified by slug: 24-object-relationship-note
Proj attribute: References, identified by slug: 24-references
Proj attribute: Supplement Note, identified by slug: 24-supplement-note
Proj attribute: Local Grid (X), identified by slug: 24-local-grid-x
Proj attribute: Conservation Treatment Year, identified by slug: 24-conservation-treatment-year
Proj attribute: Legacy Database Artifact ID, identified by slug: 24-legacy-database-artifact-id
Proj attribute: Year Excavated, identified by slug: 24-year-excavated-1
Proj attribute: Year Cataloged, identified by slug: 24-year-cataloged
Proj attribute: Page, identified by slug: 24-page
Proj attribute: Year, identified by slug: 24-year
Proj attribute: Elevation, identified by slug: 24-elevation
Proj attribute: Grid (X), identified by slug: 24-grid-x
Proj attribute: Grid (Y), identified by slug: 24-grid-y
Proj attribute: Grid X Uncertainty (+/- cm), identified by slug: 24-grid-x-uncertainty-cm
Proj attribute: Grid Y Uncertainty (+/- cm), identified by slug: 24-grid-y-uncertainty-cm
Proj attribute: Elevation Uncertainty (+/- cm), identified by slug: 24-elevation-uncertainty-cm
Proj attribute: Conservation Treatment Date, identified by slug: 24-conservation-treatment-date
Proj attribute: Date Cataloged, identified by slug: 24-date-cataloged

In [3]:
# Make a list of only the slugs from the list of slug, label tuples.
attribs_for_records = [slug for slug, _ in (stnd_attribs_tuples + proj_attribs_tuples)]

# Make a dataframe by fetching result records from Open Context.
# This will be slow until we finish improvements to Open Context's API.
# However, the results get cached by saving as files locally. That
# makes iterating on this notebook much less painful.
df = oc_api.url_to_dataframe(url, attribs_for_records)


Got records 12401 to 12584 of 12584 from: http://opencontext.org/subjects-search/Italy?attributes=dc-terms-temporal%2Ccidoc-crm-p45-consists-of%2Ccidoc-crm-p2-has-type%2C24-fabric-category%2C24-object-type%2C24-director%2C24-fragment-noted%2C24-has-related-trench-book-entry%2C24-decorative-technique%2C24-conservation-material%2C24-conservation-action%2C24-vessel-form%2C24-chronology%2C24-motif%2Ccatalogued-by%2C24-reference%2C24-conserved-by%2C24-record-type%2C24-initially-documented-as%2C24-has-specialist-record%2C24-absorbed-by%2C24-absorbs%2C24-vessel-part-present%2C24-joins%2C24-vessel-part%2C24-comparanda-form%2C24-size%2C24-object-type-notes%2C24-condition%2C24-fabric-description%2C24-description%2C24-object-master-grid%2C24-local-grid-y%2C24-munsell-color%2C24-catalog-id-note%2C24-conservation-people%2C24-conservation-treatment-notes%2C24-object-coordinate-notes%2C24-local-grid-x-note%2C24-trench%2C24-depth-notes%2C24-general-notes%2C24-grid-notes%2C24-specialist-description-note%2C24-supplemental-find-identification-note%2C24-conserved-in-group%2C24-object-relationship-note%2C24-references%2C24-supplement-note%2C24-local-grid-x%2C24-conservation-treatment-year%2C24-legacy-database-artifact-id%2C24-year-excavated-1%2C24-year-cataloged%2C24-page%2C24-year%2C24-elevation%2C24-grid-x%2C24-grid-y%2C24-grid-x-uncertainty-cm%2C24-grid-y-uncertainty-cm%2C24-elevation-uncertainty-cm%2C24-conservation-treatment-date%2C24-date-cataloged&proj=24-murlo&prop=oc-gen-cat-object&response=metadata%2Curi-meta&rows=200&start=12400

In this particular dataset, there are long (sometimes HTML) descriptions of objects. We're caching these locally in the JSON results from the API requests to Open Context. However, for our purposes of making analysis friendly dataframes, we don't need these long free-text attributes. So we'll drop them from the dataframe.


In [7]:
# Define a list of columns to drop.
drop_cols = [
    'Fragment Noted',
    'Depth Notes',
    'Supplement Note',
    'Fabric Description',
    'Description',
    'Size',
]
df.drop(columns=drop_cols, inplace=True)
# The API returns 'False' if a citation URI is not defined, it's better 
# practice to make this a null.
df.loc[(df['citation uri'] == False), 'citation uri'] = np.nan 

import os

# Now save the results of all of this as a CSV file.
repo_path = os.path.dirname(os.path.abspath(os.getcwd()))
csv_path = os.path.join(
    repo_path, 
    'files',
    'oc-api-murlo-objects-multivalue-as-json.csv'
)
df.to_csv(csv_path, index=False)
print('Saved this example as a CSV table at: {}'.format(csv_path))


Saved this example as a CSV table at: /home/ekansa/github/open-context-jupyter/files/oc-api-murlo-objects-multivalue-as-json.csv

Using the already cached JSON obtained from the Open Context API, we can make a second dataframe that is "wider" (has many more columns"). This wide dataframe will express multiple values for "Motif", "Decorative Technique", and "Fabric Category" in different columns. We set the dictioary oc_api.multi_value_handle_keyed_attribs to do this.


In [5]:
oc_api.multi_value_handle_non_number = 'concat'
oc_api.multi_value_handle_keyed_attribs = {
    'Motif': 'column_val',
    'Decorative Technique': 'column_val',
    'Fabric Category': 'column_val',
}
df_wide = oc_api.url_to_dataframe(url, attribs_for_records)


Got records 12401 to 12584 of 12584 from: http://opencontext.org/subjects-search/Italy?attributes=dc-terms-temporal%2Ccidoc-crm-p45-consists-of%2Ccidoc-crm-p2-has-type%2C24-fabric-category%2C24-object-type%2C24-director%2C24-fragment-noted%2C24-has-related-trench-book-entry%2C24-decorative-technique%2C24-conservation-material%2C24-conservation-action%2C24-vessel-form%2C24-chronology%2C24-motif%2Ccatalogued-by%2C24-reference%2C24-conserved-by%2C24-record-type%2C24-initially-documented-as%2C24-has-specialist-record%2C24-absorbed-by%2C24-absorbs%2C24-vessel-part-present%2C24-joins%2C24-vessel-part%2C24-comparanda-form%2C24-size%2C24-object-type-notes%2C24-condition%2C24-fabric-description%2C24-description%2C24-object-master-grid%2C24-local-grid-y%2C24-munsell-color%2C24-catalog-id-note%2C24-conservation-people%2C24-conservation-treatment-notes%2C24-object-coordinate-notes%2C24-local-grid-x-note%2C24-trench%2C24-depth-notes%2C24-general-notes%2C24-grid-notes%2C24-specialist-description-note%2C24-supplemental-find-identification-note%2C24-conserved-in-group%2C24-object-relationship-note%2C24-references%2C24-supplement-note%2C24-local-grid-x%2C24-conservation-treatment-year%2C24-legacy-database-artifact-id%2C24-year-excavated-1%2C24-year-cataloged%2C24-page%2C24-year%2C24-elevation%2C24-grid-x%2C24-grid-y%2C24-grid-x-uncertainty-cm%2C24-grid-y-uncertainty-cm%2C24-elevation-uncertainty-cm%2C24-conservation-treatment-date%2C24-date-cataloged&proj=24-murlo&prop=oc-gen-cat-object&response=metadata%2Curi-meta&rows=200&start=12400

The df_wide dataframe handles multiple values for some attributes by making many boolean columns, with each column noting the presense of a given attribute value on a row for an artifact. For example, True values on the column "Motif :: Panther"" indicate the presense of a "Panther" motif observed on an artifact, and True valeus of the column "Motif :: Potnia Theron" indicate a "Potnia Theron" motif on an artifact.


In [6]:
df_wide.drop(columns=drop_cols, inplace=True)
# The API returns 'False' if a citation URI is not defined, it's better 
# practice to make this a null.
df_wide.loc[(df_wide['citation uri'] == False), 'citation uri'] = np.nan 
csv_wide_path = os.path.join(
    repo_path, 
    'files',
    'oc-api-murlo-objects-multivalue-as-cols.csv'
)
df_wide.to_csv(csv_wide_path, index=False)
print('Saved this example wide as a CSV table at: {}'.format(csv_wide_path))


Saved this example wide as a CSV table at: /home/ekansa/github/open-context-jupyter/files/oc-api-murlo-objects-multivalue-as-cols.csv