In [9]:
from __future__ import print_function
import os
import sys

In [10]:
#sc = SparkContext()
#sqlContext = SQLContext(sc)
sc


Out[10]:
<pyspark.context.SparkContext at 0x7fb4f473c7f0>

In [11]:
df = sqlContext.read.load("/guoda/data/idigbio-media-20170611T154222.parquet")

In [12]:
print(df.count())
df.head(2)


21241288
Out[12]:
[Row(accessuri='http://www.pnwherbaria.org/images/jpeg.php?Image=SRP052949.jpg', data=Row(Iptc4xmpExt:CountryCode=None, Iptc4xmpExt:CountryName=None, Iptc4xmpExt:ProvinceState=None, Iptc4xmpExt:WorldRegion=None, ac:accessURI='http://www.pnwherbaria.org/images/jpeg.php?Image=SRP052949.jpg', ac:associatedSpecimenReference=None, ac:attributionLogoURL=None, ac:bestQualityAccessURI=None, ac:bestQualityFormat=None, ac:caption=None, ac:captureDevice=None, ac:comments=None, ac:digitizationDate='2015-10-13T15:48:52-0800', ac:furtherInformationURL=None, ac:goodQualityAccessURI=None, ac:hashFunction=None, ac:hashValue=None, ac:licenseLogoURL='https://i.creativecommons.org/l/by-nc-sa/3.0/us/80x15.png', ac:metadataCreator=None, ac:metadataLanguage='en', ac:metadataLanguageLiteral=None, ac:metadataProvider=None, ac:metadataProviderLiteral=None, ac:provider=None, ac:providerID=None, ac:providerLiteral=None, ac:providerManagedID=None, ac:resourceCreationTechnique=None, ac:serviceExpectation=None, ac:subjectCategoryVocabulary=None, ac:subjectOrientation=None, ac:subjectPart=None, ac:subtype='Photograph', ac:subtypeLiteral=None, ac:tag=None, ac:taxonCoverage='Caprifoliaceae', ac:thumbnailAccessURI=None, ac:variant='ac:Best Quality', ac:variantLiteral=None, coreid='2706948', dc:creator=None, dc:format='image/jpeg', dc:language=None, dc:rights=None, dc:source=None, dc:type='StillImage', dcterms:accessRights=None, dcterms:available=None, dcterms:created=None, dcterms:creator='Jim Smith', dcterms:description=None, dcterms:format=None, dcterms:identifier='61d3ce8d-f2cd-41c9-a026-baec4b3f496e', dcterms:language=None, dcterms:license=None, dcterms:modified='2015-11-03T20:03:39-0800', dcterms:publisher=None, dcterms:references=None, dcterms:rights='© 2017 Boise State University, Snake River Plains Herbarium', dcterms:rightsHolder=None, dcterms:source=None, dcterms:title='SRP 52949', dcterms:type=None, dwc:basisOfRecord=None, dwc:catalogNumber=None, dwc:collectionCode=None, dwc:collectionID=None, dwc:coordinateUncertaintyInMeters=None, dwc:country=None, dwc:county=None, dwc:day=None, dwc:decimalLatitude=None, dwc:decimalLongitude=None, dwc:eventDate=None, dwc:family=None, dwc:genus=None, dwc:geodeticDatum=None, dwc:georeferencedBy=None, dwc:habitat=None, dwc:identificationQualifier=None, dwc:institutionCode=None, dwc:kingdom=None, dwc:locality=None, dwc:month=None, dwc:municipality=None, dwc:occurrenceID=None, dwc:occurrenceRemarks=None, dwc:order=None, dwc:otherCatalogNumbers=None, dwc:phylum=None, dwc:recordedBy=None, dwc:scientificName='Symphoricarpos oreophilus', dwc:scientificNameAuthorship=None, dwc:specificEpithet=None, dwc:startDayOfYear=None, dwc:stateProvince=None, dwc:year=None, exif:PixelXDimension=None, exif:PixelYDimension=None, id=None, idigbio:OriginalFileName=None, idigbio:associatedRecordReference=None, idigbio:associatedRecordsetReference=None, idigbio:mediaStatus=None, idigbio:mediaStatusDate=None, idigbio:recordId=None, photoshop:Credit=None, symbiota:recordEnteredBy=None, xmp:CreateDate=None, xmp:MetadataDate=None, xmpRights:Owner='Boise State University, Snake River Plains Herbarium', xmpRights:UsageTerms='http://creativecommons.org/licenses/by-nc-sa/3.0/us/', xmpRights:WebStatement='http://creativecommons.org/licenses/by-nc-sa/3.0/us/'), datemodified=datetime.datetime(2017, 2, 10, 10, 13, 49, 421000), dqs=-1.2727272510528564, etag='b6b5505d344990c158a4b5cae9818fb36b261ff5', format='image/jpeg', hasSpecimen=True, licenselogourl=None, mediatype='images', modified=datetime.datetime(2015, 11, 3, 23, 3, 39), recordset='f84d528a-7d08-467e-b532-ace707316f1d', rights=None, tag=None, type='stillimage', uuid='5e915b8f-67d0-4e66-9bca-bbee0d94b968', version=None, webstatement=None, xpixels=None, ypixels=None),
 Row(accessuri='http://www.pnwherbaria.org/images/jpeg.php?Image=SRP052950.jpg', data=Row(Iptc4xmpExt:CountryCode=None, Iptc4xmpExt:CountryName=None, Iptc4xmpExt:ProvinceState=None, Iptc4xmpExt:WorldRegion=None, ac:accessURI='http://www.pnwherbaria.org/images/jpeg.php?Image=SRP052950.jpg', ac:associatedSpecimenReference=None, ac:attributionLogoURL=None, ac:bestQualityAccessURI=None, ac:bestQualityFormat=None, ac:caption=None, ac:captureDevice=None, ac:comments=None, ac:digitizationDate='2015-10-13T15:48:32-0800', ac:furtherInformationURL=None, ac:goodQualityAccessURI=None, ac:hashFunction=None, ac:hashValue=None, ac:licenseLogoURL='https://i.creativecommons.org/l/by-nc-sa/3.0/us/80x15.png', ac:metadataCreator=None, ac:metadataLanguage='en', ac:metadataLanguageLiteral=None, ac:metadataProvider=None, ac:metadataProviderLiteral=None, ac:provider=None, ac:providerID=None, ac:providerLiteral=None, ac:providerManagedID=None, ac:resourceCreationTechnique=None, ac:serviceExpectation=None, ac:subjectCategoryVocabulary=None, ac:subjectOrientation=None, ac:subjectPart=None, ac:subtype='Photograph', ac:subtypeLiteral=None, ac:tag=None, ac:taxonCoverage='Caprifoliaceae', ac:thumbnailAccessURI=None, ac:variant='ac:Best Quality', ac:variantLiteral=None, coreid='2706949', dc:creator=None, dc:format='image/jpeg', dc:language=None, dc:rights=None, dc:source=None, dc:type='StillImage', dcterms:accessRights=None, dcterms:available=None, dcterms:created=None, dcterms:creator='Jim Smith', dcterms:description=None, dcterms:format=None, dcterms:identifier='a9994d14-dad5-4348-aa76-84626a00bd86', dcterms:language=None, dcterms:license=None, dcterms:modified='2015-11-03T20:05:07-0800', dcterms:publisher=None, dcterms:references=None, dcterms:rights='© 2017 Boise State University, Snake River Plains Herbarium', dcterms:rightsHolder=None, dcterms:source=None, dcterms:title='SRP 52950', dcterms:type=None, dwc:basisOfRecord=None, dwc:catalogNumber=None, dwc:collectionCode=None, dwc:collectionID=None, dwc:coordinateUncertaintyInMeters=None, dwc:country=None, dwc:county=None, dwc:day=None, dwc:decimalLatitude=None, dwc:decimalLongitude=None, dwc:eventDate=None, dwc:family=None, dwc:genus=None, dwc:geodeticDatum=None, dwc:georeferencedBy=None, dwc:habitat=None, dwc:identificationQualifier=None, dwc:institutionCode=None, dwc:kingdom=None, dwc:locality=None, dwc:month=None, dwc:municipality=None, dwc:occurrenceID=None, dwc:occurrenceRemarks=None, dwc:order=None, dwc:otherCatalogNumbers=None, dwc:phylum=None, dwc:recordedBy=None, dwc:scientificName='Symphoricarpos oreophilus', dwc:scientificNameAuthorship=None, dwc:specificEpithet=None, dwc:startDayOfYear=None, dwc:stateProvince=None, dwc:year=None, exif:PixelXDimension=None, exif:PixelYDimension=None, id=None, idigbio:OriginalFileName=None, idigbio:associatedRecordReference=None, idigbio:associatedRecordsetReference=None, idigbio:mediaStatus=None, idigbio:mediaStatusDate=None, idigbio:recordId=None, photoshop:Credit=None, symbiota:recordEnteredBy=None, xmp:CreateDate=None, xmp:MetadataDate=None, xmpRights:Owner='Boise State University, Snake River Plains Herbarium', xmpRights:UsageTerms='http://creativecommons.org/licenses/by-nc-sa/3.0/us/', xmpRights:WebStatement='http://creativecommons.org/licenses/by-nc-sa/3.0/us/'), datemodified=datetime.datetime(2017, 2, 10, 10, 13, 49, 421000), dqs=-1.2727272510528564, etag='ca3399289363cf9451127de936bbec1c13ae9e7d', format='image/jpeg', hasSpecimen=True, licenselogourl=None, mediatype='images', modified=datetime.datetime(2015, 11, 3, 23, 5, 7), recordset='f84d528a-7d08-467e-b532-ace707316f1d', rights=None, tag=None, type='stillimage', uuid='9fc60d4a-845e-48eb-9e1e-61ea6d7fde04', version=None, webstatement=None, xpixels=None, ypixels=None)]

In [13]:
df.printSchema()


root
 |-- accessuri: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- Iptc4xmpExt:CountryCode: string (nullable = true)
 |    |-- Iptc4xmpExt:CountryName: string (nullable = true)
 |    |-- Iptc4xmpExt:ProvinceState: string (nullable = true)
 |    |-- Iptc4xmpExt:WorldRegion: string (nullable = true)
 |    |-- ac:accessURI: string (nullable = true)
 |    |-- ac:associatedSpecimenReference: string (nullable = true)
 |    |-- ac:attributionLogoURL: string (nullable = true)
 |    |-- ac:bestQualityAccessURI: string (nullable = true)
 |    |-- ac:bestQualityFormat: string (nullable = true)
 |    |-- ac:caption: string (nullable = true)
 |    |-- ac:captureDevice: string (nullable = true)
 |    |-- ac:comments: string (nullable = true)
 |    |-- ac:digitizationDate: string (nullable = true)
 |    |-- ac:furtherInformationURL: string (nullable = true)
 |    |-- ac:goodQualityAccessURI: string (nullable = true)
 |    |-- ac:hashFunction: string (nullable = true)
 |    |-- ac:hashValue: string (nullable = true)
 |    |-- ac:licenseLogoURL: string (nullable = true)
 |    |-- ac:metadataCreator: string (nullable = true)
 |    |-- ac:metadataLanguage: string (nullable = true)
 |    |-- ac:metadataLanguageLiteral: string (nullable = true)
 |    |-- ac:metadataProvider: string (nullable = true)
 |    |-- ac:metadataProviderLiteral: string (nullable = true)
 |    |-- ac:provider: string (nullable = true)
 |    |-- ac:providerID: string (nullable = true)
 |    |-- ac:providerLiteral: string (nullable = true)
 |    |-- ac:providerManagedID: string (nullable = true)
 |    |-- ac:resourceCreationTechnique: string (nullable = true)
 |    |-- ac:serviceExpectation: string (nullable = true)
 |    |-- ac:subjectCategoryVocabulary: string (nullable = true)
 |    |-- ac:subjectOrientation: string (nullable = true)
 |    |-- ac:subjectPart: string (nullable = true)
 |    |-- ac:subtype: string (nullable = true)
 |    |-- ac:subtypeLiteral: string (nullable = true)
 |    |-- ac:tag: string (nullable = true)
 |    |-- ac:taxonCoverage: string (nullable = true)
 |    |-- ac:thumbnailAccessURI: string (nullable = true)
 |    |-- ac:variant: string (nullable = true)
 |    |-- ac:variantLiteral: string (nullable = true)
 |    |-- coreid: string (nullable = true)
 |    |-- dc:creator: string (nullable = true)
 |    |-- dc:format: string (nullable = true)
 |    |-- dc:language: string (nullable = true)
 |    |-- dc:rights: string (nullable = true)
 |    |-- dc:source: string (nullable = true)
 |    |-- dc:type: string (nullable = true)
 |    |-- dcterms:accessRights: string (nullable = true)
 |    |-- dcterms:available: string (nullable = true)
 |    |-- dcterms:created: string (nullable = true)
 |    |-- dcterms:creator: string (nullable = true)
 |    |-- dcterms:description: string (nullable = true)
 |    |-- dcterms:format: string (nullable = true)
 |    |-- dcterms:identifier: string (nullable = true)
 |    |-- dcterms:language: string (nullable = true)
 |    |-- dcterms:license: string (nullable = true)
 |    |-- dcterms:modified: string (nullable = true)
 |    |-- dcterms:publisher: string (nullable = true)
 |    |-- dcterms:references: string (nullable = true)
 |    |-- dcterms:rights: string (nullable = true)
 |    |-- dcterms:rightsHolder: string (nullable = true)
 |    |-- dcterms:source: string (nullable = true)
 |    |-- dcterms:title: string (nullable = true)
 |    |-- dcterms:type: string (nullable = true)
 |    |-- dwc:basisOfRecord: string (nullable = true)
 |    |-- dwc:catalogNumber: string (nullable = true)
 |    |-- dwc:collectionCode: string (nullable = true)
 |    |-- dwc:collectionID: string (nullable = true)
 |    |-- dwc:coordinateUncertaintyInMeters: string (nullable = true)
 |    |-- dwc:country: string (nullable = true)
 |    |-- dwc:county: string (nullable = true)
 |    |-- dwc:day: string (nullable = true)
 |    |-- dwc:decimalLatitude: string (nullable = true)
 |    |-- dwc:decimalLongitude: string (nullable = true)
 |    |-- dwc:eventDate: string (nullable = true)
 |    |-- dwc:family: string (nullable = true)
 |    |-- dwc:genus: string (nullable = true)
 |    |-- dwc:geodeticDatum: string (nullable = true)
 |    |-- dwc:georeferencedBy: string (nullable = true)
 |    |-- dwc:habitat: string (nullable = true)
 |    |-- dwc:identificationQualifier: string (nullable = true)
 |    |-- dwc:institutionCode: string (nullable = true)
 |    |-- dwc:kingdom: string (nullable = true)
 |    |-- dwc:locality: string (nullable = true)
 |    |-- dwc:month: string (nullable = true)
 |    |-- dwc:municipality: string (nullable = true)
 |    |-- dwc:occurrenceID: string (nullable = true)
 |    |-- dwc:occurrenceRemarks: string (nullable = true)
 |    |-- dwc:order: string (nullable = true)
 |    |-- dwc:otherCatalogNumbers: string (nullable = true)
 |    |-- dwc:phylum: string (nullable = true)
 |    |-- dwc:recordedBy: string (nullable = true)
 |    |-- dwc:scientificName: string (nullable = true)
 |    |-- dwc:scientificNameAuthorship: string (nullable = true)
 |    |-- dwc:specificEpithet: string (nullable = true)
 |    |-- dwc:startDayOfYear: string (nullable = true)
 |    |-- dwc:stateProvince: string (nullable = true)
 |    |-- dwc:year: string (nullable = true)
 |    |-- exif:PixelXDimension: string (nullable = true)
 |    |-- exif:PixelYDimension: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- idigbio:OriginalFileName: string (nullable = true)
 |    |-- idigbio:associatedRecordReference: string (nullable = true)
 |    |-- idigbio:associatedRecordsetReference: string (nullable = true)
 |    |-- idigbio:mediaStatus: string (nullable = true)
 |    |-- idigbio:mediaStatusDate: string (nullable = true)
 |    |-- idigbio:recordId: string (nullable = true)
 |    |-- photoshop:Credit: string (nullable = true)
 |    |-- symbiota:recordEnteredBy: string (nullable = true)
 |    |-- xmp:CreateDate: string (nullable = true)
 |    |-- xmp:MetadataDate: string (nullable = true)
 |    |-- xmpRights:Owner: string (nullable = true)
 |    |-- xmpRights:UsageTerms: string (nullable = true)
 |    |-- xmpRights:WebStatement: string (nullable = true)
 |-- datemodified: timestamp (nullable = true)
 |-- dqs: float (nullable = true)
 |-- etag: string (nullable = true)
 |-- format: string (nullable = true)
 |-- hasSpecimen: boolean (nullable = true)
 |-- licenselogourl: string (nullable = true)
 |-- mediatype: string (nullable = true)
 |-- modified: timestamp (nullable = true)
 |-- recordset: string (nullable = true)
 |-- rights: string (nullable = true)
 |-- tag: string (nullable = true)
 |-- type: string (nullable = true)
 |-- uuid: string (nullable = true)
 |-- version: integer (nullable = true)
 |-- webstatement: string (nullable = true)
 |-- xpixels: integer (nullable = true)
 |-- ypixels: integer (nullable = true)


In [14]:
from pyspark.sql.types import *
def mk_field_set_from_df(df_schema):
    """Create a set of fieldname::type strings from a df schema"""
    field_set = set()
    prefix = ""
    for i in df_schema:
        #print(i)
        t = type(i.dataType)
        if t is not StringType and \
           t is not FloatType and \
           t is not IntegerType and \
           t is not TimestampType and \
           t is not BooleanType:
            #print(i.dataType)
            prefix = i.name
            
            # This only accommodates one level of nesting which is
            # consistant with our parquet-building technique. The
            # string manipulation on type makes it consistant with
            # the JSON returned by the API.
            for j in i.dataType:
                field_set.add(("{0}.{1}::{2}".format(prefix, j.name, 
                                            str(j.dataType)[:-4].lower())))

        else:
            field_set.add(("{0}::{1}".format(i.name, 
                                            str(i.dataType)[:-4].lower())))
    return field_set
df_field_set = mk_field_set_from_df(df.schema)
#print(df_field_set)

In [15]:
import requests

def mk_field_set_from_api():
    """Create a set of fieldname::type strings from the meta API endpoint"""
    meta_fields_records = (requests                                                                                                 
        .get("http://search.idigbio.org/v2/meta/fields/mediarecords")                                                 
        .json()                                                                                                  
    ) 
    field_set = set()                                                                                                              
    for k,v in meta_fields_records.items():
        if k == "data":                                                                                                             
            for kd,vd in v.items():                                                                                                 
                if vd.get("fieldName", False):                                                                                      
                    field_set.add("{0}::{1}".format(vd["fieldName"],
                                                        vd["type"]))
        elif v.get("fieldName", False):                                                                                               
            field_set.add("{0}::{1}".format(v["fieldName"], v["type"]))
        else:
            # non-data nested structures, only add top level key
            field_set.add("{0}::COMPLEX".format(k))
        
    return field_set
meta_field_set = mk_field_set_from_api()
#print(meta_field_set)

In [16]:
# The loaded df is a working data frame of course, what fields are in the
# meta endpoint that the df does not have? Are those fields currently 
# excluded in the parquet generation code or has Alex snuck in new
# structures lately?
#
# Note that ES calls things strings if they're lists of strings so even if
# something doesn't show up here, watch for scaler -> vector changes in the
# way data is represented in a field.
print("In meta endpoint, not df")
print("\n".join(sorted(meta_field_set - df_field_set)))
print("\nIn df, not meta endpoint")
print("\n".join(sorted(df_field_set - meta_field_set)))


In meta endpoint, not df
datemodified::date
flags::string
indexData::COMPLEX
modified::date
recordids::string
records::string

In df, not meta endpoint
datemodified::timestamp
modified::timestamp