In [111]:
from __future__ import print_function
import os
import sys

In [1]:
4 + 4


Out[1]:
8

In [112]:
# Someday this has to go in a config or kernel

#spark_home = '/opt/spark/latest'

#os.environ['SPARK_HOME'] = spark_home
#sys.path.insert(0, spark_home + "/python")
#sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.1-src.zip'))

#from pyspark.context import SparkContext
#from pyspark.context import SparkConf
#from pyspark.sql import SQLContext

In [113]:
#sc = SparkContext()
#sqlContext = SQLContext(sc)

In [1]:
df = sqlContext.read.load("/guoda/data/idigbio-20170607T183747.parquet")


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-1-3c965626df9f> in <module>()
----> 1 df = sqlContext.read.load("/guoda/data/idigbio-20170607T183747.parquet")

NameError: name 'sqlContext' is not defined

In [115]:
print(df.count())
df.head(2)


104661524
Out[115]:
[Row(barcodevalue=None, basisofrecord='preservedspecimen', bed=None, canonicalname='bombus ashtoni', catalognumber='cuic_ent 00035073', class='insecta', collectioncode=None, collectionid=None, collectionname=None, collector='b. taraday', commonname=None, continent='north america', coordinateuncertainty=None, country='united states', countrycode='usa', county='tompkins', data=Row(coreid=None, dc:rights=None, dcterms:accessRights=None, dcterms:bibliographicCitation=None, dcterms:language=None, dcterms:license=None, dcterms:modified='2014-03-18', dcterms:references='Digital Bee Collections Network, 2014 (and updates). Version: 2016-03-08. National Science Foundation grant DBI 0956388', dcterms:rights=None, dcterms:rightsHolder=None, dcterms:source=None, dcterms:type=None, dwc:VerbatimEventDate=None, dwc:acceptedNameUsage=None, dwc:accessRights=None, dwc:associatedMedia=None, dwc:associatedOccurrences=None, dwc:associatedReferences=None, dwc:associatedSequences=None, dwc:associatedTaxa=None, dwc:basisOfRecord='PreservedSpecimen', dwc:bed=None, dwc:behavior=None, dwc:catalogNumber='CUIC_ENT 00035073', dwc:class=None, dwc:collectionCode=None, dwc:collectionID=None, dwc:continent=None, dwc:coordinatePrecision=None, dwc:coordinateUncertaintyInMeters=None, dwc:country='UNITED STATES', dwc:countryCode=None, dwc:county='Tompkins', dwc:dataGeneralizations=None, dwc:datasetID='urn:uuid:13674fa4-8611-11e4-8259-0026552be7ea', dwc:datasetName='Collaborative databasing of North American bee collections within a global informatics network project', dwc:dateIdentified=None, dwc:day=None, dwc:decimalLatitude='42.44923', dwc:decimalLongitude='-76.48226', dwc:disposition=None, dwc:dynamicProperties=None, dwc:earliestAgeOrLowestStage=None, dwc:earliestEonOrLowestEonothem=None, dwc:earliestEpochOrLowestSeries=None, dwc:earliestEraOrLowestErathem=None, dwc:earliestPeriodOrLowestSystem=None, dwc:endDayOfYear=None, dwc:establishmentMeans=None, dwc:eventDate='1980-06-20', dwc:eventID=None, dwc:eventRemarks=None, dwc:eventTime=None, dwc:family='Apidae', dwc:fieldNotes=None, dwc:fieldNumber=None, dwc:footprintSRS=None, dwc:footprintSpatialFit=None, dwc:footprintWKT=None, dwc:formation=None, dwc:genus='Bombus', dwc:geodeticDatum=None, dwc:geologicalContextID=None, dwc:georeferenceProtocol=None, dwc:georeferenceRemarks=None, dwc:georeferenceSources=None, dwc:georeferenceVerificationStatus=None, dwc:georeferencedBy=None, dwc:georeferencedDate=None, dwc:group=None, dwc:habitat=None, dwc:higherClassification='Animalia;Arthropoda;Apidae;Apinae;Bombini', dwc:higherGeography=None, dwc:higherGeographyID=None, dwc:highestBiostratigraphicZone=None, dwc:identificationID=None, dwc:identificationQualifier=None, dwc:identificationReferences=None, dwc:identificationRemarks=None, dwc:identificationVerificationStatus=None, dwc:identifiedBy='G. C. Eickwort', dwc:individualCount='1', dwc:informationWithheld=None, dwc:infraspecificEpithet=None, dwc:institutionCode='CUIC', dwc:institutionID=None, dwc:island=None, dwc:islandGroup=None, dwc:kingdom=None, dwc:latestAgeOrHighestStage=None, dwc:latestEonOrHighestEonothem=None, dwc:latestEpochOrHighestSeries=None, dwc:latestEraOrHighestErathem=None, dwc:latestPeriodOrHighestSystem=None, dwc:lifeStage='Adult', dwc:lithostratigraphicTerms=None, dwc:locality='Ithaca, Cornell Campus', dwc:locationAccordingTo='Label', dwc:locationID=None, dwc:locationRemarks=None, dwc:lowestBiostratigraphicZone=None, dwc:materialSampleID=None, dwc:maximumDepthInMeters=None, dwc:maximumElevationInMeters=None, dwc:member=None, dwc:minimumDepthInMeters=None, dwc:minimumElevationInMeters=None, dwc:month=None, dwc:municipality=None, dwc:nameAccordingTo=None, dwc:nameAccordingToID=None, dwc:namePublishedIn=None, dwc:namePublishedInID=None, dwc:namePublishedInYear=None, dwc:nomenclaturalCode=None, dwc:nomenclaturalStatus=None, dwc:occurrenceDetails=None, dwc:occurrenceID='urn:uuid:9372ac72-aeab-11e3-8259-0026552be7ea', dwc:occurrenceRemarks=None, dwc:occurrenceStatus=None, dwc:order=None, dwc:organismID=None, dwc:organismName=None, dwc:organismQuantity=None, dwc:organismQuantityType=None, dwc:originalNameUsage=None, dwc:otherCatalogNumbers=None, dwc:ownerInstitutionCode='Cornell University Insect Collection', dwc:parentNameUsage=None, dwc:phylum=None, dwc:pointRadiusSpatialFit=None, dwc:preparations=None, dwc:previousIdentifications=None, dwc:recordNumber=None, dwc:recordedBy='B. Taraday', dwc:reproductiveCondition=None, dwc:rights=None, dwc:rightsHolder=None, dwc:samplingEffort=None, dwc:samplingProtocol='Netting', dwc:scientificName='Bombus ashtoni', dwc:scientificNameAuthorship='(Cresson,1864)', dwc:scientificNameID=None, dwc:sex='Female', dwc:specificEpithet='ashtoni', dwc:startDayOfYear=None, dwc:stateProvince='New York', dwc:subgenus=None, dwc:taxonID=None, dwc:taxonRank=None, dwc:taxonRemarks=None, dwc:taxonomicStatus=None, dwc:typeStatus='None', dwc:verbatimCoordinateSystem=None, dwc:verbatimCoordinates=None, dwc:verbatimDepth=None, dwc:verbatimElevation='265 m', dwc:verbatimEventDate=None, dwc:verbatimLatitude=None, dwc:verbatimLocality=None, dwc:verbatimLongitude=None, dwc:verbatimSRS=None, dwc:verbatimTaxonRank=None, dwc:vernacularName=None, dwc:waterBody=None, dwc:year='1980', fcc:datePicked=None, fcc:pickedBy=None, id='urn:uuid:9372ac72-aeab-11e3-8259-0026552be7ea_RID', idigbio:preservative=None, idigbio:recordId=None, idigbio:subfamily=None, idigbio:substrate=None, idigbio:superfamily=None, symbiota:recordEnteredBy=None, symbiota:verbatimScientificName=None), datecollected=datetime.datetime(1980, 6, 19, 20, 0), datemodified=datetime.datetime(2016, 3, 9, 4, 17, 27, 393000), dqs=0.1617647111415863, earliestageorloweststage=None, earliesteonorlowesteonothem=None, earliestepochorlowestseries=None, earliesteraorlowesterathem=None, earliestperiodorlowestsystem=None, etag='b3985d7148b9f1a5243dc28ade2187c8a6df9c76', eventdate='1980-06-20', family='apidae', fieldnumber=None, formation=None, genus='bombus', geologicalcontextid=None, geopoint=Row(lat=42.44923, lon=-76.48226), group=None, hasImage=False, hasMedia=False, highertaxon='animalia;arthropoda;apidae;apinae;bombini', highestbiostratigraphiczone=None, individualcount=1.0, infraspecificepithet=None, institutioncode='cuic', institutionid=None, institutionname=None, kingdom='animalia', latestageorhigheststage=None, latesteonorhighesteonothem=None, latestepochorhighestseries=None, latesteraorhighesterathem=None, latestperiodorhighestsystem=None, lithostratigraphicterms=None, locality='ithaca, cornell campus', lowestbiostratigraphiczone=None, maxdepth=None, maxelevation=None, member=None, mindepth=None, minelevation=None, municipality=None, occurrenceid='urn:uuid:9372ac72-aeab-11e3-8259-0026552be7ea', order='hymenoptera', phylum='arthropoda', recordnumber=None, recordset='8919571f-205a-4aed-b9f2-96ccd0108e4c', scientificname='bombus ashtoni', specificepithet='ashtoni', startdayofyear=172, stateprovince='new york', taxonid='1340457', taxonomicstatus='accepted', taxonrank='species', typestatus='none', uuid='43c20745-590c-45f5-9e5c-ddc5153aa573', verbatimeventdate=None, verbatimlocality=None, version=None, waterbody=None),
 Row(barcodevalue=None, basisofrecord='preservedspecimen', bed=None, canonicalname='megachile sidalceae', catalognumber='amnh_bee 00140255', class='insecta', collectioncode=None, collectionid=None, collectionname=None, collector='j. s. ascher', commonname=None, continent='north america', coordinateuncertainty=10.0, country='united states', countrycode='usa', county='cochise', data=Row(coreid=None, dc:rights=None, dcterms:accessRights=None, dcterms:bibliographicCitation=None, dcterms:language=None, dcterms:license=None, dcterms:modified='2011-03-15', dcterms:references='Digital Bee Collections Network, 2014 (and updates). Version: 2016-03-08. National Science Foundation grant DBI 0956388', dcterms:rights=None, dcterms:rightsHolder=None, dcterms:source=None, dcterms:type=None, dwc:VerbatimEventDate=None, dwc:acceptedNameUsage=None, dwc:accessRights=None, dwc:associatedMedia=None, dwc:associatedOccurrences=None, dwc:associatedReferences=None, dwc:associatedSequences=None, dwc:associatedTaxa='associated with:Psorothamnus scoparius', dwc:basisOfRecord='PreservedSpecimen', dwc:bed=None, dwc:behavior=None, dwc:catalogNumber='AMNH_BEE 00140255', dwc:class=None, dwc:collectionCode=None, dwc:collectionID=None, dwc:continent=None, dwc:coordinatePrecision=None, dwc:coordinateUncertaintyInMeters='<10m', dwc:country='UNITED STATES', dwc:countryCode=None, dwc:county='Cochise', dwc:dataGeneralizations=None, dwc:datasetID='urn:uuid:13674fa4-8611-11e4-8259-0026552be7ea', dwc:datasetName='Collaborative databasing of North American bee collections within a global informatics network project', dwc:dateIdentified='2010', dwc:day=None, dwc:decimalLatitude='32.23915', dwc:decimalLongitude='-109.77285', dwc:disposition=None, dwc:dynamicProperties=None, dwc:earliestAgeOrLowestStage=None, dwc:earliestEonOrLowestEonothem=None, dwc:earliestEpochOrLowestSeries=None, dwc:earliestEraOrLowestErathem=None, dwc:earliestPeriodOrLowestSystem=None, dwc:endDayOfYear=None, dwc:establishmentMeans=None, dwc:eventDate='2009-09-01', dwc:eventID=None, dwc:eventRemarks=None, dwc:eventTime=None, dwc:family='Megachilidae', dwc:fieldNotes=None, dwc:fieldNumber=None, dwc:footprintSRS=None, dwc:footprintSpatialFit=None, dwc:footprintWKT=None, dwc:formation=None, dwc:genus='Megachile', dwc:geodeticDatum=None, dwc:geologicalContextID=None, dwc:georeferenceProtocol=None, dwc:georeferenceRemarks=None, dwc:georeferenceSources=None, dwc:georeferenceVerificationStatus=None, dwc:georeferencedBy=None, dwc:georeferencedDate=None, dwc:group=None, dwc:habitat=None, dwc:higherClassification='Animalia;Arthropoda;Megachilidae;Megachilinae;Megachilini', dwc:higherGeography=None, dwc:higherGeographyID=None, dwc:highestBiostratigraphicZone=None, dwc:identificationID=None, dwc:identificationQualifier=None, dwc:identificationReferences=None, dwc:identificationRemarks=None, dwc:identificationVerificationStatus=None, dwc:identifiedBy='J. S. Ascher', dwc:individualCount='1', dwc:informationWithheld=None, dwc:infraspecificEpithet=None, dwc:institutionCode='AMNH', dwc:institutionID=None, dwc:island=None, dwc:islandGroup=None, dwc:kingdom=None, dwc:latestAgeOrHighestStage=None, dwc:latestEonOrHighestEonothem=None, dwc:latestEpochOrHighestSeries=None, dwc:latestEraOrHighestErathem=None, dwc:latestPeriodOrHighestSystem=None, dwc:lifeStage='Adult', dwc:lithostratigraphicTerms=None, dwc:locality='4 mi E of Willcox', dwc:locationAccordingTo='Label', dwc:locationID=None, dwc:locationRemarks=None, dwc:lowestBiostratigraphicZone=None, dwc:materialSampleID=None, dwc:maximumDepthInMeters=None, dwc:maximumElevationInMeters=None, dwc:member=None, dwc:minimumDepthInMeters=None, dwc:minimumElevationInMeters=None, dwc:month=None, dwc:municipality=None, dwc:nameAccordingTo=None, dwc:nameAccordingToID=None, dwc:namePublishedIn=None, dwc:namePublishedInID=None, dwc:namePublishedInYear=None, dwc:nomenclaturalCode=None, dwc:nomenclaturalStatus=None, dwc:occurrenceDetails=None, dwc:occurrenceID='urn:uuid:854d29a6-d8e1-11e2-99a2-0026552be7ea', dwc:occurrenceRemarks=None, dwc:occurrenceStatus=None, dwc:order=None, dwc:organismID=None, dwc:organismName=None, dwc:organismQuantity=None, dwc:organismQuantityType=None, dwc:originalNameUsage=None, dwc:otherCatalogNumbers=None, dwc:ownerInstitutionCode='American Museum of Natural History', dwc:parentNameUsage=None, dwc:phylum=None, dwc:pointRadiusSpatialFit=None, dwc:preparations=None, dwc:previousIdentifications=None, dwc:recordNumber=None, dwc:recordedBy='J. S. Ascher', dwc:reproductiveCondition=None, dwc:rights=None, dwc:rightsHolder=None, dwc:samplingEffort=None, dwc:samplingProtocol='Netting', dwc:scientificName='Megachile sidalceae', dwc:scientificNameAuthorship='Cockerell,1897', dwc:scientificNameID=None, dwc:sex='Female', dwc:specificEpithet='sidalceae', dwc:startDayOfYear=None, dwc:stateProvince='Arizona', dwc:subgenus=None, dwc:taxonID=None, dwc:taxonRank=None, dwc:taxonRemarks=None, dwc:taxonomicStatus=None, dwc:typeStatus='None', dwc:verbatimCoordinateSystem=None, dwc:verbatimCoordinates=None, dwc:verbatimDepth=None, dwc:verbatimElevation='1273 m', dwc:verbatimEventDate=None, dwc:verbatimLatitude=None, dwc:verbatimLocality=None, dwc:verbatimLongitude=None, dwc:verbatimSRS=None, dwc:verbatimTaxonRank=None, dwc:vernacularName=None, dwc:waterBody=None, dwc:year='2009', fcc:datePicked=None, fcc:pickedBy=None, id='urn:uuid:854d29a6-d8e1-11e2-99a2-0026552be7ea_RID', idigbio:preservative=None, idigbio:recordId=None, idigbio:subfamily=None, idigbio:substrate=None, idigbio:superfamily=None, symbiota:recordEnteredBy=None, symbiota:verbatimScientificName=None), datecollected=datetime.datetime(2009, 8, 31, 20, 0), datemodified=datetime.datetime(2016, 3, 9, 4, 17, 27, 393000), dqs=0.19117647409439087, earliestageorloweststage=None, earliesteonorlowesteonothem=None, earliestepochorlowestseries=None, earliesteraorlowesterathem=None, earliestperiodorlowestsystem=None, etag='8a3f37c409d1f01463d4eafdb7c64c60a0040c4c', eventdate='2009-09-01', family='megachilidae', fieldnumber=None, formation=None, genus='megachile', geologicalcontextid=None, geopoint=Row(lat=32.23915, lon=-109.77285), group=None, hasImage=False, hasMedia=False, highertaxon='animalia;arthropoda;megachilidae;megachilinae;megachilini', highestbiostratigraphiczone=None, individualcount=1.0, infraspecificepithet=None, institutioncode='amnh', institutionid=None, institutionname=None, kingdom='animalia', latestageorhigheststage=None, latesteonorhighesteonothem=None, latestepochorhighestseries=None, latesteraorhighesterathem=None, latestperiodorhighestsystem=None, lithostratigraphicterms=None, locality='4 mi e of willcox', lowestbiostratigraphiczone=None, maxdepth=None, maxelevation=None, member=None, mindepth=None, minelevation=None, municipality=None, occurrenceid='urn:uuid:854d29a6-d8e1-11e2-99a2-0026552be7ea', order='hymenoptera', phylum='arthropoda', recordnumber=None, recordset='8919571f-205a-4aed-b9f2-96ccd0108e4c', scientificname='megachile sidalceae', specificepithet='sidalceae', startdayofyear=244, stateprovince='arizona', taxonid='1336076', taxonomicstatus='accepted', taxonrank='species', typestatus='none', uuid='d1f8c8b5-1456-45b8-ba65-b30b731f90a4', verbatimeventdate=None, verbatimlocality=None, version=None, waterbody=None)]

In [116]:
df.printSchema()


root
 |-- barcodevalue: string (nullable = true)
 |-- basisofrecord: string (nullable = true)
 |-- bed: string (nullable = true)
 |-- canonicalname: string (nullable = true)
 |-- catalognumber: string (nullable = true)
 |-- class: string (nullable = true)
 |-- collectioncode: string (nullable = true)
 |-- collectionid: string (nullable = true)
 |-- collectionname: string (nullable = true)
 |-- collector: string (nullable = true)
 |-- commonname: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- coordinateuncertainty: float (nullable = true)
 |-- country: string (nullable = true)
 |-- countrycode: string (nullable = true)
 |-- county: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- coreid: string (nullable = true)
 |    |-- dc:rights: string (nullable = true)
 |    |-- dcterms:accessRights: string (nullable = true)
 |    |-- dcterms:bibliographicCitation: string (nullable = true)
 |    |-- dcterms:language: string (nullable = true)
 |    |-- dcterms:license: string (nullable = true)
 |    |-- dcterms:modified: string (nullable = true)
 |    |-- dcterms:references: string (nullable = true)
 |    |-- dcterms:rights: string (nullable = true)
 |    |-- dcterms:rightsHolder: string (nullable = true)
 |    |-- dcterms:source: string (nullable = true)
 |    |-- dcterms:type: string (nullable = true)
 |    |-- dwc:VerbatimEventDate: string (nullable = true)
 |    |-- dwc:acceptedNameUsage: string (nullable = true)
 |    |-- dwc:accessRights: string (nullable = true)
 |    |-- dwc:associatedMedia: string (nullable = true)
 |    |-- dwc:associatedOccurrences: string (nullable = true)
 |    |-- dwc:associatedReferences: string (nullable = true)
 |    |-- dwc:associatedSequences: string (nullable = true)
 |    |-- dwc:associatedTaxa: string (nullable = true)
 |    |-- dwc:basisOfRecord: string (nullable = true)
 |    |-- dwc:bed: string (nullable = true)
 |    |-- dwc:behavior: string (nullable = true)
 |    |-- dwc:catalogNumber: string (nullable = true)
 |    |-- dwc:class: string (nullable = true)
 |    |-- dwc:collectionCode: string (nullable = true)
 |    |-- dwc:collectionID: string (nullable = true)
 |    |-- dwc:continent: string (nullable = true)
 |    |-- dwc:coordinatePrecision: string (nullable = true)
 |    |-- dwc:coordinateUncertaintyInMeters: string (nullable = true)
 |    |-- dwc:country: string (nullable = true)
 |    |-- dwc:countryCode: string (nullable = true)
 |    |-- dwc:county: string (nullable = true)
 |    |-- dwc:dataGeneralizations: string (nullable = true)
 |    |-- dwc:datasetID: string (nullable = true)
 |    |-- dwc:datasetName: string (nullable = true)
 |    |-- dwc:dateIdentified: string (nullable = true)
 |    |-- dwc:day: string (nullable = true)
 |    |-- dwc:decimalLatitude: string (nullable = true)
 |    |-- dwc:decimalLongitude: string (nullable = true)
 |    |-- dwc:disposition: string (nullable = true)
 |    |-- dwc:dynamicProperties: string (nullable = true)
 |    |-- dwc:earliestAgeOrLowestStage: string (nullable = true)
 |    |-- dwc:earliestEonOrLowestEonothem: string (nullable = true)
 |    |-- dwc:earliestEpochOrLowestSeries: string (nullable = true)
 |    |-- dwc:earliestEraOrLowestErathem: string (nullable = true)
 |    |-- dwc:earliestPeriodOrLowestSystem: string (nullable = true)
 |    |-- dwc:endDayOfYear: string (nullable = true)
 |    |-- dwc:establishmentMeans: string (nullable = true)
 |    |-- dwc:eventDate: string (nullable = true)
 |    |-- dwc:eventID: string (nullable = true)
 |    |-- dwc:eventRemarks: string (nullable = true)
 |    |-- dwc:eventTime: string (nullable = true)
 |    |-- dwc:family: string (nullable = true)
 |    |-- dwc:fieldNotes: string (nullable = true)
 |    |-- dwc:fieldNumber: string (nullable = true)
 |    |-- dwc:footprintSRS: string (nullable = true)
 |    |-- dwc:footprintSpatialFit: string (nullable = true)
 |    |-- dwc:footprintWKT: string (nullable = true)
 |    |-- dwc:formation: string (nullable = true)
 |    |-- dwc:genus: string (nullable = true)
 |    |-- dwc:geodeticDatum: string (nullable = true)
 |    |-- dwc:geologicalContextID: string (nullable = true)
 |    |-- dwc:georeferenceProtocol: string (nullable = true)
 |    |-- dwc:georeferenceRemarks: string (nullable = true)
 |    |-- dwc:georeferenceSources: string (nullable = true)
 |    |-- dwc:georeferenceVerificationStatus: string (nullable = true)
 |    |-- dwc:georeferencedBy: string (nullable = true)
 |    |-- dwc:georeferencedDate: string (nullable = true)
 |    |-- dwc:group: string (nullable = true)
 |    |-- dwc:habitat: string (nullable = true)
 |    |-- dwc:higherClassification: string (nullable = true)
 |    |-- dwc:higherGeography: string (nullable = true)
 |    |-- dwc:higherGeographyID: string (nullable = true)
 |    |-- dwc:highestBiostratigraphicZone: string (nullable = true)
 |    |-- dwc:identificationID: string (nullable = true)
 |    |-- dwc:identificationQualifier: string (nullable = true)
 |    |-- dwc:identificationReferences: string (nullable = true)
 |    |-- dwc:identificationRemarks: string (nullable = true)
 |    |-- dwc:identificationVerificationStatus: string (nullable = true)
 |    |-- dwc:identifiedBy: string (nullable = true)
 |    |-- dwc:individualCount: string (nullable = true)
 |    |-- dwc:informationWithheld: string (nullable = true)
 |    |-- dwc:infraspecificEpithet: string (nullable = true)
 |    |-- dwc:institutionCode: string (nullable = true)
 |    |-- dwc:institutionID: string (nullable = true)
 |    |-- dwc:island: string (nullable = true)
 |    |-- dwc:islandGroup: string (nullable = true)
 |    |-- dwc:kingdom: string (nullable = true)
 |    |-- dwc:latestAgeOrHighestStage: string (nullable = true)
 |    |-- dwc:latestEonOrHighestEonothem: string (nullable = true)
 |    |-- dwc:latestEpochOrHighestSeries: string (nullable = true)
 |    |-- dwc:latestEraOrHighestErathem: string (nullable = true)
 |    |-- dwc:latestPeriodOrHighestSystem: string (nullable = true)
 |    |-- dwc:lifeStage: string (nullable = true)
 |    |-- dwc:lithostratigraphicTerms: string (nullable = true)
 |    |-- dwc:locality: string (nullable = true)
 |    |-- dwc:locationAccordingTo: string (nullable = true)
 |    |-- dwc:locationID: string (nullable = true)
 |    |-- dwc:locationRemarks: string (nullable = true)
 |    |-- dwc:lowestBiostratigraphicZone: string (nullable = true)
 |    |-- dwc:materialSampleID: string (nullable = true)
 |    |-- dwc:maximumDepthInMeters: string (nullable = true)
 |    |-- dwc:maximumElevationInMeters: string (nullable = true)
 |    |-- dwc:member: string (nullable = true)
 |    |-- dwc:minimumDepthInMeters: string (nullable = true)
 |    |-- dwc:minimumElevationInMeters: string (nullable = true)
 |    |-- dwc:month: string (nullable = true)
 |    |-- dwc:municipality: string (nullable = true)
 |    |-- dwc:nameAccordingTo: string (nullable = true)
 |    |-- dwc:nameAccordingToID: string (nullable = true)
 |    |-- dwc:namePublishedIn: string (nullable = true)
 |    |-- dwc:namePublishedInID: string (nullable = true)
 |    |-- dwc:namePublishedInYear: string (nullable = true)
 |    |-- dwc:nomenclaturalCode: string (nullable = true)
 |    |-- dwc:nomenclaturalStatus: string (nullable = true)
 |    |-- dwc:occurrenceDetails: string (nullable = true)
 |    |-- dwc:occurrenceID: string (nullable = true)
 |    |-- dwc:occurrenceRemarks: string (nullable = true)
 |    |-- dwc:occurrenceStatus: string (nullable = true)
 |    |-- dwc:order: string (nullable = true)
 |    |-- dwc:organismID: string (nullable = true)
 |    |-- dwc:organismName: string (nullable = true)
 |    |-- dwc:organismQuantity: string (nullable = true)
 |    |-- dwc:organismQuantityType: string (nullable = true)
 |    |-- dwc:originalNameUsage: string (nullable = true)
 |    |-- dwc:otherCatalogNumbers: string (nullable = true)
 |    |-- dwc:ownerInstitutionCode: string (nullable = true)
 |    |-- dwc:parentNameUsage: string (nullable = true)
 |    |-- dwc:phylum: string (nullable = true)
 |    |-- dwc:pointRadiusSpatialFit: string (nullable = true)
 |    |-- dwc:preparations: string (nullable = true)
 |    |-- dwc:previousIdentifications: string (nullable = true)
 |    |-- dwc:recordNumber: string (nullable = true)
 |    |-- dwc:recordedBy: string (nullable = true)
 |    |-- dwc:reproductiveCondition: string (nullable = true)
 |    |-- dwc:rights: string (nullable = true)
 |    |-- dwc:rightsHolder: string (nullable = true)
 |    |-- dwc:samplingEffort: string (nullable = true)
 |    |-- dwc:samplingProtocol: string (nullable = true)
 |    |-- dwc:scientificName: string (nullable = true)
 |    |-- dwc:scientificNameAuthorship: string (nullable = true)
 |    |-- dwc:scientificNameID: string (nullable = true)
 |    |-- dwc:sex: string (nullable = true)
 |    |-- dwc:specificEpithet: string (nullable = true)
 |    |-- dwc:startDayOfYear: string (nullable = true)
 |    |-- dwc:stateProvince: string (nullable = true)
 |    |-- dwc:subgenus: string (nullable = true)
 |    |-- dwc:taxonID: string (nullable = true)
 |    |-- dwc:taxonRank: string (nullable = true)
 |    |-- dwc:taxonRemarks: string (nullable = true)
 |    |-- dwc:taxonomicStatus: string (nullable = true)
 |    |-- dwc:typeStatus: string (nullable = true)
 |    |-- dwc:verbatimCoordinateSystem: string (nullable = true)
 |    |-- dwc:verbatimCoordinates: string (nullable = true)
 |    |-- dwc:verbatimDepth: string (nullable = true)
 |    |-- dwc:verbatimElevation: string (nullable = true)
 |    |-- dwc:verbatimEventDate: string (nullable = true)
 |    |-- dwc:verbatimLatitude: string (nullable = true)
 |    |-- dwc:verbatimLocality: string (nullable = true)
 |    |-- dwc:verbatimLongitude: string (nullable = true)
 |    |-- dwc:verbatimSRS: string (nullable = true)
 |    |-- dwc:verbatimTaxonRank: string (nullable = true)
 |    |-- dwc:vernacularName: string (nullable = true)
 |    |-- dwc:waterBody: string (nullable = true)
 |    |-- dwc:year: string (nullable = true)
 |    |-- fcc:datePicked: string (nullable = true)
 |    |-- fcc:pickedBy: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- idigbio:preservative: string (nullable = true)
 |    |-- idigbio:recordId: string (nullable = true)
 |    |-- idigbio:subfamily: string (nullable = true)
 |    |-- idigbio:substrate: string (nullable = true)
 |    |-- idigbio:superfamily: string (nullable = true)
 |    |-- symbiota:recordEnteredBy: string (nullable = true)
 |    |-- symbiota:verbatimScientificName: string (nullable = true)
 |-- datecollected: timestamp (nullable = true)
 |-- datemodified: timestamp (nullable = true)
 |-- dqs: float (nullable = true)
 |-- earliestageorloweststage: string (nullable = true)
 |-- earliesteonorlowesteonothem: string (nullable = true)
 |-- earliestepochorlowestseries: string (nullable = true)
 |-- earliesteraorlowesterathem: string (nullable = true)
 |-- earliestperiodorlowestsystem: string (nullable = true)
 |-- etag: string (nullable = true)
 |-- eventdate: string (nullable = true)
 |-- family: string (nullable = true)
 |-- fieldnumber: string (nullable = true)
 |-- formation: string (nullable = true)
 |-- genus: string (nullable = true)
 |-- geologicalcontextid: string (nullable = true)
 |-- geopoint: struct (nullable = true)
 |    |-- lat: double (nullable = true)
 |    |-- lon: double (nullable = true)
 |-- group: string (nullable = true)
 |-- hasImage: boolean (nullable = true)
 |-- hasMedia: boolean (nullable = true)
 |-- highertaxon: string (nullable = true)
 |-- highestbiostratigraphiczone: string (nullable = true)
 |-- individualcount: float (nullable = true)
 |-- infraspecificepithet: string (nullable = true)
 |-- institutioncode: string (nullable = true)
 |-- institutionid: string (nullable = true)
 |-- institutionname: string (nullable = true)
 |-- kingdom: string (nullable = true)
 |-- latestageorhigheststage: string (nullable = true)
 |-- latesteonorhighesteonothem: string (nullable = true)
 |-- latestepochorhighestseries: string (nullable = true)
 |-- latesteraorhighesterathem: string (nullable = true)
 |-- latestperiodorhighestsystem: string (nullable = true)
 |-- lithostratigraphicterms: string (nullable = true)
 |-- locality: string (nullable = true)
 |-- lowestbiostratigraphiczone: string (nullable = true)
 |-- maxdepth: float (nullable = true)
 |-- maxelevation: float (nullable = true)
 |-- member: string (nullable = true)
 |-- mindepth: float (nullable = true)
 |-- minelevation: float (nullable = true)
 |-- municipality: string (nullable = true)
 |-- occurrenceid: string (nullable = true)
 |-- order: string (nullable = true)
 |-- phylum: string (nullable = true)
 |-- recordnumber: string (nullable = true)
 |-- recordset: string (nullable = true)
 |-- scientificname: string (nullable = true)
 |-- specificepithet: string (nullable = true)
 |-- startdayofyear: integer (nullable = true)
 |-- stateprovince: string (nullable = true)
 |-- taxonid: string (nullable = true)
 |-- taxonomicstatus: string (nullable = true)
 |-- taxonrank: string (nullable = true)
 |-- typestatus: string (nullable = true)
 |-- uuid: string (nullable = true)
 |-- verbatimeventdate: string (nullable = true)
 |-- verbatimlocality: string (nullable = true)
 |-- version: integer (nullable = true)
 |-- waterbody: string (nullable = true)


In [117]:
from pyspark.sql.types import *
def mk_field_set_from_df(df_schema):
    """Create a set of fieldname::type strings from a df schema"""
    field_set = set()
    prefix = ""
    for i in df_schema:
        #print(i)
        t = type(i.dataType)
        if t is not StringType and \
           t is not FloatType and \
           t is not IntegerType and \
           t is not TimestampType and \
           t is not BooleanType:
            #print(i.dataType)
            prefix = i.name
            
            # This only accommodates one level of nesting which is
            # consistant with our parquet-building technique. The
            # string manipulation on type makes it consistant with
            # the JSON returned by the API.
            for j in i.dataType:
                field_set.add(("{0}.{1}::{2}".format(prefix, j.name, 
                                            str(j.dataType)[:-4].lower())))

        else:
            field_set.add(("{0}::{1}".format(i.name, 
                                            str(i.dataType)[:-4].lower())))
    return field_set
df_field_set = mk_field_set_from_df(df.schema)
#print(df_field_set)

In [118]:
import requests

def mk_field_set_from_api():
    """Create a set of fieldname::type strings from the meta API endpoint"""
    meta_fields_records = (requests                                                                                                 
        .get("http://search.idigbio.org/v2/meta/fields/records")                                                 
        .json()                                                                                                  
    ) 
    field_set = set()                                                                                                              
    for k,v in meta_fields_records.items():
        if k == "data":                                                                                                             
            for kd,vd in v.items():                                                                                                 
                if vd.get("fieldName", False):                                                                                      
                    field_set.add("{0}::{1}".format(vd["fieldName"],
                                                        vd["type"]))
        elif v.get("fieldName", False):                                                                                               
            field_set.add("{0}::{1}".format(v["fieldName"], v["type"]))
        else:
            # non-data nested structures, only add top level key
            field_set.add("{0}::COMPLEX".format(k))
        
    return field_set
meta_field_set = mk_field_set_from_api()
#print(meta_field_set)

In [119]:
# The loaded df is a working data frame of course, what fields are in the
# meta endpoint that the df does not have? Are those fields currently 
# excluded in the parquet generation code or has Alex snuck in new
# structures lately?
#
# Note that ES calls things strings if they're lists of strings so even if
# something doesn't show up here, watch for scaler -> vector changes in the
# way data is represented in a field.
print("In meta endpoint, not df")
print("\n".join(sorted(meta_field_set - df_field_set)))
print("\nIn df, not meta endpoint")
print("\n".join(sorted(df_field_set - meta_field_set)))


In meta endpoint, not df
associatedsequences::string
commonnames::string
datecollected::date
datemodified::date
flags::string
geopoint::geo_point
indexData::COMPLEX
mediarecords::string
recordids::string

In df, not meta endpoint
datecollected::timestamp
datemodified::timestamp
geopoint.lat::double
geopoint.lon::double