In [111]:
from __future__ import print_function
import os
import sys
In [1]:
4 + 4
Out[1]:
8
In [112]:
# Someday this has to go in a config or kernel
#spark_home = '/opt/spark/latest'
#os.environ['SPARK_HOME'] = spark_home
#sys.path.insert(0, spark_home + "/python")
#sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.1-src.zip'))
#from pyspark.context import SparkContext
#from pyspark.context import SparkConf
#from pyspark.sql import SQLContext
In [113]:
#sc = SparkContext()
#sqlContext = SQLContext(sc)
In [1]:
df = sqlContext.read.load("/guoda/data/idigbio-20170607T183747.parquet")
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-1-3c965626df9f> in <module>()
----> 1 df = sqlContext.read.load("/guoda/data/idigbio-20170607T183747.parquet")
NameError: name 'sqlContext' is not defined
In [115]:
print(df.count())
df.head(2)
104661524
Out[115]:
[Row(barcodevalue=None, basisofrecord='preservedspecimen', bed=None, canonicalname='bombus ashtoni', catalognumber='cuic_ent 00035073', class='insecta', collectioncode=None, collectionid=None, collectionname=None, collector='b. taraday', commonname=None, continent='north america', coordinateuncertainty=None, country='united states', countrycode='usa', county='tompkins', data=Row(coreid=None, dc:rights=None, dcterms:accessRights=None, dcterms:bibliographicCitation=None, dcterms:language=None, dcterms:license=None, dcterms:modified='2014-03-18', dcterms:references='Digital Bee Collections Network, 2014 (and updates). Version: 2016-03-08. National Science Foundation grant DBI 0956388', dcterms:rights=None, dcterms:rightsHolder=None, dcterms:source=None, dcterms:type=None, dwc:VerbatimEventDate=None, dwc:acceptedNameUsage=None, dwc:accessRights=None, dwc:associatedMedia=None, dwc:associatedOccurrences=None, dwc:associatedReferences=None, dwc:associatedSequences=None, dwc:associatedTaxa=None, dwc:basisOfRecord='PreservedSpecimen', dwc:bed=None, dwc:behavior=None, dwc:catalogNumber='CUIC_ENT 00035073', dwc:class=None, dwc:collectionCode=None, dwc:collectionID=None, dwc:continent=None, dwc:coordinatePrecision=None, dwc:coordinateUncertaintyInMeters=None, dwc:country='UNITED STATES', dwc:countryCode=None, dwc:county='Tompkins', dwc:dataGeneralizations=None, dwc:datasetID='urn:uuid:13674fa4-8611-11e4-8259-0026552be7ea', dwc:datasetName='Collaborative databasing of North American bee collections within a global informatics network project', dwc:dateIdentified=None, dwc:day=None, dwc:decimalLatitude='42.44923', dwc:decimalLongitude='-76.48226', dwc:disposition=None, dwc:dynamicProperties=None, dwc:earliestAgeOrLowestStage=None, dwc:earliestEonOrLowestEonothem=None, dwc:earliestEpochOrLowestSeries=None, dwc:earliestEraOrLowestErathem=None, dwc:earliestPeriodOrLowestSystem=None, dwc:endDayOfYear=None, dwc:establishmentMeans=None, dwc:eventDate='1980-06-20', dwc:eventID=None, dwc:eventRemarks=None, dwc:eventTime=None, dwc:family='Apidae', dwc:fieldNotes=None, dwc:fieldNumber=None, dwc:footprintSRS=None, dwc:footprintSpatialFit=None, dwc:footprintWKT=None, dwc:formation=None, dwc:genus='Bombus', dwc:geodeticDatum=None, dwc:geologicalContextID=None, dwc:georeferenceProtocol=None, dwc:georeferenceRemarks=None, dwc:georeferenceSources=None, dwc:georeferenceVerificationStatus=None, dwc:georeferencedBy=None, dwc:georeferencedDate=None, dwc:group=None, dwc:habitat=None, dwc:higherClassification='Animalia;Arthropoda;Apidae;Apinae;Bombini', dwc:higherGeography=None, dwc:higherGeographyID=None, dwc:highestBiostratigraphicZone=None, dwc:identificationID=None, dwc:identificationQualifier=None, dwc:identificationReferences=None, dwc:identificationRemarks=None, dwc:identificationVerificationStatus=None, dwc:identifiedBy='G. C. Eickwort', dwc:individualCount='1', dwc:informationWithheld=None, dwc:infraspecificEpithet=None, dwc:institutionCode='CUIC', dwc:institutionID=None, dwc:island=None, dwc:islandGroup=None, dwc:kingdom=None, dwc:latestAgeOrHighestStage=None, dwc:latestEonOrHighestEonothem=None, dwc:latestEpochOrHighestSeries=None, dwc:latestEraOrHighestErathem=None, dwc:latestPeriodOrHighestSystem=None, dwc:lifeStage='Adult', dwc:lithostratigraphicTerms=None, dwc:locality='Ithaca, Cornell Campus', dwc:locationAccordingTo='Label', dwc:locationID=None, dwc:locationRemarks=None, dwc:lowestBiostratigraphicZone=None, dwc:materialSampleID=None, dwc:maximumDepthInMeters=None, dwc:maximumElevationInMeters=None, dwc:member=None, dwc:minimumDepthInMeters=None, dwc:minimumElevationInMeters=None, dwc:month=None, dwc:municipality=None, dwc:nameAccordingTo=None, dwc:nameAccordingToID=None, dwc:namePublishedIn=None, dwc:namePublishedInID=None, dwc:namePublishedInYear=None, dwc:nomenclaturalCode=None, dwc:nomenclaturalStatus=None, dwc:occurrenceDetails=None, dwc:occurrenceID='urn:uuid:9372ac72-aeab-11e3-8259-0026552be7ea', dwc:occurrenceRemarks=None, dwc:occurrenceStatus=None, dwc:order=None, dwc:organismID=None, dwc:organismName=None, dwc:organismQuantity=None, dwc:organismQuantityType=None, dwc:originalNameUsage=None, dwc:otherCatalogNumbers=None, dwc:ownerInstitutionCode='Cornell University Insect Collection', dwc:parentNameUsage=None, dwc:phylum=None, dwc:pointRadiusSpatialFit=None, dwc:preparations=None, dwc:previousIdentifications=None, dwc:recordNumber=None, dwc:recordedBy='B. Taraday', dwc:reproductiveCondition=None, dwc:rights=None, dwc:rightsHolder=None, dwc:samplingEffort=None, dwc:samplingProtocol='Netting', dwc:scientificName='Bombus ashtoni', dwc:scientificNameAuthorship='(Cresson,1864)', dwc:scientificNameID=None, dwc:sex='Female', dwc:specificEpithet='ashtoni', dwc:startDayOfYear=None, dwc:stateProvince='New York', dwc:subgenus=None, dwc:taxonID=None, dwc:taxonRank=None, dwc:taxonRemarks=None, dwc:taxonomicStatus=None, dwc:typeStatus='None', dwc:verbatimCoordinateSystem=None, dwc:verbatimCoordinates=None, dwc:verbatimDepth=None, dwc:verbatimElevation='265 m', dwc:verbatimEventDate=None, dwc:verbatimLatitude=None, dwc:verbatimLocality=None, dwc:verbatimLongitude=None, dwc:verbatimSRS=None, dwc:verbatimTaxonRank=None, dwc:vernacularName=None, dwc:waterBody=None, dwc:year='1980', fcc:datePicked=None, fcc:pickedBy=None, id='urn:uuid:9372ac72-aeab-11e3-8259-0026552be7ea_RID', idigbio:preservative=None, idigbio:recordId=None, idigbio:subfamily=None, idigbio:substrate=None, idigbio:superfamily=None, symbiota:recordEnteredBy=None, symbiota:verbatimScientificName=None), datecollected=datetime.datetime(1980, 6, 19, 20, 0), datemodified=datetime.datetime(2016, 3, 9, 4, 17, 27, 393000), dqs=0.1617647111415863, earliestageorloweststage=None, earliesteonorlowesteonothem=None, earliestepochorlowestseries=None, earliesteraorlowesterathem=None, earliestperiodorlowestsystem=None, etag='b3985d7148b9f1a5243dc28ade2187c8a6df9c76', eventdate='1980-06-20', family='apidae', fieldnumber=None, formation=None, genus='bombus', geologicalcontextid=None, geopoint=Row(lat=42.44923, lon=-76.48226), group=None, hasImage=False, hasMedia=False, highertaxon='animalia;arthropoda;apidae;apinae;bombini', highestbiostratigraphiczone=None, individualcount=1.0, infraspecificepithet=None, institutioncode='cuic', institutionid=None, institutionname=None, kingdom='animalia', latestageorhigheststage=None, latesteonorhighesteonothem=None, latestepochorhighestseries=None, latesteraorhighesterathem=None, latestperiodorhighestsystem=None, lithostratigraphicterms=None, locality='ithaca, cornell campus', lowestbiostratigraphiczone=None, maxdepth=None, maxelevation=None, member=None, mindepth=None, minelevation=None, municipality=None, occurrenceid='urn:uuid:9372ac72-aeab-11e3-8259-0026552be7ea', order='hymenoptera', phylum='arthropoda', recordnumber=None, recordset='8919571f-205a-4aed-b9f2-96ccd0108e4c', scientificname='bombus ashtoni', specificepithet='ashtoni', startdayofyear=172, stateprovince='new york', taxonid='1340457', taxonomicstatus='accepted', taxonrank='species', typestatus='none', uuid='43c20745-590c-45f5-9e5c-ddc5153aa573', verbatimeventdate=None, verbatimlocality=None, version=None, waterbody=None),
Row(barcodevalue=None, basisofrecord='preservedspecimen', bed=None, canonicalname='megachile sidalceae', catalognumber='amnh_bee 00140255', class='insecta', collectioncode=None, collectionid=None, collectionname=None, collector='j. s. ascher', commonname=None, continent='north america', coordinateuncertainty=10.0, country='united states', countrycode='usa', county='cochise', data=Row(coreid=None, dc:rights=None, dcterms:accessRights=None, dcterms:bibliographicCitation=None, dcterms:language=None, dcterms:license=None, dcterms:modified='2011-03-15', dcterms:references='Digital Bee Collections Network, 2014 (and updates). Version: 2016-03-08. National Science Foundation grant DBI 0956388', dcterms:rights=None, dcterms:rightsHolder=None, dcterms:source=None, dcterms:type=None, dwc:VerbatimEventDate=None, dwc:acceptedNameUsage=None, dwc:accessRights=None, dwc:associatedMedia=None, dwc:associatedOccurrences=None, dwc:associatedReferences=None, dwc:associatedSequences=None, dwc:associatedTaxa='associated with:Psorothamnus scoparius', dwc:basisOfRecord='PreservedSpecimen', dwc:bed=None, dwc:behavior=None, dwc:catalogNumber='AMNH_BEE 00140255', dwc:class=None, dwc:collectionCode=None, dwc:collectionID=None, dwc:continent=None, dwc:coordinatePrecision=None, dwc:coordinateUncertaintyInMeters='<10m', dwc:country='UNITED STATES', dwc:countryCode=None, dwc:county='Cochise', dwc:dataGeneralizations=None, dwc:datasetID='urn:uuid:13674fa4-8611-11e4-8259-0026552be7ea', dwc:datasetName='Collaborative databasing of North American bee collections within a global informatics network project', dwc:dateIdentified='2010', dwc:day=None, dwc:decimalLatitude='32.23915', dwc:decimalLongitude='-109.77285', dwc:disposition=None, dwc:dynamicProperties=None, dwc:earliestAgeOrLowestStage=None, dwc:earliestEonOrLowestEonothem=None, dwc:earliestEpochOrLowestSeries=None, dwc:earliestEraOrLowestErathem=None, dwc:earliestPeriodOrLowestSystem=None, dwc:endDayOfYear=None, dwc:establishmentMeans=None, dwc:eventDate='2009-09-01', dwc:eventID=None, dwc:eventRemarks=None, dwc:eventTime=None, dwc:family='Megachilidae', dwc:fieldNotes=None, dwc:fieldNumber=None, dwc:footprintSRS=None, dwc:footprintSpatialFit=None, dwc:footprintWKT=None, dwc:formation=None, dwc:genus='Megachile', dwc:geodeticDatum=None, dwc:geologicalContextID=None, dwc:georeferenceProtocol=None, dwc:georeferenceRemarks=None, dwc:georeferenceSources=None, dwc:georeferenceVerificationStatus=None, dwc:georeferencedBy=None, dwc:georeferencedDate=None, dwc:group=None, dwc:habitat=None, dwc:higherClassification='Animalia;Arthropoda;Megachilidae;Megachilinae;Megachilini', dwc:higherGeography=None, dwc:higherGeographyID=None, dwc:highestBiostratigraphicZone=None, dwc:identificationID=None, dwc:identificationQualifier=None, dwc:identificationReferences=None, dwc:identificationRemarks=None, dwc:identificationVerificationStatus=None, dwc:identifiedBy='J. S. Ascher', dwc:individualCount='1', dwc:informationWithheld=None, dwc:infraspecificEpithet=None, dwc:institutionCode='AMNH', dwc:institutionID=None, dwc:island=None, dwc:islandGroup=None, dwc:kingdom=None, dwc:latestAgeOrHighestStage=None, dwc:latestEonOrHighestEonothem=None, dwc:latestEpochOrHighestSeries=None, dwc:latestEraOrHighestErathem=None, dwc:latestPeriodOrHighestSystem=None, dwc:lifeStage='Adult', dwc:lithostratigraphicTerms=None, dwc:locality='4 mi E of Willcox', dwc:locationAccordingTo='Label', dwc:locationID=None, dwc:locationRemarks=None, dwc:lowestBiostratigraphicZone=None, dwc:materialSampleID=None, dwc:maximumDepthInMeters=None, dwc:maximumElevationInMeters=None, dwc:member=None, dwc:minimumDepthInMeters=None, dwc:minimumElevationInMeters=None, dwc:month=None, dwc:municipality=None, dwc:nameAccordingTo=None, dwc:nameAccordingToID=None, dwc:namePublishedIn=None, dwc:namePublishedInID=None, dwc:namePublishedInYear=None, dwc:nomenclaturalCode=None, dwc:nomenclaturalStatus=None, dwc:occurrenceDetails=None, dwc:occurrenceID='urn:uuid:854d29a6-d8e1-11e2-99a2-0026552be7ea', dwc:occurrenceRemarks=None, dwc:occurrenceStatus=None, dwc:order=None, dwc:organismID=None, dwc:organismName=None, dwc:organismQuantity=None, dwc:organismQuantityType=None, dwc:originalNameUsage=None, dwc:otherCatalogNumbers=None, dwc:ownerInstitutionCode='American Museum of Natural History', dwc:parentNameUsage=None, dwc:phylum=None, dwc:pointRadiusSpatialFit=None, dwc:preparations=None, dwc:previousIdentifications=None, dwc:recordNumber=None, dwc:recordedBy='J. S. Ascher', dwc:reproductiveCondition=None, dwc:rights=None, dwc:rightsHolder=None, dwc:samplingEffort=None, dwc:samplingProtocol='Netting', dwc:scientificName='Megachile sidalceae', dwc:scientificNameAuthorship='Cockerell,1897', dwc:scientificNameID=None, dwc:sex='Female', dwc:specificEpithet='sidalceae', dwc:startDayOfYear=None, dwc:stateProvince='Arizona', dwc:subgenus=None, dwc:taxonID=None, dwc:taxonRank=None, dwc:taxonRemarks=None, dwc:taxonomicStatus=None, dwc:typeStatus='None', dwc:verbatimCoordinateSystem=None, dwc:verbatimCoordinates=None, dwc:verbatimDepth=None, dwc:verbatimElevation='1273 m', dwc:verbatimEventDate=None, dwc:verbatimLatitude=None, dwc:verbatimLocality=None, dwc:verbatimLongitude=None, dwc:verbatimSRS=None, dwc:verbatimTaxonRank=None, dwc:vernacularName=None, dwc:waterBody=None, dwc:year='2009', fcc:datePicked=None, fcc:pickedBy=None, id='urn:uuid:854d29a6-d8e1-11e2-99a2-0026552be7ea_RID', idigbio:preservative=None, idigbio:recordId=None, idigbio:subfamily=None, idigbio:substrate=None, idigbio:superfamily=None, symbiota:recordEnteredBy=None, symbiota:verbatimScientificName=None), datecollected=datetime.datetime(2009, 8, 31, 20, 0), datemodified=datetime.datetime(2016, 3, 9, 4, 17, 27, 393000), dqs=0.19117647409439087, earliestageorloweststage=None, earliesteonorlowesteonothem=None, earliestepochorlowestseries=None, earliesteraorlowesterathem=None, earliestperiodorlowestsystem=None, etag='8a3f37c409d1f01463d4eafdb7c64c60a0040c4c', eventdate='2009-09-01', family='megachilidae', fieldnumber=None, formation=None, genus='megachile', geologicalcontextid=None, geopoint=Row(lat=32.23915, lon=-109.77285), group=None, hasImage=False, hasMedia=False, highertaxon='animalia;arthropoda;megachilidae;megachilinae;megachilini', highestbiostratigraphiczone=None, individualcount=1.0, infraspecificepithet=None, institutioncode='amnh', institutionid=None, institutionname=None, kingdom='animalia', latestageorhigheststage=None, latesteonorhighesteonothem=None, latestepochorhighestseries=None, latesteraorhighesterathem=None, latestperiodorhighestsystem=None, lithostratigraphicterms=None, locality='4 mi e of willcox', lowestbiostratigraphiczone=None, maxdepth=None, maxelevation=None, member=None, mindepth=None, minelevation=None, municipality=None, occurrenceid='urn:uuid:854d29a6-d8e1-11e2-99a2-0026552be7ea', order='hymenoptera', phylum='arthropoda', recordnumber=None, recordset='8919571f-205a-4aed-b9f2-96ccd0108e4c', scientificname='megachile sidalceae', specificepithet='sidalceae', startdayofyear=244, stateprovince='arizona', taxonid='1336076', taxonomicstatus='accepted', taxonrank='species', typestatus='none', uuid='d1f8c8b5-1456-45b8-ba65-b30b731f90a4', verbatimeventdate=None, verbatimlocality=None, version=None, waterbody=None)]
In [116]:
df.printSchema()
root
|-- barcodevalue: string (nullable = true)
|-- basisofrecord: string (nullable = true)
|-- bed: string (nullable = true)
|-- canonicalname: string (nullable = true)
|-- catalognumber: string (nullable = true)
|-- class: string (nullable = true)
|-- collectioncode: string (nullable = true)
|-- collectionid: string (nullable = true)
|-- collectionname: string (nullable = true)
|-- collector: string (nullable = true)
|-- commonname: string (nullable = true)
|-- continent: string (nullable = true)
|-- coordinateuncertainty: float (nullable = true)
|-- country: string (nullable = true)
|-- countrycode: string (nullable = true)
|-- county: string (nullable = true)
|-- data: struct (nullable = true)
| |-- coreid: string (nullable = true)
| |-- dc:rights: string (nullable = true)
| |-- dcterms:accessRights: string (nullable = true)
| |-- dcterms:bibliographicCitation: string (nullable = true)
| |-- dcterms:language: string (nullable = true)
| |-- dcterms:license: string (nullable = true)
| |-- dcterms:modified: string (nullable = true)
| |-- dcterms:references: string (nullable = true)
| |-- dcterms:rights: string (nullable = true)
| |-- dcterms:rightsHolder: string (nullable = true)
| |-- dcterms:source: string (nullable = true)
| |-- dcterms:type: string (nullable = true)
| |-- dwc:VerbatimEventDate: string (nullable = true)
| |-- dwc:acceptedNameUsage: string (nullable = true)
| |-- dwc:accessRights: string (nullable = true)
| |-- dwc:associatedMedia: string (nullable = true)
| |-- dwc:associatedOccurrences: string (nullable = true)
| |-- dwc:associatedReferences: string (nullable = true)
| |-- dwc:associatedSequences: string (nullable = true)
| |-- dwc:associatedTaxa: string (nullable = true)
| |-- dwc:basisOfRecord: string (nullable = true)
| |-- dwc:bed: string (nullable = true)
| |-- dwc:behavior: string (nullable = true)
| |-- dwc:catalogNumber: string (nullable = true)
| |-- dwc:class: string (nullable = true)
| |-- dwc:collectionCode: string (nullable = true)
| |-- dwc:collectionID: string (nullable = true)
| |-- dwc:continent: string (nullable = true)
| |-- dwc:coordinatePrecision: string (nullable = true)
| |-- dwc:coordinateUncertaintyInMeters: string (nullable = true)
| |-- dwc:country: string (nullable = true)
| |-- dwc:countryCode: string (nullable = true)
| |-- dwc:county: string (nullable = true)
| |-- dwc:dataGeneralizations: string (nullable = true)
| |-- dwc:datasetID: string (nullable = true)
| |-- dwc:datasetName: string (nullable = true)
| |-- dwc:dateIdentified: string (nullable = true)
| |-- dwc:day: string (nullable = true)
| |-- dwc:decimalLatitude: string (nullable = true)
| |-- dwc:decimalLongitude: string (nullable = true)
| |-- dwc:disposition: string (nullable = true)
| |-- dwc:dynamicProperties: string (nullable = true)
| |-- dwc:earliestAgeOrLowestStage: string (nullable = true)
| |-- dwc:earliestEonOrLowestEonothem: string (nullable = true)
| |-- dwc:earliestEpochOrLowestSeries: string (nullable = true)
| |-- dwc:earliestEraOrLowestErathem: string (nullable = true)
| |-- dwc:earliestPeriodOrLowestSystem: string (nullable = true)
| |-- dwc:endDayOfYear: string (nullable = true)
| |-- dwc:establishmentMeans: string (nullable = true)
| |-- dwc:eventDate: string (nullable = true)
| |-- dwc:eventID: string (nullable = true)
| |-- dwc:eventRemarks: string (nullable = true)
| |-- dwc:eventTime: string (nullable = true)
| |-- dwc:family: string (nullable = true)
| |-- dwc:fieldNotes: string (nullable = true)
| |-- dwc:fieldNumber: string (nullable = true)
| |-- dwc:footprintSRS: string (nullable = true)
| |-- dwc:footprintSpatialFit: string (nullable = true)
| |-- dwc:footprintWKT: string (nullable = true)
| |-- dwc:formation: string (nullable = true)
| |-- dwc:genus: string (nullable = true)
| |-- dwc:geodeticDatum: string (nullable = true)
| |-- dwc:geologicalContextID: string (nullable = true)
| |-- dwc:georeferenceProtocol: string (nullable = true)
| |-- dwc:georeferenceRemarks: string (nullable = true)
| |-- dwc:georeferenceSources: string (nullable = true)
| |-- dwc:georeferenceVerificationStatus: string (nullable = true)
| |-- dwc:georeferencedBy: string (nullable = true)
| |-- dwc:georeferencedDate: string (nullable = true)
| |-- dwc:group: string (nullable = true)
| |-- dwc:habitat: string (nullable = true)
| |-- dwc:higherClassification: string (nullable = true)
| |-- dwc:higherGeography: string (nullable = true)
| |-- dwc:higherGeographyID: string (nullable = true)
| |-- dwc:highestBiostratigraphicZone: string (nullable = true)
| |-- dwc:identificationID: string (nullable = true)
| |-- dwc:identificationQualifier: string (nullable = true)
| |-- dwc:identificationReferences: string (nullable = true)
| |-- dwc:identificationRemarks: string (nullable = true)
| |-- dwc:identificationVerificationStatus: string (nullable = true)
| |-- dwc:identifiedBy: string (nullable = true)
| |-- dwc:individualCount: string (nullable = true)
| |-- dwc:informationWithheld: string (nullable = true)
| |-- dwc:infraspecificEpithet: string (nullable = true)
| |-- dwc:institutionCode: string (nullable = true)
| |-- dwc:institutionID: string (nullable = true)
| |-- dwc:island: string (nullable = true)
| |-- dwc:islandGroup: string (nullable = true)
| |-- dwc:kingdom: string (nullable = true)
| |-- dwc:latestAgeOrHighestStage: string (nullable = true)
| |-- dwc:latestEonOrHighestEonothem: string (nullable = true)
| |-- dwc:latestEpochOrHighestSeries: string (nullable = true)
| |-- dwc:latestEraOrHighestErathem: string (nullable = true)
| |-- dwc:latestPeriodOrHighestSystem: string (nullable = true)
| |-- dwc:lifeStage: string (nullable = true)
| |-- dwc:lithostratigraphicTerms: string (nullable = true)
| |-- dwc:locality: string (nullable = true)
| |-- dwc:locationAccordingTo: string (nullable = true)
| |-- dwc:locationID: string (nullable = true)
| |-- dwc:locationRemarks: string (nullable = true)
| |-- dwc:lowestBiostratigraphicZone: string (nullable = true)
| |-- dwc:materialSampleID: string (nullable = true)
| |-- dwc:maximumDepthInMeters: string (nullable = true)
| |-- dwc:maximumElevationInMeters: string (nullable = true)
| |-- dwc:member: string (nullable = true)
| |-- dwc:minimumDepthInMeters: string (nullable = true)
| |-- dwc:minimumElevationInMeters: string (nullable = true)
| |-- dwc:month: string (nullable = true)
| |-- dwc:municipality: string (nullable = true)
| |-- dwc:nameAccordingTo: string (nullable = true)
| |-- dwc:nameAccordingToID: string (nullable = true)
| |-- dwc:namePublishedIn: string (nullable = true)
| |-- dwc:namePublishedInID: string (nullable = true)
| |-- dwc:namePublishedInYear: string (nullable = true)
| |-- dwc:nomenclaturalCode: string (nullable = true)
| |-- dwc:nomenclaturalStatus: string (nullable = true)
| |-- dwc:occurrenceDetails: string (nullable = true)
| |-- dwc:occurrenceID: string (nullable = true)
| |-- dwc:occurrenceRemarks: string (nullable = true)
| |-- dwc:occurrenceStatus: string (nullable = true)
| |-- dwc:order: string (nullable = true)
| |-- dwc:organismID: string (nullable = true)
| |-- dwc:organismName: string (nullable = true)
| |-- dwc:organismQuantity: string (nullable = true)
| |-- dwc:organismQuantityType: string (nullable = true)
| |-- dwc:originalNameUsage: string (nullable = true)
| |-- dwc:otherCatalogNumbers: string (nullable = true)
| |-- dwc:ownerInstitutionCode: string (nullable = true)
| |-- dwc:parentNameUsage: string (nullable = true)
| |-- dwc:phylum: string (nullable = true)
| |-- dwc:pointRadiusSpatialFit: string (nullable = true)
| |-- dwc:preparations: string (nullable = true)
| |-- dwc:previousIdentifications: string (nullable = true)
| |-- dwc:recordNumber: string (nullable = true)
| |-- dwc:recordedBy: string (nullable = true)
| |-- dwc:reproductiveCondition: string (nullable = true)
| |-- dwc:rights: string (nullable = true)
| |-- dwc:rightsHolder: string (nullable = true)
| |-- dwc:samplingEffort: string (nullable = true)
| |-- dwc:samplingProtocol: string (nullable = true)
| |-- dwc:scientificName: string (nullable = true)
| |-- dwc:scientificNameAuthorship: string (nullable = true)
| |-- dwc:scientificNameID: string (nullable = true)
| |-- dwc:sex: string (nullable = true)
| |-- dwc:specificEpithet: string (nullable = true)
| |-- dwc:startDayOfYear: string (nullable = true)
| |-- dwc:stateProvince: string (nullable = true)
| |-- dwc:subgenus: string (nullable = true)
| |-- dwc:taxonID: string (nullable = true)
| |-- dwc:taxonRank: string (nullable = true)
| |-- dwc:taxonRemarks: string (nullable = true)
| |-- dwc:taxonomicStatus: string (nullable = true)
| |-- dwc:typeStatus: string (nullable = true)
| |-- dwc:verbatimCoordinateSystem: string (nullable = true)
| |-- dwc:verbatimCoordinates: string (nullable = true)
| |-- dwc:verbatimDepth: string (nullable = true)
| |-- dwc:verbatimElevation: string (nullable = true)
| |-- dwc:verbatimEventDate: string (nullable = true)
| |-- dwc:verbatimLatitude: string (nullable = true)
| |-- dwc:verbatimLocality: string (nullable = true)
| |-- dwc:verbatimLongitude: string (nullable = true)
| |-- dwc:verbatimSRS: string (nullable = true)
| |-- dwc:verbatimTaxonRank: string (nullable = true)
| |-- dwc:vernacularName: string (nullable = true)
| |-- dwc:waterBody: string (nullable = true)
| |-- dwc:year: string (nullable = true)
| |-- fcc:datePicked: string (nullable = true)
| |-- fcc:pickedBy: string (nullable = true)
| |-- id: string (nullable = true)
| |-- idigbio:preservative: string (nullable = true)
| |-- idigbio:recordId: string (nullable = true)
| |-- idigbio:subfamily: string (nullable = true)
| |-- idigbio:substrate: string (nullable = true)
| |-- idigbio:superfamily: string (nullable = true)
| |-- symbiota:recordEnteredBy: string (nullable = true)
| |-- symbiota:verbatimScientificName: string (nullable = true)
|-- datecollected: timestamp (nullable = true)
|-- datemodified: timestamp (nullable = true)
|-- dqs: float (nullable = true)
|-- earliestageorloweststage: string (nullable = true)
|-- earliesteonorlowesteonothem: string (nullable = true)
|-- earliestepochorlowestseries: string (nullable = true)
|-- earliesteraorlowesterathem: string (nullable = true)
|-- earliestperiodorlowestsystem: string (nullable = true)
|-- etag: string (nullable = true)
|-- eventdate: string (nullable = true)
|-- family: string (nullable = true)
|-- fieldnumber: string (nullable = true)
|-- formation: string (nullable = true)
|-- genus: string (nullable = true)
|-- geologicalcontextid: string (nullable = true)
|-- geopoint: struct (nullable = true)
| |-- lat: double (nullable = true)
| |-- lon: double (nullable = true)
|-- group: string (nullable = true)
|-- hasImage: boolean (nullable = true)
|-- hasMedia: boolean (nullable = true)
|-- highertaxon: string (nullable = true)
|-- highestbiostratigraphiczone: string (nullable = true)
|-- individualcount: float (nullable = true)
|-- infraspecificepithet: string (nullable = true)
|-- institutioncode: string (nullable = true)
|-- institutionid: string (nullable = true)
|-- institutionname: string (nullable = true)
|-- kingdom: string (nullable = true)
|-- latestageorhigheststage: string (nullable = true)
|-- latesteonorhighesteonothem: string (nullable = true)
|-- latestepochorhighestseries: string (nullable = true)
|-- latesteraorhighesterathem: string (nullable = true)
|-- latestperiodorhighestsystem: string (nullable = true)
|-- lithostratigraphicterms: string (nullable = true)
|-- locality: string (nullable = true)
|-- lowestbiostratigraphiczone: string (nullable = true)
|-- maxdepth: float (nullable = true)
|-- maxelevation: float (nullable = true)
|-- member: string (nullable = true)
|-- mindepth: float (nullable = true)
|-- minelevation: float (nullable = true)
|-- municipality: string (nullable = true)
|-- occurrenceid: string (nullable = true)
|-- order: string (nullable = true)
|-- phylum: string (nullable = true)
|-- recordnumber: string (nullable = true)
|-- recordset: string (nullable = true)
|-- scientificname: string (nullable = true)
|-- specificepithet: string (nullable = true)
|-- startdayofyear: integer (nullable = true)
|-- stateprovince: string (nullable = true)
|-- taxonid: string (nullable = true)
|-- taxonomicstatus: string (nullable = true)
|-- taxonrank: string (nullable = true)
|-- typestatus: string (nullable = true)
|-- uuid: string (nullable = true)
|-- verbatimeventdate: string (nullable = true)
|-- verbatimlocality: string (nullable = true)
|-- version: integer (nullable = true)
|-- waterbody: string (nullable = true)
In [117]:
from pyspark.sql.types import *
def mk_field_set_from_df(df_schema):
"""Create a set of fieldname::type strings from a df schema"""
field_set = set()
prefix = ""
for i in df_schema:
#print(i)
t = type(i.dataType)
if t is not StringType and \
t is not FloatType and \
t is not IntegerType and \
t is not TimestampType and \
t is not BooleanType:
#print(i.dataType)
prefix = i.name
# This only accommodates one level of nesting which is
# consistant with our parquet-building technique. The
# string manipulation on type makes it consistant with
# the JSON returned by the API.
for j in i.dataType:
field_set.add(("{0}.{1}::{2}".format(prefix, j.name,
str(j.dataType)[:-4].lower())))
else:
field_set.add(("{0}::{1}".format(i.name,
str(i.dataType)[:-4].lower())))
return field_set
df_field_set = mk_field_set_from_df(df.schema)
#print(df_field_set)
In [118]:
import requests
def mk_field_set_from_api():
"""Create a set of fieldname::type strings from the meta API endpoint"""
meta_fields_records = (requests
.get("http://search.idigbio.org/v2/meta/fields/records")
.json()
)
field_set = set()
for k,v in meta_fields_records.items():
if k == "data":
for kd,vd in v.items():
if vd.get("fieldName", False):
field_set.add("{0}::{1}".format(vd["fieldName"],
vd["type"]))
elif v.get("fieldName", False):
field_set.add("{0}::{1}".format(v["fieldName"], v["type"]))
else:
# non-data nested structures, only add top level key
field_set.add("{0}::COMPLEX".format(k))
return field_set
meta_field_set = mk_field_set_from_api()
#print(meta_field_set)
In [119]:
# The loaded df is a working data frame of course, what fields are in the
# meta endpoint that the df does not have? Are those fields currently
# excluded in the parquet generation code or has Alex snuck in new
# structures lately?
#
# Note that ES calls things strings if they're lists of strings so even if
# something doesn't show up here, watch for scaler -> vector changes in the
# way data is represented in a field.
print("In meta endpoint, not df")
print("\n".join(sorted(meta_field_set - df_field_set)))
print("\nIn df, not meta endpoint")
print("\n".join(sorted(df_field_set - meta_field_set)))
In meta endpoint, not df
associatedsequences::string
commonnames::string
datecollected::date
datemodified::date
flags::string
geopoint::geo_point
indexData::COMPLEX
mediarecords::string
recordids::string
In df, not meta endpoint
datecollected::timestamp
datemodified::timestamp
geopoint.lat::double
geopoint.lon::double
Content source: bio-guoda/guoda-datasets
Similar notebooks: