Unique Value Code

With the ES generated iDigBio dataframe, build a text list of all the unique values in each field.


In [1]:
import re
idb_df_version = "20161119"  # Hardcoded version of the idb parquet to use

In [2]:
df = sqlContext.read.load("/guoda/data/idigbio-{0}.parquet".format(idb_df_version))

Small subset

Start by making a small selection to work with


In [3]:
small_df = (df
            .where(df["stateprovince"] == "vermont")
            .where(df["genus"] == "acer")
            )
print(small_df.count())
small_df.printSchema()


447
root
 |-- barcodevalue: string (nullable = true)
 |-- basisofrecord: string (nullable = true)
 |-- bed: string (nullable = true)
 |-- canonicalname: string (nullable = true)
 |-- catalognumber: string (nullable = true)
 |-- class: string (nullable = true)
 |-- collectioncode: string (nullable = true)
 |-- collectionid: string (nullable = true)
 |-- collectionname: string (nullable = true)
 |-- collector: string (nullable = true)
 |-- commonname: string (nullable = true)
 |-- continent: string (nullable = true)
 |-- coordinateuncertainty: float (nullable = true)
 |-- country: string (nullable = true)
 |-- countrycode: string (nullable = true)
 |-- county: string (nullable = true)
 |-- data: struct (nullable = true)
 |    |-- coreid: string (nullable = true)
 |    |-- dc:rights: string (nullable = true)
 |    |-- dcterms:accessRights: string (nullable = true)
 |    |-- dcterms:bibliographicCitation: string (nullable = true)
 |    |-- dcterms:language: string (nullable = true)
 |    |-- dcterms:license: string (nullable = true)
 |    |-- dcterms:modified: string (nullable = true)
 |    |-- dcterms:references: string (nullable = true)
 |    |-- dcterms:rights: string (nullable = true)
 |    |-- dcterms:rightsHolder: string (nullable = true)
 |    |-- dcterms:source: string (nullable = true)
 |    |-- dcterms:type: string (nullable = true)
 |    |-- dwc:VerbatimEventDate: string (nullable = true)
 |    |-- dwc:acceptedNameUsage: string (nullable = true)
 |    |-- dwc:accessRights: string (nullable = true)
 |    |-- dwc:associatedMedia: string (nullable = true)
 |    |-- dwc:associatedOccurrences: string (nullable = true)
 |    |-- dwc:associatedReferences: string (nullable = true)
 |    |-- dwc:associatedSequences: string (nullable = true)
 |    |-- dwc:associatedTaxa: string (nullable = true)
 |    |-- dwc:basisOfRecord: string (nullable = true)
 |    |-- dwc:bed: string (nullable = true)
 |    |-- dwc:behavior: string (nullable = true)
 |    |-- dwc:catalogNumber: string (nullable = true)
 |    |-- dwc:class: string (nullable = true)
 |    |-- dwc:collectionCode: string (nullable = true)
 |    |-- dwc:collectionID: string (nullable = true)
 |    |-- dwc:continent: string (nullable = true)
 |    |-- dwc:coordinatePrecision: string (nullable = true)
 |    |-- dwc:coordinateUncertaintyInMeters: string (nullable = true)
 |    |-- dwc:country: string (nullable = true)
 |    |-- dwc:countryCode: string (nullable = true)
 |    |-- dwc:county: string (nullable = true)
 |    |-- dwc:dataGeneralizations: string (nullable = true)
 |    |-- dwc:datasetID: string (nullable = true)
 |    |-- dwc:datasetName: string (nullable = true)
 |    |-- dwc:dateIdentified: string (nullable = true)
 |    |-- dwc:day: string (nullable = true)
 |    |-- dwc:decimalLatitude: string (nullable = true)
 |    |-- dwc:decimalLongitude: string (nullable = true)
 |    |-- dwc:disposition: string (nullable = true)
 |    |-- dwc:dynamicProperties: string (nullable = true)
 |    |-- dwc:earliestAgeOrLowestStage: string (nullable = true)
 |    |-- dwc:earliestEonOrLowestEonothem: string (nullable = true)
 |    |-- dwc:earliestEpochOrLowestSeries: string (nullable = true)
 |    |-- dwc:earliestEraOrLowestErathem: string (nullable = true)
 |    |-- dwc:earliestPeriodOrLowestSystem: string (nullable = true)
 |    |-- dwc:endDayOfYear: string (nullable = true)
 |    |-- dwc:establishmentMeans: string (nullable = true)
 |    |-- dwc:eventDate: string (nullable = true)
 |    |-- dwc:eventID: string (nullable = true)
 |    |-- dwc:eventRemarks: string (nullable = true)
 |    |-- dwc:eventTime: string (nullable = true)
 |    |-- dwc:family: string (nullable = true)
 |    |-- dwc:fieldNotes: string (nullable = true)
 |    |-- dwc:fieldNumber: string (nullable = true)
 |    |-- dwc:footprintWKT: string (nullable = true)
 |    |-- dwc:formation: string (nullable = true)
 |    |-- dwc:genus: string (nullable = true)
 |    |-- dwc:geodeticDatum: string (nullable = true)
 |    |-- dwc:geologicalContextID: string (nullable = true)
 |    |-- dwc:georeferenceProtocol: string (nullable = true)
 |    |-- dwc:georeferenceRemarks: string (nullable = true)
 |    |-- dwc:georeferenceSources: string (nullable = true)
 |    |-- dwc:georeferenceVerificationStatus: string (nullable = true)
 |    |-- dwc:georeferencedBy: string (nullable = true)
 |    |-- dwc:georeferencedDate: string (nullable = true)
 |    |-- dwc:group: string (nullable = true)
 |    |-- dwc:habitat: string (nullable = true)
 |    |-- dwc:higherClassification: string (nullable = true)
 |    |-- dwc:higherGeography: string (nullable = true)
 |    |-- dwc:higherGeographyID: string (nullable = true)
 |    |-- dwc:highestBiostratigraphicZone: string (nullable = true)
 |    |-- dwc:identificationID: string (nullable = true)
 |    |-- dwc:identificationQualifier: string (nullable = true)
 |    |-- dwc:identificationReferences: string (nullable = true)
 |    |-- dwc:identificationRemarks: string (nullable = true)
 |    |-- dwc:identificationVerificationStatus: string (nullable = true)
 |    |-- dwc:identifiedBy: string (nullable = true)
 |    |-- dwc:individualCount: string (nullable = true)
 |    |-- dwc:individualID: string (nullable = true)
 |    |-- dwc:informationWithheld: string (nullable = true)
 |    |-- dwc:infraspecificEpithet: string (nullable = true)
 |    |-- dwc:institutionCode: string (nullable = true)
 |    |-- dwc:institutionID: string (nullable = true)
 |    |-- dwc:island: string (nullable = true)
 |    |-- dwc:islandGroup: string (nullable = true)
 |    |-- dwc:kingdom: string (nullable = true)
 |    |-- dwc:latestAgeOrHighestStage: string (nullable = true)
 |    |-- dwc:latestEonOrHighestEonothem: string (nullable = true)
 |    |-- dwc:latestEpochOrHighestSeries: string (nullable = true)
 |    |-- dwc:latestEraOrHighestErathem: string (nullable = true)
 |    |-- dwc:latestPeriodOrHighestSystem: string (nullable = true)
 |    |-- dwc:lifeStage: string (nullable = true)
 |    |-- dwc:lithostratigraphicTerms: string (nullable = true)
 |    |-- dwc:locality: string (nullable = true)
 |    |-- dwc:locationAccordingTo: string (nullable = true)
 |    |-- dwc:locationID: string (nullable = true)
 |    |-- dwc:locationRemarks: string (nullable = true)
 |    |-- dwc:lowestBiostratigraphicZone: string (nullable = true)
 |    |-- dwc:maximumDepthInMeters: string (nullable = true)
 |    |-- dwc:maximumElevationInMeters: string (nullable = true)
 |    |-- dwc:member: string (nullable = true)
 |    |-- dwc:minimumDepthInMeters: string (nullable = true)
 |    |-- dwc:minimumElevationInMeters: string (nullable = true)
 |    |-- dwc:month: string (nullable = true)
 |    |-- dwc:municipality: string (nullable = true)
 |    |-- dwc:nameAccordingTo: string (nullable = true)
 |    |-- dwc:nameAccordingToID: string (nullable = true)
 |    |-- dwc:namePublishedIn: string (nullable = true)
 |    |-- dwc:namePublishedInID: string (nullable = true)
 |    |-- dwc:namePublishedInYear: string (nullable = true)
 |    |-- dwc:nomenclaturalCode: string (nullable = true)
 |    |-- dwc:nomenclaturalStatus: string (nullable = true)
 |    |-- dwc:occurrenceDetails: string (nullable = true)
 |    |-- dwc:occurrenceID: string (nullable = true)
 |    |-- dwc:occurrenceRemarks: string (nullable = true)
 |    |-- dwc:occurrenceStatus: string (nullable = true)
 |    |-- dwc:order: string (nullable = true)
 |    |-- dwc:organismID: string (nullable = true)
 |    |-- dwc:originalNameUsage: string (nullable = true)
 |    |-- dwc:otherCatalogNumbers: string (nullable = true)
 |    |-- dwc:ownerInstitutionCode: string (nullable = true)
 |    |-- dwc:parentNameUsage: string (nullable = true)
 |    |-- dwc:phylum: string (nullable = true)
 |    |-- dwc:preparations: string (nullable = true)
 |    |-- dwc:previousIdentifications: string (nullable = true)
 |    |-- dwc:recordNumber: string (nullable = true)
 |    |-- dwc:recordedBy: string (nullable = true)
 |    |-- dwc:reproductiveCondition: string (nullable = true)
 |    |-- dwc:rights: string (nullable = true)
 |    |-- dwc:rightsHolder: string (nullable = true)
 |    |-- dwc:samplingEffort: string (nullable = true)
 |    |-- dwc:samplingProtocol: string (nullable = true)
 |    |-- dwc:scientificName: string (nullable = true)
 |    |-- dwc:scientificNameAuthorship: string (nullable = true)
 |    |-- dwc:scientificNameID: string (nullable = true)
 |    |-- dwc:sex: string (nullable = true)
 |    |-- dwc:specificEpithet: string (nullable = true)
 |    |-- dwc:startDayOfYear: string (nullable = true)
 |    |-- dwc:stateProvince: string (nullable = true)
 |    |-- dwc:subgenus: string (nullable = true)
 |    |-- dwc:taxonID: string (nullable = true)
 |    |-- dwc:taxonRank: string (nullable = true)
 |    |-- dwc:taxonRemarks: string (nullable = true)
 |    |-- dwc:taxonomicStatus: string (nullable = true)
 |    |-- dwc:typeStatus: string (nullable = true)
 |    |-- dwc:verbatimCoordinateSystem: string (nullable = true)
 |    |-- dwc:verbatimCoordinates: string (nullable = true)
 |    |-- dwc:verbatimDepth: string (nullable = true)
 |    |-- dwc:verbatimElevation: string (nullable = true)
 |    |-- dwc:verbatimEventDate: string (nullable = true)
 |    |-- dwc:verbatimLatitude: string (nullable = true)
 |    |-- dwc:verbatimLocality: string (nullable = true)
 |    |-- dwc:verbatimLongitude: string (nullable = true)
 |    |-- dwc:verbatimSRS: string (nullable = true)
 |    |-- dwc:verbatimTaxonRank: string (nullable = true)
 |    |-- dwc:vernacularName: string (nullable = true)
 |    |-- dwc:waterBody: string (nullable = true)
 |    |-- dwc:year: string (nullable = true)
 |    |-- fcc:datePicked: string (nullable = true)
 |    |-- fcc:pickedBy: string (nullable = true)
 |    |-- id: string (nullable = true)
 |    |-- idigbio:preservative: string (nullable = true)
 |    |-- idigbio:recordId: string (nullable = true)
 |    |-- idigbio:subfamily: string (nullable = true)
 |    |-- idigbio:substrate: string (nullable = true)
 |    |-- idigbio:superfamily: string (nullable = true)
 |    |-- symbiota:recordEnteredBy: string (nullable = true)
 |    |-- symbiota:verbatimScientificName: string (nullable = true)
 |-- datecollected: timestamp (nullable = true)
 |-- datemodified: timestamp (nullable = true)
 |-- dqs: float (nullable = true)
 |-- earliestageorloweststage: string (nullable = true)
 |-- earliesteonorlowesteonothem: string (nullable = true)
 |-- earliestepochorlowestseries: string (nullable = true)
 |-- earliesteraorlowesterathem: string (nullable = true)
 |-- earliestperiodorlowestsystem: string (nullable = true)
 |-- etag: string (nullable = true)
 |-- eventdate: string (nullable = true)
 |-- family: string (nullable = true)
 |-- fieldnumber: string (nullable = true)
 |-- formation: string (nullable = true)
 |-- genus: string (nullable = true)
 |-- geologicalcontextid: string (nullable = true)
 |-- geopoint: struct (nullable = true)
 |    |-- lat: double (nullable = true)
 |    |-- lon: double (nullable = true)
 |-- group: string (nullable = true)
 |-- hasImage: boolean (nullable = true)
 |-- hasMedia: boolean (nullable = true)
 |-- highertaxon: string (nullable = true)
 |-- highestbiostratigraphiczone: string (nullable = true)
 |-- individualcount: float (nullable = true)
 |-- infraspecificepithet: string (nullable = true)
 |-- institutioncode: string (nullable = true)
 |-- institutionid: string (nullable = true)
 |-- institutionname: string (nullable = true)
 |-- kingdom: string (nullable = true)
 |-- latestageorhigheststage: string (nullable = true)
 |-- latesteonorhighesteonothem: string (nullable = true)
 |-- latestepochorhighestseries: string (nullable = true)
 |-- latesteraorhighesterathem: string (nullable = true)
 |-- latestperiodorhighestsystem: string (nullable = true)
 |-- lithostratigraphicterms: string (nullable = true)
 |-- locality: string (nullable = true)
 |-- lowestbiostratigraphiczone: string (nullable = true)
 |-- maxdepth: float (nullable = true)
 |-- maxelevation: float (nullable = true)
 |-- member: string (nullable = true)
 |-- mindepth: float (nullable = true)
 |-- minelevation: float (nullable = true)
 |-- municipality: string (nullable = true)
 |-- occurrenceid: string (nullable = true)
 |-- order: string (nullable = true)
 |-- phylum: string (nullable = true)
 |-- recordnumber: string (nullable = true)
 |-- recordset: string (nullable = true)
 |-- scientificname: string (nullable = true)
 |-- specificepithet: string (nullable = true)
 |-- startdayofyear: integer (nullable = true)
 |-- stateprovince: string (nullable = true)
 |-- taxonid: string (nullable = true)
 |-- taxonomicstatus: string (nullable = true)
 |-- taxonrank: string (nullable = true)
 |-- typestatus: string (nullable = true)
 |-- uuid: string (nullable = true)
 |-- verbatimeventdate: string (nullable = true)
 |-- verbatimlocality: string (nullable = true)
 |-- version: integer (nullable = true)
 |-- waterbody: string (nullable = true)

Test set of fields to do this for

Set up the fields to iterate over to generate a file per field. Hard code for now. Just a few to make sure the summarization process works.


In [4]:
fields = ["stateprovince", "specificepithet"]
#, "data.dwc:specificepithet"]

In [5]:
p = re.compile('[\W_]+')
for field in fields:
    slug = p.sub("_", field)
    output_fn = "idigbio-{0}-unique-{1}".format(idb_df_version, slug)
    (small_df
     .groupBy(df[field])
     .count()
     .write
     .format("com.databricks.spark.csv")
     .mode("overwrite")
     .option("header", "false")
     .save("/outputs/{0}.csv".format(output_fn))
    )

Looks like the nested fields are pretty killer. They take a lot longer than the top level ones and it GC memory killed Spark in this notebook even for data.dwc:genus which should be smallish (50k).

Changed small_df to acer in vermont instead of vermont and the test summary to specificepithet.

Doesn't seem to matter, eats all memory even when the small dataset is 477 records.

Building field list to iterate over

Now build up the full list of fields, in corperate this into the real job


In [20]:
field_set = set()
for s in small_df.schema:
    #print(s.dataType)
    #if str(s.dataType) in ["StringType", "FloatType", "TimestampType", "InegertType", "BooleanType", "DoubleType"]:
    if not str(s.dataType).startswith("StructType"):
        field_set.add(s.name)
    else:
        for sub in s.dataType:
            field_set.add(".".join([s.name, sub.name]))
print(field_set)
print(len(field_set))


{'data.dwc:georeferenceVerificationStatus', 'data.dwc:earliestEraOrLowestErathem', 'data.dcterms:references', 'data.dwc:informationWithheld', 'data.dwc:verbatimElevation', 'data.dwc:geologicalContextID', 'data.dwc:phylum', 'data.dwc:coordinatePrecision', 'data.dwc:samplingEffort', 'genus', 'data.dwc:identificationRemarks', 'data.dwc:minimumDepthInMeters', 'data.dcterms:language', 'data.dwc:locationRemarks', 'data.dwc:originalNameUsage', 'datemodified', 'taxonid', 'data.dwc:verbatimTaxonRank', 'data.dwc:member', 'data.dwc:eventTime', 'data.dwc:typeStatus', 'data.dwc:accessRights', 'data.dwc:establishmentMeans', 'data.dwc:previousIdentifications', 'typestatus', 'data.dwc:collectionID', 'data.dwc:eventDate', 'data.dwc:georeferenceRemarks', 'data.dcterms:rights', 'data.dwc:behavior', 'data.dwc:bed', 'data.dwc:latestEonOrHighestEonothem', 'formation', 'data.dwc:class', 'data.dwc:earliestPeriodOrLowestSystem', 'catalognumber', 'data.dwc:associatedReferences', 'data.fcc:datePicked', 'data.dwc:nomenclaturalStatus', 'data.dcterms:type', 'institutionname', 'data.dwc:taxonRemarks', 'data.dwc:verbatimLongitude', 'data.dwc:locationID', 'data.idigbio:preservative', 'data.dwc:kingdom', 'data.dwc:preparations', 'data.dwc:highestBiostratigraphicZone', 'lithostratigraphicterms', 'data.dwc:samplingProtocol', 'data.dwc:waterBody', 'order', 'data.coreid', 'canonicalname', 'data.dwc:acceptedNameUsage', 'data.dwc:decimalLatitude', 'data.dwc:footprintWKT', 'data.dwc:associatedSequences', 'data.dwc:maximumElevationInMeters', 'data.dwc:georeferenceProtocol', 'data.dwc:day', 'data.dwc:namePublishedInYear', 'data.dwc:otherCatalogNumbers', 'data.id', 'data.dwc:occurrenceDetails', 'data.dwc:county', 'data.dwc:earliestEonOrLowestEonothem', 'maxdepth', 'data.dwc:startDayOfYear', 'country', 'data.dc:rights', 'data.dwc:recordedBy', 'data.symbiota:recordEnteredBy', 'data.dwc:basisOfRecord', 'data.dwc:identifiedBy', 'locality', 'recordnumber', 'data.dwc:verbatimCoordinateSystem', 'geopoint.lon', 'data.dwc:group', 'verbatimeventdate', 'data.dwc:latestAgeOrHighestStage', 'dqs', 'data.dwc:vernacularName', 'institutioncode', 'data.dwc:maximumDepthInMeters', 'data.dwc:infraspecificEpithet', 'data.dwc:month', 'collector', 'data.idigbio:substrate', 'data.dcterms:modified', 'data.dwc:identificationID', 'data.dwc:minimumElevationInMeters', 'etag', 'data.dwc:datasetName', 'data.dwc:taxonRank', 'data.dwc:georeferenceSources', 'countrycode', 'data.dwc:recordNumber', 'basisofrecord', 'data.dwc:locality', 'coordinateuncertainty', 'waterbody', 'data.dwc:higherGeography', 'data.dcterms:accessRights', 'data.dwc:associatedTaxa', 'data.dwc:island', 'earliestperiodorlowestsystem', 'latestperiodorhighestsystem', 'data.dwc:rights', 'data.dwc:verbatimSRS', 'data.dcterms:source', 'data.dwc:individualCount', 'institutionid', 'data.dwc:verbatimLatitude', 'data.dwc:year', 'data.dwc:eventRemarks', 'family', 'data.dwc:latestEpochOrHighestSeries', 'latesteraorhighesterathem', 'minelevation', 'data.dwc:catalogNumber', 'data.dwc:institutionCode', 'data.dwc:associatedMedia', 'data.dwc:specificEpithet', 'collectionid', 'data.dwc:higherClassification', 'data.dwc:identificationVerificationStatus', 'commonname', 'data.dwc:coordinateUncertaintyInMeters', 'data.dwc:islandGroup', 'data.dwc:scientificNameID', 'specificepithet', 'earliestageorloweststage', 'phylum', 'data.dcterms:rightsHolder', 'data.dwc:occurrenceRemarks', 'data.dwc:organismID', 'data.dwc:family', 'collectioncode', 'data.dwc:verbatimEventDate', 'data.dwc:ownerInstitutionCode', 'data.dwc:verbatimDepth', 'data.fcc:pickedBy', 'recordset', 'scientificname', 'earliesteonorlowesteonothem', 'data.dwc:nameAccordingTo', 'datecollected', 'startdayofyear', 'lowestbiostratigraphiczone', 'hasMedia', 'data.dwc:continent', 'data.dwc:associatedOccurrences', 'data.dwc:decimalLongitude', 'data.dwc:municipality', 'data.dwc:individualID', 'data.dwc:lowestBiostratigraphicZone', 'data.dcterms:license', 'data.idigbio:subfamily', 'data.dwc:sex', 'data.dcterms:bibliographicCitation', 'data.dwc:fieldNumber', 'data.dwc:namePublishedIn', 'mindepth', 'latestageorhigheststage', 'data.dwc:stateProvince', 'data.dwc:eventID', 'taxonrank', 'data.dwc:country', 'barcodevalue', 'data.symbiota:verbatimScientificName', 'bed', 'data.dwc:endDayOfYear', 'data.dwc:habitat', 'data.dwc:occurrenceStatus', 'latesteonorhighesteonothem', 'stateprovince', 'data.dwc:earliestEpochOrLowestSeries', 'data.idigbio:recordId', 'data.dwc:geodeticDatum', 'highertaxon', 'group', 'data.dwc:nameAccordingToID', 'municipality', 'data.dwc:dateIdentified', 'data.dwc:scientificName', 'geologicalcontextid', 'member', 'infraspecificepithet', 'data.dwc:genus', 'data.dwc:occurrenceID', 'data.dwc:fieldNotes', 'maxelevation', 'data.dwc:countryCode', 'data.dwc:VerbatimEventDate', 'county', 'data.dwc:locationAccordingTo', 'data.dwc:georeferencedDate', 'data.dwc:scientificNameAuthorship', 'data.dwc:verbatimLocality', 'data.dwc:reproductiveCondition', 'data.dwc:verbatimCoordinates', 'data.dwc:lifeStage', 'geopoint.lat', 'kingdom', 'data.dwc:namePublishedInID', 'data.dwc:formation', 'uuid', 'data.dwc:datasetID', 'data.dwc:order', 'data.dwc:dynamicProperties', 'data.dwc:georeferencedBy', 'data.dwc:nomenclaturalCode', 'data.dwc:taxonomicStatus', 'hasImage', 'individualcount', 'data.dwc:latestPeriodOrHighestSystem', 'data.dwc:disposition', 'latestepochorhighestseries', 'data.dwc:subgenus', 'data.dwc:lithostratigraphicTerms', 'continent', 'eventdate', 'verbatimlocality', 'version', 'highestbiostratigraphiczone', 'taxonomicstatus', 'earliestepochorlowestseries', 'data.dwc:higherGeographyID', 'fieldnumber', 'data.dwc:taxonID', 'data.dwc:identificationQualifier', 'occurrenceid', 'class', 'data.dwc:dataGeneralizations', 'data.dwc:identificationReferences', 'data.dwc:institutionID', 'data.idigbio:superfamily', 'collectionname', 'earliesteraorlowesterathem', 'data.dwc:parentNameUsage', 'data.dwc:rightsHolder', 'data.dwc:earliestAgeOrLowestStage', 'data.dwc:latestEraOrHighestErathem', 'data.dwc:collectionCode'}
246

In [ ]: