Unique collection codes


In [14]:
from __future__ import print_function
import os
from pyspark import SQLContext

In [42]:
# Load iDigBio Parquet
sqlContext = SQLContext(sc)
idbdf = sqlContext.read.parquet("../data/idigbio/occurrence.txt.parquet")
#idbdf = sqlContext.read.parquet("../data/idigbio-100k/occurrence.txt.parquet")
#print(idbdf.schema)

In [43]:
col_codes = idbdf.select(
    idbdf['`http://rs.tdwg.org/dwc/terms/collectionCode`'].alias('collectionCode')
    ).distinct()

In [45]:
# Spark is not good with unicode, this isn't happy
#col_codes.write.text("../data/unique_collection_codes.txt")

print(idbdf.count())
print(col_codes.count())
with open("../data/unique_collection_codes.txt", 'w') as f:
    for r in col_codes.collect():
        if r["collectionCode"] is not None:
            f.write(r["collectionCode"] + "\n")
            #break


15457224
144

In [ ]: