In [1]:
%matplotlib inline
from __future__ import print_function
import os
from pyspark import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as sql
import pyspark.sql.types as types
#from pyspark.sql.functions import udf, length
import matplotlib.pyplot as plt
import numpy
import math
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import pyspark.ml.feature as feature


:0: FutureWarning: IPython widgets are experimental and may change in the future.

In [2]:
# Load Processed Parquet
sqlContext = SQLContext(sc)
ols = sqlContext.read.json("../data/idigbio_envo_terms.json")
total_records = ols.count()
print(total_records)
ols.printSchema()

# FIXME: Now have phrases in this file, need to filter them out!


84158
root
 |-- noun: string (nullable = true)
 |-- response: struct (nullable = true)
 |    |-- description: array (nullable = true)
 |    |    |-- element: string (containsNull = true)
 |    |-- id: string (nullable = true)
 |    |-- iri: string (nullable = true)
 |    |-- is_defining_ontology: boolean (nullable = true)
 |    |-- label: string (nullable = true)
 |    |-- obo_id: string (nullable = true)
 |    |-- ontology_name: string (nullable = true)
 |    |-- ontology_prefix: string (nullable = true)
 |    |-- short_form: string (nullable = true)
 |    |-- type: string (nullable = true)
 |-- term_id: string (nullable = true)
 |-- term_label: string (nullable = true)
 |-- time: string (nullable = true)


In [3]:
print(ols.filter(sql.col("term_id") != "").count())
print(ols.filter(sql.col("term_id") == "").count())


27411
56747

In [4]:
# Manually review a random sample of positive results and negative results
# Different random sample every time run
ols\
    .filter(sql.col("term_id") != "")\
    .sample(withReplacement=False, fraction=0.01)\
    .unionAll(
    ols\
        .filter(sql.col("term_id") == "")\
        .sample(withReplacement=False, fraction=0.002)
    )\
    .select(sql.col("noun"), sql.col("term_label"))\
    .show(1000, truncate=False)


+---------------------------------------------------+-------------------------------------------+
|noun                                               |term_label                                 |
+---------------------------------------------------+-------------------------------------------+
|estuary.                                           |estuary                                    |
|service                                            |national wildlife refuge                   |
|'sand                                              |sand                                       |
|mlb08/31/2007                                      |herb and fern layer                        |
|59-721                                             |stream                                     |
|also                                               |atmospheric carbon dioxide                 |
|1/27/79                                            |peak                                       |
|emergence                                          |hot spring                                 |
|given.                                             |biosphere reserve                          |
|pit-tagged                                         |pit                                        |
|viii-4-43                                          |marine reef                                |
|covered                                            |covered latrine                            |
|21:00h                                             |oceanic benthopelagic zone biome           |
|facility                                           |sports facility                            |
|m.b.200                                            |marine bathyal zone biome                  |
|-500m.                                             |Environment Ontology                       |
|48o23.5'w                                          |inheres in part of                         |
|13-20.i.2009                                       |loam                                       |
|5/2006                                             |karst field                                |
|230                                                |bayou                                      |
|8.v.91                                             |V-shaped                                   |
|home                                               |human dwelling                             |
|mud.                                               |mud                                        |
|ceiling.                                           |ceiling                                    |
|dead                                               |dead                                       |
|fraction                                           |mine tailing                               |
|cracked.                                           |glacial plucking                           |
|23:30                                              |saline hydrographic feature                |
|12.vi.73                                           |sisal plantation                           |
|10/12/80                                           |hyperthermophilic sediment                 |
|hk03-0221-1                                        |colloidal sediment                         |
|viii-24-30                                         |saline hydrographic feature                |
|'these                                             |storm surge                                |
|disturbed.                                         |disturbed area                             |
|cilia.                                             |ciliated epithelium                        |
|weather.                                           |storm surge                                |
|7/6-31/1982                                        |acidic water                               |
|w/purple                                           |inheres in part of                         |
|p.66                                               |NAD(P)                                     |
|52773-76                                           |tea plantation                             |
|relay                                              |dorsal horn of spinal cord                 |
|effective                                          |national geopolitical entity               |
|'mustard                                           |mustard                                    |
|alas.                                              |alas                                       |
|12-aug-08                                          |piece of gravel                            |
|date                                               |date                                       |
|rigid.                                             |lithosphere                                |
|drift                                              |tombolo                                    |
|26.iv.1987                                         |along-front current                        |
|bryophyte                                          |sphagnum bog                               |
|569                                                |first-order administrative region          |
|exchange.                                          |closed ecological system                   |
|1-dec-89                                           |colloidal sediment                         |
|7/31/00                                            |organic feature                            |
|green-yellow                                       |yellow bone marrow                         |
|10/14/2009.                                        |marine basaltic hydrothermal vent biome    |
|brownish-green                                     |Viridiplantae                              |
|rhr95-53.                                          |cotton plantation                          |
|68-22.                                             |plantation                                 |
|326.                                               |marsh                                      |
|skin                                               |skin environment                           |
|viii-ix.1982                                       |acoustico-facial VII-VIII ganglion complex |
|237.                                               |beach                                      |
|187.                                               |badland                                    |
|27-may-09                                          |scoria                                     |
|vent                                               |hydrothermal vent                          |
|6:37.                                              |plant derived beverage                     |
|1241                                               |seamount                                   |
|m.565                                              |marine bathyal zone biome                  |
|type                                               |biome                                      |
|27-viii-39                                         |conifer woodland                           |
|anchor                                             |irregular bone                             |
|21-28.vii.1977                                     |continental margin                         |
|fork                                               |watercourse                                |
|29.v.1986                                          |V-shaped                                   |
|n=60                                               |tobacco warehouse                          |
|pollinator                                         |land degradation                           |
|specimennumber:593                                 |stream                                     |
|hk9700203-1                                        |colloidal sediment                         |
|19.ii.1996                                         |national park                              |
|v:01                                               |V-shaped                                   |
|v:02                                               |V-shaped                                   |
|810.                                               |undersea feature                           |
|60-381                                             |liquefied natural gas                      |
|a=9                                                |alkaline environment                       |
|2x=33                                              |oak woodland                               |
|sharp                                              |saline wedge estuary                       |
|15/iv/2003                                         |mesophilic sediment                        |
|xii:1                                              |colloidal sediment                         |
|road-cut                                           |road cut                                   |
|-rhr                                               |Environment Ontology                       |
|7/21/1997                                          |acidic water                               |
|2'steep                                            |cliff                                      |
|6-5-25                                             |paraffin                                   |
|4-18-82                                            |canopy                                     |
|1/2000                                             |broadleaf forest                           |
|21.viii.1995                                       |legume                                     |
|burns.                                             |paraffin lamp                              |
|157                                                |wadi mouth                                 |
|'w'-marks                                          |intertidal zone                            |
|drainage-tracks                                    |drainage canal                             |
|marshes                                            |marine salt marsh biome                    |
|mountains.                                         |seamount                                   |
|tl=51                                              |siltstone                                  |
|1+2                                                |partially_surrounded_by                    |
|occipital                                          |head somite                                |
|kau-08-0815-1                                      |amictic lake                               |
|ti-83-20                                           |transport feature                          |
|lr-10                                              |respirable suspended particulate matter    |
|6-21-42                                            |building                                   |
|59-120                                             |western boundary current                   |
|id-54                                              |id                                         |
|5-dec-01                                           |karst field                                |
|15'x50'x4.5'.                                      |mesophilic sediment                        |
|iv-21-51                                           |siltstone                                  |
|mcpherson-rice                                     |rice straw                                 |
|look                                               |mineral material                           |
|n=18                                               |tobacco warehouse                          |
|blue-gray-green                                    |blue cheese                                |
|hk-133                                             |surface layer                              |
|465.                                               |marine channel                             |
|pallidum                                           |brainstem                                  |
|9-10-12-13                                         |subtropical                                |
|taste-                                             |gustatory system                           |
|504.                                               |sea shore                                  |
|11:50                                              |thermosphere                               |
|5.xi.74                                            |container glass                            |
|serves                                             |buffer zone                                |
|23.vii.1978                                        |must                                       |
|12/06/85                                           |intermittent wetland                       |
|169-175                                            |asphalt lake                               |
|re-collection                                      |anatomical collection                      |
|nodes.                                             |cervical lymph node                        |
|59-25                                              |compressed natural gas                     |
|10:55am                                            |respirable suspended particulate matter    |
|fleshy                                             |melon fruit product                        |
|deciduous                                          |deciduous (plant)                          |
|snow                                               |snow                                       |
|2w/w.                                              |inheres in part of                         |
|48o23.6'w                                          |inheres in part of                         |
|sensu                                              |fallopian tube                             |
|12:23.                                             |must                                       |
|xii:23                                             |must                                       |
|intermedium                                        |intermediate mesoderm                      |
|die-off                                            |cutoff                                     |
|3-viii-1977                                        |arsenate treated wood                      |
|10w/0.                                             |fatty acid anion 28:0                      |
|29.iii.74                                          |container glass                            |
|'sus                                               |pork                                       |
|people.                                            |vehicle                                    |
|botanical                                          |botanical garden                           |
|rotate                                             |volcanic dike                              |
|pathology.                                         |mpath_slim                                 |
|trs-data.                                          |mesotrophic lake                           |
|pairs.                                             |interacts with                             |
|63833-49                                           |nest of termite                            |
|230.                                               |bayou                                      |
|old                                                |old                                        |
|8-1-1946                                           |colloidal sediment                         |
|60-442                                             |liquefied natural gas                      |
|annotation                                         |forest process                             |
|rocky-sandy-clay                                   |clay                                       |
|saddle.                                            |saddle dam                                 |
|ammonia                                            |ammonia                                    |
|tt69-23                                            |must                                       |
|tt69-22                                            |highland subtropical                       |
|bottomland                                         |flood plain                                |
|spinach                                            |spinach                                    |
|1191                                               |heath                                      |
|acc:2007                                           |herb and fern layer                        |
|59-93                                              |farm                                       |
|7/18                                               |acidic water                               |
|steam                                              |fumarole                                   |
|rhyolite                                           |stratovolcano                              |
|17-dec-81                                          |buffer zone                                |
|hk07-1021-1                                        |ocean trench                               |
|made.                                              |man-made tunnel                            |
|02:00                                              |organic feature                            |
|robins/october                                     |turlough                                   |
|6:36.                                              |animal derived beverage                    |
|pinus-quercus.                                     |pinyon juniper woodland                    |
|15:15.                                             |nitrogen-15 atom                           |
|tests=blues                                        |calcareous ooze                            |
|2n=33                                              |oak woodland                               |
|exped. to brazil                                   |results in proliferation of                |
|polylepis/shrub litter mixed polylepis/shrub litter|plant litter                               |
|atrraction trap                                    |pour-flush pit latrine                     |
|polydomous nest                                    |nest of bird                               |
|oak-pine forest leaf litter                        |litter layer                               |
|contorta stand                                     |woodland                                   |
|soil. common                                       |soil erosion                               |
|section hispidoderma                               |section of aorta                           |
|strongly tuberculate                               |solonetz                                   |
|flower garden                                      |garden                                     |
|is now holst state forest                          |forest                                     |
|elev not lsited                                    |track                                      |
|labels. see uwf notes. -a varandas/july2004        |alpine                                     |
|mine ecosystem reconstruction                      |ecosystem-wide aerobic respiration         |
|coniferous grove                                   |subpolar coniferous forest biome           |
|barely attached                                    |attached to                                |
|collybia box                                       |neurocranium                               |
|pink petals                                        |Vacherin Mont d'Or                         |
|rico/penn state college                            |university campus                          |
|next to cowpens                                    |results in proliferation of                |
|gill net                                           |marine downwelling                         |
|lindgren funnel                                    |uterine tube infundibulum                  |
|field notes                                        |field                                      |
|purpureus xanthopus                                |hyacinth bean product                      |
|good condition                                     |altitudinal condition                      |
|noturus leptacanthus collected but not included    |hydrological condensation process          |
|see uwf notes. -g sheehy/july04                    |ecosystem-wide aerobic respiration         |
|id not verified                                    |id                                         |
|sugar/yeast trap                                   |sugar                                      |
|grazing present                                    |pasture                                    |
|shade form                                         |batholith                                  |
|flowers yellow. ne1/4                              |spice product                              |
|unable to fly                                      |saline pan                                 |
|purple. vine                                       |whole plant                                |
|museum no.8359                                     |flash flooding                             |
|white pine                                         |white smoker                               |
|united states washington                           |dairy                                      |
|ones white                                         |white smoker                               |
|page note. jcs/aug05                               |page                                       |
|eucalyptus oil                                     |oil                                        |
|leptophylla association                            |national geopolitical entity               |
|inner paper                                        |paper product                              |
|ssapling leaves 2-4m above running waters          |neritic sea surface microlayer biome       |
|southwest facing slope                             |slope                                      |
|throughout area                                    |vegetated area                             |
|brood patch                                        |Peyer's patch                              |
|pp879-886. see field notes                         |field                                      |
|pauw university                                    |university campus                          |
|museum no.11816                                    |flash flooding                             |
|univ. of washington                                |Environment Ontology                       |
|rotting grass trimmings                            |grass silage                               |
|under alnus                                        |understory                                 |
|tag see original tag                               |ecosystem-wide aerobic respiration         |
|survey of nebraska                                 |Environment Ontology                       |
|a decaying log                                     |watercraft                                 |
|just starting to open                              |open anatomical space                      |
|returned to tunisia                                |results in proliferation of                |
|bunch grass                                        |grass silage                               |
|commutata var. falcatum                            |velvet bean product                        |
|level very low                                     |low tide zone                              |
|of cave                                            |cave floor                                 |
|yellow syn=parmelia hypoleucina                    |Emmenthal                                  |
|abundant perennial                                 |ice field                                  |
|pink to purple                                     |Vacherin Mont d'Or                         |
|same jar. station=gv52-12                          |collection of organisms of the same species|
|died later                                         |anabranch                                  |
|center dorsally                                    |germinal center                            |
|deep red-brown                                     |poultry deep litter                        |
|cloud forest ridge litter                          |cloud                                      |
|exposed to photo-hypo                              |exposed                                    |
|a collector                                        |human house                                |
|brillantes con borde morado                        |multi-tissue plant structure               |
|to ge                                              |epididymis                                 |
|much lower than previous visit to site             |important site                             |
|formaldehyde method                                |chemical toilet fixture                    |
|cloacal protuberance                               |protuberance                               |
|annona                                             |                                           |
|peepee                                             |                                           |
|dove                                               |                                           |
|psilocarya                                         |                                           |
|920916                                             |                                           |
|silenes                                            |                                           |
|fatto                                              |                                           |
|ameroseius                                         |                                           |
|orson                                              |                                           |
|00095                                              |                                           |
|84mm                                               |                                           |
|2265                                               |                                           |
|su.                                                |                                           |
|41000040106                                        |                                           |
|dentro                                             |                                           |
|alpium                                             |                                           |
|races                                              |                                           |
|dryomys                                            |                                           |
|hawaiian                                           |                                           |
|1532                                               |                                           |
|13.7                                               |                                           |
|hannegan                                           |                                           |
|wiley                                              |                                           |
|hualien                                            |                                           |
|acantholapitha                                     |                                           |
|niobrara                                           |                                           |
|potrero                                            |                                           |
|thi                                                |                                           |
|5937                                               |                                           |
|henk                                               |                                           |
|m.f.742                                            |                                           |
|nordstrom                                          |                                           |
|51.2mm                                             |                                           |
|sl/tl                                              |                                           |
|jpd                                                |                                           |
|calabaria                                          |                                           |
|chrysopilus                                        |                                           |
|525.                                               |                                           |
|utms                                               |                                           |
|service\r\ndepartment                              |                                           |
|jorhat                                             |                                           |
|igapo                                              |                                           |
|nigrifrons                                         |                                           |
|fencerows                                          |                                           |
|2991                                               |                                           |
|jobertina                                          |                                           |
|2x5                                                |                                           |
|painter                                            |                                           |
|borgmeier                                          |                                           |
|108th                                              |                                           |
|dara                                               |                                           |
|ravines.                                           |                                           |
|sportsman                                          |                                           |
|1649                                               |                                           |
|no.rec.inf.                                        |                                           |
|65.5mm                                             |                                           |
|51302060305                                        |                                           |
|castanhal                                          |                                           |
|2816                                               |                                           |
|235897                                             |                                           |
|garcete                                            |                                           |
|4463-4491                                          |                                           |
|30670                                              |                                           |
|96898                                              |                                           |
|peasley                                            |                                           |
|41000040401                                        |                                           |
|varandas/june2003                                  |                                           |
|dendrocopus                                        |                                           |
|headlands.                                         |                                           |
|8203                                               |                                           |
|mescalero                                          |                                           |
|26.5                                               |                                           |
|f.h.                                               |                                           |
|50400020801                                        |                                           |
|wildfire.                                          |                                           |
|arroyos                                            |                                           |
|d.o.                                               |                                           |
|bulky                                              |                                           |
|50.5                                               |                                           |
|lattke                                             |                                           |
|serviceberry                                       |                                           |
|232134                                             |                                           |
|screening                                          |                                           |
|jtl                                                |                                           |
|*nigropinnatus*                                    |                                           |
|scarletbush                                        |                                           |
|50600030603                                        |                                           |
|sta                                                |                                           |
|1.70                                               |                                           |
|41100010501                                        |                                           |
|wingate                                            |                                           |
|daz                                                |                                           |
|densities                                          |                                           |
|janet                                              |                                           |
|quernoa                                            |                                           |
|*lampanyctus*                                      |                                           |
|passando                                           |                                           |
|flouresces                                         |                                           |
|utk                                                |                                           |
|1.5'ave.                                           |                                           |
|cantharellus                                       |                                           |
|trilobate                                          |                                           |
|1864                                               |                                           |
|rafters                                            |                                           |
|1754                                               |                                           |
|p273                                               |                                           |
|illinois                                           |                                           |
|tartarea frigida                                   |                                           |
+---------------------------------------------------+-------------------------------------------+


In [5]:
# Need to filter out terms that were the result of mapping a numerical string
# criteria is perhaps more than one number in a string? (remember nouns had to
# be >2 chars to try to map)
def is_digit(s):
    return True in [c in s for c in "1234567890"]

is_digit_udf = sql.udf(is_digit, types.BooleanType())

ols_no_digits = ols\
    .where(is_digit_udf(sql.col("noun")) == False)
    
ols_no_digits.show(10)


+---------------+--------------------+--------------------+----------------+--------------------+
|           noun|            response|             term_id|      term_label|                time|
+---------------+--------------------+--------------------+----------------+--------------------+
|           fawn|[null,null,null,n...|                    |                |2016-04-16T22:55:...|
|      biennials|[null,null,null,n...|                    |                |2016-04-16T22:55:...|
|    vexillifera|[null,null,null,n...|                    |                |2016-04-16T22:55:...|
|          tasok|[null,null,null,n...|                    |                |2016-04-16T22:55:...|
|          peas.|[WrappedArray(A p...|envo:http://purl....|dry peas product|2016-04-16T22:55:...|
|cylindrosporium|[null,null,null,n...|                    |                |2016-04-16T22:55:...|
|      maderista|[null,null,null,n...|                    |                |2016-04-16T22:58:...|
|       dothidea|[null,null,null,n...|                    |                |2016-04-16T22:58:...|
|           fms.|[null,null,null,n...|                    |                |2016-04-16T22:58:...|
|        invert.|[null,null,null,n...|                    |                |2016-04-16T23:01:...|
+---------------+--------------------+--------------------+----------------+--------------------+
only showing top 10 rows


In [6]:
# What kinds of things were we finding?
terms = ols_no_digits\
    .dropna(subset=["term_id"])\
    .filter(sql.col("term_id") != "")\
    .groupBy(sql.col("term_id"), sql.col("term_label"))\
    .count()\
    .orderBy(sql.col("count"), ascending=False)
    
print(terms.count())
terms.show(20, truncate=False)


2497
+--------------------------------------------------+---------------------------+-----+
|term_id                                           |term_label                 |count|
+--------------------------------------------------+---------------------------+-----+
|envo:                                             |Environment Ontology       |230  |
|envo:http://purl.obolibrary.org/obo/ENVO_01000257 |white smoker               |152  |
|envo:http://purl.obolibrary.org/obo/ENVO_01000628 |plant litter               |146  |
|envo:http://purl.obolibrary.org/obo/ENVO_01000337 |herb and fern layer        |138  |
|envo:http://purl.obolibrary.org/obo/ENVO_02500020 |results in proliferation of|135  |
|envo:http://purl.obolibrary.org/obo/ENVO_00003887 |blue cheese                |122  |
|envo:http://purl.obolibrary.org/obo/ENVO_00000111 |forest                     |115  |
|envo:http://purl.obolibrary.org/obo/ENVO_00000467 |university campus          |107  |
|envo:http://purl.obolibrary.org/obo/ENVO_01000352 |field                      |105  |
|envo:http://purl.obolibrary.org/obo/ENVO_00003046 |Bleu  de Bresse            |103  |
|envo:http://purl.obolibrary.org/obo/ENVO_00000109 |woodland                   |101  |
|envo:http://purl.obolibrary.org/obo/ENVO_00005790 |red soil                   |94   |
|envo:http://purl.obolibrary.org/obo/UBERON_0000332|yellow bone marrow         |92   |
|envo:http://purl.obolibrary.org/obo/ENVO_01000336 |shrub layer                |84   |
|envo:http://purl.obolibrary.org/obo/ENVO_00001998 |soil                       |81   |
|envo:http://purl.obolibrary.org/obo/ENVO_00000337 |orange juice               |79   |
|envo:http://purl.obolibrary.org/obo/ENVO_00003061 |Vacherin Mont d'Or         |76   |
|envo:http://purl.obolibrary.org/obo/ENVO_01000338 |litter layer               |74   |
|envo:http://purl.obolibrary.org/obo/ENVO_0010117  |velvet bean product        |69   |
|envo:http://purl.obolibrary.org/obo/ENVO_00003971 |brown sugar                |68   |
+--------------------------------------------------+---------------------------+-----+
only showing top 20 rows


In [9]:
# Let's graph that distribution
terms_pdf = terms.toPandas()
import seaborn as sns
ticks = range(0, 3001, 500)
ax = sns.barplot(x="term_id", y="count", data=terms_pdf, color="black")
ax.set_yscale('log')
ax.get_xaxis().set_ticks(ticks)
ax.get_xaxis().set_ticklabels(ticks)
ax.set_title("Distribution of ENVO Terms Found")
ax.set_ylabel("Count (Log scale)")
ax.set_xlabel("Rank of Unique Terms")


Out[9]:
<matplotlib.text.Text at 0x7f9994e04d10>

In [ ]: