In [1]:
%matplotlib inline
from __future__ import print_function
import os
from pyspark import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as sql
import pyspark.sql.types as types
#from pyspark.sql.functions import udf, length
import matplotlib.pyplot as plt
import numpy
import math
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
import pyspark.ml.feature as feature
:0: FutureWarning: IPython widgets are experimental and may change in the future.
In [2]:
# Load Processed Parquet
sqlContext = SQLContext(sc)
ols = sqlContext.read.json("../data/idigbio_envo_terms.json")
total_records = ols.count()
print(total_records)
ols.printSchema()
# FIXME: Now have phrases in this file, need to filter them out!
84158
root
|-- noun: string (nullable = true)
|-- response: struct (nullable = true)
| |-- description: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- id: string (nullable = true)
| |-- iri: string (nullable = true)
| |-- is_defining_ontology: boolean (nullable = true)
| |-- label: string (nullable = true)
| |-- obo_id: string (nullable = true)
| |-- ontology_name: string (nullable = true)
| |-- ontology_prefix: string (nullable = true)
| |-- short_form: string (nullable = true)
| |-- type: string (nullable = true)
|-- term_id: string (nullable = true)
|-- term_label: string (nullable = true)
|-- time: string (nullable = true)
In [3]:
print(ols.filter(sql.col("term_id") != "").count())
print(ols.filter(sql.col("term_id") == "").count())
27411
56747
In [4]:
# Manually review a random sample of positive results and negative results
# Different random sample every time run
ols\
.filter(sql.col("term_id") != "")\
.sample(withReplacement=False, fraction=0.01)\
.unionAll(
ols\
.filter(sql.col("term_id") == "")\
.sample(withReplacement=False, fraction=0.002)
)\
.select(sql.col("noun"), sql.col("term_label"))\
.show(1000, truncate=False)
+---------------------------------------------------+-------------------------------------------+
|noun |term_label |
+---------------------------------------------------+-------------------------------------------+
|estuary. |estuary |
|service |national wildlife refuge |
|'sand |sand |
|mlb08/31/2007 |herb and fern layer |
|59-721 |stream |
|also |atmospheric carbon dioxide |
|1/27/79 |peak |
|emergence |hot spring |
|given. |biosphere reserve |
|pit-tagged |pit |
|viii-4-43 |marine reef |
|covered |covered latrine |
|21:00h |oceanic benthopelagic zone biome |
|facility |sports facility |
|m.b.200 |marine bathyal zone biome |
|-500m. |Environment Ontology |
|48o23.5'w |inheres in part of |
|13-20.i.2009 |loam |
|5/2006 |karst field |
|230 |bayou |
|8.v.91 |V-shaped |
|home |human dwelling |
|mud. |mud |
|ceiling. |ceiling |
|dead |dead |
|fraction |mine tailing |
|cracked. |glacial plucking |
|23:30 |saline hydrographic feature |
|12.vi.73 |sisal plantation |
|10/12/80 |hyperthermophilic sediment |
|hk03-0221-1 |colloidal sediment |
|viii-24-30 |saline hydrographic feature |
|'these |storm surge |
|disturbed. |disturbed area |
|cilia. |ciliated epithelium |
|weather. |storm surge |
|7/6-31/1982 |acidic water |
|w/purple |inheres in part of |
|p.66 |NAD(P) |
|52773-76 |tea plantation |
|relay |dorsal horn of spinal cord |
|effective |national geopolitical entity |
|'mustard |mustard |
|alas. |alas |
|12-aug-08 |piece of gravel |
|date |date |
|rigid. |lithosphere |
|drift |tombolo |
|26.iv.1987 |along-front current |
|bryophyte |sphagnum bog |
|569 |first-order administrative region |
|exchange. |closed ecological system |
|1-dec-89 |colloidal sediment |
|7/31/00 |organic feature |
|green-yellow |yellow bone marrow |
|10/14/2009. |marine basaltic hydrothermal vent biome |
|brownish-green |Viridiplantae |
|rhr95-53. |cotton plantation |
|68-22. |plantation |
|326. |marsh |
|skin |skin environment |
|viii-ix.1982 |acoustico-facial VII-VIII ganglion complex |
|237. |beach |
|187. |badland |
|27-may-09 |scoria |
|vent |hydrothermal vent |
|6:37. |plant derived beverage |
|1241 |seamount |
|m.565 |marine bathyal zone biome |
|type |biome |
|27-viii-39 |conifer woodland |
|anchor |irregular bone |
|21-28.vii.1977 |continental margin |
|fork |watercourse |
|29.v.1986 |V-shaped |
|n=60 |tobacco warehouse |
|pollinator |land degradation |
|specimennumber:593 |stream |
|hk9700203-1 |colloidal sediment |
|19.ii.1996 |national park |
|v:01 |V-shaped |
|v:02 |V-shaped |
|810. |undersea feature |
|60-381 |liquefied natural gas |
|a=9 |alkaline environment |
|2x=33 |oak woodland |
|sharp |saline wedge estuary |
|15/iv/2003 |mesophilic sediment |
|xii:1 |colloidal sediment |
|road-cut |road cut |
|-rhr |Environment Ontology |
|7/21/1997 |acidic water |
|2'steep |cliff |
|6-5-25 |paraffin |
|4-18-82 |canopy |
|1/2000 |broadleaf forest |
|21.viii.1995 |legume |
|burns. |paraffin lamp |
|157 |wadi mouth |
|'w'-marks |intertidal zone |
|drainage-tracks |drainage canal |
|marshes |marine salt marsh biome |
|mountains. |seamount |
|tl=51 |siltstone |
|1+2 |partially_surrounded_by |
|occipital |head somite |
|kau-08-0815-1 |amictic lake |
|ti-83-20 |transport feature |
|lr-10 |respirable suspended particulate matter |
|6-21-42 |building |
|59-120 |western boundary current |
|id-54 |id |
|5-dec-01 |karst field |
|15'x50'x4.5'. |mesophilic sediment |
|iv-21-51 |siltstone |
|mcpherson-rice |rice straw |
|look |mineral material |
|n=18 |tobacco warehouse |
|blue-gray-green |blue cheese |
|hk-133 |surface layer |
|465. |marine channel |
|pallidum |brainstem |
|9-10-12-13 |subtropical |
|taste- |gustatory system |
|504. |sea shore |
|11:50 |thermosphere |
|5.xi.74 |container glass |
|serves |buffer zone |
|23.vii.1978 |must |
|12/06/85 |intermittent wetland |
|169-175 |asphalt lake |
|re-collection |anatomical collection |
|nodes. |cervical lymph node |
|59-25 |compressed natural gas |
|10:55am |respirable suspended particulate matter |
|fleshy |melon fruit product |
|deciduous |deciduous (plant) |
|snow |snow |
|2w/w. |inheres in part of |
|48o23.6'w |inheres in part of |
|sensu |fallopian tube |
|12:23. |must |
|xii:23 |must |
|intermedium |intermediate mesoderm |
|die-off |cutoff |
|3-viii-1977 |arsenate treated wood |
|10w/0. |fatty acid anion 28:0 |
|29.iii.74 |container glass |
|'sus |pork |
|people. |vehicle |
|botanical |botanical garden |
|rotate |volcanic dike |
|pathology. |mpath_slim |
|trs-data. |mesotrophic lake |
|pairs. |interacts with |
|63833-49 |nest of termite |
|230. |bayou |
|old |old |
|8-1-1946 |colloidal sediment |
|60-442 |liquefied natural gas |
|annotation |forest process |
|rocky-sandy-clay |clay |
|saddle. |saddle dam |
|ammonia |ammonia |
|tt69-23 |must |
|tt69-22 |highland subtropical |
|bottomland |flood plain |
|spinach |spinach |
|1191 |heath |
|acc:2007 |herb and fern layer |
|59-93 |farm |
|7/18 |acidic water |
|steam |fumarole |
|rhyolite |stratovolcano |
|17-dec-81 |buffer zone |
|hk07-1021-1 |ocean trench |
|made. |man-made tunnel |
|02:00 |organic feature |
|robins/october |turlough |
|6:36. |animal derived beverage |
|pinus-quercus. |pinyon juniper woodland |
|15:15. |nitrogen-15 atom |
|tests=blues |calcareous ooze |
|2n=33 |oak woodland |
|exped. to brazil |results in proliferation of |
|polylepis/shrub litter mixed polylepis/shrub litter|plant litter |
|atrraction trap |pour-flush pit latrine |
|polydomous nest |nest of bird |
|oak-pine forest leaf litter |litter layer |
|contorta stand |woodland |
|soil. common |soil erosion |
|section hispidoderma |section of aorta |
|strongly tuberculate |solonetz |
|flower garden |garden |
|is now holst state forest |forest |
|elev not lsited |track |
|labels. see uwf notes. -a varandas/july2004 |alpine |
|mine ecosystem reconstruction |ecosystem-wide aerobic respiration |
|coniferous grove |subpolar coniferous forest biome |
|barely attached |attached to |
|collybia box |neurocranium |
|pink petals |Vacherin Mont d'Or |
|rico/penn state college |university campus |
|next to cowpens |results in proliferation of |
|gill net |marine downwelling |
|lindgren funnel |uterine tube infundibulum |
|field notes |field |
|purpureus xanthopus |hyacinth bean product |
|good condition |altitudinal condition |
|noturus leptacanthus collected but not included |hydrological condensation process |
|see uwf notes. -g sheehy/july04 |ecosystem-wide aerobic respiration |
|id not verified |id |
|sugar/yeast trap |sugar |
|grazing present |pasture |
|shade form |batholith |
|flowers yellow. ne1/4 |spice product |
|unable to fly |saline pan |
|purple. vine |whole plant |
|museum no.8359 |flash flooding |
|white pine |white smoker |
|united states washington |dairy |
|ones white |white smoker |
|page note. jcs/aug05 |page |
|eucalyptus oil |oil |
|leptophylla association |national geopolitical entity |
|inner paper |paper product |
|ssapling leaves 2-4m above running waters |neritic sea surface microlayer biome |
|southwest facing slope |slope |
|throughout area |vegetated area |
|brood patch |Peyer's patch |
|pp879-886. see field notes |field |
|pauw university |university campus |
|museum no.11816 |flash flooding |
|univ. of washington |Environment Ontology |
|rotting grass trimmings |grass silage |
|under alnus |understory |
|tag see original tag |ecosystem-wide aerobic respiration |
|survey of nebraska |Environment Ontology |
|a decaying log |watercraft |
|just starting to open |open anatomical space |
|returned to tunisia |results in proliferation of |
|bunch grass |grass silage |
|commutata var. falcatum |velvet bean product |
|level very low |low tide zone |
|of cave |cave floor |
|yellow syn=parmelia hypoleucina |Emmenthal |
|abundant perennial |ice field |
|pink to purple |Vacherin Mont d'Or |
|same jar. station=gv52-12 |collection of organisms of the same species|
|died later |anabranch |
|center dorsally |germinal center |
|deep red-brown |poultry deep litter |
|cloud forest ridge litter |cloud |
|exposed to photo-hypo |exposed |
|a collector |human house |
|brillantes con borde morado |multi-tissue plant structure |
|to ge |epididymis |
|much lower than previous visit to site |important site |
|formaldehyde method |chemical toilet fixture |
|cloacal protuberance |protuberance |
|annona | |
|peepee | |
|dove | |
|psilocarya | |
|920916 | |
|silenes | |
|fatto | |
|ameroseius | |
|orson | |
|00095 | |
|84mm | |
|2265 | |
|su. | |
|41000040106 | |
|dentro | |
|alpium | |
|races | |
|dryomys | |
|hawaiian | |
|1532 | |
|13.7 | |
|hannegan | |
|wiley | |
|hualien | |
|acantholapitha | |
|niobrara | |
|potrero | |
|thi | |
|5937 | |
|henk | |
|m.f.742 | |
|nordstrom | |
|51.2mm | |
|sl/tl | |
|jpd | |
|calabaria | |
|chrysopilus | |
|525. | |
|utms | |
|service\r\ndepartment | |
|jorhat | |
|igapo | |
|nigrifrons | |
|fencerows | |
|2991 | |
|jobertina | |
|2x5 | |
|painter | |
|borgmeier | |
|108th | |
|dara | |
|ravines. | |
|sportsman | |
|1649 | |
|no.rec.inf. | |
|65.5mm | |
|51302060305 | |
|castanhal | |
|2816 | |
|235897 | |
|garcete | |
|4463-4491 | |
|30670 | |
|96898 | |
|peasley | |
|41000040401 | |
|varandas/june2003 | |
|dendrocopus | |
|headlands. | |
|8203 | |
|mescalero | |
|26.5 | |
|f.h. | |
|50400020801 | |
|wildfire. | |
|arroyos | |
|d.o. | |
|bulky | |
|50.5 | |
|lattke | |
|serviceberry | |
|232134 | |
|screening | |
|jtl | |
|*nigropinnatus* | |
|scarletbush | |
|50600030603 | |
|sta | |
|1.70 | |
|41100010501 | |
|wingate | |
|daz | |
|densities | |
|janet | |
|quernoa | |
|*lampanyctus* | |
|passando | |
|flouresces | |
|utk | |
|1.5'ave. | |
|cantharellus | |
|trilobate | |
|1864 | |
|rafters | |
|1754 | |
|p273 | |
|illinois | |
|tartarea frigida | |
+---------------------------------------------------+-------------------------------------------+
In [5]:
# Need to filter out terms that were the result of mapping a numerical string
# criteria is perhaps more than one number in a string? (remember nouns had to
# be >2 chars to try to map)
def is_digit(s):
return True in [c in s for c in "1234567890"]
is_digit_udf = sql.udf(is_digit, types.BooleanType())
ols_no_digits = ols\
.where(is_digit_udf(sql.col("noun")) == False)
ols_no_digits.show(10)
+---------------+--------------------+--------------------+----------------+--------------------+
| noun| response| term_id| term_label| time|
+---------------+--------------------+--------------------+----------------+--------------------+
| fawn|[null,null,null,n...| | |2016-04-16T22:55:...|
| biennials|[null,null,null,n...| | |2016-04-16T22:55:...|
| vexillifera|[null,null,null,n...| | |2016-04-16T22:55:...|
| tasok|[null,null,null,n...| | |2016-04-16T22:55:...|
| peas.|[WrappedArray(A p...|envo:http://purl....|dry peas product|2016-04-16T22:55:...|
|cylindrosporium|[null,null,null,n...| | |2016-04-16T22:55:...|
| maderista|[null,null,null,n...| | |2016-04-16T22:58:...|
| dothidea|[null,null,null,n...| | |2016-04-16T22:58:...|
| fms.|[null,null,null,n...| | |2016-04-16T22:58:...|
| invert.|[null,null,null,n...| | |2016-04-16T23:01:...|
+---------------+--------------------+--------------------+----------------+--------------------+
only showing top 10 rows
In [6]:
# What kinds of things were we finding?
terms = ols_no_digits\
.dropna(subset=["term_id"])\
.filter(sql.col("term_id") != "")\
.groupBy(sql.col("term_id"), sql.col("term_label"))\
.count()\
.orderBy(sql.col("count"), ascending=False)
print(terms.count())
terms.show(20, truncate=False)
2497
+--------------------------------------------------+---------------------------+-----+
|term_id |term_label |count|
+--------------------------------------------------+---------------------------+-----+
|envo: |Environment Ontology |230 |
|envo:http://purl.obolibrary.org/obo/ENVO_01000257 |white smoker |152 |
|envo:http://purl.obolibrary.org/obo/ENVO_01000628 |plant litter |146 |
|envo:http://purl.obolibrary.org/obo/ENVO_01000337 |herb and fern layer |138 |
|envo:http://purl.obolibrary.org/obo/ENVO_02500020 |results in proliferation of|135 |
|envo:http://purl.obolibrary.org/obo/ENVO_00003887 |blue cheese |122 |
|envo:http://purl.obolibrary.org/obo/ENVO_00000111 |forest |115 |
|envo:http://purl.obolibrary.org/obo/ENVO_00000467 |university campus |107 |
|envo:http://purl.obolibrary.org/obo/ENVO_01000352 |field |105 |
|envo:http://purl.obolibrary.org/obo/ENVO_00003046 |Bleu de Bresse |103 |
|envo:http://purl.obolibrary.org/obo/ENVO_00000109 |woodland |101 |
|envo:http://purl.obolibrary.org/obo/ENVO_00005790 |red soil |94 |
|envo:http://purl.obolibrary.org/obo/UBERON_0000332|yellow bone marrow |92 |
|envo:http://purl.obolibrary.org/obo/ENVO_01000336 |shrub layer |84 |
|envo:http://purl.obolibrary.org/obo/ENVO_00001998 |soil |81 |
|envo:http://purl.obolibrary.org/obo/ENVO_00000337 |orange juice |79 |
|envo:http://purl.obolibrary.org/obo/ENVO_00003061 |Vacherin Mont d'Or |76 |
|envo:http://purl.obolibrary.org/obo/ENVO_01000338 |litter layer |74 |
|envo:http://purl.obolibrary.org/obo/ENVO_0010117 |velvet bean product |69 |
|envo:http://purl.obolibrary.org/obo/ENVO_00003971 |brown sugar |68 |
+--------------------------------------------------+---------------------------+-----+
only showing top 20 rows
In [9]:
# Let's graph that distribution
terms_pdf = terms.toPandas()
import seaborn as sns
ticks = range(0, 3001, 500)
ax = sns.barplot(x="term_id", y="count", data=terms_pdf, color="black")
ax.set_yscale('log')
ax.get_xaxis().set_ticks(ticks)
ax.get_xaxis().set_ticklabels(ticks)
ax.set_title("Distribution of ENVO Terms Found")
ax.set_ylabel("Count (Log scale)")
ax.set_xlabel("Rank of Unique Terms")
Out[9]:
<matplotlib.text.Text at 0x7f9994e04d10>
In [ ]:
Content source: mjcollin/2016spr
Similar notebooks: