In [10]:
%matplotlib inline
from __future__ import print_function
import os
from pyspark import SQLContext
from pyspark.sql import Row
import pyspark.sql.functions as sql
import pyspark.sql.types as types
#from pyspark.sql.functions import udf, length
import matplotlib.pyplot as plt
import numpy
import math
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import pyspark.ml.feature as feature

In [11]:
# Load Processed Parquet
sqlContext = SQLContext(sc)
phrases = sqlContext.read.parquet("../data/idigbio_phrases.parquet")\
    .sort(sql.col("count"), ascending=False)
total_records = phrases.count()
print(total_records)
phrases.printSchema()


36387
root
 |-- phrase: string (nullable = true)
 |-- count: long (nullable = true)


In [3]:
# our job didn't keep the total number of phrases found so we'll have to add that up
phrases.select(sql.sum(sql.col("count"))).show()


+----------+
|sum(count)|
+----------+
|    308305|
+----------+


In [4]:
phrases.describe().show()
phrases.show(30, truncate=False)


+-------+------------------+
|summary|             count|
+-------+------------------+
|  count|             36387|
|   mean| 8.472943633715339|
| stddev|294.87045518010905|
|    min|                 1|
|    max|             52092|
+-------+------------------+

+---------------------------------------+-----+
|phrase                                 |count|
+---------------------------------------+-----+
|intercept trap                         |52092|
|forest litter                          |12723|
|field notes                            |10554|
|field notebook                         |5607 |
|fogging fungus covered log             |4744 |
|tropical forest flight intercept trap  |3083 |
|for. litter berlese forest litter      |2874 |
|berlese forest litter                  |2724 |
|montane forest litter                  |2714 |
|flight intercept trap                  |2633 |
|rainforest litter                      |2445 |
|cloud forest flight intercept trap     |2434 |
|frank-t. krell                         |2402 |
|h. welch                               |2236 |
|fungusy log                            |1949 |
|of british columbia herbarium          |1785 |
|rain forest flight intercept trap      |1618 |
|see field notes                        |1570 |
|forest cloud forest litter             |1440 |
|in fieldbook                           |1351 |
|station sheet                          |1299 |
|not yet published                      |1231 |
|funnel extraction                      |1188 |
|botanical garden herbarium             |1176 |
|data not interpreted                   |1141 |
|large trees                            |1099 |
|rainforest flight intercept trap       |1060 |
|moving thru pass against wind-migration|1025 |
|nest series                            |1024 |
|american flora                         |949  |
+---------------------------------------+-----+
only showing top 30 rows


In [5]:
# Median phrase count
import math
phrases\
    .select(sql.col("count"))\
    .limit(int(math.floor(total_records/2)))\
    .orderBy(sql.col("count"))\
    .show(3)


+-----+
|count|
+-----+
|    1|
|    1|
|    1|
+-----+
only showing top 3 rows


In [6]:
# ok, how many have more than 1?
phrases\
    .filter(sql.col("count") > 1)\
    .count()


Out[6]:
10112

In [7]:
# distribution of phrase frequency
phrases_pdf = phrases.limit(10000).toPandas()

In [8]:
import seaborn as sns
ax = sns.barplot(x="phrase", y="count", data=phrases_pdf, color="black")
ticks = range(0, 10001, 2000)
ax.set_yscale('log')
ax.get_xaxis().set_ticks(ticks)
ax.get_xaxis().set_ticklabels(ticks)
ax.set_title("Distribution of Noun Phrases Found")
ax.set_ylabel("Count (Log scale)")
ax.set_xlabel("Rank of Unique Noun Phrases")


ERROR! Session/line number was not unique in database. History logging moved to new session 117
Out[8]:
<matplotlib.text.Text at 0x7f4b97285b50>

In [15]:
# write frequently used ones out
# Change col names to match nouns so we can use same OLS script
phrases\
    .filter(sql.col("count") > 1)\
    .select(sql.col("phrase").alias("noun"), sql.col("count"))\
    .write.format("json").save("../data/idigbio_phrases.json")

In [12]:
# Have done the lookups, now analize just the phrases we looked up
# Load Processed Parquet
ols = sqlContext.read.json("../data/idigbio_envo_terms.json")
ols = ols\
    .filter(sql.col("time") > "2016-04-19T00:00:00.00")

total_records = ols.count()
print(total_records)
#ols.printSchema()


9579

In [13]:
print(ols.filter(sql.col("term_id") != "").count())
print(ols.filter(sql.col("term_id") == "").count())


7917
1662

In [14]:
# What kinds of things were we finding?
terms = ols\
    .dropna(subset=["term_id"])\
    .filter(sql.col("term_id") != "")\
    .groupBy(sql.col("term_id"), sql.col("term_label"))\
    .count()\
    .orderBy(sql.col("count"), ascending=False)
    
print(terms.count())
terms.show(20, truncate=False)


1208
+--------------------------------------------------+---------------------------+-----+
|term_id                                           |term_label                 |count|
+--------------------------------------------------+---------------------------+-----+
|envo:http://purl.obolibrary.org/obo/ENVO_01000352 |field                      |220  |
|envo:http://purl.obolibrary.org/obo/ENVO_01000116 |pebble sediment            |178  |
|envo:                                             |Environment Ontology       |159  |
|envo:http://purl.obolibrary.org/obo/ENVO_01000709 |flash flooding             |135  |
|envo:http://purl.obolibrary.org/obo/ENVO_02500020 |results in proliferation of|135  |
|envo:http://purl.obolibrary.org/obo/ENVO_01000337 |herb and fern layer        |123  |
|envo:http://purl.obolibrary.org/obo/ENVO_01000628 |plant litter               |109  |
|envo:http://purl.obolibrary.org/obo/ENVO_01000257 |white smoker               |108  |
|envo:http://purl.obolibrary.org/obo/ENVO_00000111 |forest                     |94   |
|envo:http://purl.obolibrary.org/obo/ENVO_00000467 |university campus          |94   |
|envo:http://purl.obolibrary.org/obo/ENVO_00003046 |Bleu  de Bresse            |88   |
|envo:http://purl.obolibrary.org/obo/ENVO_01000338 |litter layer               |73   |
|envo:http://purl.obolibrary.org/obo/CHEBI_35209   |label                      |73   |
|envo:http://purl.obolibrary.org/obo/ENVO_01000336 |shrub layer                |67   |
|envo:http://purl.obolibrary.org/obo/ENVO_00003887 |blue cheese                |67   |
|envo:http://purl.obolibrary.org/obo/ENVO_01000507 |iron sheet                 |66   |
|envo:http://purl.obolibrary.org/obo/ENVO_0010117  |velvet bean product        |63   |
|envo:http://purl.obolibrary.org/obo/PATO_0001422  |dead                       |61   |
|envo:http://purl.obolibrary.org/obo/UBERON_0000332|yellow bone marrow         |60   |
|envo:http://purl.obolibrary.org/obo/ENVO_01000335 |understory                 |53   |
+--------------------------------------------------+---------------------------+-----+
only showing top 20 rows


In [18]:
# Manually review a random sample of positive results and negative results
# Different random sample every time run
ols\
    .filter(sql.col("term_id") != "")\
    .sample(withReplacement=False, fraction=0.01)\
    .unionAll(
    ols\
        .filter(sql.col("term_id") == "")\
        .sample(withReplacement=False, fraction=0.02)
    )\
    .select(sql.col("noun"), sql.col("term_label"))\
    .show(1000, truncate=False)


+-----------------------------------------------------------------------+-------------------------------------------+
|noun                                                                   |term_label                                 |
+-----------------------------------------------------------------------+-------------------------------------------+
|dark blue-purple                                                       |blue cheese                                |
|herb. columbia college                                                 |herb and fern layer                        |
|surface. see field notes                                               |snow field                                 |
|brasil. regnell. musei bot. stockholm. plantae itineris regnelliani iii|flat glass                                 |
|herb. lafayette college                                                |herb and fern layer                        |
|laurelia novae-zelandiae leaf litter                                   |plant litter                               |
|de lsd393                                                              |Bleu  de Bresse                            |
|a wayside weed                                                         |human house                                |
|human dung trap                                                        |dung building floor                        |
|states national herbarium                                              |national forest                            |
|nest box                                                               |nest of bird                               |
|oak/alder/pine forest litter                                           |litter layer                               |
|light blue                                                             |blue cheese                                |
|mech dislodg study                                                     |nature reserve                             |
|forest/montane tropical transition forest under fungusy log            |tropical upper montane forest              |
|an escape                                                              |acidification of an aquatic environment    |
|now restricted to jungermannia rubra                                   |environmental condition                    |
|robins/june2001 no field notes                                         |field                                      |
|mouth see field notes                                                  |mouth                                      |
|only skull only                                                        |skull                                      |
|museum no.9730                                                         |flash flooding                             |
|det_comments:38 mm                                                     |pebble sediment                            |
|hot pink                                                               |hot spring                                 |
|another plant                                                          |sewage plant                               |
|same item                                                              |collection of organisms of the same species|
|genus/species listed                                                   |cypress strand swamp                       |
|college herbarium                                                      |university campus                          |
|jht77-120. swffs                                                       |western boundary current                   |
|mounting board                                                         |road                                       |
|kanouse name                                                           |marine sediment                            |
|similar to ecclesiasticus                                              |transpiration                              |
|very showy pink flowers                                                |maelstrom                                  |
|of falls                                                               |Environment Ontology                       |
|fallen epiphyte laiden branch under bark                               |understory                                 |
|intercept trap night                                                   |pour-flush pit latrine                     |
|herb. barnard college                                                  |herb and fern layer                        |
|or more of elevation                                                   |Environment Ontology                       |
|frequent herbaceous perennial                                          |herb and fern layer                        |
|deep sea habitats expedition                                           |Environment Ontology                       |
|var. glabrata leonard                                                  |velvet bean product                        |
|oliver komar 's catalogue                                              |weir                                       |
|white oak                                                              |oak woodland                               |
|streamside shrub litter                                                |shrub layer                                |
|varandas/february2004 see field notes                                  |field                                      |
|juan river                                                             |river                                      |
|pond. -rh robins                                                       |pond                                       |
|section channelized                                                    |section of aorta                           |
|feet away                                                              |latrine pit                                |
|high elevation                                                         |elevation                                  |
|sphagnum under betula                                                  |sphagnum bog                               |
|going up tree trunk. night                                             |plant litter                               |
|head lamping                                                           |head                                       |
|rainforest jelly fungus                                                |tropical moist broadleaf forest biome      |
|also field number cbf-3                                                |field                                      |
|greenish brown                                                         |brown sugar                                |
|donated to fred b                                                      |attached to                                |
|tropical forest favolus hexagonalis                                    |tropical mangrove                          |
|museum no.13596                                                        |flash flooding                             |
|sampling.-ma phillips/september2009                                    |axilla skin                                |
|pebble turret                                                          |pebble                                     |
|information. -rh robins/march2008                                      |radio device                               |
|pale red                                                               |red soil                                   |
|en cooperacion con la universidad de los andes                         |Bleu  de Bresse                            |
|margin prominent-lateral                                               |continental margin                         |
|scrub-grass margin to remnant kahikatea forest                         |forest                                     |
|elev not listed                                                        |environmental condition                    |
|old growth forest                                                      |forest                                     |
|flow.-ma phillips/september2009                                        |debris flow                                |
|leg. m.47                                                              |leg                                        |
|h. chrysargeum                                                         |NAD(P)H                                    |
|exsicatti label                                                        |label                                      |
|month record                                                           |arid subtropical                           |
|rotten wood. uv+                                                       |wood                                       |
|level. rocky roadside. soil- silty clay loam                           |loam                                       |
|robins/november2000 see field notes                                    |field                                      |
|abundant perennial                                                     |ice field                                  |
|against window                                                         |hydrological condensation process          |
|probably wrong                                                         |constructed feature                        |
|stream. tj vigliotti/december2006                                      |stream                                     |
|rust stage                                                             |S-shaped body                              |
|not thigmotropic                                                       |track                                      |
|university of montana                                                  |university campus                          |
|head lamp                                                              |lamp                                       |
|agave bases                                                            |fermented agave beverage                   |
|usually sterile                                                        |sterile water                              |
|a hiking trail                                                         |track                                      |
|de rio                                                                 |river                                      |
|herbarium kansas state collegee                                        |foam                                       |
|light purple                                                           |photoreceptor array                        |
|a las ruinas                                                           |human house                                |
|little forage value                                                    |silage                                     |
|deciduous wood                                                         |wood                                       |
|rocky limestone slope                                                  |rocky slope                                |
|umbellularia californica                                               |                                           |
|gallinago gallinago delicata                                           |                                           |
|camponotus crematogaster strimigenys                                   |                                           |
|modoc formica neorufibarbis strays                                     |                                           |
|norcross-bartlett expedition                                           |                                           |
|terrestre. inflorescencia amarilla                                     |                                           |
|flores rojas                                                           |                                           |
|unkown. -march2003                                                     |                                           |
|hyla bistincta                                                         |                                           |
|lepraria arctica                                                       |                                           |
|hypnum revolutum                                                       |                                           |
|much branched and tall                                                 |                                           |
|anemone chinensis                                                      |                                           |
|muy escaso                                                             |                                           |
|blackish purple                                                        |                                           |
|bryologicum hjalmar moller                                             |                                           |
|backpack shocker                                                       |                                           |
|veraepacis latifasciatus                                               |                                           |
|csm seine                                                              |                                           |
|herbarium ksu                                                          |                                           |
|albatross philipp                                                      |                                           |
|imaged palmaceae                                                       |                                           |
|castanopsis chrysophylla                                               |                                           |
|aduncus paternus                                                       |                                           |
|society foray                                                          |                                           |
|dried out and disc                                                     |                                           |
|pannaria praetermissa                                                  |                                           |
|forel mars                                                             |                                           |
|chrysophyllum caespitosum                                              |                                           |
|trichomitrion immersus                                                 |                                           |
|augusti cactorum                                                       |                                           |
|populeum ovatum                                                        |                                           |
|oreophilus and artemisia tridentata                                    |                                           |
+-----------------------------------------------------------------------+-------------------------------------------+


In [ ]: