author: lukethompson@gmail.com
date: 5 Oct 2017
language: Python 3.5
conda enviroment: emp-py3
license: BSD3

envo_hierarchy_lookup.ipynb


In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
path_envobiome = '../../data/metadata-refine/envo_biome_name_is_a.txt'

In [3]:
pd.options.mode.chained_assignment = None

In [4]:
def term_to_envo_hierarchy(term, dictionary):
    hierarchy = [term]
    while True:
        hierarchy.insert(0, dictionary[term])
        term = dictionary[term]
        if term not in dictionary.keys():
            break
    return hierarchy

In [5]:
def term_parent_to_dict(path):
    # read envo biome file, split into two dataframes, merge, create dictionary
    df = pd.read_csv(path, header=None)
    # name dataframe
    df_name = df.iloc[np.arange(0,df.shape[0],2)]
    df_name.reset_index(inplace=True, drop=True)
    for index, row in df_name.iterrows():
        df_name.iloc[index] = re.sub('name: ', '', str(row[0]))
    # is_a dataframe
    df_is_a = df.iloc[np.arange(1,df.shape[0],2)]
    df_is_a.reset_index(inplace=True, drop=True)
    for index, row in df_is_a.iterrows():
        df_is_a.iloc[index] = re.sub('is_a: ', '', str(row[0]))
    # merged dataframe
    df_name_is_a = pd.merge(df_name, df_is_a, left_index=True, right_index=True)
    df_name_is_a.columns = ['name', 'is_a']
    # dictionary for lookup
    dictionary = dict(zip(df_name_is_a.name, df_name_is_a.is_a))
    return dictionary

In [6]:
dict_name_is_a = term_parent_to_dict(path_envobiome)

In [8]:
term_to_envo_hierarchy('desert biome', dict_name_is_a)


Out[8]:
['environmental system', 'biome', 'terrestrial biome', 'desert biome']

Bash script to generate input text files

#!/bin/bash

# """"""""""""""""""""
# Contact: jkanbar@ucsd.edu (jad kanbar for questions)
# The file 'envo.obo' has to be in the same directory as this script.
# This script filters the envo ontology, by taking the file 'envo.obo' and
# and outputting two files in same directory as the script:
# 1.) envo_name_is_a_relationship_full.txt
# 2.) envo_name_is_a.txt.
#
# An OBO stanza in 'envo.obo' looks like this:
#    [term]
#    tag: value
#    tag: value
#    tag: value
# Relevant ontology terms have this term relationship:
#    name: value
#    is_a: value
#    relatinship: value
#
# 1.) 'envo_name_is_a_relationship_full.txt' outputs:
#      name: value
#      is_a: value
#      relationship: value
# Example:
#      name: waterfall
#      is_a: hydrographic feature
#      relationship: part_of stream
#
# 'envo_name_is_a_relationship_full.txt' outputs all relevant
# EMP envo ontology classes including:
# environmental feature (EMP metadata category: env_feature)
# environmental matter (EMP metadata category: env_matter)
# environmental system (EMP metadata category: env_biome)
# Not all terms have both an 'is_a: value' and 'relationship: value',
# but each term must have at least one. Each 'name: value' may have multiple
# associated 'is_a: value' and relationship: value' indicating a term's link to
# multiple class types (i.e. env_feature and env_biome). Spaces do not separate
# terms.
#
# 2.) 'envo_name_is_a.txt' outputs:
#      name: *biome
#      is_a: value
# Example:
#      name: polar desert biome
#      is_a: desert biome
#
# 'envo_name_is_a.txt' outputs only class environmental system
# (EMP metadata category: env_biome).
# Each 'name: *biome' has only one associated 'is_a: value'. Spaces do not
# separate terms.
#
# """"""""""""""""""""


envo_parse () {
  echo $1

  # Retrieves all <tag-value pair> in order for all terms starting with:
  # name: value
  # is_a:value
  # relationship: value
  awk '{if($1 == "name:") print $0 ; if($1 == "is_a:") print $0;
        if($1 == "relationship:") print $0}' $1 > tmp1

  # Removes all terms that only have one <tag-value pair> 'name: value'
  # and not followed by 'is_a: value' or 'relationship: value'
  # these represent obsolete terms, defined by the <tag-value pair>
  # 'is_obsolete: value'.
  awk '{a[NR]=$1; b[NR]=$0}END{for(i=1;i<=NR;i++){
        if(a[i] == "name:" && a[i+1] == "name:") print "";
        else print b[i]}}' tmp1 > tmp2

  # Removes empty lines left from above command
  sed '/^$/d' tmp2 | sed '$d' > tmp3

  # Removes all ENVO terms described within all <tag-value pair>
  awk '{gsub("ENVO:", ""); gsub("[0-9]", "");
        gsub(" ! ", ""); print}' tmp3 > tmp4

  # Removed last two lines which are:
  # name: has_increased_levels_of
  # is_a: has_parthas_part
  # These last two lines do not hold meaningful envo ontology information.
  sed 'N;$!P;$!D;$d' tmp4 > envo_name_is_a_relationship_full.txt

  # To retrieve all biome pairs of 'name: value' and 'is_a: value' these
  # terms below do not have the biome suffix appended to them in the envo
  # ontology. To ease the sort for biome terms, "biome" was appended to the
  # end of these 'name: value' terms and later removed.
  awk '{if ($0 == ("name: neritic supra-littoral zone") ||
        $0 == ("name: nertic littoral zone") ||
        $0 == ("name: neritic sub-littoral zone"))
        print $0 " biome";
        else print $0}' envo_name_is_a_relationship_full.txt > tmp5

  # Sort for all 'name: *biome' and 'is_a: value' pairs in order.
  awk '{a[NR]=$NF; b[NR]=$1; c[NR]=$0}END{for(i=1;i<=NR;i++){
        if(a[i] == "biome" && b[i+1] == "is_a:")
        print c[i] "\n" c[i+1]}}' tmp5 > tmp6

  # Remove "biome" suffix appended to these 'name: value' terms from above.
  awk '{if ($0 == ("name: neritic supra-littoral zone biome") ||
        $0 == ("name: nertic littoral zone biome") ||
        $0 == ("name: neritic sub-littoral zone biome"))
        print $1,$2,$3,$4; else print $0}d' tmp6 > envo_name_is_a.txt

  rm tmp1 tmp2 tmp3 tmp4 tmp5 tmp6
}

envo_parse envo.obo