#!/bin/bash
# """"""""""""""""""""
# Contact: jkanbar@ucsd.edu (jad kanbar for questions)
# The file 'envo.obo' has to be in the same directory as this script.
# This script filters the envo ontology, by taking the file 'envo.obo' and
# and outputting two files in same directory as the script:
# 1.) envo_name_is_a_relationship_full.txt
# 2.) envo_name_is_a.txt.
#
# An OBO stanza in 'envo.obo' looks like this:
# [term]
# tag: value
# tag: value
# tag: value
# Relevant ontology terms have this term relationship:
# name: value
# is_a: value
# relatinship: value
#
# 1.) 'envo_name_is_a_relationship_full.txt' outputs:
# name: value
# is_a: value
# relationship: value
# Example:
# name: waterfall
# is_a: hydrographic feature
# relationship: part_of stream
#
# 'envo_name_is_a_relationship_full.txt' outputs all relevant
# EMP envo ontology classes including:
# environmental feature (EMP metadata category: env_feature)
# environmental matter (EMP metadata category: env_matter)
# environmental system (EMP metadata category: env_biome)
# Not all terms have both an 'is_a: value' and 'relationship: value',
# but each term must have at least one. Each 'name: value' may have multiple
# associated 'is_a: value' and relationship: value' indicating a term's link to
# multiple class types (i.e. env_feature and env_biome). Spaces do not separate
# terms.
#
# 2.) 'envo_name_is_a.txt' outputs:
# name: *biome
# is_a: value
# Example:
# name: polar desert biome
# is_a: desert biome
#
# 'envo_name_is_a.txt' outputs only class environmental system
# (EMP metadata category: env_biome).
# Each 'name: *biome' has only one associated 'is_a: value'. Spaces do not
# separate terms.
#
# """"""""""""""""""""
envo_parse () {
echo $1
# Retrieves all <tag-value pair> in order for all terms starting with:
# name: value
# is_a:value
# relationship: value
awk '{if($1 == "name:") print $0 ; if($1 == "is_a:") print $0;
if($1 == "relationship:") print $0}' $1 > tmp1
# Removes all terms that only have one <tag-value pair> 'name: value'
# and not followed by 'is_a: value' or 'relationship: value'
# these represent obsolete terms, defined by the <tag-value pair>
# 'is_obsolete: value'.
awk '{a[NR]=$1; b[NR]=$0}END{for(i=1;i<=NR;i++){
if(a[i] == "name:" && a[i+1] == "name:") print "";
else print b[i]}}' tmp1 > tmp2
# Removes empty lines left from above command
sed '/^$/d' tmp2 | sed '$d' > tmp3
# Removes all ENVO terms described within all <tag-value pair>
awk '{gsub("ENVO:", ""); gsub("[0-9]", "");
gsub(" ! ", ""); print}' tmp3 > tmp4
# Removed last two lines which are:
# name: has_increased_levels_of
# is_a: has_parthas_part
# These last two lines do not hold meaningful envo ontology information.
sed 'N;$!P;$!D;$d' tmp4 > envo_name_is_a_relationship_full.txt
# To retrieve all biome pairs of 'name: value' and 'is_a: value' these
# terms below do not have the biome suffix appended to them in the envo
# ontology. To ease the sort for biome terms, "biome" was appended to the
# end of these 'name: value' terms and later removed.
awk '{if ($0 == ("name: neritic supra-littoral zone") ||
$0 == ("name: nertic littoral zone") ||
$0 == ("name: neritic sub-littoral zone"))
print $0 " biome";
else print $0}' envo_name_is_a_relationship_full.txt > tmp5
# Sort for all 'name: *biome' and 'is_a: value' pairs in order.
awk '{a[NR]=$NF; b[NR]=$1; c[NR]=$0}END{for(i=1;i<=NR;i++){
if(a[i] == "biome" && b[i+1] == "is_a:")
print c[i] "\n" c[i+1]}}' tmp5 > tmp6
# Remove "biome" suffix appended to these 'name: value' terms from above.
awk '{if ($0 == ("name: neritic supra-littoral zone biome") ||
$0 == ("name: nertic littoral zone biome") ||
$0 == ("name: neritic sub-littoral zone biome"))
print $1,$2,$3,$4; else print $0}d' tmp6 > envo_name_is_a.txt
rm tmp1 tmp2 tmp3 tmp4 tmp5 tmp6
}
envo_parse envo.obo