In [83]:
import requests
from io import StringIO
from pygenprop.results import GenomePropertiesResults
from pygenprop.database_file_parser import parse_genome_properties_flat_file
from pygenprop.assignment_file_parser import parse_interproscan_file, parse_genome_property_longform_file
In [84]:
# The Genome Properties is a flat file database that can be fount on Github.
# The latest release of the database can be found at the following URL.
genome_properties_database_url = 'https://raw.githubusercontent.com/ebi-pf-team/genome-properties/master/flatfiles/genomeProperties.txt'
# For this tutorial we will stream the file directly into the Jupyter notebook. Alternativly,
# one could be downloaded the file with the unix wget or curl commands.
with requests.Session() as current_download:
response = current_download.get(genome_properties_database_url, stream=True)
tree = parse_genome_properties_flat_file(StringIO(response.text))
In [85]:
# There are 1286 properties in the Genome Properties tree.
len(tree)
Out[85]:
In [86]:
# Find all properties of type "GUILD".
for genome_property in tree:
if genome_property.type == 'GUILD':
print(genome_property.name)
In [87]:
# Parse InterProScan files
with open('E_coli_K12.tsv') as ipr5_file_one:
assignment_cache_1 = parse_interproscan_file(ipr5_file_one)
In [88]:
with open('E_coli_O157_H7.tsv') as ipr5_file_two:
assignment_cache_2 = parse_interproscan_file(ipr5_file_two)
In [89]:
# Create results comparison object
results = GenomePropertiesResults(assignment_cache_1, assignment_cache_2, properties_tree=tree)
In [90]:
# Get property by identifier
virulence = tree['GenProp0074']
In [91]:
virulence
Out[91]:
In [92]:
# Iterate to get the identifiers of child properties of virulence
types_of_vir = [genprop.id for genprop in virulence.children]
In [93]:
# The property results property is used to compare two property assignments between samples.
results.property_results
Out[93]:
In [94]:
# The step results property is used to compare two step assignments between samples.
results.step_results
Out[94]:
In [95]:
# Get properties with differing assignments
results.differing_property_results
Out[95]:
In [96]:
# Get property assignments for virulence properties
results.get_results(*types_of_vir, steps=False)
Out[96]:
In [97]:
# Get step assignments for virulence properties
results.get_results(*types_of_vir, steps=True)
Out[97]:
In [98]:
# Get counts of virulence properties assigned YES, NO, and PARTIAL per organism
results.get_results_summary(*types_of_vir, steps=False, normalize=False)
Out[98]:
In [99]:
# Get counts of virulence steps assigned YES, NO, and PARTIAL per organism
results.get_results_summary(*types_of_vir, steps=True, normalize=False)
Out[99]:
In [100]:
# Get percentages of virulence steps assigned YES, NO, and PARTIAL per organism
results.get_results_summary(*types_of_vir, steps=True, normalize=True)
Out[100]:
In [ ]: