In [1]:
from pygenprop.database_file_parser import parse_genome_property_file
from pygenprop.results import GenomePropertiesResults
from pygenprop.assignment_file_parser import parse_genome_property_longform_file, parse_interproscan_file
import pandas as pd
In [2]:
def generate_differential_assignments(*results):
comparison = pd.concat(results, axis=1, sort=False)
comparison_transposed = comparison.transpose()
differential_assignments = comparison_transposed[[col for col in comparison_transposed if not comparison_transposed[col].nunique()==1]].transpose()
return differential_assignments
In [3]:
with open('/Users/lee/Dropbox/RandD/Repositories/genome-properties/flatfiles/genomeProperties.txt') as genome_properties_file:
genprop_tree = parse_genome_property_file(genome_properties_file)
In [4]:
with open('/Users/lee/Google_Drive/Neufeld_Lab/Projects/ELA_GSB_test_data_for_Lee/Jackson_InterProScan_GenProp_Results/Updated_Fall_2018/genome_properties/C_chlorochromatii_CaD3') as assignment_file:
file_assignment_results = parse_genome_property_longform_file(assignment_file)
genome_properties_file_result = GenomePropertiesResults(file_assignment_results, genome_properties_tree=genprop_tree)
In [5]:
with open('/Users/lee/Google_Drive/Neufeld_Lab/Projects/ELA_GSB_test_data_for_Lee/Jackson_InterProScan_GenProp_Results/Updated_Fall_2018/interproscan_results/C_chlorochromatii_CaD3.tsv') as assignment_file:
file_assignment_results = parse_interproscan_file(assignment_file)
file_assignment_results.sample_name = "C_chlorochromatii_CaD3_InterProScan"
inteproscan_file_result = GenomePropertiesResults(file_assignment_results, genome_properties_tree=genprop_tree)
In [6]:
len(genprop_tree) # Number of properties in the tree.
Out[6]:
In [7]:
len(genome_properties_file_result.property_results) # Number of assigned properties from genome properties assignment file.
Out[7]:
In [8]:
len(inteproscan_file_result.property_results) # Number of assigned properties from InterProScan TSV file.
Out[8]:
In [9]:
differential_property_assignments = generate_differential_assignments(genome_properties_file_result.property_results, inteproscan_file_result.property_results)
In [10]:
len(differential_property_assignments) # Number of differing property assignments between InterProScan file and genome properties assignment file.
Out[10]:
In [11]:
differential_step_assignments = generate_differential_assignments(genome_properties_file_result.step_results, inteproscan_file_result.step_results)
In [12]:
len(differential_step_assignments) # Number of differing step assignments between InterProScan file and genome properties assignment file.
Out[12]:
In [13]:
len(inteproscan_file_result.step_results) # Number of steps assigned.
Out[13]:
In [14]:
global_leaf_genome_property_ids = {leaf.id for leaf in genprop_tree.leafs}
differential_genome_property_leaf_ids = set(differential_property_assignments.index.tolist()).intersection(global_leaf_genome_property_ids)
In [15]:
differential_leaf_genome_property_assignments = differential_property_assignments[differential_property_assignments.index.get_level_values(0).isin(differential_genome_property_leaf_ids)]
In [16]:
len(differential_leaf_genome_property_assignments) # Number of differing leaf step assignments between InterProScan file and genome properties assignment file.
Out[16]:
In [17]:
differential_leaf_step_assignments = differential_step_assignments[differential_step_assignments.index.get_level_values(0).isin(differential_genome_property_leaf_ids)]
In [18]:
len(differential_leaf_step_assignments)
Out[18]:
See: https://github.com/ebi-pf-team/genome-properties/issues/30
In [19]:
differential_leaf_step_assignments
Out[19]:
In [20]:
differential_leaf_genome_property_assignments
Out[20]:
In [21]:
differential_property_assignments_not_caused_by_leaves = set(differential_leaf_genome_property_assignments.index.tolist()).isdisjoint({property_id for property_id, step_id in differential_leaf_step_assignments.index.tolist()})
In [22]:
differential_property_assignments_not_caused_by_leaves
Out[22]:
In [26]:
print('Missmatched Properties: {:1.2}%'.format(len(differential_property_assignments)/len(inteproscan_file_result.property_results)*100))
print('Missmatched Steps: {:1.2}%'.format(len(differential_step_assignments)/len(inteproscan_file_result.step_results)*100))
print('Missmatched Leaf Properties: {:1.2}%'.format(len(differential_leaf_genome_property_assignments)/len(inteproscan_file_result.property_results)*100))
print('Missmatched Leaf Steps: {:1.2}%'.format(len(differential_leaf_step_assignments)/len(inteproscan_file_result.step_results)*100))
print('Missmatched Leaf Properties Not Caused By Missmatched Steps: {}'.format(differential_property_assignments_not_caused_by_leaves))
In [ ]: