In [1]:
from pygenprop.database_file_parser import parse_genome_property_file
from pygenprop.results import GenomePropertiesResults
from pygenprop.assignment_file_parser import parse_genome_property_longform_file, parse_interproscan_file
import pandas as pd

In [2]:
def generate_differential_assignments(*results):
    comparison = pd.concat(results, axis=1, sort=False)
    comparison_transposed = comparison.transpose()
    differential_assignments = comparison_transposed[[col for col in comparison_transposed if not comparison_transposed[col].nunique()==1]].transpose()
    return differential_assignments

In [3]:
with open('/Users/lee/Dropbox/RandD/Repositories/genome-properties/flatfiles/genomeProperties.txt') as genome_properties_file:
    genprop_tree = parse_genome_property_file(genome_properties_file)

In [4]:
with open('/Users/lee/Google_Drive/Neufeld_Lab/Projects/ELA_GSB_test_data_for_Lee/Jackson_InterProScan_GenProp_Results/Updated_Fall_2018/genome_properties/C_chlorochromatii_CaD3') as assignment_file:
    file_assignment_results = parse_genome_property_longform_file(assignment_file)
    genome_properties_file_result = GenomePropertiesResults(file_assignment_results, genome_properties_tree=genprop_tree)

In [5]:
with open('/Users/lee/Google_Drive/Neufeld_Lab/Projects/ELA_GSB_test_data_for_Lee/Jackson_InterProScan_GenProp_Results/Updated_Fall_2018/interproscan_results/C_chlorochromatii_CaD3.tsv') as assignment_file:
    file_assignment_results = parse_interproscan_file(assignment_file)
    file_assignment_results.sample_name = "C_chlorochromatii_CaD3_InterProScan"
    inteproscan_file_result = GenomePropertiesResults(file_assignment_results, genome_properties_tree=genprop_tree)

In [6]:
len(genprop_tree) # Number of properties in the tree.


Out[6]:
1286

In [7]:
len(genome_properties_file_result.property_results) # Number of assigned properties from genome properties assignment file.


Out[7]:
1286

In [8]:
len(inteproscan_file_result.property_results) # Number of assigned properties from InterProScan TSV file.


Out[8]:
1286

In [9]:
differential_property_assignments = generate_differential_assignments(genome_properties_file_result.property_results, inteproscan_file_result.property_results)

In [10]:
len(differential_property_assignments) # Number of differing property assignments between InterProScan file and genome properties assignment file.


Out[10]:
37

In [11]:
differential_step_assignments = generate_differential_assignments(genome_properties_file_result.step_results, inteproscan_file_result.step_results)

In [12]:
len(differential_step_assignments) # Number of differing step assignments between InterProScan file and genome properties assignment file.


Out[12]:
87

In [13]:
len(inteproscan_file_result.step_results) # Number of steps assigned.


Out[13]:
6525

In [14]:
global_leaf_genome_property_ids = {leaf.id for leaf in genprop_tree.leafs}
differential_genome_property_leaf_ids = set(differential_property_assignments.index.tolist()).intersection(global_leaf_genome_property_ids)

In [15]:
differential_leaf_genome_property_assignments = differential_property_assignments[differential_property_assignments.index.get_level_values(0).isin(differential_genome_property_leaf_ids)]

In [16]:
len(differential_leaf_genome_property_assignments) # Number of differing leaf step assignments between InterProScan file and genome properties assignment file.


Out[16]:
22

In [17]:
differential_leaf_step_assignments = differential_step_assignments[differential_step_assignments.index.get_level_values(0).isin(differential_genome_property_leaf_ids)]

In [18]:
len(differential_leaf_step_assignments)


Out[18]:
20

Differential steps are due to genome properties perl improperly assigning yes when to steps where their are mutiple insufficent evidences and some of these are missing.

See: https://github.com/ebi-pf-team/genome-properties/issues/30


In [19]:
differential_leaf_step_assignments


Out[19]:
C_chlorochromatii_CaD3 C_chlorochromatii_CaD3_InterProScan
Genome_Property_ID Step_Number
GenProp0457 6 YES NO
7 YES NO
GenProp0458 2 YES NO
GenProp0685 1 YES NO
GenProp0701 1 YES NO
GenProp0715 2 YES NO
GenProp0724 8 YES NO
GenProp0750 1 YES NO
3 YES NO
GenProp0754 1 YES NO
GenProp0756 1 YES NO
GenProp0759 2 YES NO
GenProp0839 2 YES NO
GenProp0840 2 YES NO
GenProp0842 3 YES NO
GenProp0877 1 YES NO
GenProp0917 12 YES NO
GenProp0927 2 YES NO
GenProp1074 2 YES NO
GenProp1094 3 YES NO

In [20]:
differential_leaf_genome_property_assignments


Out[20]:
C_chlorochromatii_CaD3 C_chlorochromatii_CaD3_InterProScan
GenProp0457 PARTIAL NO
GenProp0458 PARTIAL NO
GenProp0617 PARTIAL NO
GenProp0685 PARTIAL NO
GenProp0701 PARTIAL NO
GenProp0715 PARTIAL NO
GenProp0724 PARTIAL NO
GenProp0750 YES PARTIAL
GenProp0754 YES NO
GenProp0756 PARTIAL NO
GenProp0759 PARTIAL NO
GenProp0813 PARTIAL NO
GenProp0839 YES PARTIAL
GenProp0840 YES PARTIAL
GenProp0842 YES PARTIAL
GenProp0877 YES PARTIAL
GenProp0899 PARTIAL NO
GenProp0917 PARTIAL NO
GenProp0927 YES PARTIAL
GenProp1072 PARTIAL NO
GenProp1074 YES NO
GenProp1094 PARTIAL NO

In [21]:
differential_property_assignments_not_caused_by_leaves = set(differential_leaf_genome_property_assignments.index.tolist()).isdisjoint({property_id for property_id, step_id in differential_leaf_step_assignments.index.tolist()})

In [22]:
differential_property_assignments_not_caused_by_leaves


Out[22]:
False

In [26]:
print('Missmatched Properties: {:1.2}%'.format(len(differential_property_assignments)/len(inteproscan_file_result.property_results)*100))
print('Missmatched Steps: {:1.2}%'.format(len(differential_step_assignments)/len(inteproscan_file_result.step_results)*100))
print('Missmatched Leaf Properties: {:1.2}%'.format(len(differential_leaf_genome_property_assignments)/len(inteproscan_file_result.property_results)*100))
print('Missmatched Leaf Steps: {:1.2}%'.format(len(differential_leaf_step_assignments)/len(inteproscan_file_result.step_results)*100))
print('Missmatched Leaf Properties Not Caused By Missmatched Steps: {}'.format(differential_property_assignments_not_caused_by_leaves))


Missmatched Properties: 2.9%
Missmatched Steps: 1.3%
Missmatched Leaf Properties: 1.7%
Missmatched Leaf Steps: 0.31%
Missmatched Leaf Properties Not Caused By Missmatched Steps: False

In [ ]: