In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from itertools import permutations
import networkx as nx
import os
In [2]:
def print_edges(edges):
print(','.join([str(e) for e in edges]))
In [3]:
G = nx.DiGraph()
with open("assembly/K127/prelim_graph.gfa") as f:
for line in f:
line = line.split()
line_type = line[0]
# S 238024 ACCAATTAT KC:i:37210
if line_type == "S":
v_name = int(line[1])
v_length = len(line[2])
G.add_node(v_name, length=v_length)
# L 238322 + 19590 - 55M
if line_type == "L":
v1 = int(line[1])
v2 = int(line[3])
G.add_edge(v1, v2)
print('Number of components:', nx.number_weakly_connected_components(G))
# remain only largest component
new_G = nx.DiGraph()
for g in nx.weakly_connected_component_subgraphs(G):
print(g.number_of_nodes())
if new_G.number_of_nodes() < g.number_of_nodes():
new_G = g.copy()
G = new_G.copy()
In [4]:
df_ref = pd.read_csv("refs/refs_edges.txt", header=None, names=["e"])
df_ref = df_ref["e"].str.split('\t', 1, expand=True)
df_ref.columns = ["e_id", "strains"]
df_ref = df_ref.set_index("e_id")
df_ref.index = df_ref.index.astype("int")
df_ref.head()
Out[4]:
Оставляем только ребра из большой компоненты:
In [5]:
df_ref = df_ref.loc[list(G.nodes)]
Выводим рёбра, которые никому не принадлежат, на экран, и выкидываем их из таблицы:
In [6]:
nobody_edges = df_ref[df_ref["strains"].isnull()].index
print("{n} edges are Nobody".format(n=len(nobody_edges)), '\n')
print_edges(nobody_edges)
df_ref = df_ref.loc[set(df_ref.index) - set(nobody_edges)]
Сплитим список референсов:
In [7]:
df_ref["strains"] = df_ref["strains"].str.split('\t')
df_ref["strains"] = df_ref["strains"].apply(lambda x: [s.rpartition('_')[0] for s in x])
df_ref["strains"] = df_ref["strains"].apply(Counter)
df_ref.head()
Out[7]:
Считаем копийность каждого ребра:
In [8]:
df_ref["single_copy"] = df_ref["strains"].apply(lambda x: x.most_common(1)[0][1] == 1)
df_ref.head()
Out[8]:
Считываем длину рёбер:
In [9]:
df_length = pd.read_csv("edges_lengths.tsv", sep='\t', header=None, names=["length"])
df_length = df_length.astype("int")
df_ref = df_ref.join(df_length, how='inner')
df_ref.head()
Out[9]:
In [10]:
ref_profile = pd.read_csv("refs/profile.csv", header=None, index_col=0)
for i in range(1, 11):
ref_profile[i] = ref_profile[i] / ref_profile[i].sum()
ref_profile
Out[10]:
In [11]:
desman_profile = pd.read_csv("desman_results/%s_0/desman_freqs.csv" % len(ref_profile),
header=None, index_col=0, dtype=float)
desman_profile.index = desman_profile.index.astype(int)
desman_profile
Out[11]:
Ищем соответствие между профилями:
In [12]:
ref_freqs = ref_profile.as_matrix()
ans_error = float("Inf")
ans_permut = None
for cur_permut in permutations(desman_profile.index):
desman_freqs = desman_profile.loc[cur_permut, :].as_matrix()
#print(cur_error, cur_permut)
cur_error = ((ref_freqs - desman_freqs) ** 2).sum()
if cur_error < ans_error:
ans_error = cur_error
ans_permut = cur_permut
print("Error:", ans_error)
In [13]:
def invert_permutation(permutation):
return [i for i, j in sorted(enumerate(permutation), key=lambda x: x[1])]
In [15]:
strains = list('s' + ref_profile.iloc[invert_permutation(ans_permut), :].index.astype(str))
strains
Out[15]:
In [16]:
df_desman = pd.read_csv("desman_results/%s_0/gene_assignment_etaS_df.csv" % len(strains), skiprows=1,
names=["e_id"] + strains)
df_desman['e_id'] = df_desman['e_id'].str[1:].astype("int")
df_desman = df_desman.set_index('e_id')
df_desman[strains] = df_desman[strains].astype('int')
#df_desman = df_desman.join(df_length, how='inner')
df_desman = df_desman.loc[set(df_ref.index) - set(nobody_edges)]
df_desman.head()
Out[16]:
In [17]:
for cur_s in strains:
df_ref[cur_s] = df_ref['strains'].apply(lambda x: int(cur_s in x))
df_ref.head()
Out[17]:
In [18]:
df_ref.sort_index(inplace=True)
df_desman.sort_index(inplace=True)
In [19]:
right_answers = (df_ref[strains] == df_desman[strains]).sum(axis=1) == len(strains)
print("Accuracy on all edges: %.2f" % (right_answers.sum() / len(df_ref)))
In [20]:
long_edges = df_ref['length'] > 500
print("Percent of long edges: %.2f" % (long_edges.sum() / len(df_ref)))
print("Accuracy on long edges: %.2f" % ((right_answers & long_edges).sum() / long_edges.sum()))
In [21]:
if not os.path.exists("bandage_colors"):
os.makedirs("bandage_colors")
for cur_s in strains:
print('\n\n', "_________________________", cur_s, '\n')
selected_nodes = df_ref[df_ref[cur_s] == 1].index
G_sub = G.subgraph(selected_nodes)
print("components in reference subgraph:", nx.number_weakly_connected_components(G_sub))
selected_nodes = df_desman[df_desman[cur_s] == 1].index
G_sub = G.subgraph(selected_nodes)
print("components in DESMAN subgraph:", nx.number_weakly_connected_components(G_sub))
df_ref['color'] = "#b0b0b0" # grey
long = df_ref['length'] >= 500
single = df_ref['single_copy']
real_true = df_ref[cur_s] == 1
desman_true = df_desman[cur_s] == 1
df_ref.loc[~long & real_true, 'color'] = 'Brown'
df_ref.loc[long & single & real_true & desman_true, 'color'] = 'Lime'
df_ref.loc[long & ~single & real_true & desman_true, 'color'] = 'Green'
df_ref.loc[long & single & real_true & ~desman_true, 'color'] = 'Teal'
df_ref.loc[long & ~single & real_true & ~desman_true, 'color'] = 'Navy'
df_ref.loc[long & single & ~real_true & desman_true, 'color'] = 'Yellow'
df_ref.loc[long & ~single & ~real_true & desman_true, 'color'] = 'Orange'
df_ref['strains_print'] = df_ref['strains'].apply(lambda x: ", ".join('{}({})'.format(k, v) for k, v in x.items()))
df_ref['strains_print'] = df_ref['strains_print'].apply(lambda x: x.replace('(1)', ''))
df_ref[['strains_print', 'color']].to_csv("bandage_colors/{}.csv".format(cur_s), index_label='name')
print("\nlong FN")
print_edges(df_ref[long & real_true & ~desman_true].index)
print("\nlong FP")
print_edges(df_ref[long & ~real_true & desman_true].index)
In [ ]:
In [ ]: