In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from itertools import permutations
import networkx as nx
import os
In [2]:
def print_edges(edges):
print(','.join([str(e) for e in edges]))
Я переименовавыла контиги одного референса таким образом, чтобы они были в формате
имяРеференса_номерКонтига
In [3]:
!head -5 refs_edges.txt
Считываем файл ответа sequence-threader, как он есть
In [4]:
df_ref = pd.read_csv("refs_edges.txt", header=None, names=["e"])
df_ref = df_ref["e"].str.split('\t', 1, expand=True)
df_ref.columns = ["e_id", "strains"]
df_ref = df_ref.set_index("e_id")
df_ref.index = df_ref.index.astype("int")
df_ref.loc[df_ref["strains"].isnull(), "strains"] = "nobody_0"
df_ref.head()
Out[4]:
Сплитим список референсов:
In [5]:
df_ref["strains"] = df_ref["strains"].str.split('\t')
df_ref["strains"] = df_ref["strains"].apply(lambda x: [s.rpartition('_')[0] for s in x])
df_ref["strains"] = df_ref["strains"].apply(Counter)
df_ref.head()
Out[5]:
Считаем копийность каждого ребра:
In [6]:
df_ref["single_copy"] = df_ref["strains"].apply(lambda x: x.most_common(1)[0][1] == 1)
df_ref.head()
Out[6]:
In [7]:
ref_profile = pd.read_csv("profile.csv", header=None, index_col=0)
for i in range(1, 11):
ref_profile[i] = ref_profile[i] / ref_profile[i].sum()
ref_profile
Out[7]:
In [8]:
desman_profile = pd.read_csv("desman_freqs.csv",
header=None, index_col=0, dtype=float)
desman_profile.index = desman_profile.index.astype(int)
desman_profile
Out[8]:
Ищем соответствие между профилями:
In [9]:
ref_freqs = ref_profile.as_matrix()
ans_error = float("Inf")
ans_permut = None
for cur_permut in permutations(desman_profile.index):
desman_freqs = desman_profile.loc[cur_permut, :].as_matrix()
#print(cur_error, cur_permut)
cur_error = ((ref_freqs - desman_freqs) ** 2).sum()
if cur_error < ans_error:
ans_error = cur_error
ans_permut = cur_permut
print("Error:", ans_error)
In [10]:
def invert_permutation(permutation):
return [i for i, j in sorted(enumerate(permutation), key=lambda x: x[1])]
In [11]:
strains = list('s' + ref_profile.iloc[invert_permutation(ans_permut), :].index.astype(str))
strains
Out[11]:
In [12]:
!head -5 gene_assignment_etaS_df.csv
In [13]:
df_desman = pd.read_csv("gene_assignment_etaS_df.csv", skiprows=1, names=["e_id"] + strains)
df_desman['e_id'] = df_desman['e_id'].str[1:].astype("int")
df_desman = df_desman.set_index('e_id')
df_desman[strains] = df_desman[strains].astype('int')
df_desman.head()
Out[13]:
In [14]:
for cur_s in strains:
df_ref[cur_s] = df_ref['strains'].apply(lambda x: int(cur_s in x))
df_ref.head()
Out[14]:
In [15]:
df_ref.sort_index(inplace=True)
df_desman.sort_index(inplace=True)
In [16]:
right_answers = (df_ref[strains] == df_desman[strains]).sum(axis=1) == len(strains)
print("Accuracy on all edges: %.2f" % (right_answers.sum() / len(df_ref)))
In [17]:
if not os.path.exists("bandage_colors"):
os.makedirs("bandage_colors")
for cur_s in strains:
print('\n\n_______________', cur_s)
df_ref['color'] = "#b0b0b0" # grey
#long = df_ref['length'] >= 500
single = df_ref['single_copy']
real_true = df_ref[cur_s] == 1
desman_true = df_desman[cur_s] == 1
#df_ref.loc[~long & real_true, 'color'] = 'Brown'
df_ref.loc[ single & real_true & desman_true, 'color'] = 'Lime'
df_ref.loc[~single & real_true & desman_true, 'color'] = 'Green'
df_ref.loc[ single & real_true & ~desman_true, 'color'] = 'Teal'
df_ref.loc[~single & real_true & ~desman_true, 'color'] = 'Navy'
df_ref.loc[ single & ~real_true & desman_true, 'color'] = 'Yellow'
df_ref.loc[~single & ~real_true & desman_true, 'color'] = 'Orange'
df_ref['strains_print'] = df_ref['strains'].apply(
lambda x: ", ".join('{}({})'.format(k, v) for k, v in x.items()))
df_ref['strains_print'] = df_ref['strains_print'].apply(lambda x: x.replace('(1)', ''))
df_ref[['strains_print', 'color']].to_csv("bandage_colors/{}.csv".format(cur_s), index_label='name')
print("\nFN")
print_edges(df_ref[real_true & ~desman_true].index)
print("\nFP")
print_edges(df_ref[~real_true & desman_true].index)
Теперь в папке bandage_colors лежит раскраска для каждого из штаммов соответственно
In [18]:
!ls bandage_colors
In [20]:
!head bandage_colors/s5.csv
In [ ]: