In [1]:
import pandas as pd
import numpy as np

from collections import Counter
from itertools import permutations

import networkx as nx

import os

In [2]:
def print_edges(edges):
    print(','.join([str(e) for e in edges]))

Табличка с референсами

Я переименовавыла контиги одного референса таким образом, чтобы они были в формате

имяРеференса_номерКонтига


In [3]:
!head -5 refs_edges.txt


5515248	s7_3	s9_8	s5_2
5607954	s7_3	s5_2
5601674	s7_10	s9_27	s5_1
5427068	s7_10	s5_1
5546564	s9_27

Считываем файл ответа sequence-threader, как он есть


In [4]:
df_ref = pd.read_csv("refs_edges.txt", header=None, names=["e"])

df_ref = df_ref["e"].str.split('\t', 1, expand=True)
df_ref.columns = ["e_id", "strains"]
df_ref = df_ref.set_index("e_id")
df_ref.index = df_ref.index.astype("int")
df_ref.loc[df_ref["strains"].isnull(), "strains"] = "nobody_0"
df_ref.head()


Out[4]:
strains
e_id
5515248 s7_3\ts9_8\ts5_2
5607954 s7_3\ts5_2
5601674 s7_10\ts9_27\ts5_1
5427068 s7_10\ts5_1
5546564 s9_27

Сплитим список референсов:


In [5]:
df_ref["strains"] = df_ref["strains"].str.split('\t')
df_ref["strains"] = df_ref["strains"].apply(lambda x: [s.rpartition('_')[0] for s in x])
df_ref["strains"] = df_ref["strains"].apply(Counter)
df_ref.head()


Out[5]:
strains
e_id
5515248 {'s7': 1, 's9': 1, 's5': 1}
5607954 {'s7': 1, 's5': 1}
5601674 {'s7': 1, 's9': 1, 's5': 1}
5427068 {'s7': 1, 's5': 1}
5546564 {'s9': 1}

Считаем копийность каждого ребра:


In [6]:
df_ref["single_copy"] = df_ref["strains"].apply(lambda x: x.most_common(1)[0][1] == 1)
df_ref.head()


Out[6]:
strains single_copy
e_id
5515248 {'s7': 1, 's9': 1, 's5': 1} True
5607954 {'s7': 1, 's5': 1} True
5601674 {'s7': 1, 's9': 1, 's5': 1} True
5427068 {'s7': 1, 's5': 1} True
5546564 {'s9': 1} True

Считываем профили


In [7]:
ref_profile = pd.read_csv("profile.csv", header=None, index_col=0)
for i in range(1, 11):
    ref_profile[i] = ref_profile[i] / ref_profile[i].sum()
ref_profile


Out[7]:
1 2 3 4 5 6 7 8 9 10
0
7 0.44 0.13 0.10 0.42 0.06 0.55 0.76 0.3 0.22 0.23
9 0.55 0.84 0.11 0.53 0.12 0.32 0.14 0.2 0.78 0.10
5 0.01 0.03 0.79 0.05 0.82 0.13 0.10 0.5 0.00 0.67

In [8]:
desman_profile = pd.read_csv("desman_freqs.csv",
                             header=None, index_col=0, dtype=float)
desman_profile.index = desman_profile.index.astype(int)
desman_profile


Out[8]:
1 2 3 4 5 6 7 8 9 10
0
0 0.494 0.164 0.121 0.475 0.074 0.582 0.773 0.320 0.261 0.252
1 0.494 0.802 0.098 0.472 0.108 0.286 0.122 0.179 0.736 0.091
2 0.012 0.034 0.781 0.053 0.818 0.132 0.105 0.501 0.002 0.658

Ищем соответствие между профилями:


In [9]:
ref_freqs = ref_profile.as_matrix()
ans_error = float("Inf")
ans_permut = None
for cur_permut in permutations(desman_profile.index):
    desman_freqs = desman_profile.loc[cur_permut, :].as_matrix()
    #print(cur_error, cur_permut)
    cur_error = ((ref_freqs - desman_freqs) ** 2).sum()
    if cur_error < ans_error:
        ans_error = cur_error
        ans_permut = cur_permut
print("Error:", ans_error)


Error: 0.023954

In [10]:
def invert_permutation(permutation):
    return [i for i, j in sorted(enumerate(permutation), key=lambda x: x[1])]

In [11]:
strains = list('s' + ref_profile.iloc[invert_permutation(ans_permut), :].index.astype(str))
strains


Out[11]:
['s7', 's9', 's5']

Табличка ответов DESMAN


In [12]:
!head -5 gene_assignment_etaS_df.csv


,0,1,2
e5515248,1.0,1.0,1.0
e5607954,0.0,0.0,1.0
e5601674,1.0,1.0,1.0
e5427068,1.0,0.0,1.0

In [13]:
df_desman = pd.read_csv("gene_assignment_etaS_df.csv", skiprows=1, names=["e_id"] + strains)
df_desman['e_id'] = df_desman['e_id'].str[1:].astype("int")
df_desman = df_desman.set_index('e_id')
df_desman[strains] = df_desman[strains].astype('int')

df_desman.head()


Out[13]:
s7 s9 s5
e_id
5515248 1 1 1
5607954 0 0 1
5601674 1 1 1
5427068 1 0 1
5546564 0 1 0

In [14]:
for cur_s in strains:
    df_ref[cur_s] = df_ref['strains'].apply(lambda x: int(cur_s in x))
    
df_ref.head()


Out[14]:
strains single_copy s7 s9 s5
e_id
5515248 {'s7': 1, 's9': 1, 's5': 1} True 1 1 1
5607954 {'s7': 1, 's5': 1} True 1 0 1
5601674 {'s7': 1, 's9': 1, 's5': 1} True 1 1 1
5427068 {'s7': 1, 's5': 1} True 1 0 1
5546564 {'s9': 1} True 0 1 0

Точность DESMAN


In [15]:
df_ref.sort_index(inplace=True)
df_desman.sort_index(inplace=True)

In [16]:
right_answers = (df_ref[strains] == df_desman[strains]).sum(axis=1) == len(strains)
print("Accuracy on all edges: %.2f" % (right_answers.sum() / len(df_ref)))


Accuracy on all edges: 0.80

Раскрашиваем граф для каждого штамма


In [17]:
if not os.path.exists("bandage_colors"):
    os.makedirs("bandage_colors")


for cur_s in strains:
    
    print('\n\n_______________', cur_s)

    df_ref['color'] = "#b0b0b0"  # grey


    #long = df_ref['length'] >= 500
    single = df_ref['single_copy']
    real_true = df_ref[cur_s] == 1
    desman_true = df_desman[cur_s] == 1

    #df_ref.loc[~long & real_true, 'color'] = 'Brown'

    df_ref.loc[ single  &  real_true  &  desman_true, 'color'] = 'Lime'
    df_ref.loc[~single  &  real_true  &  desman_true, 'color'] = 'Green'

    df_ref.loc[ single  &  real_true  & ~desman_true, 'color'] = 'Teal'
    df_ref.loc[~single  &  real_true  & ~desman_true, 'color'] = 'Navy'

    df_ref.loc[ single  & ~real_true  &  desman_true, 'color'] = 'Yellow'
    df_ref.loc[~single  & ~real_true  &  desman_true, 'color'] = 'Orange'


    df_ref['strains_print'] = df_ref['strains'].apply(
        lambda x: ", ".join('{}({})'.format(k, v) for k, v in x.items()))
    df_ref['strains_print'] = df_ref['strains_print'].apply(lambda x: x.replace('(1)', ''))

    df_ref[['strains_print', 'color']].to_csv("bandage_colors/{}.csv".format(cur_s), index_label='name')
    
    print("\nFN")
    print_edges(df_ref[real_true & ~desman_true].index)
    
    print("\nFP")
    print_edges(df_ref[~real_true & desman_true].index)



_______________ s7

FN
46552,325316,362196,482888,555284,648310,884642,902088,1156174,1219216,1233698,1253944,4691195,4711570,4809726,4865236,4879836,4899716,5047786,5056686,5161862,5190986,5210786,5221422,5229452,5239610,5243762,5260758,5318210,5321426,5328452,5336482,5337232,5338210,5338838,5343482,5346520,5347056,5347476,5354274,5354634,5355352,5365868,5375578,5388898,5399188,5402840,5409120,5414356,5417722,5419506,5419602,5426606,5442114,5453454,5453634,5460092,5462924,5466008,5483818,5483820,5486206,5488202,5490028,5490178,5491812,5491840,5498078,5499888,5500622,5502848,5506160,5510576,5514920,5516648,5522364,5527192,5535846,5536118,5536138,5536720,5536766,5537190,5539754,5541886,5541970,5542390,5542842,5543492,5544658,5545752,5547076,5559598,5570250,5570426,5570504,5572448,5574208,5574404,5575730,5578572,5580860,5580908,5582100,5582320,5582366,5584076,5585704,5586430,5586432,5586526,5586548,5589210,5589592,5589926,5590268,5590852,5591328,5592242,5592254,5592948,5593026,5593740,5593820,5593828,5594200,5594324,5595488,5595832,5598860,5598862,5599238,5599434,5599762,5599796,5600246,5600814,5600880,5601096,5601372,5601540,5602504,5602776,5603086,5603216,5603666,5603736,5603830,5603842,5603916,5604464,5604914,5604916,5605660,5605934,5605962,5606032,5606034,5606266,5606282,5606416,5606812,5607056,5607158,5607314,5607704,5607728,5607740,5607954

FP
1924,1926,19100,19102,19556,37286,47888,61404,89534,98704,189884,192284,203714,214858,215678,223104,223632,255314,272750,272984,289270,296028,303384,326238,377992,378376,379706,380526,402846,406924,406926,432460,434000,442066,450514,450516,473134,482080,505620,506858,517798,524488,533136,533268,534588,555282,555944,564898,564900,584338,618854,629748,636354,636478,636480,642606,652064,652066,656780,662306,668060,673240,699306,708460,721836,733230,733940,743010,743394,749452,749454,767906,790998,828112,831256,832228,842334,858356,861448,863838,864430,864432,913366,924810,925670,937036,957132,974684,986752,990714,1005690,1011618,1011620,1014674,1029092,1030836,1048444,1048446,1076706,1088300,1101978,1116336,1135644,1150706,1167784,1184412,1185728,1187864,1232754,1234270,1252044,1255324,1270220,1304358,1315332,1318376,1320286,1320288,1332154,1332156,1340038,1340040,1366022,1368474,1376472,1381530,1384180,1385844,1387262,1395252,1428658,1444198,1444200,1510046,1515266,1518744,1526146,1528596,4623028,4647719,4651235,4658189,4695988,4697916,4709432,4709546,4709746,4710836,4711564,4714708,4734636,4766200,4770830,4777566,4781760,4783818,4784096,4793502,4796452,4829986,4830776,4833792,4839184,4842422,4843088,4843454,4851222,4855764,4862632,4869950,4872410,4874958,4876212,4876780,4899430,4901574,4903710,4911290,4917270,4924108,4925050,4927956,4934746,4936930,4942238,4942862,4963332,4970476,4970978,4975146,4979258,4984570,4993418,5014028,5026924,5037904,5040370,5050374,5050932,5054966,5058304,5058706,5072540,5073506,5075298,5075556,5080716,5083632,5083958,5085710,5087656,5089160,5090084,5095774,5096734,5103960,5108606,5109762,5113422,5114206,5120630,5120722,5126140,5131070,5143476,5144858,5147270,5149260,5149310,5152316,5153856,5158392,5160600,5160638,5161778,5165654,5165728,5169354,5172580,5184292,5184816,5190064,5190928,5191212,5198536,5199614,5205140,5211310,5212142,5213290,5215892,5217652,5226266,5226416,5230622,5230624,5234904,5234910,5256394,5264522,5279898,5284030,5287862,5295056,5307490,5318688,5319120,5324016,5326370,5328478,5333592,5345120,5358178,5375566,5390538,5407030,5466218,5492668,5507914,5510380,5510382,5521944,5535446,5536142,5536258,5537610,5538154,5539776,5540484,5541236,5542250,5542578,5543376,5543520,5543568,5543714,5544302,5544784,5544836,5545178,5545476,5547126,5547624,5548096,5549426,5549468,5549526,5551578,5552972,5553894,5553902,5553932,5555874,5556602,5559988,5561024,5562462,5562470,5563548,5563550,5565068,5565070,5565248,5565612,5566040,5567856,5568640,5571752,5581080,5583330,5584124,5584130,5585270,5585278,5587058,5587524,5587532,5587544,5589804,5590382,5591388,5591776,5591896,5592502,5592732,5592876,5592880,5592998,5593426,5593462,5593574,5593728,5593856,5594714,5594862,5594968,5595084,5595130,5595158,5595378,5595444,5596552,5597150,5597388,5597978,5598250,5598390,5598512,5598516,5598550,5598596,5598604,5598608,5598706,5598778,5599002,5599056,5599058,5599162,5599522,5599528,5600514,5602416,5604078,5604134,5604622,5604692,5604704,5605202,5605210,5605264,5605354,5605564,5605568,5605716,5605844,5605854,5605930,5605954,5605994,5606090,5606094,5606120,5606140,5606272,5606380,5606384,5606386,5606404,5606446,5606514,5606536,5606538,5606636,5606690,5606700,5606712,5606734,5606750,5606764,5606802,5606816,5606858,5606862,5606864,5607014,5607036,5607040,5607042,5607182,5607716,5607906,5608046,5608048


_______________ s9

FN
243264,406926,432460,652066,1234270,1320288,4636893,5072540,5087656,5095774,5113730,5196536,5199614,5202180,5264522,5339384,5340408,5347922,5356042,5362610,5362872,5407030,5421760,5423168,5423630,5426862,5444828,5466218,5536258,5559596,5571752,5574438,5579288,5581080,5588018,5599270,5605906,5606284,5606804

FP
26948,110802,224254,244714,446182,447168,447854,447856,664488,783820,786128,843772,922428,936446,1253734,1315954,4640025,4691822,4709744,4963210,4984666,4987864,5017174,5045742,5079722,5115742,5137988,5142372,5161022,5180854,5184292,5195458,5211048,5211310,5213712,5215726,5215996,5217652,5220558,5222314,5224046,5227854,5228058,5228460,5250704,5263674,5287186,5290974,5291530,5291632,5291644,5318688,5319120,5322176,5323512,5324016,5328478,5333592,5352086,5469524,5478216,5488354,5488362,5488370,5488374,5506938,5511658,5511666,5511706,5512970,5515450,5518630,5518636,5525040,5532182,5533916,5537606,5537666,5539776,5543676,5543750,5543752,5544784,5545476,5546176,5546526,5546728,5546746,5547764,5547830,5547918,5548536,5548570,5549468,5549526,5551578,5555874,5557030,5565070,5565248,5567856,5568640,5568648,5571886,5572296,5575106,5579384,5580330,5582226,5587014,5587972,5587974,5588434,5588436,5588838,5588840,5589538,5589804,5590278,5590420,5591300,5591492,5591776,5592502,5592732,5592880,5594044,5594746,5594758,5594862,5595158,5595498,5596552,5597388,5598250,5598390,5598512,5598516,5598550,5598574,5598580,5598608,5598778,5599056,5599468,5600764,5601360,5603204,5603712,5603718,5603818,5604382,5604384,5604622,5604664,5604922,5605154,5605542,5605716,5605844,5605852,5605854,5605930,5605954,5605994,5606090,5606094,5606120,5606272,5606380,5606446,5606514,5606536,5606538,5606676,5606690,5606698,5606700,5606734,5606764,5606802,5606816,5607114,5607126


_______________ s5

FN
243264,533138,555944,652066,1101978,1334606,4823474,4901574,4927956,5058706,5072540,5085710,5109762,5120722,5147270,5149260,5149310,5158392,5205248,5226416,5230618,5230622,5230624,5336546,5345120,5347922,5358178,5393536,5466218,5514072,5519696,5535164,5536922,5547624,5548918,5550404,5552972,5571752,5578618,5589934,5592884,5601102

FP
18054,20152,72464,89534,182492,414398,447854,447856,580704,642606,664488,922428,1315954,1468424,1528594,4623861,4710466,5017174,5045742,5070318,5082420,5117298,5180854,5220558,5256394,5291632,5291640,5295056,5326370,5467704,5488354,5488362,5488370,5511658,5511666,5511706,5518630,5531628,5532182,5533024,5537664,5541480,5543568,5543714,5543750,5546576,5547918,5548536,5549468,5551578,5553902,5554022,5555874,5561024,5562462,5563544,5563548,5563550,5565070,5565248,5566040,5567856,5574016,5574148,5574156,5587014,5587166,5587660,5587682,5587972,5587974,5588434,5588838,5588840,5592502,5592876,5592880,5593462,5593856,5594968,5595084,5595378,5595498,5596264,5596272,5596552,5597388,5597716,5597718,5597978,5598250,5598390,5598512,5598516,5598550,5598596,5598604,5598608,5598706,5598778,5599002,5599056,5599058,5599162,5603652,5604622,5604636,5604638,5604700,5604706,5604746,5605154,5605476,5605554,5605844,5605954,5605994,5606016,5606090,5606094,5606120,5606272,5606380,5606384,5606386,5606392,5606404,5606446,5606498,5606514,5606536,5606538,5606636,5606690,5606700,5606712,5606750,5606764,5606802,5606816,5606858,5606862,5606864,5607014,5607036,5607040,5607042,5607350,5607358,5607362,5608046,5608090

Теперь в папке bandage_colors лежит раскраска для каждого из штаммов соответственно


In [18]:
!ls bandage_colors


s5.csv	s7.csv	s9.csv

In [20]:
!head bandage_colors/s5.csv


name,strains_print,color
1924,nobody,#b0b0b0
1926,nobody,#b0b0b0
6308,"s7, s9",#b0b0b0
18054,nobody,Yellow
19100,nobody,#b0b0b0
19102,nobody,#b0b0b0
19556,nobody,#b0b0b0
19572,nobody,#b0b0b0
20152,nobody,Yellow

In [ ]: