In [1]:
import pandas as pd
import numpy as np

from collections import Counter
from itertools import permutations

import networkx as nx

import os

In [2]:
def print_edges(edges):
    print(','.join([str(e) for e in edges]))

Считываем граф

Oставляем только большую компоненту связности


In [3]:
G = nx.DiGraph()

with open("assembly/K127/prelim_graph.gfa") as f:
    for line in f:
        line = line.split()
        line_type = line[0]
        
        # S 238024 ACCAATTAT KC:i:37210
        if line_type == "S":
            v_name = int(line[1])
            v_length = len(line[2])
            G.add_node(v_name, length=v_length)
        
        # L 238322 + 19590 - 55M
        if line_type == "L":
            v1 = int(line[1])
            v2 = int(line[3])
            G.add_edge(v1, v2)
            
print('Number of components:', nx.number_weakly_connected_components(G))      
            
# remain only largest component
new_G = nx.DiGraph()
for g in nx.weakly_connected_component_subgraphs(G):
    print(g.number_of_nodes())
    if new_G.number_of_nodes() < g.number_of_nodes():
        new_G = g.copy()
G = new_G.copy()


Number of components: 213
3111
2
1
1
2
1
3
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
4
1
1
1
1
3
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
3
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
5
1
1
1
1
1
1
1
1
1
1
1
2
1
1
3
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

Табличка с референсами

Считываем файл ответа, как он есть


In [4]:
df_ref = pd.read_csv("refs/refs_edges.txt", header=None, names=["e"])

df_ref = df_ref["e"].str.split('\t', 1, expand=True)
df_ref.columns = ["e_id", "strains"]
df_ref = df_ref.set_index("e_id")
df_ref.index = df_ref.index.astype("int")
df_ref.head()


Out[4]:
strains
e_id
3051154 s9_8\ts6_1
2899616 s6_1
3094698 s9_8
2935900 s6_1
2652616 s6_1

Оставляем только ребра из большой компоненты:


In [5]:
df_ref = df_ref.loc[list(G.nodes)]

Выводим рёбра, которые никому не принадлежат, на экран, и выкидываем их из таблицы:


In [6]:
nobody_edges = df_ref[df_ref["strains"].isnull()].index
print("{n} edges are Nobody".format(n=len(nobody_edges)), '\n')
print_edges(nobody_edges)

df_ref = df_ref.loc[set(df_ref.index) - set(nobody_edges)]


152 edges are Nobody 

729174,3088472,3096792,3031324,860472,3088702,3088748,573858,697314,3007572,3097690,549984,492682,754962,427370,345464,296672,3098422,3098516,2803630,3008452,3098612,3098614,3098630,2992144,469208,3098894,3090860,3099142,35410,3099352,60138,3099394,3099422,3099442,52020,3099466,3099472,3099478,3099530,3099572,2878418,2968584,3099750,3099812,584978,2977164,93618,822772,3084050,3970,3084272,3084280,3084288,3084294,2699274,413714,2633754,3100712,2977882,594124,3100972,2634046,3101016,2879852,528752,3084726,291264,2634264,2634292,3027512,242300,848530,406166,3085038,2986754,472022,504790,2987088,2634836,2962672,2635004,2815234,3093794,2635068,3085692,3102122,2635270,2668084,513620,3102368,38598,2766560,3102448,2635526,137132,2635698,3102662,432070,407536,407538,2938888,3102740,2635832,3102828,47402,3103042,3094850,3103052,825696,3103086,3103108,2636170,358814,2636198,547280,2636246,3103196,498146,367082,276994,621094,3103380,3095228,3095258,2636516,3021554,3103480,3103542,3103720,834592,3103792,3103834,3103866,736400,3103926,3103964,3013918,2997548,736582,540050,3104178,384480,3104334,3104412,3104484,761578,737108,3096404,3104654,3104666,3104710

Сплитим список референсов:


In [7]:
df_ref["strains"] = df_ref["strains"].str.split('\t')
df_ref["strains"] = df_ref["strains"].apply(lambda x: [s.rpartition('_')[0] for s in x])
df_ref["strains"] = df_ref["strains"].apply(Counter)
df_ref.head()


Out[7]:
strains
e_id
2719746 {'s9': 2}
3096592 {'s6': 1}
2981908 {'s6': 1}
3096602 {'s9': 1}
3096610 {'s9': 1, 's6': 1}

Считаем копийность каждого ребра:


In [8]:
df_ref["single_copy"] = df_ref["strains"].apply(lambda x: x.most_common(1)[0][1] == 1)
df_ref.head()


Out[8]:
strains single_copy
e_id
2719746 {'s9': 2} False
3096592 {'s6': 1} True
2981908 {'s6': 1} True
3096602 {'s9': 1} True
3096610 {'s9': 1, 's6': 1} True

Считываем длину рёбер:


In [9]:
df_length = pd.read_csv("edges_lengths.tsv", sep='\t', header=None, names=["length"])
df_length = df_length.astype("int")

df_ref = df_ref.join(df_length, how='inner')
df_ref.head()


Out[9]:
strains single_copy length
2719746 {'s9': 2} False 140
3096592 {'s6': 1} True 1674
2981908 {'s6': 1} True 305
3096602 {'s9': 1} True 2779
3096610 {'s9': 1, 's6': 1} True 8878

Считываем профили


In [10]:
ref_profile = pd.read_csv("refs/profile.csv", header=None, index_col=0)
for i in range(1, 11):
    ref_profile[i] = ref_profile[i] / ref_profile[i].sum()
ref_profile


Out[10]:
1 2 3 4 5 6 7 8 9 10
0
9 0.41 0.83 0.98 0.74 0.09 0.82 0.49 0.99 0.37 0.99
6 0.59 0.17 0.02 0.26 0.91 0.18 0.51 0.01 0.63 0.01

In [11]:
desman_profile = pd.read_csv("desman_results/%s_0/desman_freqs.csv" % len(ref_profile),
                             header=None, index_col=0, dtype=float)
desman_profile.index = desman_profile.index.astype(int)
desman_profile


Out[11]:
1 2 3 4 5 6 7 8 9 10
0
0 0.435 0.841 0.984 0.753 0.127 0.838 0.508 0.991 0.396 0.991
1 0.565 0.159 0.016 0.247 0.873 0.162 0.492 0.009 0.604 0.009

Ищем соответствие между профилями:


In [12]:
ref_freqs = ref_profile.as_matrix()
ans_error = float("Inf")
ans_permut = None
for cur_permut in permutations(desman_profile.index):
    desman_freqs = desman_profile.loc[cur_permut, :].as_matrix()
    #print(cur_error, cur_permut)
    cur_error = ((ref_freqs - desman_freqs) ** 2).sum()
    if cur_error < ans_error:
        ans_error = cur_error
        ans_permut = cur_permut
print("Error:", ans_error)


Error: 0.007252

In [13]:
def invert_permutation(permutation):
    return [i for i, j in sorted(enumerate(permutation), key=lambda x: x[1])]

In [15]:
strains = list('s' + ref_profile.iloc[invert_permutation(ans_permut), :].index.astype(str))
strains


Out[15]:
['s9', 's6']

Табличка ответов DESMAN


In [16]:
df_desman = pd.read_csv("desman_results/%s_0/gene_assignment_etaS_df.csv" % len(strains), skiprows=1,
                  names=["e_id"] + strains)
df_desman['e_id'] = df_desman['e_id'].str[1:].astype("int")
df_desman = df_desman.set_index('e_id')
df_desman[strains] = df_desman[strains].astype('int')

#df_desman = df_desman.join(df_length, how='inner')

df_desman = df_desman.loc[set(df_ref.index) - set(nobody_edges)]
df_desman.head()


Out[16]:
s9 s6
e_id
2719746 1 0
3096592 0 1
2981908 0 1
3096602 1 0
3096610 1 0

In [17]:
for cur_s in strains:
    df_ref[cur_s] = df_ref['strains'].apply(lambda x: int(cur_s in x))
    
df_ref.head()


Out[17]:
strains single_copy length s9 s6
2719746 {'s9': 2} False 140 1 0
3096592 {'s6': 1} True 1674 0 1
2981908 {'s6': 1} True 305 0 1
3096602 {'s9': 1} True 2779 1 0
3096610 {'s9': 1, 's6': 1} True 8878 1 1

Точность DESMAN


In [18]:
df_ref.sort_index(inplace=True)
df_desman.sort_index(inplace=True)

In [19]:
right_answers = (df_ref[strains] == df_desman[strains]).sum(axis=1) == len(strains)
print("Accuracy on all edges: %.2f" % (right_answers.sum() / len(df_ref)))


Accuracy on all edges: 0.68

In [20]:
long_edges = df_ref['length'] > 500
print("Percent of long edges: %.2f" % (long_edges.sum() / len(df_ref)))
print("Accuracy on long edges: %.2f" % ((right_answers & long_edges).sum() / long_edges.sum()))


Percent of long edges: 0.82
Accuracy on long edges: 0.69

Раскрашиваем граф для конкретного штамма


In [21]:
if not os.path.exists("bandage_colors"):
    os.makedirs("bandage_colors")


for cur_s in strains:
    print('\n\n', "_________________________", cur_s, '\n')
    
    selected_nodes = df_ref[df_ref[cur_s] == 1].index
    G_sub = G.subgraph(selected_nodes)
    print("components in reference subgraph:", nx.number_weakly_connected_components(G_sub))

    selected_nodes = df_desman[df_desman[cur_s] == 1].index
    G_sub = G.subgraph(selected_nodes)
    print("components in   DESMAN  subgraph:", nx.number_weakly_connected_components(G_sub))

    df_ref['color'] = "#b0b0b0"  # grey


    long = df_ref['length'] >= 500
    single = df_ref['single_copy']
    real_true = df_ref[cur_s] == 1
    desman_true = df_desman[cur_s] == 1

    df_ref.loc[~long & real_true, 'color'] = 'Brown'

    df_ref.loc[long & single  &  real_true  &  desman_true, 'color'] = 'Lime'
    df_ref.loc[long & ~single  &  real_true  &  desman_true, 'color'] = 'Green'

    df_ref.loc[long &  single  &  real_true  & ~desman_true, 'color'] = 'Teal'
    df_ref.loc[long & ~single  &  real_true  & ~desman_true, 'color'] = 'Navy'

    df_ref.loc[long &  single  & ~real_true  &  desman_true, 'color'] = 'Yellow'
    df_ref.loc[long & ~single  & ~real_true  &  desman_true, 'color'] = 'Orange'


    df_ref['strains_print'] = df_ref['strains'].apply(lambda x: ", ".join('{}({})'.format(k, v) for k, v in x.items()))
    df_ref['strains_print'] = df_ref['strains_print'].apply(lambda x: x.replace('(1)', ''))

    df_ref[['strains_print', 'color']].to_csv("bandage_colors/{}.csv".format(cur_s), index_label='name')
    
    print("\nlong FN")
    print_edges(df_ref[long & real_true & ~desman_true].index)
    
    print("\nlong FP")
    print_edges(df_ref[long & ~real_true & desman_true].index)



 _________________________ s9 

components in reference subgraph: 1
components in   DESMAN  subgraph: 22

long FN
3104004

long FP
2636772,2702468,2713804,2720574,2772264,2774372,2793100,2824318,2831124,2859096,2867600,2883122,2892396,2893568,2899676,2902722,2916484,2922788,2928688,2932392,2932886,2934428,2935938,2938262,2942248,2957606,2958440,2958574,2958918,2959666,2959850,2960582,2960832,2961778,2963054,2963380,2965194,2966204,2966448,2968910,2968944,2969870,2970378,2971380,2971916,2971960,2972212,2973592,2974534,2974998,2975152,2976938,2978002,2978272,2978986,2979040,2979130,2979952,2980184,2980312,2980930,2981574,2982708,2983094,2984292,2984914,2985162,2985766,2987998,2988864,2988942,2989212,2989532,2989642,2989646,2989870,2992374,2995236,2996364,3004272,3004868,3005364,3007012,3013726,3084856,3085372,3086446,3086732,3087016,3087936,3088714,3094106,3094388,3095060,3095446,3095648,3095700,3095748,3096308,3096436,3096446,3096752,3096972,3097144,3097682,3097708,3097896,3098010,3098060,3098202,3099012,3099084,3099342,3099396,3099724,3100022,3102882,3102922,3103116,3103158,3103182,3103274,3103362,3103576,3103744,3103878,3103988,3104058,3104068,3104292,3104296,3104298,3104304,3104350,3104380,3104462,3104522,3104678,3104736,3104742


 _________________________ s6 

components in reference subgraph: 91
components in   DESMAN  subgraph: 681

long FN
2636772,2702468,2713804,2720574,2772264,2774372,2793100,2824318,2831124,2859096,2867600,2883122,2892396,2893568,2899676,2902722,2916484,2922788,2928688,2932392,2932886,2934428,2935938,2938262,2942248,2957606,2958440,2958574,2958918,2959666,2959850,2960582,2960832,2961778,2963054,2963380,2965194,2966204,2966448,2968910,2968944,2969870,2970378,2971380,2971916,2971960,2972212,2973592,2974534,2974998,2975152,2976788,2976938,2977290,2977486,2978002,2978272,2978986,2979040,2979130,2979294,2979952,2980184,2980312,2980930,2981574,2982708,2983094,2984292,2984416,2984914,2985162,2985766,2987998,2988548,2988864,2988942,2989212,2989374,2989440,2989532,2989534,2989624,2989640,2989642,2989646,2989776,2989800,2989826,2989848,2989870,2990186,2992374,2994706,2995236,2996364,2997832,3004272,3004868,3005364,3007012,3008862,3013726,3031166,3033290,3036848,3038232,3038888,3040880,3041506,3041732,3041840,3042450,3043508,3043568,3043794,3043860,3043884,3045904,3045984,3046734,3047268,3047328,3047832,3048244,3048612,3048946,3049254,3049620,3049750,3049786,3050024,3050300,3050712,3050946,3051154,3051258,3051282,3051398,3051568,3051712,3052428,3052506,3052512,3052940,3053296,3053520,3053852,3053980,3054126,3054990,3055162,3055486,3056558,3056668,3056694,3056706,3057360,3057564,3057634,3057990,3058014,3058104,3058110,3058456,3058638,3058774,3058786,3058792,3059126,3059548,3059652,3059670,3059806,3060126,3060144,3060228,3060548,3060820,3060906,3062118,3063476,3063616,3063744,3064096,3064464,3064732,3064828,3064984,3065112,3065192,3065252,3065384,3066206,3066374,3067076,3067844,3068750,3069418,3069652,3070400,3070894,3071212,3071356,3071370,3071372,3072008,3072024,3073046,3073234,3073236,3073430,3073802,3074494,3074496,3074630,3076262,3076310,3076312,3079400,3079764,3081120,3082712,3083624,3084520,3084562,3084708,3084804,3084856,3084868,3085014,3085152,3085154,3085222,3085324,3085352,3085372,3085534,3085546,3085664,3085918,3085972,3086316,3086390,3086406,3086424,3086426,3086446,3086484,3086522,3086564,3086598,3086604,3086692,3086700,3086706,3086732,3086804,3086842,3086904,3086952,3086988,3086992,3087016,3087072,3087100,3087178,3087194,3087200,3087306,3087350,3087354,3087424,3087742,3087838,3087860,3087906,3087936,3087942,3087948,3087952,3088152,3088396,3088450,3088608,3088676,3088714,3088762,3088808,3088836,3088868,3088884,3088888,3088894,3088922,3090110,3091446,3091702,3091708,3091996,3092168,3092306,3092386,3092426,3092570,3092750,3093298,3094014,3094034,3094074,3094106,3094164,3094170,3094176,3094226,3094276,3094352,3094370,3094388,3094466,3094584,3094586,3094636,3094734,3094778,3095048,3095060,3095306,3095418,3095444,3095446,3095470,3095538,3095648,3095676,3095684,3095692,3095694,3095700,3095748,3095768,3095826,3095922,3096006,3096026,3096028,3096040,3096112,3096180,3096192,3096246,3096260,3096294,3096308,3096400,3096436,3096446,3096574,3096610,3096616,3096640,3096678,3096720,3096748,3096752,3096756,3096860,3096868,3096882,3096896,3096972,3096996,3097038,3097040,3097144,3097146,3097160,3097182,3097256,3097284,3097294,3097320,3097404,3097438,3097460,3097496,3097500,3097510,3097548,3097560,3097572,3097588,3097626,3097648,3097664,3097682,3097694,3097708,3097806,3097820,3097836,3097880,3097882,3097886,3097896,3097900,3097946,3097966,3097976,3098010,3098020,3098038,3098052,3098060,3098074,3098176,3098182,3098202,3098210,3098246,3098270,3098274,3098294,3098310,3098334,3098350,3098440,3098446,3098456,3098462,3098488,3098636,3098648,3098680,3098738,3098750,3098762,3098796,3098798,3098804,3098814,3098854,3098872,3098904,3098918,3098978,3099012,3099058,3099084,3099090,3099092,3099098,3099104,3099112,3099118,3099130,3099154,3099166,3099172,3099220,3099232,3099266,3099270,3099308,3099368,3099396,3099638,3099720,3099782,3099820,3099824,3099826,3099828,3099834,3099842,3099850,3099852,3099856,3099860,3099864,3099868,3099870,3099874,3100312,3100676,3100978,3101028,3101148,3101168,3101198,3101210,3101332,3101346,3101376,3101418,3101454,3101548,3101566,3101626,3101632,3101742,3101806,3102472,3102474,3102486,3102500,3102526,3102582,3102586,3102600,3102614,3102630,3102632,3102658,3102668,3102674,3102680,3102682,3102706,3102756,3102760,3102764,3102766,3102824,3102830,3102832,3102838,3102856,3102868,3102872,3102882,3102920,3102922,3102932,3102936,3102944,3102988,3103010,3103032,3103036,3103038,3103058,3103080,3103088,3103090,3103096,3103116,3103132,3103136,3103140,3103158,3103166,3103180,3103182,3103186,3103190,3103192,3103220,3103232,3103242,3103250,3103254,3103266,3103270,3103274,3103290,3103308,3103312,3103324,3103348,3103356,3103362,3103372,3103390,3103402,3103416,3103420,3103422,3103464,3103488,3103496,3103498,3103500,3103504,3103508,3103520,3103522,3103536,3103540,3103548,3103552,3103554,3103556,3103558,3103560,3103566,3103578,3103580,3103590,3103592,3103596,3103600,3103602,3103608,3103618,3103622,3103628,3103630,3103632,3103636,3103644,3103660,3103680,3103686,3103688,3103700,3103704,3103710,3103730,3103744,3103758,3103788,3103798,3103808,3103820,3103836,3103838,3103848,3103852,3103864,3103878,3103886,3103896,3103898,3103906,3103908,3103914,3103920,3103922,3103924,3103936,3103940,3103944,3103948,3103958,3103962,3103968,3103980,3103988,3103990,3104000,3104016,3104020,3104034,3104036,3104038,3104058,3104060,3104068,3104074,3104102,3104104,3104110,3104146,3104152,3104154,3104156,3104158,3104162,3104164,3104168,3104172,3104184,3104186,3104188,3104190,3104194,3104198,3104200,3104204,3104214,3104224,3104232,3104238,3104250,3104258,3104266,3104274,3104288,3104292,3104296,3104298,3104304,3104310,3104314,3104316,3104328,3104342,3104346,3104350,3104366,3104372,3104374,3104376,3104378,3104382,3104384,3104388,3104392,3104400,3104406,3104410,3104414,3104416,3104418,3104420,3104422,3104424,3104426,3104428,3104430,3104432,3104434,3104436,3104438,3104440,3104442,3104444,3104446,3104448,3104450,3104452,3104454,3104456,3104458,3104460,3104462,3104492,3104498,3104572,3104634,3104646,3104648

long FP
3097766,3098518,3100814,3104472

In [ ]:


In [ ]: