In [21]:
import networkx as nx

In [22]:


In [22]:
def to_my_int(node_name):
    node_name = node_name[:-1]
    node_name = "".join(node_name.split('_'))
    node_name = int(node_name)
    return node_name

In [23]:
def print_nodes(nodes_list):
    print(','.join([str(node) for node in nodes_list]))

In [24]:
def to_single_format(nodes_list):
    return set([int(node[:-1]) for node in nodes_list])

def to_double_format(nodes_list):
    nodes_1 = [str(node) + "+" for node in nodes_list]
    nodes_2 = [str(node) + "-" for node in nodes_list]
    return nodes_1 + nodes_2

In [25]:
def rev(node_name):
    direction = node_name[-1]
    if direction == "-":
        rev_direction = "+"
    else:
        rev_direction = "-"
    return node_name[:-1] + rev_direction

Считываем граф

Oставляем только большую компоненту связности


In [75]:
G = nx.DiGraph()

with open("/home/margarita/AU/meta_strains_project/final_algo/data/g5_r1/prelim_graph.gfa") as f:
    for line in f:
        line = line.split()
        line_type = line[0]
        
        # S 238024 ACCAATTAT KC:i:37210
        if line_type == "S":
            v_name = line[1]
            v_length = len(line[2])
            G.add_node(v_name + "+", length=v_length)
            G.add_node(v_name + "-", length=v_length)
        
        # L 238322 + 19590 - 55M
        if line_type == "L":
            v1 = line[1] + line[2]
            v2 = line[3] + line[4]
            G.add_edge(v1, v2)
            G.add_edge(rev(v2), rev(v1))
            
print('Number of components:', nx.number_weakly_connected_components(G))      
            
# remain only largest component
new_G = nx.DiGraph()
for g in nx.weakly_connected_component_subgraphs(G):
    if new_G.number_of_nodes() < g.number_of_nodes():
        new_G = g.copy()
G = new_G.copy()


Number of components: 811

In [76]:
self_loops = []
for node in G.nodes:
    if node in list(nx.all_neighbors(G, node)):
        self_loops.append(node)
        
print_nodes([node[:-1] for node in self_loops])


7611649,7611649,6079470,7618517,6079470,7618517

In [77]:
deleted_tips = []
tips = [node for node in G.nodes if (G.in_degree(node) == 0 
                                     and nx.get_node_attributes(G,'length')[node] <= 500)]

deleted_tips.extend(tips)

while len(tips) > 0:
    G.remove_nodes_from(tips)
    
    tips = [node for node in G.nodes if (G.in_degree(node) == 0 
                                         and nx.get_node_attributes(G,'length')[node] <= 500)]
    deleted_tips.extend(tips)
    
deleted_tips = set([node for node in deleted_tips])
print_nodes([node for node in deleted_tips])


570052+,699452-,428238-,7472799-,6339981+,7631847-,7489641-,1832832-,7496465-,6251249-,6276775+,7632445-,6337135+,21148+,1770028-,715646-,25082+,25080+,1092978-,6966935-,6247389+,6668757+,1736092-,6376865+,1821908-,6633785+,1295828+,759360+,1725264+,1106344-,6780089-,6813761-,124778+,7611569-,7616117-,759362+,124776+

In [ ]: