In [3]:
import os
import pickle
import shutil
import Queue

import networkx as nx
import numpy as np
import pandas as pd

# from ghsom import main_no_labels as ghsom_main
from ghsom_parallel import main as ghsom_main
# from ghsom_parallel_signal_frequency import main as ghsom_main
# from ghsom_parallel_edge_constrained import main as ghsom_main

def save_obj(obj, name):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
    
root_dir = "/home/david/Documents/ghsom/hierarchical_exploration_10000/"

# for data in ["yeast_reactome", "yeast_uetz", "collins", "ccsb", "ito_core", "lc_multiple"]:
for data in ["yeast_reactome"]:
    
    print "data={}".format(data)
    print
    
#     for e_sg in [0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1]:
#     for e_sg in [0.8, 0.7, 0.6, 0.5, 0.4]:
    for e_sg in [0.8, 0.6, 0.5]:
        
        print "e_sg={}".format(e_sg)
        print

#         for e_en in [0.7, 0.6, 0.5, 0.4, 0.3]:
#         for e_en in [e_sg]:
        for e_en in [0.3]:
        
            print "e_en={}".format(e_en)
            print

            os.chdir(root_dir)

            #ghsom parameters
            params = {'eta': 0.0001,
                     'sigma': 1,
                      'e_sg': e_sg,
                     'e_en': e_en}

            map_file = '{}_hierarchy_communities_{}_{}'.format(data, e_sg, e_en)

            if not os.path.isfile("{}.pkl".format(map_file)):

                #run ghsom and save output
                print "running GHSOM and saving to {}.pkl".format(map_file)


#                 G, map = ghsom_main(params, '../embedded_{}.gpickle'.format(data), lam=1000)
#                 save_obj((G, map), map_file)
#                 print '\nnumber of communities found: {}, saved maps to {}'.format(len(map), map_file)

                G, networks = ghsom_main(params, '../embedded_{}.gpickle'.format(data), num_iter=10000, num_threads=5)
                save_obj((G, networks), map_file)

                print '\nnumber of maps grown: {}, saved maps to {}'.format(len(networks), map_file)

            else:

                print "{}.pkl already exists, loading maps".format(map_file)    
                #load output
#                 G, map = load_obj(map_file)
                G, networks = load_obj(map_file)

            #save results to file
            dir_name = "{}_hierarchy_communities_{}_{}".format(data, e_sg, e_en)
            if not os.path.isdir(dir_name):
#                 shutil.rmtree(dir_name)

                os.mkdir(dir_name)
                print 'made directory {}'.format(dir_name)

            os.chdir(dir_name)
            print "moved to {}".format(dir_name)

            #all genes
            all_genes_file = "all_genes.txt"
            with open(all_genes_file, 'w') as f:
                for n, d in G.nodes(data=True):
                    f.write("{}\n".format(n))
            print "wrote {}".format(all_genes_file)
            
            genes = G.nodes()
            gene_assignments = {k: v for k, v in zip(genes, 
                np.array([["" for j in range(100)] for i in range(len(genes))], dtype="S100"))}
            
            for map_id, network, e in networks:

                #shortest path matrix
                shortest_path = nx.floyd_warshall_numpy(network)
                communities_in_this_map = np.array([v for k, v in nx.get_node_attributes(network, "ID").items()])
                shortest_path_df = pd.DataFrame(shortest_path, index=communities_in_this_map)
                shortest_path_file = "{}_shortest_path.csv".format(map_id)
                
                shortest_path_df.to_csv(shortest_path_file, index=True, header=False, sep=',')
                print 'wrote shortest path matrix and saved as {}'.format(shortest_path_file)

                for n, d in network.nodes(data=True):
                    
                    community_assignment = d["ID"]
                    layer = community_assignment.count("-")
                    
                    for node in d["ls"]:
                        gene_assignments[node][layer] = community_assignment

#             #map queue
#             q = Queue.Queue()

#             c = 1
#             depth = 0
#             q.put((c, depth, map))

#             genes = G.nodes()
#             gene_assignments = {k: v for k, v in zip(genes, 
#                 np.array([["" for j in range(10)] for i in range(len(genes))], dtype="S20"))}

#             while not q.empty():

#                 map_id, depth, map = q.get()
#                 c = 1

#                 #shortest path matrix
#                 communities_in_this_map = np.array(["{}-{}".format(str(map_id).zfill(2), 
#                                                                    str(i).zfill(2)) for i in range(c, c + len(map))])

#                 shortest_path = nx.floyd_warshall_numpy(map).astype(np.int)
#     #             shortest_path = np.insert(shortest_path, 0, communities_in_this_map, axis=1)
#                 shortest_path_df = pd.DataFrame(shortest_path, index=communities_in_this_map)
#                 shortest_path_file = "{}_shortest_path.csv".format(str(map_id).zfill(2))
#     #             np.savetxt(shortest_path_file, shortest_path, fmt='%i', delimiter=",")
#                 shortest_path_df.to_csv(shortest_path_file, index=True, header=False, sep=',')
#                 print 'wrote shortest path matrix and saved as {}'.format(shortest_path_file)



#                 #gene community assignments
#                 for n, d in map.nodes(data=True):

#                     community = communities_in_this_map[c - 1]

#                     for node in d['ls']:
#                         gene_assignments[node][depth] = community

#                     #add map to queue
#                     m = d['n']

#                     if not m == []:

#                         q.put((community, depth + 1, m))   

#                     c += 1

            #back to matrix
            assignment_matrix = np.array([v for k, v in gene_assignments.items()])
            #remove unnecessary columns
            mask = assignment_matrix != ""
            idx = mask.any(axis = 0)
            assignment_matrix = assignment_matrix[:,idx]
            assignment_matrix = np.insert(assignment_matrix, 0, "01", axis=1)

            assignment_df = pd.DataFrame(assignment_matrix, index=genes)

            assignment_file = "assignment_matrix.csv"
        
            assignment_df.to_csv(assignment_file, index=True, header=False, sep=',')
                                 
            print "wrote assignment matrix and saved it as {}".format(assignment_file)
            print
            
            
print "DONE"


data=yeast_reactome

e_sg=0.8

e_en=0.3

running GHSOM and saving to yeast_reactome_hierarchy_communities_0.8_0.3.pkl
MQE_0=4.0863156855, growth target=3.2690525484
MQE=3.20459957995, size of map=3
growth terminated, MQE: 3.20459957995, target: 3.2690525484, number of deleted neurons: 0
submitted job: ID=01-01, e=4.53247228254, number of nodes=279MQE_0=4.53247228254, growth target=3.62597782603 

MQE_0=3.1533586474, growth target=2.52268691792submitted job: ID=01-02, e=3.1533586474, number of nodes=270

submitted job: ID=01-03, e=1.92796780993, number of nodes=588
 MQE_0=1.92796780993, growth target=1.54237424794
MQE=2.46108394658, size of map=3
growth terminated, MQE: 2.46108394658, target: 2.52268691792, number of deleted neurons: 0
 submitted job: ID=01-02-01, e=2.6180420127, number of nodes=112MQE_0=2.6180420127, growth target=2.09443361016MQE_0=2.0126789747, growth target=1.61014317976


submitted job: ID=01-02-02, e=2.0126789747, number of nodes=95
submitted job: ID=01-02-03, e=2.75253085235, number of nodes=63
MQE_0=2.75253085235, growth target=2.20202468188
MQE=3.72601530366, size of map=3
MQE=1.51432577324, size of map=3
growth terminated, MQE: 1.51432577324, target: 2.20202468188, number of deleted neurons: 0
MQE=1.45636478681, size of map=3
growth terminated, MQE: 1.45636478681, target: 1.61014317976, number of deleted neurons: 0
submitted job: ID=01-02-02-03, e=1.15188183444, number of nodes=55
MQE_0=1.15188183444, growth target=0.921505467555
MQE=1.90315888023, size of map=3
growth terminated, MQE: 1.90315888023, target: 2.09443361016, number of deleted neurons: 0
submitted job: ID=01-02-01-03, e=2.08001246065, number of nodes=59
MQE_0=2.08001246065, growth target=1.66400996852
MQE=0.684355007265, size of map=3
growth terminated, MQE: 0.684355007265, target: 0.921505467555, number of deleted neurons: 0
MQE=1.26359248407, size of map=3
growth terminated, MQE: 1.26359248407, target: 1.66400996852, number of deleted neurons: 0
MQE=3.27439369381, size of map=4
growth terminated, MQE: 3.27439369381, target: 3.62597782603, number of deleted neurons: 0
MQE_0=2.79823437017, growth target=2.23858749614submitted job: ID=01-01-02, e=2.79823437017, number of nodes=143

submitted job: ID=01-01-04, e=3.71552959818, number of nodes=59
MQE_0=3.71552959818, growth target=2.97242367854
MQE=1.51759935904, size of map=3
growth terminated, MQE: 1.51759935904, target: 1.54237424794, number of deleted neurons: 0
submitted job: ID=01-03-01, e=1.59588670479, number of nodes=210
submitted job: ID=01-03-02, e=1.36624032563, number of nodes=118MQE_0=1.59588670479, growth target=1.27670936383

submitted job: ID=01-03-03, e=1.59067104669, number of nodes=260MQE_0=1.36624032563, growth target=1.0929922605

MQE_0=1.59067104669, growth target=1.27253683735
MQE=1.82487223187, size of map=3
growth terminated, MQE: 1.82487223187, target: 2.97242367854, number of deleted neurons: 0
MQE=0.787919042375, size of map=3
growth terminated, MQE: 0.787919042375, target: 1.0929922605, number of deleted neurons: 0
submitted job: ID=01-03-02-03, e=1.00606033007, number of nodes=57
MQE_0=1.00606033007, growth target=0.804848264056
MQE=2.02384106913, size of map=3
growth terminated, MQE: 2.02384106913, target: 2.23858749614, number of deleted neurons: 0
submitted job: ID=01-01-02-01, e=1.44870411107, number of nodes=88
MQE_0=1.44870411107, growth target=1.15896328886
MQE=0.236281556847, size of map=3
growth terminated, MQE: 0.236281556847, target: 0.804848264056, number of deleted neurons: 0
MQE=1.04878530379, size of map=3
growth terminated, MQE: 1.04878530379, target: 1.15896328886, number of deleted neurons: 0
MQE=1.36997642211, size of map=3
MQE=0.934518342941, size of map=3
growth terminated, MQE: 0.934518342941, target: 1.27253683735, number of deleted neurons: 0
submitted job: ID=01-03-03-02, e=0.912913573232, number of nodes=68
 submitted job: ID=01-03-03-03, e=1.44814573716, number of nodes=63MQE_0=0.912913573232, growth target=0.730330858586

MQE_0=1.44814573716, growth target=1.15851658973
MQE=0.964292227627, size of map=3
growth terminated, MQE: 0.964292227627, target: 1.15851658973, number of deleted neurons: 0
MQE=0.309685285076, size of map=3
growth terminated, MQE: 0.309685285076, target: 0.730330858586, number of deleted neurons: 0
MQE=1.31133863395, size of map=4
MQE=1.08476080109, size of map=5
growth terminated, MQE: 1.08476080109, target: 1.27670936383, number of deleted neurons: 0 
MQE_0=0.539361037316, growth target=0.431488829853submitted job: ID=01-03-01-03, e=0.539361037316, number of nodes=67

submitted job: ID=01-03-01-04, e=1.75679352857, number of nodes=60
MQE_0=1.75679352857, growth target=1.40543482286
MQE=1.13975817869, size of map=3
growth terminated, MQE: 1.13975817869, target: 1.40543482286, number of deleted neurons: 0
MQE=0.542426525828, size of map=3
MQE=0.438669205614, size of map=4
MQE=0.331825966555, size of map=5
growth terminated, MQE: 0.331825966555, target: 0.431488829853, number of deleted neurons: 0
DONE

number of maps grown: 20, saved maps to yeast_reactome_hierarchy_communities_0.8_0.3
made directory yeast_reactome_hierarchy_communities_0.8_0.3
moved to yeast_reactome_hierarchy_communities_0.8_0.3
wrote all_genes.txt
wrote shortest path matrix and saved as 01_shortest_path.csv
wrote shortest path matrix and saved as 01-02_shortest_path.csv
wrote shortest path matrix and saved as 01-02-03_shortest_path.csv
wrote shortest path matrix and saved as 01-02-02_shortest_path.csv
wrote shortest path matrix and saved as 01-02-01_shortest_path.csv
wrote shortest path matrix and saved as 01-02-02-03_shortest_path.csv
wrote shortest path matrix and saved as 01-02-01-03_shortest_path.csv
wrote shortest path matrix and saved as 01-01_shortest_path.csv
wrote shortest path matrix and saved as 01-03_shortest_path.csv
wrote shortest path matrix and saved as 01-01-04_shortest_path.csv
wrote shortest path matrix and saved as 01-03-02_shortest_path.csv
wrote shortest path matrix and saved as 01-01-02_shortest_path.csv
wrote shortest path matrix and saved as 01-03-02-03_shortest_path.csv
wrote shortest path matrix and saved as 01-01-02-01_shortest_path.csv
wrote shortest path matrix and saved as 01-03-03_shortest_path.csv
wrote shortest path matrix and saved as 01-03-03-03_shortest_path.csv
wrote shortest path matrix and saved as 01-03-03-02_shortest_path.csv
wrote shortest path matrix and saved as 01-03-01_shortest_path.csv
wrote shortest path matrix and saved as 01-03-01-04_shortest_path.csv
wrote shortest path matrix and saved as 01-03-01-03_shortest_path.csv
wrote assignment matrix and saved it as assignment_matrix.csv

e_sg=0.6

e_en=0.3

yeast_reactome_hierarchy_communities_0.6_0.3.pkl already exists, loading maps
moved to yeast_reactome_hierarchy_communities_0.6_0.3
wrote all_genes.txt
wrote shortest path matrix and saved as 01_shortest_path.csv
wrote shortest path matrix and saved as 01-05_shortest_path.csv
wrote shortest path matrix and saved as 01-02_shortest_path.csv
wrote shortest path matrix and saved as 01-01_shortest_path.csv
wrote shortest path matrix and saved as 01-07_shortest_path.csv
wrote shortest path matrix and saved as 01-03_shortest_path.csv
wrote shortest path matrix and saved as 01-08_shortest_path.csv
wrote shortest path matrix and saved as 01-10_shortest_path.csv
wrote shortest path matrix and saved as 01-02-03_shortest_path.csv
wrote shortest path matrix and saved as 01-01-02_shortest_path.csv
wrote shortest path matrix and saved as 01-09_shortest_path.csv
wrote shortest path matrix and saved as 01-06_shortest_path.csv
wrote shortest path matrix and saved as 01-04_shortest_path.csv
wrote shortest path matrix and saved as 01-04-02_shortest_path.csv
wrote shortest path matrix and saved as 01-09-02_shortest_path.csv
wrote shortest path matrix and saved as 01-02-02_shortest_path.csv
wrote assignment matrix and saved it as assignment_matrix.csv

e_sg=0.5

e_en=0.3

yeast_reactome_hierarchy_communities_0.5_0.3.pkl already exists, loading maps
moved to yeast_reactome_hierarchy_communities_0.5_0.3
wrote all_genes.txt
wrote shortest path matrix and saved as 01_shortest_path.csv
wrote shortest path matrix and saved as 01-02_shortest_path.csv
wrote shortest path matrix and saved as 01-06_shortest_path.csv
wrote shortest path matrix and saved as 01-01_shortest_path.csv
wrote shortest path matrix and saved as 01-08_shortest_path.csv
wrote shortest path matrix and saved as 01-05_shortest_path.csv
wrote shortest path matrix and saved as 01-07_shortest_path.csv
wrote shortest path matrix and saved as 01-16_shortest_path.csv
wrote shortest path matrix and saved as 01-01-01_shortest_path.csv
wrote shortest path matrix and saved as 01-01-03_shortest_path.csv
wrote shortest path matrix and saved as 01-13_shortest_path.csv
wrote shortest path matrix and saved as 01-01-01-03_shortest_path.csv
wrote shortest path matrix and saved as 01-04_shortest_path.csv
wrote assignment matrix and saved it as assignment_matrix.csv

DONE

In [ ]: