In [2]:
    
# import standard scientific computing tools
import matplotlib.pyplot as plt 
import pandas as pd 
import numpy as np 
import networkx as nx 
# use mygene.info to translate between entrez and gene symbol
import mygene 
mg = mygene.MyGeneInfo()
# latex rendering of text in graphs
import matplotlib as mpl
mpl.rc('text', usetex = False)
mpl.rc('font', family = 'serif')
# allow for in-cell plotting
% matplotlib inline
# import the visJS2jupyter tool
import visJS2jupyter.visJS_module 
import visJS2jupyter.visualizations
    
Download and unzip the human STRING interactome from this link https://string-db.org/download/protein.links.v10.5/9606.protein.links.v10.5.txt.gz
Then load the interactome into a networkX object using the load_STRING function
Note: make sure to set the path to the correct download location!
In [3]:
    
def load_STRING(datafile = '../interactomes/string_human/9606.protein.links.v10.txt',conf_thresh=700,species='human'):
    '''
    Helper function to parse and load STRING network
    
    '''
    
    string_df = pd.read_csv(datafile,sep=' ')
    string_df = string_df.loc[string_df['combined_score']>conf_thresh]
                              
    # create the network
    G_str = nx.Graph()
    G_str.add_weighted_edges_from(zip(string_df['protein1'],string_df['protein2'],string_df['combined_score']))
    
    # convert from ensembleID to gene symbol
    string_genes_temp = [n[n.find('.')+1:] for n in G_str.nodes()]
    mg_temp = mg.querymany(string_genes_temp,scopes='ensemblprotein',fields='symbol',species=species)
    ensembl_list = [x['query'] for x in mg_temp]
    symbol_list = [x['symbol'] if 'symbol' in x.keys() else 'None' for x in mg_temp]
    ensembl_to_symbol = dict(zip(ensembl_list,symbol_list))
    ensembl_to_symbol = pd.Series(ensembl_to_symbol)
    
    # relabel nodes with symbols
    G_str = nx.relabel_nodes(G_str,dict(zip(G_str.nodes(),string_genes_temp)))
    G_str = nx.relabel_nodes(G_str,dict(ensembl_to_symbol[ensembl_to_symbol!='None']))  # only keep the proteins that have associated genes
    
    return G_str
    
In [4]:
    
# load the string interactome
# NOTE: MAKE SURE TO SET THE PATH TO WHERE YOU DOWNLOADED THE STRING DATABASE
G_str = load_STRING('9606.protein.links.v10.5.txt',
                                    conf_thresh=700,species='human')
    
    
In [5]:
    
ASD_df = pd.read_csv('gene-report.csv',)
ASD_genes = ASD_df['Gene Symbol'].unique().tolist()
print(len(ASD_genes)) # how many unique ASD genes are in the table?
ASD_genes = list(np.intersect1d(ASD_genes,G_str.nodes())) # only keep genes in STRING
len(ASD_genes) # how many genes ASD risk genes are found in the STRING high confidence interactome?
    
    
    Out[5]:
In [6]:
    
Wprime = visJS2jupyter.visualizations.normalized_adj_matrix(G_str)
    
In [7]:
    
num_seeds=50
np.random.shuffle(ASD_genes) # shuffle the ASD_genes
ASD_random_N = ASD_genes[0:num_seeds] # randomly select num_seeds from the ASD gene set
    
In [11]:
    
visJS2jupyter.visualizations.draw_heat_prop(G_str,
                              seed_nodes=ASD_random_N,
                              num_nodes=200,
                              Wprime=Wprime,
                              k=.7,
                              edge_width=.2,
                              edge_smooth_enabled=True,
                              edge_smooth_type='bezier',
                              largest_connected_component=True,
                              node_size_transform='Math.sqrt',
                              node_size_multiplier=2,
                              hover = False,
                              hover_connected_edges = False,
                              physics_enabled=False,
                              node_scaling_label_draw_threshold=10,
                              min_label_size=100,
                              max_label_size=100,
                              max_visible=100,
                              node_font_size=20,
                              highlight_nodes=ASD_genes
                             )
    
    Out[11]:
In [12]:
    
num_seeds = 25 # number of known ASD genes with which to seed the simulation
num_reps=10 # number of times to repeat the simulation --> increase for robustness
topN_range = range(100,2000,100) # measure the number of recovered ASD genes for each of these numbers
frac_recovered = pd.DataFrame(np.zeros((len(topN_range),num_reps)),index=topN_range) # initialize fraction ASD recovered genes
frac_recovered_control = pd.DataFrame(np.zeros((len(topN_range),num_reps)),index=topN_range) # initialize fraction control recovered genes
# loop simulation num_reps times
for rep in range(num_reps):
    print(rep)
    all_genes = G_str.nodes() # all genes in STRING high confidence interactome
    np.random.shuffle(all_genes) # shuffle all_genes
    CONTROL_genes = all_genes[0:len(ASD_genes)] # randomly select len(ASD_genes) control genes 
    np.random.shuffle(CONTROL_genes) # shuffle these
    CONTROL_random_N = CONTROL_genes[0:num_seeds] # randomly select num_seeds from this control set to recover
    np.random.shuffle(ASD_genes) # shuffle the ASD_genes
    ASD_random_N = ASD_genes[0:num_seeds] # randomly select num_seeds from the ASD gene set
    Fnew = visJS2jupyter.visualizations.network_propagation(G_str,Wprime,ASD_random_N)  # network propagation simulation for ASD
    #Fnew_control = visJS2jupyter.visualizations.network_propagation(G_str,Wprime,CONTROL_random_N) # network propagation simulation for control
    Fnew.sort_values(ascending=False,inplace=True) # sort ASD heat vector
    #Fnew_control.sort_values(ascending=False,inplace=True) # sort Control heat vector
    
    ASD_remainder = np.setdiff1d(ASD_genes,ASD_random_N) # genes not in ASD seed set
    CONTROL_remainder = np.setdiff1d(CONTROL_genes,CONTROL_random_N) # genes not in control seed set
    
    # fill in fraction ASD recovered and fraction control recovered, for each N in topN_range
    for N in topN_range:
        frac_recovered.loc[N][rep] = len(np.intersect1d(Fnew.head(N).index.tolist(),ASD_remainder))/float(len(ASD_remainder))
        #frac_recovered_control.loc[N][rep] = len(np.intersect1d(Fnew_control.head(N).index.tolist(),CONTROL_remainder))/float(len(CONTROL_remainder))
        
        # measure the fraction of ASD_remainder are randomly selected from the network
        frac_recovered_control.loc[N][rep]=len(np.intersect1d(all_genes[:N],ASD_remainder))/float(len(ASD_remainder))
    
    
In [13]:
    
num_recovered = frac_recovered*len(ASD_remainder)
num_recovered_control = frac_recovered_control*len(ASD_remainder)
    
In [14]:
    
# plot the results
plt.figure(figsize=(5,5))
plt.errorbar(num_recovered.index.tolist(),
             list(num_recovered.mean(axis=1)),
             list(num_recovered.std(axis=1)),
             markersize=8,
             linewidth=2,
             label='heat propagation',fmt='ro-')
plt.errorbar(num_recovered_control.index.tolist(),
             list(num_recovered_control.mean(axis=1)),
             list(num_recovered_control.std(axis=1)),
             markersize=8,
             linewidth=2,
             fmt='ko-',label='random')
plt.xlabel('top N genes',fontsize=22)
plt.ylabel('number autism risk\ngenes recovered',fontsize=22)
plt.legend(loc='upper left',fontsize=16)
ax = plt.gca()
ax.tick_params(axis='both',which='major',labelsize=14)
#plt.title(str(num_seeds) + ' seed genes',fontsize=16)
#plt.savefig('../../figures/number_recovered_'+str(num_seeds)+'SeedGenes.png',dpi=200,bbox_inches='tight')
    
    
In [15]:
    
# how significantly different are heat_prop and random, at N = 200?
from scipy.stats import ranksums
tmp, p = ranksums(frac_recovered.loc[200],frac_recovered_control.loc[200])
print(p)
    
    
In [ ]: