notebook.community

Edit and run



In [2]:

    
import warnings

# Our numerical workhorses
import numpy as np
import pandas as pd
import scipy.stats as st

# Import pyplot for plotting
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d

# Seaborn, useful for graphics
import seaborn as sns

# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline

# This enables SVG graphics inline.  There is a bug, so uncomment if it works.
%config InlineBackend.figure_formats = {'png', 'retina'}

# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2, 
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)

# Import Bokeh modules for interactive plotting
import bokeh.charts
import bokeh.charts.utils
import bokeh.io
import bokeh.models
import bokeh.palettes
import bokeh.plotting

# Display graphics in this notebook
bokeh.io.output_notebook()

# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.utils.extmath import cartesian

import networkx as nx









    



/Users/davidangeles/anaconda/envs/py35/lib/python3.5/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))






    




    


    

    
        
        BokehJS successfully loaded.



In [3]:

    
fname= '../input/c_elegans_gene_interactions.txt'
maxL= 0
i= 0
with open(fname, 'r') as f:
    for line in f:
        if line[0] != '#':
            l= line.split('\t') 
            if len(l) > maxL:
                maxL= len(l)

print(maxL) #as of Jan 1, 2016, maxL was 44 -- this is the number of rows in the df



In [4]:

    
maxL= 44
fname= '../input/c_elegans_gene_interactions.txt'
series= []
with open(fname, 'r') as f:
    for line in f:
        if line[0] != '#':
            line= line.rstrip()
            l= line.split('\t') 
            empty= [np.nan]*(maxL - len(l))
            l= l+empty
            series.append(l)
df= pd.DataFrame(data= series)



In [5]:

    
# WBInteractionID	Interaction_type	Interaction_subtype	Summary	Citation	Interactor1	Common-name	Role1	Interactor2	Common-name	Role2	...
columns_main= ['WBInteractionID', 'interaction_type', 'interaction_subtype', 'summary', 'citation']
column_repeat= ['interactor','common_name','role']



In [6]:

    
columns_sub= []
columns_sub_no_name= []

#max number of interactors in this df
maxInteractors= (maxL - len(columns_main))/len(column_repeat)


for i in np.arange(1, maxInteractors+1):
    for word in column_repeat:
        word= word+str(int(i))
        columns_sub= columns_sub+[word]
        if word != 'common_name'+str(int(i)):
            columns_sub_no_name= columns_sub_no_name+[word]

columns= columns_main+columns_sub



In [7]:

    
#try to clean up the 
maxL= 44
fname= '../input/c_elegans_gene_interactions.txt'
series= []

with open(fname, 'r') as f:
    for line in f:
        c= []
        
        if line[0] != '#':  
            
            line= line.rstrip()
            l= line.split('\t') 
            keys= columns[0:len(l)]
            d= dict(zip(keys, l))
            
            if d['interaction_type'] == 'Regulatory':
                
            
                eff_indices= np.where(np.array([x == 'Trans_regulator' for x in l]))[0]
                aff_indices= np.where(np.array([x == 'Trans_regulated' for x in l]))[0]
                
                d= dict(zip(keys, l))
                interactors= (len(l) - len(columns_main))/3

                #if no multiple interactors, assume it's autoregulatory
                #only accept trans_regulator roles:
                if interactors == 1 and d['role1'] == 'Trans_regulator':
                    
                    
                    d['role1']= 'Trans_regulated' #to denote auto-regulation
                    mykeys= ['interactor1','common_name1','role1']
                    add= [d[x] for x in mykeys]
                    
                    c= l+add      
                    series.append(c)
                    
                else:
                    
                    
                    combos= cartesian([eff_indices, aff_indices])
                    #print(aff_indices)
                    for pair in combos:
                        eff_ind= pair[0]
                        aff_ind= pair[1]
                        
                        #info on the effector:
                        effector= l[eff_ind-2:eff_ind+1]
                        affector= l[aff_ind-2: aff_ind+1]
                        
                        
                        c= l[:len(columns_main)] + effector + affector
                        series.append(c)

cols= columns_main+ ['interactor1','common_name1','role1']+['interactor2','common_name2','role2']
df= pd.DataFrame(data= series, columns= cols)



In [8]:

    
df= df.drop(['interaction_type', 'interaction_subtype', 'summary', 'citation'],1)
df.head()









    Out[8]:






  
    
      
      WBInteractionID
      interactor1
      common_name1
      role1
      interactor2
      common_name2
      role2
    
  
  
    
      0
      WBInteraction000003067
      WBGene00003003
      lin-14
      Trans_regulator
      WBGene00001080
      dpy-21
      Trans_regulated
    
    
      1
      WBInteraction000003068
      WBGene00001085
      dpy-26
      Trans_regulator
      WBGene00003003
      lin-14
      Trans_regulated
    
    
      2
      WBInteraction000003069
      WBGene00001086
      dpy-27
      Trans_regulator
      WBGene00003003
      lin-14
      Trans_regulated
    
    
      3
      WBInteraction000003070
      WBGene00001087
      dpy-28
      Trans_regulator
      WBGene00003003
      lin-14
      Trans_regulated
    
    
      4
      WBInteraction000003071
      WBGene00006962
      xol-1
      Trans_regulator
      WBGene00004745
      sdc-1
      Trans_regulated



In [9]:

    
df.shape









    Out[9]:





(7249, 7)



In [10]:

    
g= nx.from_pandas_dataframe(df, 'interactor1', 'interactor2', nx.DiGraph())



In [11]:

    
#degreeness of the network
val= np.array([])
for key, value in nx.degree(g).items():
    if np.size(val)== 0:
        val= np.array(value)
    else:
        val= np.append(val, value)
    if value == 0:
        print('uhoh')
        
print(val)









    



[1 1 3 ..., 2 1 4]



In [23]:

    
n_bins= np.floor(np.cbrt(len(val))) #rule of thumb, but use cbrt since we are using log-bins
MIN= min(val)
MAX= max(val)

#plot histogram with log spaced bins
plt.hist(val, bins = 10 ** np.linspace(np.log10(MIN), np.log10(MAX), n_bins))

plt.gca().set_xscale('log')
plt.gca().set_yscale('log')
plt.gca().set_xlabel('Node Degree')
plt.gca().set_ylabel('Frequency')
plt.gca().set_title(r'Trans-regulated Gene Network of $C. elegans$')
plt.savefig('../output/Trans-regulated Gene Network of C. elegans.png')



In [13]:

    
conn_comp= nx.connected_component_subgraphs(nx.Graph(g))



In [15]:

    
print(nx.degree_histogram(g))









    



[0, 1377, 526, 224, 176, 117, 67, 68, 53, 39, 29, 24, 27, 28, 14, 21, 12, 5, 5, 7, 4, 9, 5, 3, 7, 2, 4, 3, 2, 2, 4, 1, 0, 2, 0, 0, 3, 1, 0, 0, 0, 1, 0, 0, 3, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]



In [27]:

    
nx.draw_networkx(g, node_size= 20, alpha= 0.4, node_color='blue', with_labels=False)
plt.xlim(-.45, .45)
plt.ylim(-.45,.45)
plt.savefig('../output/trans_reg_worm_network_zoom_in.png')



In [22]:

    
plt.figure(figsize=(8,8))
nx.draw_spring(g,node_size=20, alpha=0.2,node_color="blue", with_labels=False)
plt.axis('equal')
plt.xlim(-1, 1)
plt.ylim(-1,1)
plt.savefig('../output/trans_reg_worm_network.png')



In [ ]:

    
mc= max_clique(nx.Graph(g))



In [84]:

    
from networkx.algorithms.approximation import ramsey
__author__ = """Nicholas Mancuso (nick.mancuso@gmail.com)"""
__all__ = ["clique_removal","max_clique"]

def max_clique(G):
    r"""Find the Maximum Clique

    Finds the `O(|V|/(log|V|)^2)` apx of maximum clique/independent set
    in the worst case.

    Parameters
    ----------
    G : NetworkX graph
        Undirected graph

    Returns
    -------
    clique : set
        The apx-maximum clique of the graph

    Notes
    ------
    A clique in an undirected graph G = (V, E) is a subset of the vertex set
    `C \subseteq V`, such that for every two vertices in C, there exists an edge
    connecting the two. This is equivalent to saying that the subgraph
    induced by C is complete (in some cases, the term clique may also refer
    to the subgraph).

    A maximum clique is a clique of the largest possible size in a given graph.
    The clique number `\omega(G)` of a graph G is the number of
    vertices in a maximum clique in G. The intersection number of
    G is the smallest number of cliques that together cover all edges of G.

    http://en.wikipedia.org/wiki/Maximum_clique

    References
    ----------
    .. [1] Boppana, R., & Halldórsson, M. M. (1992).
        Approximating maximum independent sets by excluding subgraphs.
        BIT Numerical Mathematics, 32(2), 180–196. Springer.
        doi:10.1007/BF01994876
    """
    if G is None:
        raise ValueError("Expected NetworkX graph!")

    # finding the maximum clique in a graph is equivalent to finding
    # the independent set in the complementary graph
    cgraph = nx.complement(G)
    iset, _ = clique_removal(cgraph)
    return iset

def clique_removal(G):
    """ Repeatedly remove cliques from the graph.

    Results in a `O(|V|/(\log |V|)^2)` approximation of maximum clique
    & independent set. Returns the largest independent set found, along
    with found maximal cliques.

    Parameters
    ----------
    G : NetworkX graph
        Undirected graph

    Returns
    -------
    max_ind_cliques : (set, list) tuple
        Maximal independent set and list of maximal cliques (sets) in the graph.

    References
    ----------
    .. [1] Boppana, R., & Halldórsson, M. M. (1992).
        Approximating maximum independent sets by excluding subgraphs.
        BIT Numerical Mathematics, 32(2), 180–196. Springer.
    """
    graph = G.copy()
    c_i, i_i = ramsey.ramsey_R2(graph)
    cliques = [c_i]
    isets = [i_i]
    while graph:
        graph.remove_nodes_from(c_i)
        c_i, i_i = ramsey.ramsey_R2(graph)
        if c_i:
            cliques.append(c_i)
        if i_i:
            isets.append(i_i)

    maxiset = max(isets)
    return maxiset, cliques



In [78]:

    
def power_law(nodes, alpha):
    """
    A function to model power-law growth of a network
    """

    if np.min(alpha)> 0:
        return nodes**-alpha
    else:
        raise RuntimeError('alpha range includes values less than 0, please specify only positive alpha') 
        
def mle_power_law(data, alpha):
    if max(data) != 1:
        data= data/max(data)
    
    nodes= np.linspace(1, len(data), len(data)) #number of nodes
    log_mle= -len(d)/2*np.log(np.sum(data[0] - power_law(nodes, alpha)**2))
    
    return np.exp(log_mle)



In [81]:

    
x= np.linspace(1, 20, 100)
y= power_law(nodes, 1.5)
plt.plot(x, y)

alpha= np.linspace(0.1, 3, 100)
mle= []
for a in alpha:
    mle.append(mle_power_law(val, a))

fig, ax= plt.subplots()
ax.plot(alpha, mle)









    Out[81]:





[<matplotlib.lines.Line2D at 0x112414048>]



In [83]:

    
mle[0]









    Out[83]:





nan



In [64]:

    
val.shape









    Out[64]:





(2893,)



In [65]:

    
val[0]









    Out[65]:





1



In [ ]:

	WBInteractionID	interactor1	common_name1	role1	interactor2	common_name2	role2
0	WBInteraction000003067	WBGene00003003	lin-14	Trans_regulator	WBGene00001080	dpy-21	Trans_regulated
1	WBInteraction000003068	WBGene00001085	dpy-26	Trans_regulator	WBGene00003003	lin-14	Trans_regulated
2	WBInteraction000003069	WBGene00001086	dpy-27	Trans_regulator	WBGene00003003	lin-14	Trans_regulated
3	WBInteraction000003070	WBGene00001087	dpy-28	Trans_regulator	WBGene00003003	lin-14	Trans_regulated
4	WBInteraction000003071	WBGene00006962	xol-1	Trans_regulator	WBGene00004745	sdc-1	Trans_regulated