In [236]:
import warnings
# Our numerical workhorses
import numpy as np
import pandas as pd
import scipy.stats as st
# Import pyplot for plotting
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d
# Seaborn, useful for graphics
import seaborn as sns
# Magic function to make matplotlib inline; other style specs must come AFTER
%matplotlib inline
# This enables SVG graphics inline. There is a bug, so uncomment if it works.
%config InlineBackend.figure_formats = {'png', 'retina'}
# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2,
'axes.labelsize': 18,
'axes.titlesize': 18,
'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)
# Import Bokeh modules for interactive plotting
import bokeh.charts
import bokeh.charts.utils
import bokeh.io
import bokeh.models
import bokeh.palettes
import bokeh.plotting
# Display graphics in this notebook
bokeh.io.output_notebook()
# Suppress future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.utils.extmath import cartesian
import networkx as nx
In [165]:
fname= '../input/c_elegans_gene_interactions.txt'
maxL= 0
i= 0
with open(fname, 'r') as f:
for line in f:
if line[0] != '#':
l= line.split('\t')
if len(l) > maxL:
maxL= len(l)
print(maxL) #as of Jan 1, 2016, maxL was 44 -- this is the number of rows in the df
In [166]:
maxL= 44
fname= '../input/c_elegans_gene_interactions.txt'
series= []
with open(fname, 'r') as f:
for line in f:
if line[0] != '#':
line= line.rstrip()
l= line.split('\t')
empty= [np.nan]*(maxL - len(l))
l= l+empty
series.append(l)
df= pd.DataFrame(data= series)
In [198]:
# WBInteractionID Interaction_type Interaction_subtype Summary Citation Interactor1 Common-name Role1 Interactor2 Common-name Role2 ...
columns_main= ['WBInteractionID', 'interaction_type', 'interaction_subtype', 'summary', 'citation']
column_repeat= ['interactor','common_name','role']
In [222]:
columns_sub= []
columns_sub_no_name= []
#max number of interactors in this df
maxInteractors= (maxL - len(columns_main))/len(column_repeat)
for i in np.arange(1, maxInteractors+1):
for word in column_repeat:
word= word+str(int(i))
columns_sub= columns_sub+[word]
if word != 'common_name'+str(int(i)):
columns_sub_no_name= columns_sub_no_name+[word]
columns= columns_main+columns_sub
In [232]:
#try to clean up the
maxL= 44
fname= '../input/c_elegans_gene_interactions.txt'
series= []
with open(fname, 'r') as f:
for line in f:
c= []
if line[0] != '#':
line= line.rstrip()
l= line.split('\t')
keys= columns[0:len(l)]
d= dict(zip(keys, l))
if d['interaction_type'] == 'Regulatory':
eff_indices= np.where(np.array([x == 'Trans_regulator' for x in l]))[0]
aff_indices= np.where(np.array([x == 'Trans_regulated' for x in l]))[0]
d= dict(zip(keys, l))
interactors= (len(l) - len(columns_main))/3
#if no multiple interactors, assume it's autoregulatory
#only accept trans_regulator roles:
if interactors == 1 and d['role1'] == 'Trans_regulator':
d['role1']= 'Trans_regulated' #to denote auto-regulation
mykeys= ['interactor1','common_name1','role1']
add= [d[x] for x in mykeys]
c= l+add
series.append(c)
else:
combos= cartesian([eff_indices, aff_indices])
#print(aff_indices)
for pair in combos:
eff_ind= pair[0]
aff_ind= pair[1]
#info on the effector:
effector= l[eff_ind-2:eff_ind+1]
affector= l[aff_ind-2: aff_ind+1]
c= l[:len(columns_main)] + effector + affector
series.append(c)
cols= columns_main+ ['interactor1','common_name1','role1']+['interactor2','common_name2','role2']
df= pd.DataFrame(data= series, columns= cols)
In [233]:
df= df.drop(['interaction_type', 'interaction_subtype', 'summary', 'citation'],1)
df.head()
Out[233]:
In [234]:
df.shape
Out[234]:
In [287]:
g= nx.from_pandas_dataframe(df, 'interactor1', 'interactor2', nx.DiGraph())
In [249]:
nx.draw(g)
In [296]:
#degreeness of the network
for key, value in nx.degree(g).items():
if np.size(val)== 0:
val= np.array(value)
else:
val= np.append(val, value)
if value == 0:
print('uhoh')
print(val)
In [299]:
n_bins= np.floor(np.cbrt(len(val))) #rule of thumb, but use cbrt since we are using log-bins
MIN= min(val)
MAX= max(val)
#plot histogram with log spaced bins
plt.hist(val, bins = 10 ** np.linspace(np.log10(MIN), np.log10(MAX), n_bins))
plt.gca().set_xscale('log')
plt.gca().set_yscale('log')
plt.gca().set_xlabel('Node Degree')
plt.gca().set_ylabel('Frequency')
plt.gca().set_title('Log-Log Histogram of Node Degree Frequency')
Out[299]:
In [298]:
conn_comp= nx.connected_component_subgraphs(g)
In [277]:
print(conn_comp)
In [281]:
print(nx.degree_histogram(g))
In [282]:
plt.plot(nx.degree_histogram(g))
Out[282]:
In [300]:
nx.draw_graphviz(g)
In [303]:
import graphviz
In [ ]: