In [3]:
import h5py
import numpy as np
# Get ACES data
h5ds = h5py.File("ACES/experiments/data/U133A_combat.h5")
X = np.array(h5ds['U133A_combat_RFS']['ExpressionData'])
y = np.array(h5ds['U133A_combat_RFS']['PatientClassLabels'])
aces_gene_names = np.array(h5ds['U133A_combat_RFS']['GeneLabels'])
h5ds.close()
In [4]:
%pylab inline
In [5]:
X.shape
Out[5]:
In [22]:
y==1
Out[22]:
In [29]:
gene_idx = 45
plt.scatter(np.where(y==1)[0], X[np.where(y==1)[0], gene_idx], color='blue')
plt.scatter(np.where(y==0)[0], X[np.where(y==0)[0], gene_idx], color='orange')
plt.xlim(0, X.shape[0])
Out[29]:
In [35]:
print "Proportion of zeros: %.3f" % (float(len(np.where(X<-1)[0]))/float(X.shape[0]*X.shape[1]))
In [46]:
# Number of constant genes (always expressed)
X_zeroed = np.where(X<-1, 1, 0)
print len(np.where(np.sum(X_zeroed, axis=0)==0)[0]), "genes are always expressed"
I2D network: 12 643 nodes, of which 10 018 are present in the data, and 142309 edges.
In [47]:
print aces_gene_names[:10]
In [54]:
alist = list(aces_gene_names[:10])
gn1 = 'Entrez_5982'
gn2 = 'Entrez_76'
print alist.index(gn1)
print alist.index(gn2)
In [76]:
aces_gene_names = list(aces_gene_names)
edges_set = set([]) # (gene_idx_1, gene_idx_2)
# gene_idx_1 < gene_idx_2
# idx in aces_gene_names, starting at 0
with open('ACES/experiments/data/I2D_edges_0411.sif') as f:
for line in f:
ls = line.split()
gene_name_1 = 'Entrez_%s' % ls[0]
gene_name_2 = 'Entrez_%s' % ls[2]
# Exclude self edges
if gene_name_1 == gene_name_2:
continue
try:
gene_idx_1 = aces_gene_names.index(gene_name_1)
gene_idx_2 = aces_gene_names.index(gene_name_2)
except ValueError:
continue
if gene_idx_1 < gene_idx_2:
e = (gene_idx_1, gene_idx_2)
else:
e = (gene_idx_2, gene_idx_1)
edges_set.add(e)
f.close()
In [83]:
np.savetxt('I2D_edges.txt', np.array([list(x) for x in list(edges_set)]), fmt='%d')
In [77]:
len(edges_set)
Out[77]:
In [ ]:
In [66]:
edges_list = np.array(edges_list)
In [85]:
genes_in_network = set(np.array([list(x) for x in list(edges_set)]).flatten())
print len(genes_in_network)
In [86]:
len(set(np.where(np.sum(X_zeroed, axis=0)==0)[0]).intersection(genes_in_network))
Out[86]:
9900 genes in the network. 4181 of those are always expressed.
In [87]:
X_binary = np.where(X<-1, 0, 1)
Out[87]:
In [94]:
float(np.count_nonzero(X_binary))/(X.shape[1]*X.shape[1])
Out[94]:
In [92]:
np.count_nonzero(X_binary)
Out[92]:
In [ ]: