In [1]:
import numpy as np
In [2]:
import csv
csvfile = open('data/cellcycle.txt', 'r')
reader = csv.reader(csvfile, delimiter='\t')
A primeira linha contém o nome das colunas:
In [3]:
cycles = reader.next()[1:]
In [4]:
print len(cycles)
print cycles[0:5]
Já a primeira coluna contém o nome dos genes:
In [5]:
genes = [row[0] for row in reader]
In [6]:
print len(genes)
print genes[0:5]
In [7]:
csvfile.close()
Para facilitar a visualização, vamos trabalhar com os primeiros 250 genes:
In [8]:
data = np.genfromtxt('data/cellcycle.txt', delimiter='\t', skip_header=1,
usecols=range(1, 83), missing_values='', filling_values=0.)
X = data[0:250,:]
n, m = X.shape
print n, m
In [9]:
X
Out[9]:
In [10]:
%matplotlib notebook
import matplotlib.pyplot as plt
In [11]:
from scipy.cluster.hierarchy import dendrogram, linkage
In [12]:
Z = linkage(X, 'ward')
Z[i, 0] e Z[i, 1] são combinados para formar o cluster $i$. Z[i, 2]Z[i,4]
In [13]:
print Z[0:5]
print '...'
print Z[-6:-1]
In [14]:
Z.shape
Out[14]:
Função que, para um índice $i$, devolve o nome do gene:
In [15]:
llf = lambda id : genes[id]
In [16]:
plt.figure(figsize=(12, 30))
plt.title('Dendrograma')
plt.ylabel('Gene')
plt.xlabel(u'Distância (Ward)')
den = dendrogram(Z, orientation='left', leaf_font_size=9, leaf_label_func=llf)
In [17]:
import seaborn as sns
In [18]:
plt.figure(2, figsize=(12,36))
hm = sns.heatmap(X[den['leaves'],:], square=True, cbar=False, xticklabels=False)
gene_labels = [genes[i] for i in den['leaves']]
ylabels = hm.set_yticklabels(gene_labels, rotation=0)