As always, we start by setting matplotlib inline and importing pyplot.
In [ ]:
%matplotlib inline
from matplotlib import pyplot as plt
Next, we set the working directory...
In [ ]:
import os
working_dir = '~/Data/marcelle-hcs-sample/features'
os.chdir(os.path.expanduser(working_dir))
The working directory contains a pre-populated file called dirs.txt that contains all the directories on the merri cluster in which a feature map was created. (As well as the RGB images.) (This was done simply with an ls -d command on merri.) We use this to copy over each feature matrix.
In [ ]:
import paramiko as mk
with open('dirs.txt', 'r') as fin:
line = fin.readline().rstrip()
dirs = line.split(' ')
from husc.screens import myofusion as myo
remote_base_dir = 'marcelle/raw-data'
plateids = map(myo.dir2plate, dirs)
feature_fns = ['features-%i.h5' % p for p in plateids]
# set up SFTP transport to fetch files
trans = mk.Transport(('merri.vlsci.unimelb.edu.au', 22))
trans.connect(pkey=mk.Agent()._keys[0], username='jni')
sftp = mk.SFTPClient.from_transport(trans)
#for d, f in zip(dirs, feature_fns):
# if not os.path.exists(f):
# sftp.get(os.path.join(remote_base_dir, d, 'features.h5'), f)
Let's do a sanity check to ensure plate ids are unique:
In [ ]:
print (len(dirs), len(set(dirs)),
len(plateids), len(set(plateids)))
tempdict = {}
for k in plateids:
tempdict.setdefault(k, []).append(k)
print([k for k, v in tempdict.items() if len(v) > 1])
That repeated "0" is because of the "NOCODE" plates above. We ignore them and plow bravely on.
In [ ]:
import os
import pandas as pd
import numpy as np
In [ ]:
h5_fns = sorted(filter(lambda s: s.startswith('feature-') and
s.endswith('.h5') and not
s.endswith('NOCODE.h5'), os.listdir('.')))
datasets = [pd.read_hdf(f, 'data') for f in h5_fns]
In [ ]:
data = pd.concat(datasets)
print(data)
(This is a reduced dataset for demo --- full dataset has 45,000 rows!)
In [ ]:
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition
In [ ]:
X = StandardScaler().fit_transform(data)
pca = decomposition.PCA().fit(X)
pca.n_components = 2
X_reduced = pca.fit_transform(X)
print(X_reduced.shape)
The obligatory PCA scatter:
In [ ]:
plt.scatter(*(X_reduced.T))
This is not what a PCA scatter normally looks like. The assumptions of normality are clearly violated. What can I do about this?
What features contributed most to the two top PCA components?
In [ ]:
pca_comp1, pca_comp2 = (np.argsort(np.abs(pca.components_[0]))[-1:-4:-1],
np.argsort(np.abs(pca.components_[1]))[-1:-4:-1])
data.columns[pca_comp1], data.columns[pca_comp2]
And how much were these contributions?
In [ ]:
(pca.components_[0][pca_comp1] / (pca.components_[0]).sum(),
pca.components_[1][pca_comp2] / (pca.components_[1]).sum())
In [ ]:
from sklearn import cluster
In [ ]:
# average cluster size of 10
avg_cluster_size = 20
K = cluster.MiniBatchKMeans(n_clusters=(X.shape[0] // avg_cluster_size))
K = K.fit(X)
(minibatch KMeans is an online clustering algorithm that performs KMeans on subsets of the data, adding subsets sequentially.)
In [ ]:
from husc import preprocess as pre
from husc.screens import myofusion as myo
myores_semantic_filename = myo.myores_semantic_filename
In [ ]:
sems = map(myo.myores_semantic_filename, data.index)
data.index = [(s['plate'], s['well']) for s in sems]
In [ ]:
plates = set([ix[0] for ix in data.index])
print plates
In [ ]:
from pymongo import MongoClient
collection = MongoClient()["myofusion"]["wells"]
def get_cluster_gene_data(cluster_id):
cluster_members = [data.index[i] for
i in np.flatnonzero(K.labels_ == cluster_id)]
cluster_entries = [collection.find_one({"_id": myo.key2mongo(c)})
for c in cluster_members]
return cluster_members, cluster_entries
Notes:
collection.find_one returns a "Document". Later we will see collection.find, which returns a "Cursor", which is a lazy iterable collection of matching documents.
Documents are simple {key: value} collections. Valid JSON objects are supported. Missing keys allowed. Only restriction is _id key must exist and be unique for each document. (This can be done automatically by MongoDB.)
MongoDB does not support tuples as keys, so I wrote a simple function to convert a (plate, well) tuple to a string.
We are going to examine some clusters and we want moderately-sized ones, so let's plot the cluster sizes.
In [ ]:
print(len(K.labels_), len(np.unique(K.labels_)), np.max(K.labels_))
plt.plot(np.bincount(K.labels_))
In [ ]:
cluster_members_1, cluster_entries_1 = get_cluster_gene_data(39)
print(len(cluster_members_1))
cluster_entries_1[:3]
We can use the plate and well numbers to grab the images that were analysed.
The database contains the locations of each file in merri. We use our previously established paramiko connection (held by the sftp object) to grab files on request.
In [ ]:
import tempfile as tmp
import mahotas as mh
def get_doc_value(mongo_doc, key):
try:
return mongo_doc[key], False
except KeyError:
return "", True
def get_images(key, key_type='_id',
ids_returned=["_id", "gene_name"],
failsafe_ids=["label"]):
if key_type == "_id" and type(key) == tuple:
key = myo.key2mongo(key)
im_entries = collection.find({key_type: key})
ims = []
ids = []
for entry in im_entries:
im_path = entry["filename"]
tmpfile = tmp.mkstemp(suffix='.' + im_path.split('.')[-1],
dir='.')[1]
sftp.get(im_path, tmpfile)
ims.append(mh.imread(tmpfile))
id_list = []
fail = False
for ix in ids_returned:
val, fail = get_doc_value(entry, ix)
id_list.append(val)
if fail:
for ix in failsafe_ids:
id_list.append(get_doc_value(entry, ix)[0])
ids.append(" ".join(map(str, id_list)))
labeled_ims = zip(ids, ims)
return labeled_ims
Now we want to look at the images. The below function shows images in an n x 3 rectangle.
In [ ]:
def show_3ims(label_ims):
num_rows = int(np.ceil(float(len(label_ims)) / 3))
fig, axes = plt.subplots(num_rows, 3, figsize=(15, 5 * num_rows))
for (name, im), ax in zip(label_ims, axes.ravel()):
ax.imshow(im)
ax.set_title(name)
In [ ]:
cluster_members_1
In [ ]:
try:
ims = [get_images(k)[0] for k in cluster_members_1]
except:
print([c["filename"] for c in cluster_entries_1])
raise
In [ ]:
show_3ims(ims)
In [ ]:
plate_cluster_2, gene_cluster_2 = get_cluster_gene_data(40)
plate_cluster_2
In [ ]:
try:
ims2 = [get_images(k)[0] for k in plate_cluster_2]
except:
print(c["filename"] for c in gene_cluster_2)
raise
In [ ]:
show_3ims(ims2)
In [ ]:
import itertools as it
ims3 = list(it.chain(*[get_images(g, key_type="gene_name") for
g in ["Sass6", "Son", "Cenpq"]]))
In [ ]:
show_3ims(ims3)
These need to be checked systematically: