In [1]:
from goetia.hashing import UKHShifter
from goetia.minimizers import UKHSSignature, UKHSCountSignature
from goetia.processors import UKHSCountSignatureProcessor
import screed
import numpy as np
from scipy.spatial import distance
from glob import glob
import os
import pandas as pd
import seaborn as sns
from ficus import FigureManager
import re
from collections import OrderedDict
import yaml
from yaml import CLoader as Loader, CDumper as Dumper
import json
from sklearn.manifold import TSNE
from skbio.stats.distance import mantel
from snakemake.utils import listfiles
from scipy.stats import kendalltau as KennyT
In [2]:
def load_fastani_matrix(filename):
df = pd.read_csv(filename, delimiter='\t',
names=['A', 'B', 'ani', 'matches', 'n_fragments'])
df['A'] = [os.path.basename(n) for n in df.A]
df['B'] = [os.path.basename(n) for n in df.B]
sim_df = df.set_index(['A', 'B'])['ani'].unstack()
dissim_df = (100 - sim_df) / 100
dissim_df.values[[np.arange(dissim_df.shape[0])]*2] = 0
make_sym(dissim_df)
return df, dissim_df
In [3]:
def load_sourmash_matrix(np_mat_filename, labels_filename):
labels = [os.path.basename(line.split()[0])
for line in open(labels_filename).readlines()]
mat = np.load(np_mat_filename)
df = pd.DataFrame(mat, columns=labels, index=labels)
dissim_df = 1 - df
dissim_df.values[[np.arange(dissim_df.shape[0])]*2] = 0
return df, dissim_df
In [4]:
def load_ukhs_sigs_matrix(sig_filenames, metric='cosine'):
sigs = {}
for sigfile in sig_filenames:
with open(sigfile) as fp:
data = json.load(fp)[0]
sigs[data['name']] = data['signature']
df = pd.DataFrame(sigs).T
sim_df = pd.DataFrame(distance.squareform(distance.pdist(df.values, metric)),
columns = df.index,
index = df.index)
sim_df = sim_df.sort_index(axis=0).sort_index(axis=1)
return df, sim_df
In [9]:
def get_streaming_distances(filename, metric='cosine'):
df = pd.read_csv(filename, index_col=0, sep=',', engine='c', header=None, na_filter=False, dtype=np.uint64, low_memory=False)
dfunc = getattr(distance, metric)
distances = np.zeros(len(df) - 1, dtype=np.float64)
for i in range(len(df) - 1):
distances[i] = dfunc(df.values[i, :], df.values[i+1, :])
ddf = pd.DataFrame({'read_n': df.index[1:],
'd_distance': distances})
ddf['t'] = ddf['read_n'] / ddf['read_n'].max()
return df, ddf
In [ ]:
fastani_df, fastani_dissim_df = load_fastani_matrix('ecoli_many_sigs/all_by_all_ani.txt')
In [ ]:
sourmash_df, sourmash_dissim_df = load_sourmash_matrix('ecoli_many_sigs/sourmash_sim.npb',
'ecoli_many_sigs/sourmash_sim.txt.labels.txt')
sourmash_dissim_df.drop('NC_027387.1.fa', inplace=True)
sourmash_dissim_df.drop('NC_027387.1.fa', inplace=True, axis=1)
In [ ]:
ukhs_df, ukhs_dissim_df = load_ukhs_sigs_matrix(glob('ecoli_many_sigs/*.uksig.json'))
ukhs_dissim_df.drop('NC_027387.1.fa', inplace=True)
ukhs_dissim_df.drop('NC_027387.1.fa', inplace=True, axis=1)
In [ ]:
ukhs_dissim_df.head(2)
In [ ]:
fastani_dissim_df.head(2)
In [ ]:
sourmash_dissim_df.head(2)
In [ ]:
mantel(fastani_dissim_df, sourmash_dissim_df)
In [ ]:
mantel(fastani_dissim_df, ukhs_dissim_df)
In [ ]:
mantel(ukhs_dissim_df, sourmash_dissim_df)
In [ ]:
ukhs_df, ukhs_dissim_df = load_ukhs_sigs_matrix(glob('ecoli_many_sigs/*.uksig.json'))
ukhs_dissim_df.drop('NC_027387.1.fa', inplace=True)
ukhs_dissim_df.drop('NC_027387.1.fa', inplace=True, axis=1)
In [ ]:
for W in [31, 51]:
for K in [7, 8, 9]:
files = glob('/work/scihack/2019-03-08-scihack/results/ecoli/{0}/{1}/*.sig'.format(W, K))
edf, eddf = load_ukhs_sigs_matrix(files)
eddf.drop('NC_027387.1.fa', inplace=True)
eddf.drop('NC_027387.1.fa', inplace=True, axis=1)
print(W, K, mantel(fastani_dissim_df, eddf))
print(W, K, KennyT(eddf.values.flatten(), fastani_dissim_df.values.flatten()))
print()
In [ ]:
#with FigureManager(figsize=(15,15), show=True) as (fig, ax):
jp = sns.jointplot(x='ukhs', y='ANI', height=12,
data=pd.DataFrame({'ukhs': ukhs_dissim_df.values.flatten(),
'ANI': fastani_dissim_df.values.flatten()}) )
In [ ]:
jp.savefig('ukhs_ani_joint.png')
In [ ]:
sns.clustermap(fastani_dissim_df, square=True, xticklabels=False, figsize=(15,15))
In [ ]:
ukhs_cm = sns.clustermap(ukhs_dissim_df, square=True, xticklabels=False, figsize=(15,15))
ukhs_cm.savefig('ecoli.ukhssig.clustered.png')
In [ ]:
sns.clustermap(sourmash_dissim_df, square=True, xticklabels=False)
In [ ]:
with FigureManager(figsize=(18, 8), show=True) as (fig, ax):
sns.heatmap(ecoli_sig_data.clip(upper=1000), xticklabels=False, ax=ax)
In [ ]:
sns.clustermap(ecoli_sig_data.clip(upper=1000), cbar=False,
xticklabels=False, figsize=(20,12))
In [10]:
hu_files = sorted(list(listfiles('/work/scihack/2019-03-08-scihack/results/hu/31/9/{hu_id}.sigs.csv')), key=lambda item: item[1].hu_id)
hu_data = {}
for fn, wildcards in hu_files:
df, ddf = get_streaming_distances(fn)
hu_data[wildcards.hu_id] = {'sigs': df, 'distances': ddf}
In [12]:
ncols, nrows = 3, 8
with FigureManager(figsize=(15,48), nrows=nrows, ncols=ncols, show=True) as (fig, ax):
for i, (hu_id, data) in enumerate(hu_data.items()):
row = i // ncols
col = i % ncols
try:
sns.lineplot(x='read_n', y='d_distance', ax=ax[row][col], data=data['distances'])
ax[row][col].set_title(hu_id)
except:
pass
In [11]:
df, ddf = get_streaming_distances('/work/scihack/2019-03-08-scihack/results/mmetsp/31/9/SRR1300445.sigs.csv')
In [12]:
with FigureManager(figsize=(12,8), show=True) as (fig, ax):
df.sort_values(df.index[-1], axis=1, ascending=False).T.iloc[:100].T.plot(ax=ax, legend=False)
In [ ]:
log_df = np.log2(mmetsp_streaming_df)
In [ ]:
log_df.values[np.isneginf(log_df.values)] = 0
In [ ]:
with FigureManager(figsize=(20, 10), show=True, filename='Ast_gla.streamed.sigs.sorted.png') as (fig, ax):
sns.heatmap(log_df.sort_values(13221196, axis=1), xticklabels=False, ax=ax)
In [ ]:
clst = sns.clustermap(log_df, col_cluster=True, row_cluster=False, xticklabels=False, figsize=(15,15))
In [ ]:
clst.savefig('Ast_gla.streamed.sigs.clustered.png')
In [ ]:
hu_df, hu_dissim_df = load_ukhs_sigs_matrix(glob('/work/scihack/2019-03-08-scihack/results/hu/31/9/*.sig'))
In [ ]:
hu_cm = sns.clustermap(hu_dissim_df, square=True, xticklabels=False, figsize=(15,15))
hu_cm.savefig('hu.ukhssig.clustered.png')
In [ ]:
assemblies = glob('/store/public/sequencing-runs/MMETSP/data/assemblies/*.fasta')
In [ ]:
mmetsp_sigs, mmetsp_sig_df, mmetsp_dist_df = build_sigs(assemblies[:20])
In [ ]:
with FigureManager(figsize=(8, 20), show=True) as (fig, ax):
sns.heatmap(mmetsp_sig_df.T.clip(upper=10000), yticklabels=False, ax=ax)
In [ ]:
sns.clustermap(mmetsp_sig_df.T.clip(upper=10000), yticklabels=False, figsize=(8,10), row_cluster=True)
In [ ]:
n_sigs = len(sigs)
with FigureManager(figsize=(12, 2 * n_sigs), nrows=n_sigs, show=True) as (fig, ax):
for n, (gen, name, sig) in enumerate(sigs):
sns.heatmap(sig.T, xticklabels=False, yticklabels=False, cbar=False, ax=ax[n])
In [ ]: