In [1]:
%pylab inline
%config InlineBackend.figure_format = 'retina'
In [2]:
import pandas as pd
import seaborn as sns
In [3]:
k35_df = pd.read_csv('mmetsp/Asterionellopsis_glacialis/k35/decision_nodes.csv', skipinitialspace=True)
k27_df = pd.read_csv('mmetsp/Asterionellopsis_glacialis/k27/decision_nodes.csv', skipinitialspace=True)
In [4]:
k35_df.head()
Out[4]:
We can find the number of decision nodes in the dBG by counting unique hashes...
In [101]:
k27_df.hash.nunique(), k35_df.hash.nunique()
Out[101]:
We'll make a new column for total degree, for convenience.
In [102]:
k35_df['degree'] = k35_df['l_degree'] + k35_df['r_degree']
k27_df['degree'] = k27_df['l_degree'] + k27_df['r_degree']
Let's start with the overal degree distribution during the entire construction process.
In [103]:
figsize(18,10)
fig, ax_mat = subplots(ncols=3, nrows=2)
top = ax_mat[0]
sns.distplot(k35_df.degree, kde=False, ax=top[0], bins=8)
sns.distplot(k35_df.l_degree, kde=False, ax=top[1], bins=5)
sns.distplot(k35_df.r_degree, kde=False, ax=top[2], bins=5)
bottom = ax_mat[1]
sns.distplot(k27_df.degree, kde=False, ax=bottom[0], bins=8)
sns.distplot(k27_df.l_degree, kde=False, ax=bottom[1], bins=5)
sns.distplot(k27_df.r_degree, kde=False, ax=bottom[2], bins=5)
Out[103]:
So most decision nodes in this dataset have degree 3. Note that a few have degree 2; these forks without handles.
In [6]:
figsize(12,8)
sns.distplot(k35_df.position, kde=False, label='K=35')
sns.distplot(k27_df.position, kde=False, label='K=27')
legend()
Out[6]:
In [118]:
k35_melted_df = k35_df.melt(id_vars=['hash', 'position'], value_vars=['l_degree', 'r_degree'], )
k27_melted_df = k27_df.melt(id_vars=['hash', 'position'], value_vars=['l_degree', 'r_degree'], )
k27_melted_df.head()
Out[118]:
In [119]:
figsize(18,8)
sns.violinplot('position', 'value', 'variable', k27_melted_df)
Out[119]:
In [7]:
k35_dnodes_per_read = k35_df.groupby('read_n').count().\
reset_index()[['read_n', 'hash']].rename({'hash': 'n_dnodes'}, axis='columns')
k27_dnodes_per_read = k27_df.groupby('read_n').count().\
reset_index()[['read_n', 'hash']].rename({'hash': 'n_dnodes'}, axis='columns')
In [9]:
ax = k35_dnodes_per_read.rolling(100, min_periods=10, on='read_n').mean().plot(x='read_n',
y='n_dnodes',
label='k = 35')
ax = k27_dnodes_per_read.rolling(100, min_periods=10, on='read_n').mean().plot(x='read_n',
y='n_dnodes',
label='k = 27',
ax=ax)
ax.xaxis.set_major_formatter(mpl.ticker.StrMethodFormatter("{x:,}"))
In [1]:
from goetia.minimizers import WKMinimizer
from goetia.processors import MinimizerProcessor
In [6]:
M = WKMinimizer(10, 25)
S = "GACAACGGTAAAAGTTCTAATGCTGCCGAGTCACGGGAAGGATAGAGTGAGTCCCACCATATGGCGCACC"
In [7]:
print(S)
for kmer, pos in M.get_minimizer_kmers(S):
print(pos * ' ', kmer, sep='')
In [2]:
%%time
proc = MinimizerProcessor(25, 25, 'minimizers.csv')
proc.process('/store/biodb/genomes/fugu/Takifugu_rubripes.FUGU5.dna_rm.toplevel.fa', output_interval=1)
In [ ]: