In [1]:
%pylab inline
%config InlineBackend.figure_format = 'retina'


Populating the interactive namespace from numpy and matplotlib

In [2]:
import pandas as pd
import seaborn as sns

In [3]:
k35_df = pd.read_csv('mmetsp/Asterionellopsis_glacialis/k35/decision_nodes.csv', skipinitialspace=True)
k27_df = pd.read_csv('mmetsp/Asterionellopsis_glacialis/k27/decision_nodes.csv', skipinitialspace=True)

In [4]:
k35_df.head()


Out[4]:
read_n l_degree r_degree position hash
0 414 2 1 9 4552537295491192989
1 555 2 1 3 10400599629126302337
2 917 2 1 5 8559623946996443022
3 1056 1 2 12 9867647372000252236
4 1929 2 1 6 10446294276324287310

We can find the number of decision nodes in the dBG by counting unique hashes...


In [101]:
k27_df.hash.nunique(), k35_df.hash.nunique()


Out[101]:
(1465148, 966286)

We'll make a new column for total degree, for convenience.


In [102]:
k35_df['degree'] = k35_df['l_degree'] + k35_df['r_degree']
k27_df['degree'] = k27_df['l_degree'] + k27_df['r_degree']

Let's start with the overal degree distribution during the entire construction process.


In [103]:
figsize(18,10)
fig, ax_mat = subplots(ncols=3, nrows=2)
top = ax_mat[0]
sns.distplot(k35_df.degree, kde=False, ax=top[0], bins=8)
sns.distplot(k35_df.l_degree, kde=False, ax=top[1], bins=5)
sns.distplot(k35_df.r_degree, kde=False, ax=top[2], bins=5)

bottom = ax_mat[1]
sns.distplot(k27_df.degree, kde=False, ax=bottom[0], bins=8)
sns.distplot(k27_df.l_degree, kde=False, ax=bottom[1], bins=5)
sns.distplot(k27_df.r_degree, kde=False, ax=bottom[2], bins=5)


/home/camille/miniconda/envs/goetia/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "
Out[103]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5f175dccc0>

So most decision nodes in this dataset have degree 3. Note that a few have degree 2; these forks without handles.


In [6]:
figsize(12,8)
sns.distplot(k35_df.position, kde=False, label='K=35')
sns.distplot(k27_df.position, kde=False, label='K=27')
legend()


/home/camille/miniconda/envs/goetia/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "
Out[6]:
<matplotlib.legend.Legend at 0x7f9a5757e5c0>

In [118]:
k35_melted_df = k35_df.melt(id_vars=['hash', 'position'], value_vars=['l_degree', 'r_degree'], )
k27_melted_df = k27_df.melt(id_vars=['hash', 'position'], value_vars=['l_degree', 'r_degree'], )

k27_melted_df.head()


Out[118]:
hash position variable value
0 9070239862430184705 9 l_degree 2
1 13854235864955274342 3 l_degree 2
2 6350396296750610583 5 l_degree 2
3 18383274751344853795 20 l_degree 1
4 18256312570815508169 16 l_degree 2

In [119]:
figsize(18,8)
sns.violinplot('position', 'value', 'variable', k27_melted_df)


Out[119]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f5f16d4e160>

In [7]:
k35_dnodes_per_read = k35_df.groupby('read_n').count().\
    reset_index()[['read_n', 'hash']].rename({'hash': 'n_dnodes'}, axis='columns')
    
k27_dnodes_per_read = k27_df.groupby('read_n').count().\
    reset_index()[['read_n', 'hash']].rename({'hash': 'n_dnodes'}, axis='columns')

In [9]:
ax = k35_dnodes_per_read.rolling(100, min_periods=10, on='read_n').mean().plot(x='read_n', 
                                                                               y='n_dnodes', 
                                                                               label='k = 35')

ax = k27_dnodes_per_read.rolling(100, min_periods=10, on='read_n').mean().plot(x='read_n', 
                                                                               y='n_dnodes', 
                                                                               label='k = 27', 
                                                                               ax=ax)

ax.xaxis.set_major_formatter(mpl.ticker.StrMethodFormatter("{x:,}"))



In [1]:
from goetia.minimizers import WKMinimizer
from goetia.processors import MinimizerProcessor

In [6]:
M = WKMinimizer(10, 25)
S = "GACAACGGTAAAAGTTCTAATGCTGCCGAGTCACGGGAAGGATAGAGTGAGTCCCACCATATGGCGCACC"

In [7]:
print(S)
for kmer, pos in M.get_minimizer_kmers(S):
    print(pos * ' ', kmer, sep='')


GACAACGGTAAAAGTTCTAATGCTGCCGAGTCACGGGAAGGATAGAGTGAGTCCCACCATATGGCGCACC
     CGGTAAAAGTTCTAATGCTGCCGAG
         AAAAGTTCTAATGCTGCCGAGTCAC
                  AATGCTGCCGAGTCACGGGAAGGAT
                      CTGCCGAGTCACGGGAAGGATAGAG
                         CCGAGTCACGGGAAGGATAGAGTGA
                                  GGGAAGGATAGAGTGAGTCCCACCA
                                   GGAAGGATAGAGTGAGTCCCACCAT
                                        GATAGAGTGAGTCCCACCATATGGC

In [2]:
%%time 
proc = MinimizerProcessor(25, 25, 'minimizers.csv')
proc.process('/store/biodb/genomes/fugu/Takifugu_rubripes.FUGU5.dna_rm.toplevel.fa', output_interval=1)


CPU times: user 51.8 s, sys: 1min 10s, total: 2min 1s
Wall time: 2min 6s

In [ ]: