In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
for stage in config['stages']:
    fp_fwd = 'lcap808_geo/tracks_linear_fwd/lcap_%s_linear_fwd.bw' % (stage,)
    fp_rev = 'lcap808_geo/tracks_linear_rev/lcap_%s_linear_rev.bw' % (stage,)
    !md5sum {fp_fwd} #| awk '{{print $$1}}'
    !md5sum {fp_rev} #| awk '{{print $$1}}'


d55698d8accd71607d134c08b8bd2690  lcap808_geo/tracks_linear_fwd/lcap_wt_emb_linear_fwd.bw
dcc52795f635366e375cfccc81b21e7f  lcap808_geo/tracks_linear_rev/lcap_wt_emb_linear_rev.bw
5535e09dd9e508380974d6030ada2fea  lcap808_geo/tracks_linear_fwd/lcap_wt_l1_linear_fwd.bw
4554bdf1bdb86b9ff58057cbc0753e1e  lcap808_geo/tracks_linear_rev/lcap_wt_l1_linear_rev.bw
0a13890d7c131e39e37fc9d36bfc6f78  lcap808_geo/tracks_linear_fwd/lcap_wt_l2_linear_fwd.bw
9774f077750b061d65a8c2e67fcbf2ec  lcap808_geo/tracks_linear_rev/lcap_wt_l2_linear_rev.bw
073f73ef3e76e97b45a591903133e24a  lcap808_geo/tracks_linear_fwd/lcap_wt_l3_linear_fwd.bw
a8c38368f2bf7af7b1a396da853f9d07  lcap808_geo/tracks_linear_rev/lcap_wt_l3_linear_rev.bw
95ba34a590659fb56bf9c1ff7aab0b4b  lcap808_geo/tracks_linear_fwd/lcap_wt_l4_linear_fwd.bw
74e1a1e869ebce339b80d4a8e9401e3e  lcap808_geo/tracks_linear_rev/lcap_wt_l4_linear_rev.bw
8e12b40588ba3b9c7be7108a80422675  lcap808_geo/tracks_linear_fwd/lcap_wt_ya_linear_fwd.bw
86c3d83b5150556bb41d46cf47941a88  lcap808_geo/tracks_linear_rev/lcap_wt_ya_linear_rev.bw
9b7a4bbcdf3d3e711f06e6bffc7b1359  lcap808_geo/tracks_linear_fwd/lcap_glp1_d1_linear_fwd.bw
4ddd5ccd5821104ffa29b6263a1597a7  lcap808_geo/tracks_linear_rev/lcap_glp1_d1_linear_rev.bw
1f79baf1ff46cd72344007164d1555a0  lcap808_geo/tracks_linear_fwd/lcap_glp1_d2_linear_fwd.bw
758c430dac37973f4cce748cb907f9d6  lcap808_geo/tracks_linear_rev/lcap_glp1_d2_linear_rev.bw
86888ab69cf1faa243df47eb88ab9ac9  lcap808_geo/tracks_linear_fwd/lcap_glp1_d6_linear_fwd.bw
25e48f96bb025555f02ca542e06c99ae  lcap808_geo/tracks_linear_rev/lcap_glp1_d6_linear_rev.bw
6f06c89497e33fdb1d5f46b349a96bd9  lcap808_geo/tracks_linear_fwd/lcap_glp1_d9_linear_fwd.bw
6bd55eb64fce0ee2860dbe6db8c2a161  lcap808_geo/tracks_linear_rev/lcap_glp1_d9_linear_rev.bw
9ada731c47c56e3076e48c9a55c24de1  lcap808_geo/tracks_linear_fwd/lcap_glp1_d13_linear_fwd.bw
c4dd557df66c1416f38c040787d0cc6c  lcap808_geo/tracks_linear_rev/lcap_glp1_d13_linear_rev.bw

In [3]:
for stage in config['stages']:
    fp_fwd = 'lcap808_geo/tracks_log2_fwd/lcap_%s_log2_fwd.bw' % (stage,)
    fp_rev = 'lcap808_geo/tracks_log2_rev/lcap_%s_log2_rev.bw' % (stage,)
    !md5sum {fp_fwd} #| awk '{{print $$1}}'
    !md5sum {fp_rev} #| awk '{{print $$1}}'


3cb93a6195f6e42ffc99c8f8d4db1ef8  lcap808_geo/tracks_log2_fwd/lcap_wt_emb_log2_fwd.bw
38dafe29d571c329cdf8e8dc9cbd01a1  lcap808_geo/tracks_log2_rev/lcap_wt_emb_log2_rev.bw
8d3bc492cd0c22ee5046747b3ff02d93  lcap808_geo/tracks_log2_fwd/lcap_wt_l1_log2_fwd.bw
64878d1087c12060699980ea9070c125  lcap808_geo/tracks_log2_rev/lcap_wt_l1_log2_rev.bw
c29c5cbfecbf234ffad5a40109b1930f  lcap808_geo/tracks_log2_fwd/lcap_wt_l2_log2_fwd.bw
83f62edf8724e134451684ce23d74c1e  lcap808_geo/tracks_log2_rev/lcap_wt_l2_log2_rev.bw
7b6304577c9fd444ade6a19348cacef7  lcap808_geo/tracks_log2_fwd/lcap_wt_l3_log2_fwd.bw
a6f0f0dacda83170aebc6af14107642a  lcap808_geo/tracks_log2_rev/lcap_wt_l3_log2_rev.bw
401003673997b7c5ce4f66d7aa5fe595  lcap808_geo/tracks_log2_fwd/lcap_wt_l4_log2_fwd.bw
6da981088e12c18b7286809d47aa2a47  lcap808_geo/tracks_log2_rev/lcap_wt_l4_log2_rev.bw
b8f9a1bc2a00fdafec65d34ab0aa2f66  lcap808_geo/tracks_log2_fwd/lcap_wt_ya_log2_fwd.bw
c0c41b63edbf998f1edaf1f450bdbe70  lcap808_geo/tracks_log2_rev/lcap_wt_ya_log2_rev.bw
cf3e921efd643f869c80a19e2aad9aa7  lcap808_geo/tracks_log2_fwd/lcap_glp1_d1_log2_fwd.bw
ec8e553eccfda0790c0cdbd1fe04277d  lcap808_geo/tracks_log2_rev/lcap_glp1_d1_log2_rev.bw
67751a247df712c036197d3956689634  lcap808_geo/tracks_log2_fwd/lcap_glp1_d2_log2_fwd.bw
72825e21a0a3b59fd359e24d5789dabf  lcap808_geo/tracks_log2_rev/lcap_glp1_d2_log2_rev.bw
20d7874ed56f9a43f761aebe3093f7be  lcap808_geo/tracks_log2_fwd/lcap_glp1_d6_log2_fwd.bw
43f161db5383f877547c42f628fbb7d0  lcap808_geo/tracks_log2_rev/lcap_glp1_d6_log2_rev.bw
8f591db69dbdcd7f848920a1cc9fd454  lcap808_geo/tracks_log2_fwd/lcap_glp1_d9_log2_fwd.bw
710429e6573fb3499df50c6c4852b780  lcap808_geo/tracks_log2_rev/lcap_glp1_d9_log2_rev.bw
5ee0f166f696aa0c8540c7ed0ae875a3  lcap808_geo/tracks_log2_fwd/lcap_glp1_d13_log2_fwd.bw
448f76deb0756ae39093daa338647594  lcap808_geo/tracks_log2_rev/lcap_glp1_d13_log2_rev.bw

In [4]:
# file checksum -- read1
for sample in list(config['lcap808'].keys()):
    fp = pf('lcap808_' + sample, 'md5sum_r1', '.txt', 'lcap808')
    !cat {fp} #| awk '{{print $$1}}'


d0dbf86e849bcbf0ebbafab55c0130bf  samples/lcap808_wt_emb_rep1a.r1.fq.gz
6f7a8c9d7d1260f80ec876318b9356c9  samples/lcap808_wt_emb_rep1b.r1.fq.gz
128d09a932e5118e1d6e8ee43b8c378f  samples/lcap808_wt_emb_rep2.r1.fq.gz
a4f96f57f023020a1de1b95a46f3ac78  samples/lcap808_wt_l1_rep1a.r1.fq.gz
6865caa3084627e0f418fe7911a90258  samples/lcap808_wt_l1_rep1b.r1.fq.gz
d04a625527051425427b43a08eeb2a22  samples/lcap808_wt_l1_rep2a.r1.fq.gz
17a18c43302dece5e36da12d165e7fa6  samples/lcap808_wt_l1_rep2b.r1.fq.gz
fbe3fe2cb2e7e462f158226f9d16c56b  samples/lcap808_wt_l2_rep1.r1.fq.gz
320b91361cabea124503a8c173aed5a5  samples/lcap808_wt_l2_rep2a.r1.fq.gz
02f27363b55702fd35f523236fc7bc83  samples/lcap808_wt_l2_rep2b.r1.fq.gz
571544a35151d0d50212025703b89948  samples/lcap808_wt_l3_rep1a.r1.fq.gz
34da5ce08760f6e9b4fef15964cd1b14  samples/lcap808_wt_l3_rep1b.r1.fq.gz
e16c8a40a06753f005535323bba631a6  samples/lcap808_wt_l3_rep2a.r1.fq.gz
6a9ed1fcad605e2f17d297054a2f172d  samples/lcap808_wt_l3_rep2b.r1.fq.gz
c4126778af2b18042208d38de4112dd2  samples/lcap808_wt_l4_rep1a.r1.fq.gz
e6f770287d3650f13c411429b788fccf  samples/lcap808_wt_l4_rep1b.r1.fq.gz
0a12c41de1bb6275731010f51b977a62  samples/lcap808_wt_l4_rep2a.r1.fq.gz
d1067f3805f879c80537bf8a2fbf8acd  samples/lcap808_wt_l4_rep2b.r1.fq.gz
7c485078f6d7d2a30547c199580af182  samples/lcap808_wt_ya_rep1a.r1.fq.gz
a05ebb30e2883eabd5d6fc79a73820c9  samples/lcap808_wt_ya_rep1b.r1.fq.gz
512d99524766f7c4b551f127f3f19d66  samples/lcap808_wt_ya_rep2a.r1.fq.gz
15e6188c6759fe5376ca26be441724c9  samples/lcap808_wt_ya_rep2b.r1.fq.gz
f1b5d4a21a586059f5fa3c4e4ad5e428  samples/lcap808_glp1_d1_rep1.r1.fq.gz
182742e7d22a1e2a2018ed65a9c0d3d8  samples/lcap808_glp1_d1_rep2.r1.fq.gz
e27fcfa43f63bdcc4f0c495e50c35d04  samples/lcap808_glp1_d2_rep1.r1.fq.gz
2295cca8d274405b17eb6d78640d2e76  samples/lcap808_glp1_d2_rep2.r1.fq.gz
f2d4ad84ee73beba18d386e393817277  samples/lcap808_glp1_d6_rep1.r1.fq.gz
fc21f935afb4edd936cbfd2d0ee4bcae  samples/lcap808_glp1_d6_rep2.r1.fq.gz
42e1374098efd876aeb14da71c91614d  samples/lcap808_glp1_d9_rep1.r1.fq.gz
d6cb37ec9f0635bd75934e1b02461781  samples/lcap808_glp1_d9_rep2.r1.fq.gz
0197aabab20686876dbcf321611f1ec7  samples/lcap808_glp1_d13_rep1.r1.fq.gz
37aaf691e10e4a59d3d5d003e1a980a3  samples/lcap808_glp1_d13_rep2.r1.fq.gz

In [5]:
# file checksum -- read2
for sample in list(config['lcap808'].keys()):
    fp = pf('lcap808_' + sample, 'md5sum_r2', '.txt', 'lcap808')
    !cat {fp} #| awk '{{print $$1}}'


c7595ce075e9ec682b60484c920407d2  samples/lcap808_wt_emb_rep1a.r2.fq.gz
8870fab098e904116b788499ee41bb3f  samples/lcap808_wt_emb_rep1b.r2.fq.gz
074be32ca5a20f55e8c74afdac58c0ce  samples/lcap808_wt_emb_rep2.r2.fq.gz
85e5bf59c0e791c90adba50381d83139  samples/lcap808_wt_l1_rep1a.r2.fq.gz
3a1b19d0bd0ddfe6a3a399020bc5735c  samples/lcap808_wt_l1_rep1b.r2.fq.gz
7f3b670e70174430b208f952d9fa2213  samples/lcap808_wt_l1_rep2a.r2.fq.gz
aab0487b2f5b7e76a30a460c50d306c1  samples/lcap808_wt_l1_rep2b.r2.fq.gz
588d7d64d1eee8f43a08c7aa3601b474  samples/lcap808_wt_l2_rep1.r2.fq.gz
fa19ad5fe577ec1d7ed423611cbbbade  samples/lcap808_wt_l2_rep2a.r2.fq.gz
32acda34ac1f49073d18fd35fdab9747  samples/lcap808_wt_l2_rep2b.r2.fq.gz
f27ec164ec177d70c700b1c42b7eae11  samples/lcap808_wt_l3_rep1a.r2.fq.gz
b02c87d9c77464c513a5a2e959468733  samples/lcap808_wt_l3_rep1b.r2.fq.gz
8c7e0f45a7f69a8ff3e5e338402daefb  samples/lcap808_wt_l3_rep2a.r2.fq.gz
6d7526195d56c0463cb34db841431eff  samples/lcap808_wt_l3_rep2b.r2.fq.gz
bc49da7721dae000c526c73da56e3654  samples/lcap808_wt_l4_rep1a.r2.fq.gz
559e9c1bf23c1f224bc8f9f4cd715efd  samples/lcap808_wt_l4_rep1b.r2.fq.gz
2f6fb465d95db00d210575d5cf6b80d7  samples/lcap808_wt_l4_rep2a.r2.fq.gz
761dbc1ac9c9c56d6121c9647eec4d73  samples/lcap808_wt_l4_rep2b.r2.fq.gz
6beb66dd3ac04a07712d35f65ac7fab7  samples/lcap808_wt_ya_rep1a.r2.fq.gz
a5f8b9f5e44d823afbe8ded149cebb69  samples/lcap808_wt_ya_rep1b.r2.fq.gz
f1d19af3fcfec03a517365113078d634  samples/lcap808_wt_ya_rep2a.r2.fq.gz
12216efec73293702871795215c2df04  samples/lcap808_wt_ya_rep2b.r2.fq.gz
d739d930a6119e1372869e230029cc8f  samples/lcap808_glp1_d1_rep1.r2.fq.gz
f9116b1cc8ce8d1e143f0534acf257b3  samples/lcap808_glp1_d1_rep2.r2.fq.gz
87ba1a6392a14bb300869bb40d5e82dc  samples/lcap808_glp1_d2_rep1.r2.fq.gz
b92d17eb3313b9b4d3e1aaf75f8ddefb  samples/lcap808_glp1_d2_rep2.r2.fq.gz
d0f8ff2e6ee3277b0e61e85a96f251bc  samples/lcap808_glp1_d6_rep1.r2.fq.gz
25b3225ed4d51111a1c646664ab11292  samples/lcap808_glp1_d6_rep2.r2.fq.gz
e3acccd0a960942f5ce0efcd138f9632  samples/lcap808_glp1_d9_rep1.r2.fq.gz
dac7dcb0a429e5f89a3818e352cb1a6e  samples/lcap808_glp1_d9_rep2.r2.fq.gz
391a20f261a2fd4a6376736323db5b02  samples/lcap808_glp1_d13_rep1.r2.fq.gz
6773236943e503fd66c817c4b428423f  samples/lcap808_glp1_d13_rep2.r2.fq.gz

In [6]:
# read length -- read1
for sample in list(config['lcap808'].keys()):
    fp = pf('lcap808_' + sample, 'readlen_r1', '.txt', 'lcap808')
    !cat {fp} #| awk '{{print $$1}}'
    print()


100
51
62
50
51
50
51
50
50
51
50
51
50
51
50
51
50
51
50
51
50
51
60
60
60
60
60
60
60
60
60
60

In [7]:
# read length -- read2
for sample in list(config['lcap808'].keys()):
    fp = pf('lcap808_' + sample, 'readlen_r2', '.txt', 'lcap808')
    !cat {fp} #| awk '{{print $$1}}'
    print()


100
51
62
50
51
50
51
50
50
51
50
51
50
51
50
51
50
51
50
51
50
51
60
60
60
60
60
60
60
60
60
60

In [8]:
# mean read length
for sample in itertools.islice(config['lcap808'].keys(), None):
    fp = pf('lcap808_' + sample, 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.fsizes', '.txt', 'lcap808')
    df = pd.read_csv(fp, sep='\t', names=['size', 'count'])#.head()
    l_len = list(itertools.chain(*[itertools.repeat(size, count) for (size, count) in zip(df['size'], df['count'])]))
    mean1 = np.average(df['size'], weights=df['count'])
    #mean2 = np.mean(l_len)
    #print(mean1, mean2)
    print('%d' % (mean1,))
    #print('%s %d' % (sample, mean1))


208
209
225
244
245
278
280
284
294
294
307
306
280
282
274
275
304
308
290
292
309
310
293
316
306
288
313
310
299
274
279
310

In [9]:
for sample in itertools.islice(config['lcap808'].keys(), None):
    fp = pf('lcap808_' + sample, 'trim20.bwa_pe.rm_unmapped_pe.rm_chrM.rm_rRNA_broad.rm_blacklist.rm_q10.fsizes', '.txt', 'lcap808')
    df = pd.read_csv(fp, sep='\t', names=['size', 'count'])#.head()
    l_len = list(itertools.chain(*[itertools.repeat(size, count) for (size, count) in zip(df['size'], df['count'])]))
    print('%d' % (np.std(l_len),))


110
111
123
128
128
161
162
171
170
169
195
194
163
164
151
151
175
178
172
172
201
201
207
214
216
178
223
217
209
176
177
203