In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
df_ = pd.DataFrame.from_records(list(config['scap815'].items()), columns=['label', 'bid']).merge(
    pd.DataFrame(config['scap']).transpose().query('(strain == "N2") & (qc_fail != qc_fail)'),
    left_on='bid',
    right_index=True
)[['label', 'bid', 'collection_id', 'enzyme', 'library_series_id']]
df_


Out[2]:
label bid collection_id enzyme library_series_id
0 wt_emb_rep1 HS182_JA25-N2-scRNA-S4r JA25 TAP EMB-JA25-scRNA-S4r_unsure
1 wt_emb_rep2 HS182_JA26r-N2-scRNA-S6r JA26 TAP EMB-JA26r-scRNA-S6r_unsure
2 wt_emb_rep3a chen13_scap_rep1a RonEMB1 TAP EMB-RonEMB1-N2-scRNA-S1_unsure1
3 wt_emb_rep3b chen13_scap_rep1b_trim20 RonEMB1 TAP EMB-RonEMB1-N2-scRNA-S1_unsure2
4 wt_emb_rep4 chen13_scap_rep2 RonEMB2 TAP EMB-RonEMB2-N2-scRNA-S1_unsure
5 wt_l1_rep1a HS571_JA2_N2_scRNA JA2 TAP L1-JA2-N2-scRNA-HS571
6 wt_l1_rep1b HS578_JA2r_N2_scRNA JA2 TAP L1-JA2-N2-scRNA-HS578
7 wt_l1_rep2 HS604_scRNA_JA6_L1 JA6 TAP L1-JA6-N2-scRNA-HS604
8 wt_l2_rep1a HS571_JA1_N2_scRNA JA1 TAP L2-JA1-N2-scRNA-HS571
9 wt_l2_rep1b HS582_JA1r_N2_scRNA JA1 TAP L2-JA1-N2-scRNA-HS582
10 wt_l2_rep2a HS308_JA7_scRNA_S2 JA7 Cap-Clip L2-JA7-N2-scRNA-S2
11 wt_l2_rep2b HS333_crb55_JA7 JA7 Cap-Clip L2-JA7-N2-scRNA-S2r
12 wt_l3_rep1 chen13_scap_extra_l3 JA34 TAP L3-JA34-N2-scRNA-S1
13 wt_l3_rep2a HS252_Yan_S1 JA22 Dpph L3-JA22-scRNA-S1
14 wt_l3_rep2b HS252_Yan_S2 JA22 Cap-Clip L3-JA22-scRNA-S2
15 wt_l4_rep1 HS604_scRNA_JA3_L4 JA3 TAP L4-JA3-N2-scRNA-HS604
16 wt_l4_rep2a HS571_JA9_N2_scRNA JA9 TAP L4-JA9-N2-scRNA-HS571
17 wt_l4_rep2b HS578_JA9r_N2_scRNA JA9 TAP L4-JA9-N2-scRNA-HS578
18 wt_ya_rep1 weick2014_ya_n2_tap_extra RonYA1 TAP YA-RonYA1-N2-scRNA-S1_unsure
19 wt_ya_rep2 HS053_Yan-02_S16 RonYA2 TAP YA-RonYA2-N2-scRNA-S2_1

In [3]:
# bigWig checksums
for stage in ['wt_all'] + config['stages_wt']:
    #print(stage)
    fp = 'scap815_geo/tracks_fwd/scap_%s_fwd.bw' % (stage,)
    !md5sum {fp} | awk '{{print $$1}}'
    fp = 'scap815_geo/tracks_rev/scap_%s_rev.bw' % (stage,)
    !md5sum {fp} | awk '{{print $$1}}'


b973e61d76e7ee633d40df72763151cc
3e842f9c22617541d093677c6cb000a2
75b69a4cd24d7ed2ce0b6f83ae76816b
a0599b9cbab8301362bc4a679adeb8df
40a5d717642632b2600fe36cda27f06b
c96cb8e3ae406661507fbc85500e9f14
f553246fe61edb5487a7aeffcd828dfb
5da9a1e1178e547dd1e80387727f4573
bb68a74e663d27460d35d567fb74e01f
024f8f9321e809afa90e108e642cc37e
9da7b2d9c59094b2f10561ac6a258b43
c3153cf327c6a3d3a864c6929ba92fc1
b1a8bbcb998107f610486e2f018a07c3
477b6f14c7485ed0a8eafa523bc7a9f4

In [4]:
# fastq files -- md5 checksums
l_geo = ['wt_emb_rep1', 'wt_emb_rep2',
            'wt_l1_rep1a', 'wt_l1_rep1b', 'wt_l1_rep2',
            'wt_l2_rep1a', 'wt_l2_rep1b', 'wt_l2_rep2a', 'wt_l2_rep2b',
            'wt_l3_rep1', 'wt_l3_rep2a', 'wt_l3_rep2b',
            'wt_l4_rep1', 'wt_l4_rep2a', 'wt_l4_rep2b',
            'wt_ya_rep1', 'wt_ya_rep2']

for rep in itertools.islice(l_geo, None):
    fp = 'scap815_geo/reads/scap_%s.fastq.gz' % (rep,)
    print(os.path.basename(fp))

for rep in itertools.islice(l_geo, None):
    fp = 'scap815_geo/reads/scap_%s.fastq.gz' % (rep,)
    !md5sum {fp} | awk '{{print $$1}}'


scap_wt_emb_rep1.fastq.gz
scap_wt_emb_rep2.fastq.gz
scap_wt_l1_rep1a.fastq.gz
scap_wt_l1_rep1b.fastq.gz
scap_wt_l1_rep2.fastq.gz
scap_wt_l2_rep1a.fastq.gz
scap_wt_l2_rep1b.fastq.gz
scap_wt_l2_rep2a.fastq.gz
scap_wt_l2_rep2b.fastq.gz
scap_wt_l3_rep1.fastq.gz
scap_wt_l3_rep2a.fastq.gz
scap_wt_l3_rep2b.fastq.gz
scap_wt_l4_rep1.fastq.gz
scap_wt_l4_rep2a.fastq.gz
scap_wt_l4_rep2b.fastq.gz
scap_wt_ya_rep1.fastq.gz
scap_wt_ya_rep2.fastq.gz
3a68696ad3e2d493738c750ff7ba815c
4ecf534a4db891050bcb11a7a70e4965
9e978c41e0cf0a7ccdb4fdcbbc6e5e1c
80e95c186a5a984f693297a04aaeb325
11cff21103c21b75efac2fce3c0ef7d3
741dca9430c674588ba94dde7d9e0913
53f92fcdff11076349951afcf07c2227
31c784c62ce4cd7b48d262c99fcc446b
ec4d45718b9a464f0ed61d335c1c9acc
85f7a875e6b4a9fcb46d499fc8b805e0
ef1c0af1cfd39a722761d5f23056172d
0d400d422d38c6ad4c5079d3d67792ee
358dee4c06acead3f7fa2655460f2159
2dd4425efdff895e7c98a6bf50b41e05
58157f1be1344c06579e98fbe6a5ba43
04bed45d1dd30b2b3b0f7b108e5a207d
460420bffd978fdc681cbf1c46e13de1

In [5]:
# Fastq read lengths
def readlen_(fp_inp):
    n_reads = int(1e6) # estimate read length from the first 1M reads
    counts = collections.Counter(map(len, itertools.islice(hts.FastqReader(fp_inp), n_reads)))
    return max(counts.keys())

for rep in itertools.islice(l_geo, None):
    fp = 'scap815_geo/reads/scap_%s.fastq.gz' % (rep,)
    print(readlen_(fp))


100
100
51
51
51
51
51
51
36
51
36
36
51
51
51
40
36

In [ ]: