In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb
In [2]:
df_ = pd.DataFrame.from_records(list(config['scap815'].items()), columns=['label', 'bid']).merge(
pd.DataFrame(config['scap']).transpose().query('(strain == "N2") & (qc_fail != qc_fail)'),
left_on='bid',
right_index=True
)[['label', 'bid', 'collection_id', 'enzyme', 'library_series_id']]
df_
Out[2]:
In [3]:
# bigWig checksums
for stage in ['wt_all'] + config['stages_wt']:
#print(stage)
fp = 'scap815_geo/tracks_fwd/scap_%s_fwd.bw' % (stage,)
!md5sum {fp} | awk '{{print $$1}}'
fp = 'scap815_geo/tracks_rev/scap_%s_rev.bw' % (stage,)
!md5sum {fp} | awk '{{print $$1}}'
In [4]:
# fastq files -- md5 checksums
l_geo = ['wt_emb_rep1', 'wt_emb_rep2',
'wt_l1_rep1a', 'wt_l1_rep1b', 'wt_l1_rep2',
'wt_l2_rep1a', 'wt_l2_rep1b', 'wt_l2_rep2a', 'wt_l2_rep2b',
'wt_l3_rep1', 'wt_l3_rep2a', 'wt_l3_rep2b',
'wt_l4_rep1', 'wt_l4_rep2a', 'wt_l4_rep2b',
'wt_ya_rep1', 'wt_ya_rep2']
for rep in itertools.islice(l_geo, None):
fp = 'scap815_geo/reads/scap_%s.fastq.gz' % (rep,)
print(os.path.basename(fp))
for rep in itertools.islice(l_geo, None):
fp = 'scap815_geo/reads/scap_%s.fastq.gz' % (rep,)
!md5sum {fp} | awk '{{print $$1}}'
In [5]:
# Fastq read lengths
def readlen_(fp_inp):
n_reads = int(1e6) # estimate read length from the first 1M reads
counts = collections.Counter(map(len, itertools.islice(hts.FastqReader(fp_inp), n_reads)))
return max(counts.keys())
for rep in itertools.islice(l_geo, None):
fp = 'scap815_geo/reads/scap_%s.fastq.gz' % (rep,)
print(readlen_(fp))
In [ ]: