In [1]:
%run ~/relmapping/annot/notebooks/__init__.ipynb


/mnt/home3/jj374/anaconda36/lib/python3.6/site-packages/statsmodels/compat/pandas.py:56: FutureWarning: The pandas.core.datetools module is deprecated and will be removed in a future version. Please use the pandas.tseries module instead.
  from pandas.core import datetools
os.getcwd(): /mnt/beegfs/scratch_copy/ahringer/jj374/lab/relmapping

In [2]:
for stage in config['stages']:
    fp = 'atac814_geo/tracks/atac_%s.bw' % (stage,)
    !md5sum {fp} | awk '{{print $$1}}'


6382c56d3a3c6cc25c46e4bcf6589a73
77edcbdd2221c27d7a00b7f8bcb7c7b9
3804a389b3389cb8bfc0355b633abd55
6c99bf3241e32b7c4dd3a28f377edd70
da8d09bb2a1703b6843cb05ee12a78f9
5a818c26a206fd7c94e1c51af128a5de
08f2eb374e6e20f66aed949dcfa30c9e
a9f6490bfa2be3a935a101213c0e21bb
36a5df989f8098c05c4b59ce1b405696
c7a4e7f3c300091410d60dd31085d5a0
e853e910c192fbec75f792fef74116a7

In [3]:
# file checksum -- glp1, read1 only
for sample in list(config['atac814_se'].keys()):
    fp = pf('atac814_' + sample, 'md5sum_r1', '.txt', 'atac814')
    !cat {fp} | awk '{{print $$1}}'


04f06e0f97e9dee59ba04499a5b3bbdd
aaf8bc6a31c24b9b2784cbca3ad9cae2
695cff3116883a33c9b59d4b1a63d069
bbbd654411c580e3c220c0aa5c251be3
4609e7d3d5639c13ac18c4aac0e872c6
fa7f8fef5cc5c6ed48c76caa11cbf368
36aedbf10eac58fb811a0d73099f2529
076510408707ac2e67a16c44ecacaa08
e73c55cabc382bc90f1338516bfecb8c
e3f2712ad6838dd2c2c9e3fb89575a69
ec7673917698198756f126778851cae7
4c9c3e5d3eae255881683b5a6c70a37d
33e9589fb621f22f0ff2f7512bb8fd52
7f22404690fddf6320bff1cd64c44e62
d28ef23b03972da07f46f68e8cf7d01e
ac9aac41427175948a190bdeda1955f6
cef1eda984fede08acbf8e87250215a1
e0d53ba6940bc07baf23e14a2b3a54da
3a93695c49bbc107603f2c41a7902550
05afbba3ed46f73be0ff2e32df9294be
76ee7a87c8ce5aa2bcefc6de5172c7cb
884bab9bcf7f9ad987e1b56e2d2e737b
5e4e6868f3fb7a8b730678743681f619
dec5084daf9a0815900e0087a91ab9c9
88c570bd07d38c67f0f73c289631216d
7b5f3d1003c346f9667f4fd6af86dea4
1f1ca66245fbb7958c1333e035f6bf9e
e2bcae2e3238b579b9d5e08cb149187d
d7868726080ad7449b15af5891bd0eb2
c02fbc8f03c1fc4dce46639087d03ce9

In [4]:
# file checksum -- glp1, read1 only
for sample in list(config['atac814_pe'].keys()):
    fp = pf('atac814_' + sample, 'md5sum_r1', '.txt', 'atac814')
    !cat {fp} | awk '{{print $$1}}'


7db7fc80c05c213552a1320ec2a8e6ae
adb761ac75d00b7affe0fc65c7c09b4f
d51dfe34c55df1aad1de7533a31c97c0
db44b7abcc62e1460a8f9f54f8f2f8ea
d3d9e8911160abf17e3ca338da9bc619
33c7817c1b79453013bd84c55ad0b9c2
c7b5789f39e9022e8174de78c542527b
4513135672b949c3ffe15be77c0211d9
a578e129e0223c7799a1d272d0ca7a8a
19e8e9e8b64d1371f8d7338dacc29a9b
fce120e42c3d1039df8726dc5c08b87a
10afd1f4cf1b36b27e783d50962d10d4
d7aebd358e64353c6a05a8030dd9bdfa
f52512957f26454252a637ffc86f8448
948c3e83b976bae9a0f1087d779e90c6
34ca2dc7012be686bf375c91e21a4526
5bf7d853b654af16a4daab3ca7842344
48b560f7537dc8203dc69eb79e844d4e
13f9e75b8a2182c1074ecd80ecd50a07
8ecfc5983be450df51d95ed04cac5753
611d5628a856727a22f5a3b4fc52507e

In [5]:
# file checksum -- glp1, read1 only
for sample in list(config['atac814_pe'].keys()):
    fp = pf('atac814_' + sample, 'md5sum_r2', '.txt', 'atac814')
    !cat {fp} | awk '{{print $$1}}'


7db26fb329cf9e9b1eaca964a022c60d
8e59e93be82726716f97dadc4a46144b
fd0db4057982199d1e60de4eb5d70e4c
34c467c39345e8398a158ac9792c96bb
76cb8959a4e6d4fac807afd8838abde3
95655dd09c2f8314235d20fc3e4d452b
ebb6e54a47bdf55ea999d4dc2c3cd66d
10116bfa3bceebe679598e22bd7b56d2
ab9a061557f6fbd88690fb696a718998
587e9adaec98525fff79ed369bd1d6a0
b93a17dfe0b00dbf4d3413e967b1f0d6
c1788fe75f86558715137186aadd7568
2ed6321df31160e7fe789fd1b5b53208
81874b2baccac1c62a37ecae15cec6e7
daf83918b419c9ed9afbc608f5d54cae
2bddbc2ba554d0bcffd723d0af09a584
c1c2896d1d6bcd3657ffe069ff60e60c
d059dec54dc649661f34dcc6bf004032
30f56715a9e9a08d5e74312da7bc4339
76a415cecebfc9f96351022117ed9431
96f2732239f0cfa1b981146d3e389f15

In [6]:
# readlen -- glp1, read1 only
for sample in list(config['atac814_se'].keys()):
    fp = pf('atac814_' + sample, 'readlen_r1', '.txt', 'atac814')
    !cat {fp} #| awk '{{print $$1}}'
    print()


51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51
51

In [7]:
# readlen -- wt, read1 
for sample in list(config['atac814_pe'].keys()):
    fp = pf('atac814_' + sample, 'readlen_r1', '.txt', 'atac814')
    !cat {fp} #| awk '{{print $$1}}'
    print()


32
32
51
51
32
32
32
51
36
32
51
36
32
51
32
36
32
50
32
32
50

In [8]:
# readlen -- wt, read2
for sample in list(config['atac814_pe'].keys()):
    fp = pf('atac814_' + sample, 'readlen_r2', '.txt', 'atac814')
    !cat {fp} #| awk '{{print $$1}}'
    print()


32
32
51
51
32
32
32
51
36
32
51
36
32
51
32
36
32
50
32
32
50

In [9]:
# mean read length
for sample in itertools.islice(config['atac814_pe'].keys(), None):
    fp = pf('atac814_' + sample, 'tg_pe.bwa_pe.rm_unmapped_pe.rm_chrM.rm_blacklist.rm_q10.fsizes', '.txt', 'atac814')
    df = pd.read_csv(fp, sep='\t', names=['size', 'count'])#.head()
    l_len = list(itertools.chain(*[itertools.repeat(size, count) for (size, count) in zip(df['size'], df['count'])]))
    mean1 = np.average(df['size'], weights=df['count'])
    #mean2 = np.mean(l_len)
    #print(mean1, mean2)
    print('%d' % (mean1,))
    #print('%s %d' % (sample, mean1))


158
171
207
213
140
186
181
162
148
148
168
154
153
193
172
184
188
152
142
140
212

In [10]:
for sample in itertools.islice(config['atac814_pe'].keys(), None):
    fp = pf('atac814_' + sample, 'tg_pe.bwa_pe.rm_unmapped_pe.rm_chrM.rm_blacklist.rm_q10.fsizes', '.txt', 'atac814')
    df = pd.read_csv(fp, sep='\t', names=['size', 'count'])#.head()
    l_len = list(itertools.chain(*[itertools.repeat(size, count) for (size, count) in zip(df['size'], df['count'])]))
    print('%d' % (np.std(l_len),))


113
130
192
192
103
123
118
127
114
118
131
118
122
166
149
125
134
87
95
92
153

In [ ]: