#!/bin/bash ## $1 == chr1 ## $2 == chr2 ## $3 == output_file echo -e "run juicebox dump\n\t resolution: 10000\n\t normalization method: KR\n\t output: observed notmalized sparce matrix" touch $3 java -jar /Users/pubudu/Downloads/MacTools/juicebox_tools.7.5.jar dump observed KR /Users/pubudu/Documents/RefData/raoPaper/copy_GSE63525_HMEC_combined.hic $1 $2 BP 10000 $3 echo -e "\#\#\#\#juicebox dump observed KR copy_GSE63525_HMEC_combined.hic $1 $2 BP 10000 $3\n$(cat $3)" > $3
In [ ]:
import pandas as pd
import subprocess
from itertools import combinations
In [ ]:
chr_list=['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20','21','22','X','Y'] # List of chromosomes
f_counter = 0 # Counter for the number of files created
set_path = "/Users/pubudu/Documents/RefData/raoPaper/" # path of the .hic file and dump files
file_list = [] # list of all the dump files created
In [ ]:
# Run juicebox dump to
for chroms in chr_list:
print 'Juicebox dump run - chromosome: {}'.format(chroms)
f_name = '{}.{}_{}.dumpOut.txt'.format(f_counter, chroms, chroms)
arg_list = [chroms, chroms, f_name]
file_list.append(f_name)
subprocess.call(['/Users/pubudu/Documents/RefData/raoPaper/runDump.bash', str(arg_list[0]), str(arg_list[1]), ''.join([set_path, str(arg_list[2])]) ])
f_counter += 1
In [ ]:
# itertools.combinations(iterable, r)
### Return r length subsequences of elements from the input iterable.
### Combinations are emitted in lexicographic sort order. So, if the input iterable is sorted, the combination tuples will be produced in sorted order.
### Elements are treated as unique based on their position, not on their value. So if the input elements are unique, there will be no repeat values in each combination.
for combo in combinations(chr_list, 2):
print 'Juicebox dump run - chromosomes: {} and {}'.format(combo[0], combo[1])
f_name = '{}.{}_{}.dumpOut.txt'.format(f_counter, combo[0], combo[1])
arg_list = [combo[0], combo[1], f_name]
subprocess.call(['/Users/pubudu/Documents/RefData/raoPaper/runDump.bash', str(arg_list[0]), str(arg_list[1]), ''.join([set_path, str(arg_list[2])]) ])
file_list.append(f_name)
f_counter += 1
In [ ]:
#print file_list
print len(file_list)
In [ ]:
### Fill start and end postions in juicebox dump file
res = 10000 # Resolution used to run juicebox dump command
lambdafunc = lambda x: pd.Series([x['start1'],
int(x['start1'])+res,
x['start2'],
int(x['start2'])+res,
x['normC']])
In [ ]:
#### format juicebox dump file to 7 column hibrowse input format
for f in file_list:
print 'Reformatting to 7 columns: {}'.format(f)
int_chrs = f.split('.')[1].split('_')
print int_chrs
dump_out = pd.read_table(''.join([set_path,f]), sep='\t', skiprows=1, names=['start1', 'start2', 'normC'], na_values='')
dump_out = dump_out.fillna(0)
hibrowsein = pd.DataFrame()
hibrowsein = dump_out.apply(lambdafunc, axis=1)
hibrowsein.columns = ['start1', 'end1', 'start2', 'end2', 'intCount']
hibrowsein['chr1'] = str(int_chrs[0])
hibrowsein['chr2'] = str(int_chrs[1])
hibrowsein = hibrowsein[['chr1', 'start1', 'end1', 'chr2', 'start2', 'end2', 'intCount']]
hibrowsein = hibrowsein.astype({'chr1':str,
'start1':int,
'end1':int,
'chr2':str,
'start2':int,
'end2':int,
'intCount':float})
print 'Writing {}.hibrowse.txt file...'.format(''.join([set_path,f]))
hibrowsein.to_csv('{}.hibrowse.txt'.format(''.join([set_path,f])),
sep='\t',
header=True,
index=False,
quoting=None,
line_terminator='\n')
In [ ]: