In [1]:
import sys
import os
import subprocess
import pyfaidx
from pprint import pprint
In [2]:
def print_fasta_region(faidx_obj, chrom, pos, pad, strand='+'):
#print(faidx_object[chrom][pos])
s = faidx_obj[chrom][pos-pad-1:pos+pad]
if strand == '-':
s = s.complement.reverse
print(str(s[0:pad]), end='')
print("\x1b[41m{}\x1b[0m".format(s[pad]), end='')
print(str(s[pad+1:2*pad+1]), end='\n')
def print_alignment(faidx_obj1, chrom1, pos1, strand1,
faidx_obj2, chrom2, pos2, strand2,
pad):
s1 = faidx_obj1[chrom1][pos1-pad-1:pos1+pad]
if strand1 == '-':
s1 = s1.complement.reverse
s2 = faidx_obj2[chrom2][pos2-pad-1:pos2+pad]
if strand2 == '-':
s2 = s2.complement.reverse
str1 = ''
str2 = ''
for i in range(len(s1)):
if str(s1[i]).upper() == str(s2[i]).upper():
if i == pad:
str1 += "\x1b[43m{}\x1b[0m".format(s1[i])
str2 += "\x1b[43m{}\x1b[0m".format(s2[i])
else:
str1 += str(s1[i])
str2 += str(s2[i])
else:
if i == pad:
str1 += "\x1b[45m{}\x1b[0m".format(s1[i])
str2 += "\x1b[45m{}\x1b[0m".format(s2[i])
else:
str1 += "\x1b[41m{}\x1b[0m".format(s1[i])
str2 += "\x1b[41m{}\x1b[0m".format(s2[i])
print(str1)
print(str2)
#print(str(s1[0:pad]), end='')
#print("\x1b[41m{}\x1b[0m".format(s1[pad]), end='')
#print(str(s1[pad+1:2*pad+1]), end='\n')
#print(str(s2[0:pad]), end='')
#print("\x1b[41m{}\x1b[0m".format(s2[pad]), end='')
#print(str(s2[pad+1:2*pad+1]), end='\n')
In [3]:
def run_blast(query_seq, database, echo=True):
cols = ['qid', 'chrom', 'identity', 'alignment length', 'mismatches', 'gap openings',
'q start', 'q end', 'start', 'end', 'e-value', 'bit score']
blast_output = subprocess.check_output("echo {} | blastall -p blastn -d {} -m 8".
format(str(query_seq), database), shell=True)
blast_output = blast_output.decode('utf-8')
if echo:
print('#','\t'.join(cols))
print(blast_output)
blast_results = []
for line in blast_output.split('\n'):
fields = line.split('\t')
blast_results.append(dict(zip(cols,fields)))
#pprint(blast_results)
return blast_results
def run_blast2(query_seq, database, echo=True):
cols = ['qid', 'chrom', 'identity', 'alignment length', 'mismatches', 'gap openings',
'q start', 'q end', 'start', 'end', 'e-value', 'bit score']
blast_output = subprocess.check_output("echo {} | blastall -p blastn -d {} -m 0".
format(str(query_seq), database), shell=True)
blast_output = blast_output.decode('utf-8')
if echo:
print('#','\t'.join(cols))
print(blast_output)
In [5]:
# CONSTANTS
ref_fasta_filename = '/data/archive/reference/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa'
fasta_filename = '/data/archive/reference/Aara_Sharakhov_assem/AaraCHR+AgamP4-Mt.fa'
#fasta_filename = '/data/archive/reference/Anopheles-arabiensis-Dongola_SCAFFOLDS_AaraD1.fa'
chrom = '3R'
pos = 22369522
pad = 50
In [6]:
## Running blast with fixed sequences
#seq = 'ACGTTGGATGACGATTCGTTGCTGCTTGCC'[10:]
seq = 'GCTTTTAAGGCACAGATG'
run_blast2(seq, fasta_filename)
In [5]:
# get the sequences from the first fasta
ref_fasta = pyfaidx.Fasta(ref_fasta_filename)
s = ref_fasta[chrom][pos-pad-1:pos+pad]
In [6]:
blast_results = run_blast(s, fasta_filename)
#blast_results = run_blast(s1, fasta_filename)
#blast_results = run_blast(s2, fasta_filename)
In [7]:
# go ahead and load the target fasta file
fasta = pyfaidx.Fasta(fasta_filename)
# get the aligned region
hit = blast_results[0]
if hit['start'] > hit['end']: # reverse strand?
hit_pos = int(hit['end'])+pad
hit_strand = '-'
else:
hit_pos = int(hit['start'])+pad
hit_strand = '+'
#hit_pos = hit_pos-4
#hit_pos = 28881048
#hit_strand = '+'
print(chrom, pos, '+ ref')
print_alignment(ref_fasta, chrom, pos, '+',
fasta, hit['chrom'], hit_pos, hit_strand,
pad=pad)
#print_fasta_region(ref_fasta, chrom, pos, pad, strand='+')
#print_fasta_region(fasta, hit['chrom'], hit_pos, pad, strand=hit_strand)
# output the uep seqence
if False:
uep = seq
uep_reversed = True
if hit_strand == '-':
uep_reversed = not uep_reversed
if not uep_reversed:
print('-'*(pad-len(uep))+uep)
else:
compliment_translation_table = str.maketrans('ACGTacgt','TGCAtgca')
uep = uep.translate(compliment_translation_table)[::-1]
print(' '*(pad+1)+uep+'-'*(pad-len(uep)))
print(hit['chrom'], hit_pos, hit_strand, 'target')
In [ ]:
In [217]:
snps = [ # chrom, AgamP4_pos, AaraCHR_pos, Aara_strand
['2R', 20784963, 25183147, '-'],
['2R', 21818783, 24053815, '-'],
['2R', 23770295, 21865803, '-'],
['2R', 24166682, 21476204, '-'],
['2R', 25659187, 19988535, '-'],
['2R', 26579153, 19067932, '-'],
['2R', 27058257, 27636451, '+'],
['2R', 27883925, 28567430, '+'],
['2R', 28059359, 28741353, '+'],
['2R', 28086876, 28768819, '+'],
['2R', 28214013, 28881048, '+'],
['3R', 13733466, 13730287, '+'],
['3R', 16162001, 16151295, '+'],
['3R', 17135374, 17092340, '+'],
['3R', 22369522, 22105101, '+'],
['3R', 28835529, 28859778, '+'],
['3R', 31827339, 32128616, '+'],
['3R', 33490452, 33676339, '+'],
['X', 10111683, 4755258, '-'],
['X', 206745, 1747660, '-'],
['2L', 22679503, 38222272, '-'],
['2L', 39898662, 20805020, '-'],
['2L', 7457678, 4166008, '+'],
['2R', 7280113, 7158784, '+'],
['3L', 18356615, 15240947, '+'],
['3L', 3093150, 1156210, '+'],
['3R', 10063316, 10078235, '+'],
]
In [222]:
ref_fasta_filename = '/data/archive/reference/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa'
fasta_filename = '/data/archive/reference/Aara_Sharakhov_assem/AaraCHR+AgamP4-Mt.fa'
pad = 50
ref_fasta = pyfaidx.Fasta(ref_fasta_filename)
fasta = pyfaidx.Fasta(fasta_filename)
for ch, agam_pos, aara_pos, aara_strand in snps:
print('#####', ch+':'+str(agam_pos), '->', ch+':'+str(aara_pos), aara_strand)
#print_fasta_region(ref_fasta, ch, agam_pos, pad, strand='+')
#print_fasta_region(fasta, ch, aara_pos, pad, strand=aara_strand)
print_alignment(ref_fasta, ch, agam_pos, '+',
fasta, ch, aara_pos, aara_strand,
pad)
In [ ]:
In [226]:
ref_fasta_filename = '/data/archive/reference/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa'
pos_list_str = '2R:20784963 2R:21818783 2R:23770295 2R:24166682 2R:25659187 2R:26579153 2R:27058257 2R:27883925 2R:28059359 2R:28086876 2R:28214013 3R:13733466 3R:16162001 3R:17135374 3R:22369522 3R:28835529 3R:31827339 3R:33490452 X:10111683 X:206745 2L:22679503 2L:39898662 2L:7457678 2R:7280113 3L:18356615 3L:3093150 3R:10063316'
pos_list = [ x.split(':') for x in pos_list_str.split('\t')]
#pprint(pos_list)
ref_fasta = pyfaidx.Fasta(ref_fasta_filename)
for pos in pos_list:
print(':'.join(pos), ref_fasta[pos[0]][int(pos[1])-1])
print(','.join((str(ref_fasta[pos[0]][int(pos[1])-1]) for pos in pos_list)))
In [ ]: