In [1]:
import sys
import os
import subprocess
import pyfaidx
from pprint import pprint

In [2]:
def print_fasta_region(faidx_obj, chrom, pos, pad, strand='+'):
    #print(faidx_object[chrom][pos])
    s = faidx_obj[chrom][pos-pad-1:pos+pad]
    if strand == '-':
        s = s.complement.reverse
    print(str(s[0:pad]), end='')
    print("\x1b[41m{}\x1b[0m".format(s[pad]), end='')
    print(str(s[pad+1:2*pad+1]), end='\n')
    
def print_alignment(faidx_obj1, chrom1, pos1, strand1, 
                    faidx_obj2, chrom2, pos2, strand2,
                    pad):
    s1 = faidx_obj1[chrom1][pos1-pad-1:pos1+pad]
    if strand1 == '-':
        s1 = s1.complement.reverse
    s2 = faidx_obj2[chrom2][pos2-pad-1:pos2+pad]
    if strand2 == '-':
        s2 = s2.complement.reverse
    str1 = ''
    str2 = ''
    for i in range(len(s1)):
        if str(s1[i]).upper() == str(s2[i]).upper():
            if i == pad:
                str1 += "\x1b[43m{}\x1b[0m".format(s1[i])
                str2 += "\x1b[43m{}\x1b[0m".format(s2[i])
            else:
                str1 += str(s1[i])
                str2 += str(s2[i])
        else:
            if i == pad:
                str1 += "\x1b[45m{}\x1b[0m".format(s1[i])
                str2 += "\x1b[45m{}\x1b[0m".format(s2[i])
            else:
                str1 += "\x1b[41m{}\x1b[0m".format(s1[i])
                str2 += "\x1b[41m{}\x1b[0m".format(s2[i])
    
    print(str1)
    print(str2)
    
    #print(str(s1[0:pad]), end='')
    #print("\x1b[41m{}\x1b[0m".format(s1[pad]), end='')
    #print(str(s1[pad+1:2*pad+1]), end='\n')
    #print(str(s2[0:pad]), end='')
    #print("\x1b[41m{}\x1b[0m".format(s2[pad]), end='')
    #print(str(s2[pad+1:2*pad+1]), end='\n')

In [3]:
def run_blast(query_seq, database, echo=True):
    cols = ['qid', 'chrom', 'identity', 'alignment length', 'mismatches', 'gap openings', 
            'q start', 'q end', 'start', 'end', 'e-value', 'bit score']
    blast_output = subprocess.check_output("echo {} | blastall -p blastn -d {} -m 8".
                                 format(str(query_seq), database), shell=True)
    blast_output = blast_output.decode('utf-8')
    if echo:
        print('#','\t'.join(cols))
        print(blast_output)
    blast_results = []
    for line in blast_output.split('\n'):
        fields = line.split('\t')
        blast_results.append(dict(zip(cols,fields)))
    #pprint(blast_results)
    return blast_results

def run_blast2(query_seq, database, echo=True):
    cols = ['qid', 'chrom', 'identity', 'alignment length', 'mismatches', 'gap openings', 
            'q start', 'q end', 'start', 'end', 'e-value', 'bit score']
    blast_output = subprocess.check_output("echo {} | blastall -p blastn -d {} -m 0".
                                 format(str(query_seq), database), shell=True)
    blast_output = blast_output.decode('utf-8')
    if echo:
        print('#','\t'.join(cols))
        print(blast_output)

In [5]:
# CONSTANTS
ref_fasta_filename = '/data/archive/reference/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa'
fasta_filename = '/data/archive/reference/Aara_Sharakhov_assem/AaraCHR+AgamP4-Mt.fa'
#fasta_filename = '/data/archive/reference/Anopheles-arabiensis-Dongola_SCAFFOLDS_AaraD1.fa'

chrom = '3R'
pos = 22369522
pad = 50

In [6]:
## Running blast with fixed sequences
#seq = 'ACGTTGGATGACGATTCGTTGCTGCTTGCC'[10:]
seq = 'GCTTTTAAGGCACAGATG'
run_blast2(seq, fasta_filename)


# qid	chrom	identity	alignment length	mismatches	gap openings	q start	q end	start	end	e-value	bit score
BLASTN 2.2.26 [Sep-21-2011]


Reference: Altschul, Stephen F., Thomas L. Madden, Alejandro A. Schaffer, 
Jinghui Zhang, Zheng Zhang, Webb Miller, and David J. Lipman (1997), 
"Gapped BLAST and PSI-BLAST: a new generation of protein database search
programs",  Nucleic Acids Res. 25:3389-3402.

Query= 
         (18 letters)

Database: AaraCHR+AgamP4-Mt.fa 
           6 sequences; 216,350,637 total letters

Searching..................................................done



                                                                 Score    E
Sequences producing significant alignments:                      (bits) Value

3R                                                                     36   0.008
3L                                                                     28   2.0  
2R                                                                     28   2.0  
2L                                                                     28   2.0  
X                                                                      26   8.1  

>3R 
          Length = 49805857

 Score = 36.2 bits (18), Expect = 0.008
 Identities = 18/18 (100%)
 Strand = Plus / Plus

                                  
Query: 1        gcttttaaggcacagatg 18
                ||||||||||||||||||
Sbjct: 22105083 gcttttaaggcacagatg 22105100



 Score = 28.2 bits (14), Expect = 2.0
 Identities = 14/14 (100%)
 Strand = Plus / Minus

                             
Query: 4       tttaaggcacagat 17
               ||||||||||||||
Sbjct: 7327275 tttaaggcacagat 7327262



 Score = 28.2 bits (14), Expect = 2.0
 Identities = 14/14 (100%)
 Strand = Plus / Plus

                              
Query: 3        ttttaaggcacaga 16
                ||||||||||||||
Sbjct: 13462900 ttttaaggcacaga 13462913



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Plus

                           
Query: 2      cttttaaggcaca 14
              |||||||||||||
Sbjct: 344803 cttttaaggcaca 344815



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                            
Query: 1       gcttttaaggcac 13
               |||||||||||||
Sbjct: 4081612 gcttttaaggcac 4081600



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                            
Query: 2       cttttaaggcaca 14
               |||||||||||||
Sbjct: 5418114 cttttaaggcaca 5418102



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                             
Query: 1        gcttttaaggcac 13
                |||||||||||||
Sbjct: 10435710 gcttttaaggcac 10435698



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Plus

                             
Query: 1        gcttttaaggcac 13
                |||||||||||||
Sbjct: 15421882 gcttttaaggcac 15421894



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                             
Query: 2        cttttaaggcaca 14
                |||||||||||||
Sbjct: 17061363 cttttaaggcaca 17061351



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Plus

                             
Query: 5        ttaaggcacagat 17
                |||||||||||||
Sbjct: 43535749 ttaaggcacagat 43535761



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                             
Query: 3        ttttaaggcacag 15
                |||||||||||||
Sbjct: 44684571 ttttaaggcacag 44684559


>3L 
          Length = 38873271

 Score = 28.2 bits (14), Expect = 2.0
 Identities = 14/14 (100%)
 Strand = Plus / Plus

                             
Query: 1       gcttttaaggcaca 14
               ||||||||||||||
Sbjct: 7569272 gcttttaaggcaca 7569285



 Score = 28.2 bits (14), Expect = 2.0
 Identities = 14/14 (100%)
 Strand = Plus / Plus

                              
Query: 1        gcttttaaggcaca 14
                ||||||||||||||
Sbjct: 21143818 gcttttaaggcaca 21143831



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                            
Query: 1       gcttttaaggcac 13
               |||||||||||||
Sbjct: 8661630 gcttttaaggcac 8661618



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                             
Query: 4        tttaaggcacaga 16
                |||||||||||||
Sbjct: 14160005 tttaaggcacaga 14159993



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Plus

                             
Query: 2        cttttaaggcaca 14
                |||||||||||||
Sbjct: 15306555 cttttaaggcaca 15306567


>2R 
          Length = 59049218

 Score = 28.2 bits (14), Expect = 2.0
 Identities = 14/14 (100%)
 Strand = Plus / Plus

                             
Query: 4       tttaaggcacagat 17
               ||||||||||||||
Sbjct: 8049451 tttaaggcacagat 8049464



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Plus

                             
Query: 3        ttttaaggcacag 15
                |||||||||||||
Sbjct: 23718477 ttttaaggcacag 23718489



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                             
Query: 2        cttttaaggcaca 14
                |||||||||||||
Sbjct: 44471279 cttttaaggcaca 44471267



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Plus

                             
Query: 4        tttaaggcacaga 16
                |||||||||||||
Sbjct: 46094421 tttaaggcacaga 46094433



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                             
Query: 3        ttttaaggcacag 15
                |||||||||||||
Sbjct: 53218116 ttttaaggcacag 53218104



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                             
Query: 6        taaggcacagatg 18
                |||||||||||||
Sbjct: 53906296 taaggcacagatg 53906284


>2L 
          Length = 47444456

 Score = 28.2 bits (14), Expect = 2.0
 Identities = 14/14 (100%)
 Strand = Plus / Plus

                              
Query: 1        gcttttaaggcaca 14
                ||||||||||||||
Sbjct: 29046248 gcttttaaggcaca 29046261



 Score = 28.2 bits (14), Expect = 2.0
 Identities = 14/14 (100%)
 Strand = Plus / Minus

                              
Query: 5        ttaaggcacagatg 18
                ||||||||||||||
Sbjct: 29231752 ttaaggcacagatg 29231739



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Plus

                            
Query: 5       ttaaggcacagat 17
               |||||||||||||
Sbjct: 2025764 ttaaggcacagat 2025776



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Plus

                             
Query: 3        ttttaaggcacag 15
                |||||||||||||
Sbjct: 21672908 ttttaaggcacag 21672920



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                             
Query: 1        gcttttaaggcac 13
                |||||||||||||
Sbjct: 25811946 gcttttaaggcac 25811934



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Plus

                             
Query: 5        ttaaggcacagat 17
                |||||||||||||
Sbjct: 28304267 ttaaggcacagat 28304279



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                             
Query: 5        ttaaggcacagat 17
                |||||||||||||
Sbjct: 32045050 ttaaggcacagat 32045038



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                             
Query: 2        cttttaaggcaca 14
                |||||||||||||
Sbjct: 38691575 cttttaaggcaca 38691563


>X 
          Length = 21162472

 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Minus

                             
Query: 3        ttttaaggcacag 15
                |||||||||||||
Sbjct: 15064571 ttttaaggcacag 15064559



 Score = 26.3 bits (13), Expect = 8.1
 Identities = 13/13 (100%)
 Strand = Plus / Plus

                             
Query: 2        cttttaaggcaca 14
                |||||||||||||
Sbjct: 20220721 cttttaaggcaca 20220733


  Database: AaraCHR+AgamP4-Mt.fa
    Posted date:  Nov 10, 2014 10:24 AM
  Number of letters in database: 216,350,637
  Number of sequences in database:  6
  
Lambda     K      H
    1.37    0.711     1.31 

Gapped
Lambda     K      H
    1.37    0.711     1.31 


Matrix: blastn matrix:1 -3
Gap Penalties: Existence: 5, Extension: 2
Number of Sequences: 6
Number of Hits to DB: 19,008
Number of extensions: 34
Number of successful extensions: 34
Number of sequences better than 10.0: 5
Number of HSP's gapped: 34
Number of HSP's successfully gapped: 34
Length of query: 18
Length of database: 216,350,637
Length adjustment: 15
Effective length of query: 3
Effective length of database: 216,350,547
Effective search space: 649051641
Effective search space used: 649051641
X1: 11 (21.8 bits)
X2: 15 (29.7 bits)
X3: 50 (99.1 bits)
S1: 13 (26.3 bits)
S2: 13 (26.3 bits)


In [5]:
# get the sequences from the first fasta
ref_fasta = pyfaidx.Fasta(ref_fasta_filename)
s = ref_fasta[chrom][pos-pad-1:pos+pad]

In [6]:
blast_results = run_blast(s, fasta_filename)
#blast_results = run_blast(s1, fasta_filename)
#blast_results = run_blast(s2, fasta_filename)


# qid	chrom	identity	alignment length	mismatches	gap openings	q start	q end	start	end	e-value	bit score
1_0	3R	96.04	101	4	0	1	101	22105051	22105151	2e-41	 168
1_0	3R	95.24	21	1	0	60	80	32376811	32376831	0.93	34.2
1_0	3R	100.00	16	0	0	7	22	20649825	20649810	3.7	32.2
1_0	2R	95.24	21	1	0	78	98	43260075	43260095	0.93	34.2
1_0	2R	100.00	16	0	0	84	99	37678637	37678622	3.7	32.2
1_0	X	100.00	16	0	0	56	71	11322253	11322268	3.7	32.2
1_0	2L	100.00	16	0	0	67	82	20367971	20367986	3.7	32.2
1_0	2L	100.00	16	0	0	15	30	45525751	45525736	3.7	32.2


In [7]:
# go ahead and load the target fasta file
fasta = pyfaidx.Fasta(fasta_filename)
# get the aligned region
hit = blast_results[0]

if hit['start'] > hit['end']: # reverse strand?
    hit_pos = int(hit['end'])+pad
    hit_strand = '-'
else:
    hit_pos = int(hit['start'])+pad
    hit_strand = '+'

#hit_pos = hit_pos-4
#hit_pos = 28881048
#hit_strand = '+'

print(chrom, pos, '+ ref')
print_alignment(ref_fasta, chrom, pos, '+', 
                fasta, hit['chrom'], hit_pos, hit_strand,
                pad=pad)
#print_fasta_region(ref_fasta, chrom, pos, pad, strand='+')
#print_fasta_region(fasta, hit['chrom'], hit_pos, pad, strand=hit_strand)

# output the uep seqence
if False:
    uep = seq
    uep_reversed = True
    if hit_strand == '-':
        uep_reversed = not uep_reversed
    if not uep_reversed:
        print('-'*(pad-len(uep))+uep)
    else:
        compliment_translation_table = str.maketrans('ACGTacgt','TGCAtgca')
        uep = uep.translate(compliment_translation_table)[::-1]
        print(' '*(pad+1)+uep+'-'*(pad-len(uep)))
    
print(hit['chrom'], hit_pos, hit_strand, 'target')


3R 22369522 + ref
ATGCGGGGAAAGATGGCCGTAAACACGCGGGAGCTTTTAAGGCACAGATGTAGCGTTCCGGCAAGCAGCAACGAATCGTTTAGGTTGTAAAATTGCCCCTT
ATGCGGGGAAAGATTGCCGTAAACACGCGGGAGCTTTTAAGGCACAGATGCAGAGATCCGGCAAGCAGCAACGAATCGTTTAGGTTGTAAAATTGCCCCTT
3R 22105101 + target

In [ ]:


In [217]:
snps = [ # chrom, AgamP4_pos, AaraCHR_pos, Aara_strand
    ['2R',	20784963,	25183147, '-'],
    ['2R',	21818783,	24053815, '-'],
    ['2R',	23770295,	21865803, '-'],
    ['2R',	24166682,	21476204, '-'],
    ['2R',	25659187,	19988535, '-'],
    ['2R',	26579153,	19067932, '-'],
    ['2R',	27058257,	27636451, '+'],
    ['2R',	27883925,	28567430, '+'],
    ['2R',	28059359,	28741353, '+'],
    ['2R',	28086876,	28768819, '+'],
    ['2R',	28214013,	28881048, '+'],
    ['3R',	13733466,	13730287, '+'],
    ['3R',	16162001,	16151295, '+'],
    ['3R',	17135374,	17092340, '+'],
    ['3R',	22369522,	22105101, '+'],
    ['3R',	28835529,	28859778, '+'],
    ['3R',	31827339,	32128616, '+'],
    ['3R',	33490452,	33676339, '+'],
    ['X',	10111683,	4755258, '-'],
    ['X',	206745,	1747660, '-'],
    ['2L',	22679503,	38222272, '-'],
    ['2L',	39898662,	20805020, '-'],
    ['2L',	7457678,	4166008, '+'],
    ['2R',	7280113,	7158784, '+'],
    ['3L',	18356615,	15240947, '+'],
    ['3L',	3093150,	1156210, '+'],
    ['3R',	10063316,	10078235, '+'],
    ]

In [222]:
ref_fasta_filename = '/data/archive/reference/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa'
fasta_filename = '/data/archive/reference/Aara_Sharakhov_assem/AaraCHR+AgamP4-Mt.fa'
pad = 50

ref_fasta = pyfaidx.Fasta(ref_fasta_filename)
fasta = pyfaidx.Fasta(fasta_filename)

for ch, agam_pos, aara_pos, aara_strand in snps:
    print('#####', ch+':'+str(agam_pos), '->', ch+':'+str(aara_pos), aara_strand)
    #print_fasta_region(ref_fasta, ch, agam_pos, pad, strand='+')
    #print_fasta_region(fasta, ch, aara_pos, pad, strand=aara_strand)
    print_alignment(ref_fasta, ch, agam_pos, '+', 
                    fasta, ch, aara_pos, aara_strand,
                    pad)


##### 2R:20784963 -> 2R:25183147 -
CGCCGCCAGCGCACCACGTTCAGCAGCGAGCAAACCCTCCGACTGGAGGTGGAATTTCACCGAAACGAATACATTTCCCGAGGCAGACGGTTCGAGCTGGC
CGCCGCCAGCGCACCACGTTCAGCAGCGAGCAAACGCTCCGACTGGAGGTAGAATTTCACCGAAACGAATACATTTCCCGAGGCAGACGGTTCGAGCTGGC
##### 2R:21818783 -> 2R:24053815 -
GAACCGCACGTCAAGTCGCTGCATCTGACGCTGGCCTACCAATTCCCGCCCAGTCAGTTTAATGCGCTGAAAGCGTTGGTGGAAAATTTGGACGCTTCCTG
GAACCGCACGTCAAGTCGCTGCATCTGACGCTGGCTTACCAGTTCCCGCCGGGTCAGTTTAATGCGCTGAAAGCGTTGGTGGAAAATTTGGACGCTTCCTG
##### 2R:23770295 -> 2R:21865803 -
TACTTACTCGTCTTAAAAAGTCGCACCGCTTTCACATACTCGCTGTCCTCGTCGGGCGCACCTCCTTCCACCTCCATTTGCATCTCATCCGCCGCCGCTAC
AACTTACTCGTCTTAAACAGTCGCACCGCTTTCACATACTCGCTGTCCTCGTCGGGCGCACCTCCTTCAACCTCCATTTGCATCTCACCCGCCGCCGCTAG
##### 2R:24166682 -> 2R:21476204 -
CCGCTGGCGCCGGCGCGCAGCCCGGCCGCCGCCTTCGACGTACCCGCAATGCCGCCCAGCGTGCTCATGTCCCGCGCAAACCCGAACGAAATCTTCCCGTG
CCGCTGGCACCGGCGCGCAGCCGGGCCGCCGCCTTCGACGTCCCCGCAATGCCGCCCAGCGTGCTCATGTCCCGCGCAAACCCGAACGAAATCTTCCCGTG
##### 2R:25659187 -> 2R:19988535 -
CACGGCACGGTCGCGGAAGCGGAAGAAAAGGTACTGCGCTTTGATTTACCCGCCGGTTCGACCACGTTCGAGGATTCGTGCGTGGTGGAAGACGCACAGCA
CACGGCACGGTCGCGGAAGCGGAAGAGAAAGTACTGCGCTTTGATTTGCCCGCCGGTTCGACCACGTTCGAGGATTCGTGCGTGGTGGAAGACGCACAGCA
##### 2R:26579153 -> 2R:19067932 -
AGATCGATCTCCTCGAACGCTTTCTGCGTGTCGTCCGAGTCGCGGGACGCAATCTGCGGCAGACGGCCCTCCTCCTGCAGCTGGCGCATGTACCGCAGGAC
AGATCGATCTCCTCGAACGCTTTCTGGGTGTCGTCCGAGTCGCGGGACGCAATCTGCGGCAGACGGCCCTCCTCCTGCAGCTGGCGCATGTACCGCAGGAC
##### 2R:27058257 -> 2R:27636451 +
ATGCGGGACGGCACGAAGTACCGGCGCTTTTTGGTGATGGATATTCTGCAGCTCATCTTGGTCGAGCCGGACAGTAAGCGGTTAGGGTGGGGTGTGGCTAA
ATGCGAGACGGCACGAAGTACCGGCGCTTTTTGGTGATGGATATTCTGCAGCTCATCCTGGTCGAGCCGGACAGCAAGCGGTTAGGGTGGGGTGTGGCTAA
##### 2R:27883925 -> 2R:28567430 +
AGCGCCGGTCAAACGAGCGCTCCAGCACCATCGAGGATAACGACGGCCACACCAACGGCCCGAACTACTACGGTAAAAACATTCGAGCGTCCCATCGTATC
AGCGCCGGTCAAACGAGCGCTCCAGCACCACCGAGGATAACGACGGCCACGCCAACGGCCCGAACTACTACGGTAAAAACATTCGAGCGTCCCATCATATC
##### 2R:28059359 -> 2R:28741353 +
GACCGTCCGAGCCTACGATACGCGAGTCACCGGCCGATCTTGACGCAAGGCTACAGCTGTCCGAGGGTGGCCGCTGGTCGATCGTAATCAATCGCCGGCAG
GACCGTCCGAGCCTACGATACGCGAGTCACCGGCCGATCTTGACGCAAGGCTACAGCTGTCCGAGGGTGGCCGCTGGTCGATCGTAATCAATCGCCGGCAG
##### 2R:28086876 -> 2R:28768819 +
ATAGACGGGGGGTGCACGATGGTGTGGCACGGTTTAGGAGGGGCAGTGGCAGTAGTAGCGATAGTAGGACGAACTAGTTTAGGTAGAGCAAAGGGCATCGC
ATAGACGGGGGGTGCACGATGGTGTGGCACGGTTTAGGAGGGGCAGTGGCGGTAGTAGCGATAGTAGGACGAACTAGTTTAGGTAGGGCAAAGGGCATCGC
##### 2R:28214013 -> 2R:28881048 +
ACGGTTCGGCGGACGATTCGGTGGACGTGATTGTGGGCGAATCGCTTGCCGACCGGATAAAGCACGAATCCGACGACACGTGCAGCAGCATCAAGCGGGAA
ACGGTTCGGCGGACGATTCGGTGGACGTGATTGTGGGCGAATCGCTTGCCAACCGGATAAAGCACGAATCCGACGACACATGCAGCAGCATCAAGCGGGAA
##### 3R:13733466 -> 3R:13730287 +
ACGCTGGAAATGTGCAAGCTGACGCTCGAACCGTGCCGGATACTGCACAATACCAGCTTTTTTCCCGAGTTTCTCAAGTGCAACGAAACGCTGTACCCGTC
ACGCTGGAAATGTGCAAGCTGACGCTCGAACCGTGCCGGATACTGCACAATACCAGCTTTTTTCCCGAGTTTCTCAAGTGCAACGAAACGCTGTACCCGTC
##### 3R:16162001 -> 3R:16151295 +
ACACTGCACACCACCGACCGCTCACCCTTGCCACAGTGGCAGCTGTGCTGGACCTGCTCCGTGCACGTGTCGCAGCTCCCTGCGTGACACTGCCGGCCACA
ACGCTGCACACCACCGACCGCTCACCCTTGCCACAGTGGCAGCTGTGCTGGACCTGCTCCGTGCACGTGTCGCAGCTCCCTGCGTGACACTGCCGGTCGCA
##### 3R:17135374 -> 3R:17092340 +
TCCATGAGATCGAACTGATGCTCGTTCTTTTCTACCAGCCGAGTGTATTTTTTCACATTGGTGCCAGCCGTTAGCACGCACGTGAACAGATTATCGTTCAG
TCCATGAGATCGAACTGATGCTCGTTCTTTTCTACCAGCCGAGTGTATTTCTTCACATTGGTGCCGGCCGTTAGCACGCACGTGAACAGATTATCGTTCAG
##### 3R:22369522 -> 3R:22105101 +
ATGCGGGGAAAGATGGCCGTAAACACGCGGGAGCTTTTAAGGCACAGATGTAGCGTTCCGGCAAGCAGCAACGAATCGTTTAGGTTGTAAAATTGCCCCTT
ATGCGGGGAAAGATTGCCGTAAACACGCGGGAGCTTTTAAGGCACAGATGCAGAGATCCGGCAAGCAGCAACGAATCGTTTAGGTTGTAAAATTGCCCCTT
##### 3R:28835529 -> 3R:28859778 +
GGCGGAGAGACCTGCATCAACATCAACGAGTGTCCACGGTTCGGTCCACACTACCACGAGCCCGCCAAATGGACCGAGGAGTTATTGAATGAATTCCGGTC
ggcggagagacctgcatcaacatcaacgagtgtccacggttcggtccacattaccacgaacactccaaatggtccggcgggttattgaatgaattccggtc
##### 3R:31827339 -> 3R:32128616 +
TGCCCCTTTAGAGCATCGTCAAATTCGCCATCCTCTTCAATGACTGGATCCGTAGTGGAAGCGCCTTTGACAGCGGATTTCATATCCGGCGGTAGAGTGTT
TGCCCCTTTAGAGCATCGTCAAATTCGCCATCCTCTTCAATGACTGGATCCGTAGTGGGAGCGCCTTTGACAGCGGATTTCATATCCGGCGGTAGAGTGTT
##### 3R:33490452 -> 3R:33676339 +
TCCGGATCCGCCATCAGCGGGGGCTTTTGCTCCTCACTCGACATTTCTCCATCCTCCAGCTTACGCATCCGGTACGGATCGGTCGGGAAGGCAACCGATTC
TCCGGATCCGCCACCAGCGGGGGCTTTTGCTCCTCACTCGACATTTCTCCATCCTCCAGCTTACGCATCCGGTACGGATCGGTCGGGAAGGCAACTGATTC
##### X:10111683 -> X:4755258 -
gcgcgcgcgcTTGGGACGCGCCTGTGCAGCGTCGAAGCTGCCGCAGAGCCTCACTCACCTTGTAAAGAATGGTGCGGTTGGCGGTGGGCTCGCAGACGAGC
CGAAGAGCGCTTAGGACGCGCTTGTGGAGCGTCGAAGCTGCCGCAGAGCCGCACTCACCTTGTAAAGGATGGTGCGGTTGGCGGTGGGCTCGCAGACGAGC
##### X:206745 -> X:1747660 -
TACGAATCGGACCAAGCGCATGTTTCCAGCAATGAAAGTACTTCGTTCTCTTCAGACAGTGACCTGTTGTACTATTTGCTGGAACCTACCACGGGAACGGT
TACGAATCGGACCAAGCGCACGTTTCCAGCAATGAAAGTCCTTCGTTCGCTTCAGACAGTGACCTGTTGTACTATGTGCTGGAACCTACCACGGGAACGGT
##### 2L:22679503 -> 2L:38222272 -
GAGGCTTTTGTGAAGTTTTACAGAATGCGTGTCAAATAGCCAAACTGATGTGATGGAAGATTGACTCAGATCTATCATTTTGTCCCCTTTTTTTGCTTGGG
GAGGCTTTTGTGAAGTTTTACAGATTGCGTGTCAAATAGCCAAACTGATGTGACGGAAGATTGACTCAGATCTATCATTTTGTCCCCTTTTTTTGCTTGGG
##### 2L:39898662 -> 2L:20805020 -
CAGCAGGACAACAAGGACTACGATCTGAACATTGCGACCAACTTCTTCGTGGACGACTTCATCGAGGTGATCAACAAGTATCAGCAGATCGCGAACACGCA
CAGCAGGACAACAAGGACTACGATCTGAACATTGCGACCAACTTCTTCGTGGACGACTTCATCGAGGTGATCAACAAGTATCAGCAGATCGCAAACACGCA
##### 2L:7457678 -> 2L:4166008 +
GCCGGATTAAATAGATTAAATTCACCGTGGCGTGGTACACGAGCGGTGTACCCAAAGTCCGGCAGCTCCACTGGCTTGATCGAGATGTGATGCTTGGCTGT
GCCGGATTAAATAGATTAAATTCACCGTGGCGTGGTACACGAGCGGTGTATCCAAAGTCCGGCAGCTCCACTGGCTTGATCGAGATGTGATGCTTGGCTGT
##### 2R:7280113 -> 2R:7158784 +
GGCTGACGGAGAAGGAGAAGCCGGCAACCGTGCTGCTGAAAGCGCACACCTCGCTGCTGATGGACGATCCGGCTTGCGCCAAGGACTATATGATCTGCGCC
GGCTGACGGAGAAGGAGAAGCCGGCAACCGTGCTGCTGAAAGCGCACACCTCGCTGCTGATGGACGATCCGGCTTGCGCCAAGGACTATATGATCTGCGCC
##### 3L:18356615 -> 3L:15240947 +
GAGCTGAAGGAGCTGTATCTGCACCGGAATGCGCTGGCATCGATCGGGAACAGGTCGTTCTATTACCAGAAATCGCTCGAGGTGCTCACGATTGAGGAGAA
GAGCTGAAGGAGCTGTATCTGCACCGGAATGCGCTGGCATCGATCGGAAATAGGTCGTTCTATTACCAGAAATCGCTCGAGGTGCTCACGATCGCGGAGAA
##### 3L:3093150 -> 3L:1156210 +
ATTCAGTGCTATAGCCTGAAACAAAGATCTCTTGTAGATGCATTCCAGAAGACAATACGAAAAGCCGAAGAGGTGCTGAACTTGGTGTATAATAAGTATAT
ATTCAGTGCTATAGCCTGAAACAAAGATCTCTTGTAGATGCATTCCAGAAAACAATACGAAAAGCCGAAGAGGTGCTGAACTTGGTGTATAATAAGTATAT
##### 3R:10063316 -> 3R:10078235 +
GATCGCGCAACGAACGCCGTCAGCCAGGTGCTGCCGCTCTTATCGTTCTCACCGAACGCACTGAACGAACCGTCACGGTGCTTGTAGCCCAGCTCCCGCTG
GATCGCGCAACGAACGCCGTCAGCCAGGTGCTGCCGCTCTTATCGTTCTCACCGAACGCACTGAACGAACCGTCACGGTGCTTGTAGCTCAGCTCCCGCTG

In [ ]:

Looking up REF values from a fasta from a list of positions


In [226]:
ref_fasta_filename = '/data/archive/reference/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa'

pos_list_str = '2R:20784963	2R:21818783	2R:23770295	2R:24166682	2R:25659187	2R:26579153	2R:27058257	2R:27883925	2R:28059359	2R:28086876	2R:28214013	3R:13733466	3R:16162001	3R:17135374	3R:22369522	3R:28835529	3R:31827339	3R:33490452	X:10111683	X:206745	2L:22679503	2L:39898662	2L:7457678	2R:7280113	3L:18356615	3L:3093150	3R:10063316'
pos_list = [ x.split(':') for x in pos_list_str.split('\t')]
#pprint(pos_list)

ref_fasta = pyfaidx.Fasta(ref_fasta_filename)

for pos in pos_list:
    print(':'.join(pos), ref_fasta[pos[0]][int(pos[1])-1])

print(','.join((str(ref_fasta[pos[0]][int(pos[1])-1]) for pos in pos_list)))


2R:20784963 G
2R:21818783 C
2R:23770295 G
2R:24166682 G
2R:25659187 C
2R:26579153 A
2R:27058257 G
2R:27883925 A
2R:28059359 C
2R:28086876 A
2R:28214013 G
3R:13733466 T
3R:16162001 G
3R:17135374 T
3R:22369522 T
3R:28835529 C
3R:31827339 C
3R:33490452 A
X:10111683 T
X:206745 T
2L:22679503 T
2L:39898662 G
2L:7457678 C
2R:7280113 T
3L:18356615 C
3L:3093150 G
3R:10063316 A
G,C,G,G,C,A,G,A,C,A,G,T,G,T,T,C,C,A,T,T,T,G,C,T,C,G,A

In [ ]: