In [1]:
import pysam

import numpy as np
import pandas as pd

from six.moves import urllib

In [154]:
urltemplate = "https://raw.githubusercontent.com/hammerlab/varlens/master/test/data/CELSR1/bams/{}"
url = urllib.request.URLopener()
url.retrieve(urltemplate.format("bam_0.bam"), "bam_0.bam")
url.retrieve(urltemplate.format("bam_0.bam.bai"), "bam_0.bam.bai")

samfile = pysam.AlignmentFile("bam_0.bam", "rb")

In [219]:
def print_reads(samfile, chromosome, location, ref, alt):
    print("-", location, ref, alt)
    if len(alt) > 0 and ref.startswith(alt):
        ref = ref[len(alt):]
        location += len(alt)
        alt = ""
    if len(ref) > 0 and alt.startswith(ref):
        alt = alt[len(ref):]
        location += len(ref)
        ref = ""
        
    if len(ref) == 0:
        # by convention, if the mutation is an insertion 
        # then the location refers to the base before it
        start_pos = location 
        end_pos = location + 1
    elif len(alt) == 0:
        assert len(ref) > 0
        # if we're deleting 1 or more bases then the location refers to the first base that's deleted
        start_pos = location
        end_pos = location + len(ref) 
    else:
        assert len(ref) == 1
        assert len(alt) == 1
        start_pos = location
        end_pos = location + 1

    # Let pysam pileup the reads covering our location of interest for us
    for  column in samfile.pileup(chromosome, start_pos-1, end_pos):
        if column.pos != start_pos:
            continue
        for i, read in enumerate(column.pileups):
            if read.is_refskip:
                # if read sequence doesn't actually align here, skip it
                continue
            elif read.is_del and len(ref) > 0:
                # if read has a deletion at this location and variant isn't a deletion
                continue
            pos_in_read = read.query_position  # relative location
            reference_positions = read.alignment.get_reference_positions(full_length=False)
            full_read_seq = read.alignment.query_sequence  # read sequence
            
            if start_pos in reference_positions and end_pos in reference_positions:
                read_start_pos = reference_positions.index(start_pos)
                read_end_pos = reference_positions.index(end_pos)
                
                read_seq_at_variant_locus = full_read_seq[read_start_pos:read_end_pos]
                if read_seq_at_variant_locus == alt:
                    prefix = full_read_seq[:read_start_pos]
                    suffix = full_read_seq[read_end_pos:]
                    if len(prefix) > 0 and len(suffix) > 0:
                        print(prefix,read_seq_at_variant_locus, suffix)

In [220]:
# C -> T variant at this particular locus
chromosome = "chr22"
location = 46931062

ref = "G"
alt = "A"

In [221]:
print_reads(samfile, chromosome, location , ref, alt)


- 46931062 G A
-- 46931062 G A
GGCCGCATCCTCATTCAGACGAAGCTCGTAGGTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A G
CGCATCCTCATTCAGACGAAGCTCGTAGGTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTC
CATCCTCATTCAGACGAAGCTCGTAGGTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCAT
ATTCAGACGAAGCTCGTAGGTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCAAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGC
TCAGACGAAGCTCGTAGGTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGTGCCA
GACGAAGCTCGTAGGTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGCCAGCCG
CGAAGCTCGTAGGTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGT
AGCTCGTAGGTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGA
GCTCGTAGGTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTC
CTCGTAGGTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCC
TCGTAGGTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCA
CGTAGGTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCAC
GTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCT
GTGGGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCT
GGCTGCGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCA
CGTGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCG
TGAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAA
GAACACCGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAG
CGGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAG
GGGTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGT
GGTTCGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGT
CGTTGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTAGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTC
TGTCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCAC
TCATTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCT
ATTCACGTCCAGCACCGTGATGGACACGCCGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCTCCCCGAAGCTGTAGTGCTCCACCACC
TTCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCT
TCACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTC
ACGTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGC
GTCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGG
TCCAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTACTCGCGGT
CAGCACCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGTTCC
AGCACCGTGATGGGCACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCA
CCGTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCAGCTC
GTGATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCAGCTCGG
GATGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCAGCTCGGCA
TGGACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCAGCTCGGCACA
GACACGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCAGCTCGGCACACA
CGCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCAGCTCGGCACACACTGT
CGATGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCAGCTCGGCACACACTGT
GCTGGTGGAGG A GCTCATGGGGGTCCAGCAGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCAGCTCGGCACACACTGTG
GCTGGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCAGCTCGGCACACACTGTG
GGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCAGCTCGGCACACACTGTGATC
GGTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCAGCTCGGCACACACTGTGATC
GTGGAGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGTGGTCCAGCCCGGCACACACTGTGATCC
AGG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCAGCTCGGCACACACTGTGATCCAACC
GG A GCTCATGGGGGGCCAGCCGTGGTCCACCGCCTCCACCCCGAAGCTGTAGTGCTCCACCTCCTCGCGGTCCAGCTCGGCACACACTGTGATCCAACCG

In [215]:
reads = samfile.fetch(reference="chr22", start=location-1, end=location)

In [192]:
read.seq == read.query_alignment_sequence


Out[192]:
True

In [195]:
aln = pysam.AlignedSegment??

In [196]:
aln = pysam.AlignedSegment()

In [197]:
ref = "ACCA"
aln.seq = "ACCCTTTA"

In [198]:
aln.cigarstring = "3M1I1D"


Out[198]:
'ACCCTTTA'

In [87]:


In [ ]: