In [14]:
#Widget()
Note that the understanding of 3 terms allows for a complete/useful way to use this notebook. Now, the terminology is adapted to my understanding and therefore the expressions that I present might be incomplete or not strictly as defined by science.
First, our input takes the "position" argument which it is a unique position in the genome. It is necessary to specify which position wants to be observed because when we query the server for reads, it takes a starting and an ending position to return the set of data that spans our region of interest.
Second, the "Read Groupset Names" are specific subjects which had their genome sequenced on the 1k genomes project. The data rests on the server in the form of read sets, which will be defined later on.
Third, the “Reference Set Name” is a group of contiguous defined regions in the genome, which I refer to as chromosomes, but according to the 1k genomes website, there is more than just the 23-regular chromosomal expressions. Therefore I can only assume that other regions or references have been defined.
A set of reads is the data provided by the sequencer in the form of contiguous alleles. It is natural to observe multiple reads which overlap in a particular region, as well as reads which cover the same area. But that only adds on to the certainty of the statistics which determine the allele occurrence in a given position, that is, the purpose of Pileup. Also, variants are a set of alleles that differ from the reference bases and they are known as SNPs (Single Nucleotide Polymorphisms).
Pileup is a set of functions which do inner-mutual calls to obtain the fields previously defined. After the specific set of reads that span the region of interest have been obtained, we proceed to dissect the specific position of interest. We stack them in a counter dictionary which is then passed to a function that does the frequency calculation and finishes by returning the alleles observed as well as their individualized frequency. When the functions detect that the highest frequency allele differs from the reference bases, there is a call to the variant set to obtain the name, the position at which it starts, the alternate bases, and the genotype. Finally, it is plotted in a pie chart with the proper distribution of frequency.
In [2]:
import ga4gh.client as client
c = client.HttpClient("http://1kgenomes.ga4gh.org")
In [3]:
dataset = c.searchDatasets().next()
referenceSet = c.searchReferenceSets().next()
references = [r for r in c.searchReferences(referenceSetId = referenceSet.id)]
In [4]:
contig ={}
for i in references:
contig[i.name] = str(i.id)
In [5]:
def GetReadsForName(Name):
Name = str(Name)
if type(getReadGroupsByReadGroupSetName(Name)) == str:
return getReadGroupsByReadGroupSetName(Name)
else:
return [i for i in getReadGroupsByReadGroupSetName(Name)]
def readGroupSetByName(name):
result = None
for rgs in c.searchReadGroupSets(name=name, datasetId=dataset.id):
return rgs
return result
def getReadGroupsByReadGroupSetName(readGroupSetName):
if None == readGroupSetByName(readGroupSetName):
return "Sorry, bad request for {}".format(readGroupSetName)
else:
return readGroupSetByName(readGroupSetName).read_groups
In [6]:
def chrfunct(Chromo):
chr1 = filter(lambda x: x.name == str(Chromo), references)[0]
return chr1
This function can be expanded in the sense that, INDELS are detected in this function. With more specifications this Pile-Up program with this function can be extended to also detect such variants. Also note that only 4 cigar operations are specified, because they were the only operations specified in the reads.
In [7]:
def Cigar_Interpreter(Sequence, observe, ReferBase):
Temp = 0
BaseCounter = 0
Variant = ""
AligSeq = Sequence.aligned_sequence
InterpArr = list([])
Iter = 0
for i in Sequence.alignment.cigar:
Length = i.operation_length
if i.Operation.Name(i.operation) == "ALIGNMENT_MATCH":
InterpArr[len(InterpArr):len(InterpArr)+Length] = AligSeq[Temp:Temp+Length]
Temp += Length
BaseCounter += Length
elif i.Operation.Name(i.operation) == "CLIP_SOFT":
Temp += Length
elif i.Operation.Name(i.operation) == "DELETE":
int_iter = 0
for i in range(Length):
InterpArr[len(InterpArr) : len(InterpArr)+1] = "N"
BaseCounter += 1
int_iter += 1
if BaseCounter == observe:
Variant = ReferBase[BaseCounter:BaseCounter+int_iter]
return Variant
elif i.Operation.Name(i.operation) == "INSERT":
for i in range(Length):
InterpArr[len(InterpArr):len(InterpArr)+1] = AligSeq[Temp : Temp+1]
Temp += 1
if (Temp == observe) and (len(InterpArr) >= Temp+Length+1):
Variant = "".join(InterpArr[Temp:Temp+Length+1])
return Variant
Iter += 1
if (Temp >= observe) and (len(Sequence.alignment.cigar) == Iter) :
return InterpArr[observe]
else:
return "N"
In [8]:
def find_variants(Start, End, RdGrpSetName, ChromoSm):
for variantSet in c.searchVariantSets(datasetId=dataset.id):
if variantSet.name == "phase3-release":
release = variantSet
for callSet in c.searchCallSets(variantSetId= release.id, name= str(RdGrpSetName)):
mycallset = callSet
for variant in c.searchVariants(release.id, referenceName=ChromoSm, start=Start, end=End, callSetIds=[mycallset.id]):
if len(variant.alternate_bases[0]) == 1 and len(variant.reference_bases) == 1:
print "\nA VARIANT WAS FOUND"
print "Variant Name: {}, Start: {}, End: {} \nAlternate Bases: {} \nGenotypes: {}".format(str(variant.names[0]), str(variant.start), str(variant.end), str(variant.alternate_bases[0]), str(variant.calls[0].genotype))
return
return False
In [9]:
def pileUp(contig, position, rgset, Chromosm):
alleles = []
rgset = GetReadsForName(rgset)
if type(rgset) != str:
for i in rgset:
for sequence in c.searchReads(readGroupIds=[i.id],start = position, end = position+1, referenceId=contig):
if sequence.alignment != None:
start = sequence.alignment.position.position
observe = position - sequence.alignment.position.position
end = start+len(sequence.aligned_sequence)
if observe > 100 or observe < 0:
continue
if len(sequence.alignment.cigar) > 1:
allele = Cigar_Interpreter(sequence, observe,c.listReferenceBases(chrfunct(Chromosm).id, start=start, end= end))
else:
allele = sequence.aligned_sequence[observe]
alleles.append({"allele": str(allele), "readGroupId":i.id})
return Calc_Freq(alleles)
else:
return rgset
The frequency is obtained from the occurrence of alleles in the observed position for all the reads which are mapped in that region. This function returns an array of occurrence alleles as well as their individualized frequency compared to all the reads detected.
In [10]:
def Calc_Freq(Test):
tot = len(Test)
AutCalc = {}
Arr = []
for i in range(tot):
if AutCalc.has_key(Test[i]["allele"]) == False and (Test[i]['allele'] != "N"):
AutCalc.setdefault(Test[i]["allele"], 1)
Arr.append(Test[i]['allele'])
else:
if Test[i]['allele'] == "N":
tot -= 1
else:
AutCalc[Test[i]["allele"]] = float(AutCalc.get(Test[i]["allele"]) + 1)
Freq = {}
print "\n{} Reads where used, to determine pile-up".format(tot)
tot = float(tot)
for i in Arr:
Freq.setdefault(i,float(AutCalc.get(i)/tot))
return Freq
In [11]:
def Variant_Comp(Position, ReadGroupSetName, Chromosm):
RdGrp = GetReadsForName(ReadGroupSetName)
Chrm = contig.get(Chromosm, None)
if (Chrm != None) and type(RdGrp) != (str) :
base = c.listReferenceBases(Chrm, start = Position, end = Position+1)
var = pileUp(Chrm, Position, ReadGroupSetName, Chromosm)
return (str(base), var)
else:
if RdGrp == None:
print"Read Group Set '{}' is not in the API".format(ReadGroupSetName)
else:
print"Chromosome '{}' is not in the API".format(Chromosm)
In [12]:
def plot_vars(Position, RdGrpName, Chromo):
%matplotlib inline
import matplotlib.pyplot as plt
Refer, Freqs = Variant_Comp(int(Position), str(RdGrpName),str(Chromo))
labels = Freqs.keys()
sizes = Freqs.values()
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral']
Expl= {}
Legend = []
print "Reference Bases:", Refer
for i in labels:
if Freqs.get(i) != max(sizes):
find_variants(int(Position), int(Position)+1, str(RdGrpName), str(Chromo))
Expl.setdefault(i, .15)
Legend.append("{}: {} %".format(i, str(Freqs.get(i)*100)[:4]))
elif i == Refer:
Expl.setdefault(i,0.8)
Legend.append("{}: {} %".format(i, str(Freqs.get(i)*100)[:4]))
else:
Expl.setdefault(i,0.0)
Legend.append("{}: {} %".format(i, str(Freqs.get(i)*100)[:4]))
explode = Expl.values()
plt.pie(sizes, explode=explode, labels=labels, colors=colors,autopct='%1.1f%%', shadow=True, startangle=0)
plt.axis('equal')
plt.legend(['%s' % str(x) for x in (Legend)])
plt.show()
In [13]:
def Widget():
from ipywidgets import widgets
from ipywidgets import interact
from IPython.display import display
t0 = widgets.Text(value="Position Exaple: '120394'", disabled=True)
text0 = widgets.Text()
t1 = widgets.Text(value="ReadGroupName Example: 'NA19102'", disabled=True)
text1 = widgets.Text()
t2 = widgets.Text(value= "ReferenceSets Example: '1'", disabled=True)
text2 = widgets.Text()
display(t0, text0, t1, text1, t2, text2)
button = widgets.Button(description="Submit")
exit = widgets.Button(description="Exit")
display(button, exit)
def exitFunct(c):
import sys
sys.exit(["Thank you, you have exited the function"])
def Submit(sender):
Pos, RgSetNm, Chrom = text0.value, text1.value, text2.value
chr1 = chrfunct(Chrom)
plot_vars(Pos, RgSetNm, Chrom)
def button_clicked(b):
print "Position: {}, ReadGrpSet: {}, Chrom: {}".format(text0.value, text1.value, text2.value)
Submit(b)
button.on_click(button_clicked)
exit.on_click(exitFunct)