We can query the database to obtain a pile-up of the reads from a given readgroup.
In [4]:
import ga4gh.client as client
c = client.HttpClient("http://1kgenomes.ga4gh.org")
In [5]:
dataset = c.searchDatasets().next()
referenceSet = c.searchReferenceSets().next()
references = [r for r in c.searchReferences(referenceSetId = referenceSet.id)]
In [38]:
contig = references[0].id
rgIdsArr = []
for r in c.searchReadGroupSets(datasetId=dataset.id):
rgIdsArr.append([e for e in r.readGroups])
In [63]:
def Revers_Compl(Sequence):
CompSeq = list(Sequence[:])
for i in range(len(Sequence)):
if Sequence[i]=="A":
CompSeq[i] = "T"
elif Sequence[i]=="C":
CompSeq[i] = "G"
elif Sequence[i] == "G":
CompSeq[i] = "C"
elif Sequence[i] == "T":
CompSeq[i] = "A"
else:
CompSeq[i] = "N"
return "".join(CompSeq)
In [64]:
def pileUp(contig, position, rgIdsArr):
alleles = []
for i in rgIdsArr[0]:
for sequence in c.searchReads(readGroupIds=[i.id],start = position, end = position+1, referenceId=contig):
if sequence.alignment != None:
start = sequence.alignment.position.position
observe = position-start
if sequence.alignment.position.strand == "NEG_STRAND":
Rev_Comp_Seq = Revers_Compl(sequence.alignedSequence)
allele = Rev_Comp_Seq[-(observe+1)]
alleles.append({"allele":allele, "readGroupId":i.id})
else:
allele = sequence.alignedSequence[observe]
alleles.append({"allele": allele, "readGroupId": i.id })
return alleles
In [89]:
def Calc_Freq(Position):
Test = pileUp(references[0].id, Position, rgIdsArr)
tot = len(Test)
A = [{"All": "A","Frq": float(0),"Occ": 0},{"All": "C","Frq": float(0), "Occ": 0},{"All": "G","Frq": float(0), "Occ": 0},{"All": "T","Frq": float(0), "Occ": 0}]
for i in range(tot):
if Test[i]["allele"] == "A":
A[0]["Occ"] += 1
elif Test[i]["allele"]=="C":
A[1]["Occ"] += 1
elif Test[i]["allele"] == "G":
A[2]["Occ"] += 1
elif Test[i]["allele"] == "T":
A[3]["Occ"] += 1
else:
tot -= 1
A[0]["Frq"] = float(A[0]["Occ"])/float(tot)
A[1]["Frq"] = float(A[1]["Occ"])/float(tot)
A[2]["Frq"] = float(A[2]["Occ"])/float(tot)
A[3]["Frq"] = float(A[3]["Occ"])/float(tot)
return A
In [91]:
X = Calc_Freq(10000)
Exampl = max(X)
print "The most frequent allele is : {}, with {} occurances and overall frequency of : {}".format(Exampl["All"], Exampl["Occ"], Exampl["Frq"])
In [ ]: