In [ ]:
"""This lesson used some personal genome sequencing data as well as packages I wrote myself.
The code in custom packages MotifMatcher and MotifFinder is found
in the Coursera class Biology Meets Progamming, with modifications.
"""
In [165]:
#Use pandas.DataFrame to store DNA chip results
import pandas as pd
In [190]:
#Personal genome sequencing using DNA chip
#Load raw data and print as a table
#Note that a small example of generic gDNA is used for demonstration purpose.
#The usual DNA chip sequencing result is between 300K to 4M rows long.
generic_gdna = pd.read_table("generic_gdna.txt")
print(generic_gdna)
#rsid: an ID number for a genetic mutation.
In [194]:
#Is Generic a compassionate person?
#Find rs53576 in SNPedia
generic_gdna[generic_gdna["rsid"] == 'rs53576']
Out[194]:
In [195]:
#Should Generic become an athlete?
generic_gdna[generic_gdna["rsid"] == 'rs1815739']
Out[195]:
In [196]:
#Does Generic have lactose intolerance?
generic_gdna[generic_gdna["rsid"] == "rs4988235"]
#Genotype not present in SNPedia but it probably means the chip detected the opposite strand
Out[196]:
In [198]:
#Plavis: an drug to present heart attacks and strokes. Patient response depends on genotype.
#plavix.txt includes a list of mutations that could diminish the efficacy of plavix.
plavix = pd.read_table("plavix.txt")
print(plavix)
In [199]:
#Store rsids associated with plavix response in a list
plavix_list = plavix["rsid"].tolist()
print(plavix_list)
#Loop through the list to find in Generic the genotypes associated with plavix response
for i in plavix_list:
print(generic_gdna[generic_gdna["rsid"] == i])
In [203]:
#Assemble into a new DataFrame
generic_plavix = generic_gdna.loc[generic_gdna['rsid'].isin(plavix_list)]
generic_plavix
Out[203]:
In [204]:
#Write the output to a new file.
generic_plavix.to_csv("generic_plavix.txt", index=None, sep='\t')
In [205]:
#Many reference lists available online
mendeley = pd.read_table("./annotation/mendeley.csv", delimiter = ",")
mendeley.head()
Out[205]:
In [ ]:
"""Below, two packages I wrote (MotifMatcher and MotifFinder) are used.
MotifMatcher is to find in a list of DNA sequences the strings that matches a given motif.
MotifFinder is to find in a list of DNA sequences the unkown motif.
These packages are easy to write if you go over the Coursera class Biology Meets Progamming.
"""
In [209]:
#Look for common DNA/protein sequences
#The code is for DNA, but for translated protein sequences, the idea is the same.
#This used a list of 10 generic chopped TCR sequences. The goal is to find the shared motif.
import MotifMatcher as mm
import MotifFinder as mf
In [210]:
#Use an example of generic TCR sequences
tcr = open("generic_tcr.txt").read()
tcr = [x for x in tcr.split('\n')]
print(tcr)
print("number of elements in tcr:",len(tcr))
In [213]:
#If the shared motif is known:
#Find the number of occurrences of a known motif
my_pat = "AATAGTTTAGATATAAAAGA"
#Find the number of exact matches in each TCR sequence.
exact_match = []
for i in tcr:
exact_match.append(mm.ApproximatePatternCount(my_pat, i, 0))
print(exact_match)
#Find in each TCR sequence the number of strings with a maximum of 10 mismatches.
ten_mismatches = []
for i in tcr:
ten_mismatches.append(mm.ApproximatePatternCount(my_pat, i, 10))
print(ten_mismatches)
In [217]:
#If the shared motif is unknown:
#Find the shared motif first
#This is an extremely time-consuming process. The most common approach is to use randomization.
#The idea of randomization is, if a motif is shared,
#then it will occur more times in a set of sequences,
#thus picked up more likely by randomization.
#And then check the goodness of the fit
#This finds a set of shared motifs by randomization.
best_motifs = GibbsSampler(tcr, 20, 10, 100)
print(best_motifs)
#This finds the number of differences between the approximate shared motif and the consensus,
#assembled from the shared motif.
print("the consensus is:", Consensus(best_motifs))
print("the number of differences is:", Score(best_motifs))
In [218]:
#Use the time module to get the runtime of your code
import time
from datetime import timedelta
start_time = time.monotonic()
#By repeatedly run the randomization, better and better best_motifs is found, with lower scores.
for i in range(30):
better_motifs = GibbsSampler(tcr, 20, 10, 100)
if Score(better_motifs) < Score(best_motifs):
best_motifs = better_motifs
end_time = time.monotonic()
print(timedelta(seconds=end_time - start_time))
print(best_motifs)
print("the consensus is:", Consensus(best_motifs))
print("the number of differences is:", Score(best_motifs))
Out[218]: