In [ ]:
"""This lesson used some personal genome sequencing data as well as packages I wrote myself.
The code in custom packages MotifMatcher and MotifFinder is found 
in the Coursera class Biology Meets Progamming, with modifications.
"""

In [165]:
#Use pandas.DataFrame to store DNA chip results

import pandas as pd

In [190]:
#Personal genome sequencing using DNA chip
#Load raw data and print as a table
#Note that a small example of generic gDNA is used for demonstration purpose. 
#The usual DNA chip sequencing result is between 300K to 4M rows long.

generic_gdna = pd.read_table("generic_gdna.txt")
print(generic_gdna)

#rsid: an ID number for a genetic mutation.


         rsid  chromosome   position genotype
0   rs4988235           2  136608646       GG
1     rs53576           3    8804371       GG
2  rs12248560          10   96521657       CC
3  rs28399504          10   96522463       AA
4  rs41291556          10   96535173       TT
5   rs4986893          10   96540410       GG
6   rs4244285          10   96541616       AG
7   rs1815739          11   66328095       CT

In [194]:
#Is Generic a compassionate person?
#Find rs53576 in SNPedia

generic_gdna[generic_gdna["rsid"] == 'rs53576']


Out[194]:
rsid chromosome position genotype
1 rs53576 3 8804371 GG

In [195]:
#Should Generic become an athlete?

generic_gdna[generic_gdna["rsid"] == 'rs1815739']


Out[195]:
rsid chromosome position genotype
7 rs1815739 11 66328095 CT

In [196]:
#Does Generic have lactose intolerance?

generic_gdna[generic_gdna["rsid"] == "rs4988235"]

#Genotype not present in SNPedia but it probably means the chip detected the opposite strand


Out[196]:
rsid chromosome position genotype
0 rs4988235 2 136608646 GG

In [198]:
#Plavis: an drug to present heart attacks and strokes. Patient response depends on genotype.
#plavix.txt includes a list of mutations that could diminish the efficacy of plavix.

plavix = pd.read_table("plavix.txt")
print(plavix)


         rsid identifier consensus mutation      pmid
0   rs4244285         *2        GG        A  23698643
1   rs4986893         *3        GG        A  20979470
2  rs28399504         *4        AA        G  20978260
3  rs41291556         *8        TT        C  19106083
4  rs12248560        *17        CC        T  21716271

In [199]:
#Store rsids associated with plavix response in a list
plavix_list = plavix["rsid"].tolist()
print(plavix_list)

#Loop through the list to find in Generic the genotypes associated with plavix response
for i in plavix_list:
    print(generic_gdna[generic_gdna["rsid"] == i])


['rs4244285', 'rs4986893', 'rs28399504', 'rs41291556', 'rs12248560']
        rsid  chromosome  position genotype
6  rs4244285          10  96541616       AG
        rsid  chromosome  position genotype
5  rs4986893          10  96540410       GG
         rsid  chromosome  position genotype
3  rs28399504          10  96522463       AA
         rsid  chromosome  position genotype
4  rs41291556          10  96535173       TT
         rsid  chromosome  position genotype
2  rs12248560          10  96521657       CC

In [203]:
#Assemble into a new DataFrame
generic_plavix = generic_gdna.loc[generic_gdna['rsid'].isin(plavix_list)]
generic_plavix


Out[203]:
rsid chromosome position genotype
2 rs12248560 10 96521657 CC
3 rs28399504 10 96522463 AA
4 rs41291556 10 96535173 TT
5 rs4986893 10 96540410 GG
6 rs4244285 10 96541616 AG

In [204]:
#Write the output to a new file. 
generic_plavix.to_csv("generic_plavix.txt", index=None, sep='\t')

In [205]:
#Many reference lists available online

mendeley = pd.read_table("./annotation/mendeley.csv", delimiter = ",")
mendeley.head()


Out[205]:
Name Position Chromosome Year First author Title DOI Open Access status Link
0 rs12662670 151918856 6 2011.0 Kristen N Stevens Common breast cancer susceptibility loci are a... 10.1158/0008-5472.CAN-11-1266 True http://www.mendeley.com/research/common-breast...
1 rs2046210 151948366 6 2011.0 Qiuyin Cai Replication and functional genomic analyses of... 10.1158/0008-5472.CAN-10-2733 True http://www.mendeley.com/research/replication-f...
2 rs2046210 151948366 6 2011.0 Wonshik Han Common genetic variants associated with breast... 10.1158/1055-9965.EPI-10-1282 False http://www.mendeley.com/research/common-geneti...
3 rs2046210 151948366 6 2009.0 Wei Zheng Genome-wide association study identifies a new... NaN True http://www.mendeley.com/research/genomewide-as...
4 rs2046210 151948366 6 2010.0 Simon N Stacey Ancestry-Shift Refinement Mapping of the C6orf... 10.1371/journal.pgen.1001029 True http://www.mendeley.com/research/ancestryshift...

In [ ]:
"""Below, two packages I wrote (MotifMatcher and MotifFinder) are used.
MotifMatcher is to find in a list of DNA sequences the strings that matches a given motif.
MotifFinder is to find in a list of DNA sequences the unkown motif.
These packages are easy to write if you go over the Coursera class Biology Meets Progamming.
"""

In [209]:
#Look for common DNA/protein sequences
#The code is for DNA, but for translated protein sequences, the idea is the same.
#This used a list of 10 generic chopped TCR sequences. The goal is to find the shared motif.

import MotifMatcher as mm
import MotifFinder as mf

In [210]:
#Use an example of generic TCR sequences

tcr = open("generic_tcr.txt").read()
tcr = [x for x in tcr.split('\n')]
print(tcr)
print("number of elements in tcr:",len(tcr))


['AATAGTTTAGATATAAAAGATTTCTCCCGATTCGCTCCGTTCCCCGTTTACCCGATCCCCGTGTTTCGCTTTTAGCCGAGCCGCAGGCCCAAATCGATTCCGTACCAGCTAGCAGAGGGATAGACTGT', 'AAGATGATAGGGATCGCTCCGATAGCTCGCTAAGAGCTTTCGGCTTTCGCCCTAGGGCTAGTTACTAGCTGCTAGGCTCGATTCGATCGATCGGATATAATAGATTTCTCCCAGAGTTCCGCTCGCTC', 'AAGATATAAAAGATTTCTCTTCTGGGCTGGCTGGCTTTAGGACGCGCCCAGGCGCGCGGGAACGGCAACACAGAGATCGGACCGCGACAGAAAGCTGAGGACGGCGAGAGAGCTGTAGGGAACGGATT', 'TTCTCTATCTCCCTACCTCTTTTCTTTCTCTCTACCACGATATAAAAGATTTGACCCGAGTCGCTCGCCTCCGCTAACTAGTCTACCTAGCTAGCTATGCTAGCTCGAGGATCGATCGATCGGGCTAT', 'TTGCTGGCTGCTACAGCTAGCACCCGGAGGATCGGGATCAAACTGATCGGATATAATCGACTTCTCCCGTTTCTCTTAAATCGTTGGCTTGACCCATAGCCCTAACCCTCGCTAGCTGGCTGGCTCAA', 'TTTCGTTGTCCCCGGCTAAACCATAACGCGCGCAAACAACCCAGGGATATAAAAGATTTCTCCCGGGCCTCGTTTTAAACAACAAAGAAAATGCCCCAAGATGCTAGCTTTAGCTAGCTTAGATCGAA', 'TTCCGCTCGTTGATATCATATACCAAATCATCATTCATCTTTTACAAACATGGGACGGAGGCAGGGGCCCAGTTTCATGATATAAACCATTTCTCCCGGGCTGAACTCGGGACCAGGCAGCGGGACTT', 'CCTCCCGATAGCTCGGCTCGCTTGCTCGTGATATAAAAGATTTCTCCCGTTCTCTCTTCTCCTCTCTCAATACTCTAAATTAACTTAAACTGGCGGGGATGATAGGGGCTGATCGGGCTGGTTTTGCT', 'GATATAAAAGATTTCTCCCGTGGCTCGCTACGATCGATTCGTACTCTAGCTTTAGCTCTATGCTCTCATGCTATACGTCTTCCAATGCTATCGTATCGGTACTAATACAAGCTTACGTCGGCTACTCC', 'TCCATCCCTACAACCCACCAGGGCCGGGATAGCTGGCTTGGGCGGGATCGGGCTTCTTTTCGGGATCTCTCTTCGGATTTCGCTCGGATTGGGCTTTTAGGCTTTCGGGATATAAAAGATTTCTCCCG']
number of elements in tcr: 10

In [213]:
#If the shared motif is known:
#Find the number of occurrences of a known motif

my_pat = "AATAGTTTAGATATAAAAGA"

#Find the number of exact matches in each TCR sequence.
exact_match = []
for i in tcr:
    exact_match.append(mm.ApproximatePatternCount(my_pat, i, 0))    
print(exact_match)

#Find in each TCR sequence the number of strings with a maximum of 10 mismatches.
ten_mismatches = []
for i in tcr:
    ten_mismatches.append(mm.ApproximatePatternCount(my_pat, i, 10))    
print(ten_mismatches)


[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[2, 5, 0, 2, 1, 4, 5, 2, 1, 2]

In [217]:
#If the shared motif is unknown:
#Find the shared motif first
#This is an extremely time-consuming process. The most common approach is to use randomization.

#The idea of randomization is, if a motif is shared, 
#then it will occur more times in a set of sequences, 
#thus picked up more likely by randomization.

#And then check the goodness of the fit

#This finds a set of shared motifs by randomization.
best_motifs = GibbsSampler(tcr, 20, 10, 100)
print(best_motifs)

#This finds the number of differences between the approximate shared motif and the consensus,
#assembled from the shared motif.
print("the consensus is:", Consensus(best_motifs))
print("the number of differences is:", Score(best_motifs))


['AAAAGATTTCTCCCGATTCG', 'AATAGATTTCTCCCAGAGTT', 'AAAAGATTTCTCTTCTGGGC', 'AAAAGATTTGACCCGAGTCG', 'AATCGACTTCTCCCGTTTCT', 'AAAAGATTTCTCCCGGGCCT', 'AAACCATTTCTCCCGGGCTG', 'AAAAGATTTCTCCCGTTCTC', 'AAAAGATTTCTCCCGTGGCT', 'TTCGGATTTCGCTCGGATTG']
the consensus is: AAAAGATTTCTCCCGGGTCG
the number of differences is: 46

In [218]:
#Use the time module to get the runtime of your code
import time
from datetime import timedelta
start_time = time.monotonic()

#By repeatedly run the randomization, better and better best_motifs is found, with lower scores.
for i in range(30):
    better_motifs = GibbsSampler(tcr, 20, 10, 100)
    if Score(better_motifs) < Score(best_motifs):
        best_motifs = better_motifs
        
end_time = time.monotonic()
print(timedelta(seconds=end_time - start_time))

print(best_motifs)

print("the consensus is:", Consensus(best_motifs))
print("the number of differences is:", Score(best_motifs))


0:00:02.142653
['GATATAAAAGATTTCTCCCG', 'GATATAATAGATTTCTCCCA', 'GATATAAAAGATTTCTCTTC', 'GATATAAAAGATTTGACCCG', 'GATATAATCGACTTCTCCCG', 'GATATAAAAGATTTCTCCCG', 'GATATAAACCATTTCTCCCG', 'GATATAAAAGATTTCTCCCG', 'GATATAAAAGATTTCTCCCG', 'GATATAAAAGATTTCTCCCG']
Out[218]:
12