notebook.community

Edit and run



In [ ]:

    
"""This lesson used some personal genome sequencing data as well as packages I wrote myself.
The code in custom packages MotifMatcher and MotifFinder is found 
in the Coursera class Biology Meets Progamming, with modifications.
"""



In [165]:

    
#Use pandas.DataFrame to store DNA chip results

import pandas as pd



In [190]:

    
#Personal genome sequencing using DNA chip
#Load raw data and print as a table
#Note that a small example of generic gDNA is used for demonstration purpose. 
#The usual DNA chip sequencing result is between 300K to 4M rows long.

generic_gdna = pd.read_table("generic_gdna.txt")
print(generic_gdna)

#rsid: an ID number for a genetic mutation.









    



         rsid  chromosome   position genotype
0   rs4988235           2  136608646       GG
1     rs53576           3    8804371       GG
2  rs12248560          10   96521657       CC
3  rs28399504          10   96522463       AA
4  rs41291556          10   96535173       TT
5   rs4986893          10   96540410       GG
6   rs4244285          10   96541616       AG
7   rs1815739          11   66328095       CT



In [194]:

    
#Is Generic a compassionate person?
#Find rs53576 in SNPedia

generic_gdna[generic_gdna["rsid"] == 'rs53576']









    Out[194]:






  
    
      
      rsid
      chromosome
      position
      genotype
    
  
  
    
      1
      rs53576
      3
      8804371
      GG



In [195]:

    
#Should Generic become an athlete?

generic_gdna[generic_gdna["rsid"] == 'rs1815739']









    Out[195]:






  
    
      
      rsid
      chromosome
      position
      genotype
    
  
  
    
      7
      rs1815739
      11
      66328095
      CT



In [196]:

    
#Does Generic have lactose intolerance?

generic_gdna[generic_gdna["rsid"] == "rs4988235"]

#Genotype not present in SNPedia but it probably means the chip detected the opposite strand









    Out[196]:






  
    
      
      rsid
      chromosome
      position
      genotype
    
  
  
    
      0
      rs4988235
      2
      136608646
      GG



In [198]:

    
#Plavis: an drug to present heart attacks and strokes. Patient response depends on genotype.
#plavix.txt includes a list of mutations that could diminish the efficacy of plavix.

plavix = pd.read_table("plavix.txt")
print(plavix)









    



         rsid identifier consensus mutation      pmid
0   rs4244285         *2        GG        A  23698643
1   rs4986893         *3        GG        A  20979470
2  rs28399504         *4        AA        G  20978260
3  rs41291556         *8        TT        C  19106083
4  rs12248560        *17        CC        T  21716271



In [199]:

    
#Store rsids associated with plavix response in a list
plavix_list = plavix["rsid"].tolist()
print(plavix_list)

#Loop through the list to find in Generic the genotypes associated with plavix response
for i in plavix_list:
    print(generic_gdna[generic_gdna["rsid"] == i])









    



['rs4244285', 'rs4986893', 'rs28399504', 'rs41291556', 'rs12248560']
        rsid  chromosome  position genotype
6  rs4244285          10  96541616       AG
        rsid  chromosome  position genotype
5  rs4986893          10  96540410       GG
         rsid  chromosome  position genotype
3  rs28399504          10  96522463       AA
         rsid  chromosome  position genotype
4  rs41291556          10  96535173       TT
         rsid  chromosome  position genotype
2  rs12248560          10  96521657       CC



In [203]:

    
#Assemble into a new DataFrame
generic_plavix = generic_gdna.loc[generic_gdna['rsid'].isin(plavix_list)]
generic_plavix



In [204]:

    
#Write the output to a new file. 
generic_plavix.to_csv("generic_plavix.txt", index=None, sep='\t')



In [205]:

    
#Many reference lists available online

mendeley = pd.read_table("./annotation/mendeley.csv", delimiter = ",")
mendeley.head()









    Out[205]:






  
    
      
      Name
      Position
      Chromosome
      Year
      First author
      Title
      DOI
      Open Access status
      Link
    
  
  
    
      0
      rs12662670
      151918856
      6
      2011.0
      Kristen N Stevens
      Common breast cancer susceptibility loci are a...
      10.1158/0008-5472.CAN-11-1266
      True
      http://www.mendeley.com/research/common-breast...
    
    
      1
      rs2046210
      151948366
      6
      2011.0
      Qiuyin Cai
      Replication and functional genomic analyses of...
      10.1158/0008-5472.CAN-10-2733
      True
      http://www.mendeley.com/research/replication-f...
    
    
      2
      rs2046210
      151948366
      6
      2011.0
      Wonshik Han
      Common genetic variants associated with breast...
      10.1158/1055-9965.EPI-10-1282
      False
      http://www.mendeley.com/research/common-geneti...
    
    
      3
      rs2046210
      151948366
      6
      2009.0
      Wei Zheng
      Genome-wide association study identifies a new...
      NaN
      True
      http://www.mendeley.com/research/genomewide-as...
    
    
      4
      rs2046210
      151948366
      6
      2010.0
      Simon N Stacey
      Ancestry-Shift Refinement Mapping of the C6orf...
      10.1371/journal.pgen.1001029
      True
      http://www.mendeley.com/research/ancestryshift...



In [ ]:

    
"""Below, two packages I wrote (MotifMatcher and MotifFinder) are used.
MotifMatcher is to find in a list of DNA sequences the strings that matches a given motif.
MotifFinder is to find in a list of DNA sequences the unkown motif.
These packages are easy to write if you go over the Coursera class Biology Meets Progamming.
"""



In [209]:

    
#Look for common DNA/protein sequences
#The code is for DNA, but for translated protein sequences, the idea is the same.
#This used a list of 10 generic chopped TCR sequences. The goal is to find the shared motif.

import MotifMatcher as mm
import MotifFinder as mf



In [210]:

    
#Use an example of generic TCR sequences

tcr = open("generic_tcr.txt").read()
tcr = [x for x in tcr.split('\n')]
print(tcr)
print("number of elements in tcr:",len(tcr))









    



['AATAGTTTAGATATAAAAGATTTCTCCCGATTCGCTCCGTTCCCCGTTTACCCGATCCCCGTGTTTCGCTTTTAGCCGAGCCGCAGGCCCAAATCGATTCCGTACCAGCTAGCAGAGGGATAGACTGT', 'AAGATGATAGGGATCGCTCCGATAGCTCGCTAAGAGCTTTCGGCTTTCGCCCTAGGGCTAGTTACTAGCTGCTAGGCTCGATTCGATCGATCGGATATAATAGATTTCTCCCAGAGTTCCGCTCGCTC', 'AAGATATAAAAGATTTCTCTTCTGGGCTGGCTGGCTTTAGGACGCGCCCAGGCGCGCGGGAACGGCAACACAGAGATCGGACCGCGACAGAAAGCTGAGGACGGCGAGAGAGCTGTAGGGAACGGATT', 'TTCTCTATCTCCCTACCTCTTTTCTTTCTCTCTACCACGATATAAAAGATTTGACCCGAGTCGCTCGCCTCCGCTAACTAGTCTACCTAGCTAGCTATGCTAGCTCGAGGATCGATCGATCGGGCTAT', 'TTGCTGGCTGCTACAGCTAGCACCCGGAGGATCGGGATCAAACTGATCGGATATAATCGACTTCTCCCGTTTCTCTTAAATCGTTGGCTTGACCCATAGCCCTAACCCTCGCTAGCTGGCTGGCTCAA', 'TTTCGTTGTCCCCGGCTAAACCATAACGCGCGCAAACAACCCAGGGATATAAAAGATTTCTCCCGGGCCTCGTTTTAAACAACAAAGAAAATGCCCCAAGATGCTAGCTTTAGCTAGCTTAGATCGAA', 'TTCCGCTCGTTGATATCATATACCAAATCATCATTCATCTTTTACAAACATGGGACGGAGGCAGGGGCCCAGTTTCATGATATAAACCATTTCTCCCGGGCTGAACTCGGGACCAGGCAGCGGGACTT', 'CCTCCCGATAGCTCGGCTCGCTTGCTCGTGATATAAAAGATTTCTCCCGTTCTCTCTTCTCCTCTCTCAATACTCTAAATTAACTTAAACTGGCGGGGATGATAGGGGCTGATCGGGCTGGTTTTGCT', 'GATATAAAAGATTTCTCCCGTGGCTCGCTACGATCGATTCGTACTCTAGCTTTAGCTCTATGCTCTCATGCTATACGTCTTCCAATGCTATCGTATCGGTACTAATACAAGCTTACGTCGGCTACTCC', 'TCCATCCCTACAACCCACCAGGGCCGGGATAGCTGGCTTGGGCGGGATCGGGCTTCTTTTCGGGATCTCTCTTCGGATTTCGCTCGGATTGGGCTTTTAGGCTTTCGGGATATAAAAGATTTCTCCCG']
number of elements in tcr: 10



In [213]:

    
#If the shared motif is known:
#Find the number of occurrences of a known motif

my_pat = "AATAGTTTAGATATAAAAGA"

#Find the number of exact matches in each TCR sequence.
exact_match = []
for i in tcr:
    exact_match.append(mm.ApproximatePatternCount(my_pat, i, 0))    
print(exact_match)

#Find in each TCR sequence the number of strings with a maximum of 10 mismatches.
ten_mismatches = []
for i in tcr:
    ten_mismatches.append(mm.ApproximatePatternCount(my_pat, i, 10))    
print(ten_mismatches)









    



[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[2, 5, 0, 2, 1, 4, 5, 2, 1, 2]



In [217]:

    
#If the shared motif is unknown:
#Find the shared motif first
#This is an extremely time-consuming process. The most common approach is to use randomization.

#The idea of randomization is, if a motif is shared, 
#then it will occur more times in a set of sequences, 
#thus picked up more likely by randomization.

#And then check the goodness of the fit

#This finds a set of shared motifs by randomization.
best_motifs = GibbsSampler(tcr, 20, 10, 100)
print(best_motifs)

#This finds the number of differences between the approximate shared motif and the consensus,
#assembled from the shared motif.
print("the consensus is:", Consensus(best_motifs))
print("the number of differences is:", Score(best_motifs))









    



['AAAAGATTTCTCCCGATTCG', 'AATAGATTTCTCCCAGAGTT', 'AAAAGATTTCTCTTCTGGGC', 'AAAAGATTTGACCCGAGTCG', 'AATCGACTTCTCCCGTTTCT', 'AAAAGATTTCTCCCGGGCCT', 'AAACCATTTCTCCCGGGCTG', 'AAAAGATTTCTCCCGTTCTC', 'AAAAGATTTCTCCCGTGGCT', 'TTCGGATTTCGCTCGGATTG']
the consensus is: AAAAGATTTCTCCCGGGTCG
the number of differences is: 46



In [218]:

    
#Use the time module to get the runtime of your code
import time
from datetime import timedelta
start_time = time.monotonic()

#By repeatedly run the randomization, better and better best_motifs is found, with lower scores.
for i in range(30):
    better_motifs = GibbsSampler(tcr, 20, 10, 100)
    if Score(better_motifs) < Score(best_motifs):
        best_motifs = better_motifs
        
end_time = time.monotonic()
print(timedelta(seconds=end_time - start_time))

print(best_motifs)

print("the consensus is:", Consensus(best_motifs))
print("the number of differences is:", Score(best_motifs))









    



0:00:02.142653
['GATATAAAAGATTTCTCCCG', 'GATATAATAGATTTCTCCCA', 'GATATAAAAGATTTCTCTTC', 'GATATAAAAGATTTGACCCG', 'GATATAATCGACTTCTCCCG', 'GATATAAAAGATTTCTCCCG', 'GATATAAACCATTTCTCCCG', 'GATATAAAAGATTTCTCCCG', 'GATATAAAAGATTTCTCCCG', 'GATATAAAAGATTTCTCCCG']






    Out[218]:





12

	rsid	chromosome	position	genotype
2	rs12248560	10	96521657	CC
3	rs28399504	10	96522463	AA
4	rs41291556	10	96535173	TT
5	rs4986893	10	96540410	GG
6	rs4244285	10	96541616	AG

	Name	Position	Chromosome	Year	First author	Title	DOI	Open Access status	Link
0	rs12662670	151918856	6	2011.0	Kristen N Stevens	Common breast cancer susceptibility loci are a...	10.1158/0008-5472.CAN-11-1266	True	http://www.mendeley.com/research/common-breast...
1	rs2046210	151948366	6	2011.0	Qiuyin Cai	Replication and functional genomic analyses of...	10.1158/0008-5472.CAN-10-2733	True	http://www.mendeley.com/research/replication-f...
2	rs2046210	151948366	6	2011.0	Wonshik Han	Common genetic variants associated with breast...	10.1158/1055-9965.EPI-10-1282	False	http://www.mendeley.com/research/common-geneti...
3	rs2046210	151948366	6	2009.0	Wei Zheng	Genome-wide association study identifies a new...	NaN	True	http://www.mendeley.com/research/genomewide-as...
4	rs2046210	151948366	6	2010.0	Simon N Stacey	Ancestry-Shift Refinement Mapping of the C6orf...	10.1371/journal.pgen.1001029	True	http://www.mendeley.com/research/ancestryshift...