In this notebook we define three functions as follows:
This function takes one value as input and outputs one value as outlined below
Where by complement we mean the algorithm takes a gene sequence and substitutes each value as follows
$A \rightarrow T$
$G \rightarrow C$
$T \rightarrow A$
$C \rightarrow G$
This function takes one value as input and outputs one value as outlined below
Where by reverse, we simply mean it outputs the sequence in reverse order.
For example, given CTGCCA, the program will compute ACCGTC
This function takes three values as input and outputs one value as outlined below
So, if we are given a file with gene sequence
CTGCCAGTGCATGCCACAGCTCTTCACGGGCCTTTTCAAGCTGCTCATAGCCACGTCGGACGGGCTCGTCATGGCCTTCGACCGCCGCCTGGTCGACGCGCTCCTGCTAGTGTAGGCGTTC
and (start, end) coordinates (1,6) respectively, then the program will output
CTGCCA
In [1]:
# for handling all related to gene Sequence
from Bio import SeqIO
In [2]:
## FASTA file with Species sequence
species_sequence = '/home/lgutierrezfunderburk/Documents/Test/45_row_gambiae_2L.fa'
test_file = '/home/lgutierrezfunderburk/Documents/Test/seq_test.fa'
In [18]:
# Define complement function. Input is a string containing gene sequence, and output is a string containing
# the complement of the gene sequence
def complement(seq):
"""This function takes a sequence, and computes its complement"""
# Define a dictionary with sequence symbols as keys, and their corresponding complement symbol as dictionary value
complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N':'N', 'a':'t', 't':'a', 'g':'c', 'c':'g', 'n':'n'}
# We define bases as a list where each element of the list corresponds to one letter in the seq string
bases = list(seq)
# For each element in the list bases, extract the corresponding complement value from dictionary, and construct
# a new list with the complement elements in orderly manner
complement_bases = [complement[base] for base in bases]
# use join() method to join elements in complement_bases array, and return a string with the complement sequence
return complement_bases
In [19]:
# Function from_array_to_string() takes as input an array whose elements are strings containing one letter
# and whose output is all elements of the array contatenated into a string
def from_array_to_string(sequence_array):
""" This function will take all elements in the sequence array, and concatenate them into a single string"""
return ''.join(sequence_array)
In [20]:
# Define reverse function. Input is a string containing a sequence, and output is a string with the sequence
# in reverse order
def reverse(comp_seq):
"""This function takes a sequence, and computes the sequence in reverse order"""
# take input, and return the sequence in reverse order
return comp_seq[::-1]
In [21]:
# Define get_sequence_from_file() function.
# This function takes as input a FASTA file, and outputs an array whose elements are the letters in the sequence
def get_sequence_from_file(file_name):
"""This functions will read FASTA file once, and store the content of the sequence in an array"""
# file_name is the file containing FASTA file
# we parse the FASTA file, and obtain the sequence in it, storing it in array
sequence = [seq_record.seq for seq_record in SeqIO.parse(file_name,'fasta')]
# once we obtained the sequence, we want to store it in an array such that each element in the array
# is one letter from the sequence
return [letter for letter in sequence[0]]
In [22]:
# Define exon_coordinate_partition() a function whose input is an array containing elements of a sequence, and two
# coordinates: start and end exons
# The function will return an array containing elements in the sequence corresponding to the outlined coordinates
#
# For example, if given a sequence=['a','a','g','g','c','t','a'] and the coordinates 3,6, this function
# will return [g','g','c','t']
def exon_coordinate_partition(sequence, start_exon,end_exon):
"""This function will partition our sequence as per given coordinates"""
return sequence[start_exon - 1: end_exon]
In [ ]:
# Continue
def bringing_it_all_together(species_file,start_exon,end_exon,exon_direction):
In [24]:
# Playing area
start_exon = 1
end_exon= 27
test_sequence = get_sequence_from_file(test_file)
partition_one = exon_coordinate_partition(test_sequence,start_exon,end_exon)
complement_partition_one = complement(partition_one)
reverse_complement_partition_one = reverse(complement_partition_one)
print(partition_one)
print("")
print(complement_partition_one)
print("")
print(reverse_complement_partition_one)
# What if we want to turn these into strings?
string_partition_one = from_array_to_string(partition_one)
string_complement_p_1 = from_array_to_string(complement_partition_one)
string_reverse_comp_p1 = from_array_to_string(reverse_complement_partition_one)
# Printing transformed arrays into strings
print(string_partition_one)
print("")
print(string_complement_p_1)
print("")
print(string_reverse_comp_p1)
print(complement_partition_one + reverse_complement_partition_one)
In [ ]:
In [ ]: