Welcome to the final exam.
If you haven't yet read the instructions, you can do so here.
Please run the following data set in the program(s) that you have written:
dna2.fasta
If you created your program(s) correctly, you will be able to answer the questions below.
In [1]:
import os.path
dna2_fasta_file_name = "dna2.fasta"
dna2_fasta_file_path = "./data"
dna2_fasta_file_directory = os.path.join(dna2_fasta_file_path, dna2_fasta_file_name)
print "dna2_fasta_file_directory:%s" % dna2_fasta_file_directory
# open file and attain data from file
try:
f = open(dna2_fasta_file_directory, "r")
except Exception as e:
print e
try:
data = f.read()
# read = > str, readlines = > list, readline = > str
print "read data from %s successfully." % dna2_fasta_file_directory
except Exception as e:
print e
In [2]:
# str.count function
s = "asdfdfsas"
print s.count("s")
In [3]:
# count record in variable "data" according to the special symbol ">"(each record has this symbol)
record_num = data.count(">")
print "record_num:%s" % record_num
# close file
try:
f.close()
print "close %s file successfully." % dna2_fasta_file_directory
except Exception as e:
print e
In [4]:
# open file and load data
try:
f = open(dna2_fasta_file_directory, "r")
data_string_list = f.readlines()
print "open %s file and load data successfully." % dna2_fasta_file_directory
except Exception as e:
print e
In [5]:
# generate each record string as a element in list from data_string_list
record_list = []
record_meta_list = []
for string_idx in xrange(len(data_string_list)):
string = data_string_list[string_idx]
if string.count(">") == 1:
record_meta_list.append(string)
if string_idx != 0:
record_list.append(cur_record)
cur_record = ""
continue
else:
cur_record = cur_record + string
if string_idx == len(data_string_list) - 1:
record_list.append(cur_record)
print "record num.:%s" % len(record_list)
print "len(record_meta_list):%s" % len(record_meta_list)
In [6]:
# remove special symbols
s = "asadafsdaafs"
print s.replace("a", "")
In [7]:
# remove the LINE BREAK character "\n" in variable record_list(each element is a string)
record_list = map(lambda record_string: record_string.replace("\n", ""), record_list)
In [8]:
def add(s1, s2):
return s1+s2
s1_list = range(1, 5, 1) # [1, 2, 3, 4]
s2_list = range(-1, -5, -1)
print s1_list
print s2_list
print map(add, s1_list, s2_list)
In [9]:
record_len_list = map(lambda record_string: len(record_string), record_list)
max_len_record_length = max(record_len_list)
each_record_len_list = map(lambda idx, length: (idx, length), xrange(1, 19), record_len_list)
print "len(record_len_list):%s" % len(record_len_list)
print "record_len_list:%s" % record_len_list
print "max_len_record_length:%s" % max_len_record_length
print "each_record_len_list:%s" % each_record_len_list
In [10]:
min_len_record_length = min(record_len_list)
print "min_len_record_length:%s" % min_len_record_length
In [11]:
s = "asdfa"
print s[3:].find("z")
print s[3:].index("z")
In [12]:
a = 0
while 1:
a += 1
print a
if a == 3: break
In [13]:
def get_sequence_string_accroding_2_frame_num(sequence_string, frame_num):
# generate the sequence string corresponding to frame num.
if frame_num == 1:
pass
elif frame_num == 2:
sequence_string = sequence_string[1:]
elif frame_num == 3:
sequence_string = sequence_string[2:]
return sequence_string
record_list_according_2_frame_2 = map(
lambda record: get_sequence_string_accroding_2_frame_num(
sequence_string = record,
frame_num = 6
), record_list
)
In [14]:
def get_Ngram_list(sequence_string, gramN):
# generate Ngram list
# default gramN = 3, trigram
Ngram_num = len(sequence_string) / gramN
Ngram_list = map(
lambda start_index: sequence_string[start_index:start_index + gramN], xrange(Ngram_num)
)
return Ngram_list
Ngram_2d_list = map(
lambda record: get_Ngram_list(
sequence_string = record,
gramN = 3
), record_list_according_2_frame_2
)
print map(len, record_list_according_2_frame_2)
In [15]:
def find_length_of_longgest_ORF(Ngram_sequence_list):
def find_first_target_index_in_list(trigram_string_list, target_string):
target_index_in_list = -1
try:
target_index_in_list = Ngram_sequence_list.index(target_string)
except Exception as e:
return target_index_in_list
start_codon = "ATG"
end_codon_list = ["TAA", "TAG", "TGA"]
cur_ORF_list = []
cur_max_length = 0
cur_max_length_index = -1
cur_start_index = 0
cur_end_index = -1
while 1:
# find start inedx
try:
cur_start_index = Ngram_sequence_list[cur_start_index:].index(start_codon)
except Exception as e:
print e
return cur_max_length
# find end index
end_index_list = map(
lambda end_codon: find_first_target_index_in_list(Ngram_sequence_list,
end_codon),
end_codon_list
)
cur_end_index = min(end_index_list)
if cur_end_index == -1: return cur_max_length
# current ORF
cur_ORF_list = Ngram_sequence_list[cur_start_index: cur_end_index]
# update cur_max_length variable
if cur_max_length <= len(Ngram_sequence_list[cur_start_index: cur_end_index]):
cur_max_length = len(Ngram_sequence_list[cur_start_index: cur_end_index])
cur_max_length_index = cur_start_index
# exchange end index to start index
cur_start_index = cur_end_index
cur_end_index = -1
GG
In [ ]:
ORF_max_length_for_each_record_list = map(
lambda Ngram_sequence_list: find_length_of_longgest_ORF(Ngram_sequence_list),
Ngram_2d_list
)
print len(ORF_max_length_for_each_record_list)
In [30]:
help(list)
In [16]:
def get_sequence_string_accroding_2_frame_num(sequence_string, frame_num):
# generate the sequence string corresponding to frame num.
if frame_num == 1:
pass
elif frame_num == 2:
sequence_string = sequence_string[1:]
elif frame_num == 3:
sequence_string = sequence_string[2:]
return sequence_string
record_list_according_2_frame3 = map(
lambda sequence_string: get_sequence_string_accroding_2_frame_num(
sequence_string,
frame_num = 3
), record_list
)
In [17]:
def get_Ngram_list(sequence_string, gramN):
# generate Ngram list
# default gramN = 3, trigram
Ngram_num = len(sequence_string) / gramN
Ngram_list = map(
lambda start_index: sequence_string[start_index:start_index + gramN], xrange(Ngram_num)
)
return Ngram_list
Ngram_2d_list = map(
lambda record: get_Ngram_list(
sequence_string = record,
gramN = 3
), record_list_according_2_frame3
)
print map(len, record_list_according_2_frame3)
In [ ]:
def generate_len_six_string_list(string):
segment_length = 6
len_six_string_list = []
for idx in xrange(len(string) - segment_length):
len_six_string_list.append(string[idx:idx + segment_length])
return len_six_string_list
In [ ]:
# flatten 2-Dimension list variable
from compiler.ast import flatten
li = [[1, 2], [3], [4, 5, 2]]
print "li:%s" % li
print "flatten(li):%s" % flatten(li)
In [ ]:
# generate any all 6 length strings for each record from variable "record_list"
length6_2d_list = map(generate_len_six_string_list, record_list)
length6_list = flatten(length6_2d_list)
print "len(length6_list):%s" % len(length6_list)
length6_set = set(length6_list)
print "len(length6_set):%s" % len(length6_set)
In [ ]:
# statistic about most frequency string of length 6
length6_dict = dict()
for cur_length6_string in length6_set:
cur_length6_string_exist_list = map(lambda length6_string_in_length6_list: length6_string_in_length6_list.count(cur_length6_string), length6_list)
cur_length6_string_frequency = sum(cur_length6_string_exist_list)
length6_dict[cur_length6_string] = cur_length6_string_frequency
In [ ]:
most_frequency_length6_value = max(length6_dict.values())
print "most_frequency_length6_value:%s" % most_frequency_length6_value
In [ ]:
def generate_user_defined_length_string_list(string, segment_length):
len_six_string_list = []
for idx in xrange(len(string) - segment_length):
len_six_string_list.append(string[idx:idx + segment_length])
return len_six_string_list
In [ ]:
segment_length = 12
# generate any all 12 length strings for each record from variable "record_list"
length12_2d_list = map(
lambda string: generate_user_defined_length_string_list(string, segment_length), record_list)
length12_list = flatten(length12_2d_list)
print "len(length12_list):%s" % len(length12_list)
length12_set = set(length12_list)
print "len(length12_set):%s" % len(length12_set)
length12_and_length_tuple_list = map(lambda length12: (length12, len(length12)), length12_set)
sorted_length12_and_length_tuple_list = sorted(length12_and_length_tuple_list, key = lambda tup: tup[1])
most_frequency_length12_and_length_tuple = sorted_length12_and_length_tuple_list[0]
print "most_frequency_length12_and_length_tuple:%s" % str(most_frequency_length12_and_length_tuple)
print "most_frequency_length12_and_length_tuple[0:2]:%s" % str(most_frequency_length12_and_length_tuple[0:2])
most_frequency_length12 = most_frequency_length12_and_length_tuple[0]
print "most_frequency_length12:%s" % most_frequency_length12
most_frequency_lenght12_count_in_each_record_list = map(
lambda record: record.count(most_frequency_length12), record_list)
print "most_frequency_lenght12_count_in_each_record_list:%s" % most_frequency_lenght12_count_in_each_record_list
most_frequency_length12_count_sum = sum(most_frequency_lenght12_count_in_each_record_list)
print "most_frequency_length12_count_sum:%s" % most_frequency_length12_count_sum
In [ ]:
# CATCGCC
pattern_string = "CATCGCC"
pattern_count_list = map(lambda record_string: record_string.count(pattern_string), record_list)
print "pattern_count_list:%s" % pattern_count_list
print "sum(pattern_count_list):%s" % sum(pattern_count_list)
In [ ]:
# GCGCGCA
pattern_string = "GCGCGCA"
pattern_count_list = map(lambda record_string: record_string.count(pattern_string), record_list)
print "pattern_count_list:%s" % pattern_count_list
print "sum(pattern_count_list):%s" % sum(pattern_count_list)
In [ ]:
# TGCGCGC
pattern_string = "TGCGCGC"
pattern_count_list = map(lambda record_string: record_string.count(pattern_string), record_list)
print "pattern_count_list:%s" % pattern_count_list
print "sum(pattern_count_list):%s" % sum(pattern_count_list)
In [ ]:
# CGCGCCG
pattern_string = "CGCGCCG"
pattern_count_list = map(lambda record_string: record_string.count(pattern_string), record_list)
print "pattern_count_list:%s" % pattern_count_list
print "sum(pattern_count_list):%s" % sum(pattern_count_list)