In [1]:
cd "E:\research\multisequenceanddcafrustratometry"
In [17]:
families = pfam_get_list_of_families()
In [39]:
pfam_download_family_xml_files(families)
In [102]:
accessions = pfam_get_accessions(families)
In [127]:
domain_accessions = pfam_find_domains_matching_criteria(accessions)
In [128]:
len(domain_accessions)
Out[128]:
In [191]:
pfam_download_alignments(domain_accessions, "seed")
In [ ]:
pfam_download_alignments(domain_accessions, "full")
In [200]:
for accession in domain_accessions:
gzip_decompress("%s_full.aln.gz" % accession)
In [177]:
structures = pfam_parse_pdb_file('pdb_pfamA_reg.txt')
In [222]:
domain_structures = {}
for accession in domain_accessions:
domain_structures[accession] = structures[accession]
In [ ]:
longest_structures = pfam_find_longest_structure(domain_structures, download_pdbs=True)
In [235]:
len(list(set([x[0] for x in longest_domains.values()])))
Out[235]:
In [2]:
families = pfam_get_list_of_families()
In [3]:
accessions = pfam_get_accessions(families)
In [4]:
domain_accessions = pfam_find_domains_matching_criteria(accessions)
In [5]:
structures = pfam_parse_pdb_file('pdb_pfamA_reg.txt')
In [6]:
domain_structures = {}
for accession in domain_accessions:
domain_structures[accession] = structures[accession]
In [ ]:
longest_domains, no_pdb_families = pfam_find_longest_structure(domain_structures, download_pdbs=True)
In [34]:
no_pdb_families
Out[34]:
In [35]:
# Remove those domains that don't have any structures available in PDB format
for family in no_pdb_families:
domain_accessions.remove(family)
del domain_structures[family]
In [3]:
# Save dictionaries and lists to files
# pickle.dump(longest_domains, open("longest_domains.pkl", "wb"))
longest_domains = pickle.load(open("longest_domains.pkl", "rb"))
In [ ]:
io = PDBIO()
for domain in longest_domains.values():
pdb_id, selected_chain, selected_start, selected_end = domain
selected_start = int(selected_start)
selected_end = int(selected_end)
structure = parse_pdb("%s" % pdb_id)
structure_selector = pfam_is_in_structure_selection(selected_chain, selected_start, selected_end)
io.set_structure(structure)
io.save("%s%s_%d-%d.pdb" % (pdb_id, selected_chain, selected_start, selected_end), select=structure_selector)
In [18]:
too_short_structures = []
for accession, domain in longest_domains.items():
length = int(domain[3]) - int(domain[2]) + 1
if length < 50:
too_short_structures.append(accession)
for accession in too_short_structures:
del longest_domains[accession]
In [19]:
# Save dictionaries and lists to files
pickle.dump(longest_domains, open("longest_domains.pkl", "wb"))
longest_domains = pickle.load(open("longest_domains.pkl", "rb"))
In [24]:
for domain in longest_domains.values():
pdb_id, selected_chain, selected_start, selected_end = domain
structure = parse_pdb("%s%s_%d-%d" % (pdb_id, selected_chain, int(selected_start), int(selected_end)))
sequence = get_sequence_from_structure(structure)
output_file = open("%s%s_%d-%d.fasta" % (pdb_id, selected_chain, int(selected_start), int(selected_end)), "w")
output_file.write(">%s%s_%d-%d\n" % (pdb_id, selected_chain, int(selected_start), int(selected_end)))
output_file.write(sequence)
output_file.close()
In [26]:
for accession, domain in longest_domains.items():
pfam_download_hmm(accession)
In [89]:
invalid_accessions = []
for accession, domain in longest_domains.items():
pdb_id, selected_chain, selected_start, selected_end = domain
try:
pfam_align_sequence_to_family("%s%s_%s-%s.fasta" % (pdb_id, selected_chain, selected_start, selected_end), accession)
except:
invalid_accessions.append(accession)
In [92]:
# remove CA-only and EM structures
for accession in invalid_accessions:
del longest_domains[accession]
In [13]:
# Save dictionaries and lists to files
#pickle.dump(longest_domains, open("longest_domains.pkl", "wb"))
longest_domains = pickle.load(open("longest_domains.pkl", "rb"))
In [94]:
len(longest_domains)
Out[94]:
In [40]:
for accession, domain in longest_domains.items():
pdb_id, selected_chain, selected_start, selected_end = domain
pfam_filter_full_alignment_by_pdb_alignment(accession, "%s%s_%s-%s.aln" % (pdb_id, selected_chain, selected_start, selected_end), filter_percentage=0.05)
In [51]:
# number of domains
# lengths of domains
# number of sequences
# mean sequence entropies
In [42]:
len(longest_domains)
Out[42]:
In [44]:
lengths_of_domains = []
for domain in list(longest_domains.values()):
pdb_id, selected_chain, selected_start, selected_end = domain
structure = parse_pdb("%s%s_%d-%d" % (pdb_id, selected_chain, int(selected_start), int(selected_end)))
sequence = get_sequence_from_structure(structure)
lengths_of_domains.append(len(sequence))
In [50]:
num_sequences = []
mean_sequence_entropies = []
for domain in list(longest_domains.values()):
pdb_id, selected_chain, selected_start, selected_end = domain
# num_lines = count_lines_in_file("%s%s_%d-%d_filtered_0.05.seqs" % (pdb_id, selected_chain, int(selected_start), int(selected_end)))
frequency_matrix, sequence_entropy, number_of_sequences, sequence_length, mean_sequence_entropy = compute_average_sequence_and_sequence_entropy("%s%s_%d-%d_filtered_0.05.seqs" % (pdb_id, selected_chain, int(selected_start), int(selected_end)))
num_sequences.append(number_of_sequences)
mean_sequence_entropies.append(mean_sequence_entropy)
In [55]:
fig = plotly_histogram(lengths_of_domains)
py.iplot(fig)
Out[55]:
In [56]:
fig = plotly_histogram(num_sequences)
py.iplot(fig)
Out[56]:
In [57]:
fig = plotly_histogram(mean_sequence_entropies)
py.iplot(fig)
Out[57]:
In [81]:
fig = plotly_scatter(num_sequences, mean_sequence_entropies)
py.iplot(fig)
Out[81]:
In [82]:
fig = plotly_scatter(lengths_of_domains,num_sequences, color=mean_sequence_entropies)
py.iplot(fig)
Out[82]:
In [66]:
# total number of sequences
sum(num_sequences)
Out[66]:
In [83]:
# most number of sequences for a single domain
list(longest_domains.items())[np.argmax(num_sequences)]
Out[83]:
In [84]:
# longest single domain
list(longest_domains.items())[np.argmax(lengths_of_domains)]
Out[84]:
In [85]:
num_sequences[np.argmax(lengths_of_domains)]
Out[85]:
In [91]:
sorted_domains, sorted_lengths = sort_list_by_other_list(longest_domains.items(), lengths_of_domains)
In [95]:
list(zip(sorted_lengths[-50:], sorted_domains[-50:]))
Out[95]:
In [ ]: