In [5]:

    
%matplotlib inline
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os, sys, glob

TM Sequence analysis

A workbook to analyse the TM sequences of TM annotated Swissprot proteins from the 3 domains.

All TM domains were annotated again using TMHMM and the sequences +/- 3 residues and orientations pulled out from TMHMM output using a trivial script



In [2]:

    
ls









    



archaea_swissprot_tm_clustered_clean_ii_tm_seqs.fasta
archaea_swissprot_tm_clustered_clean_io_tm_seqs.fasta
archaea_swissprot_tm_clustered_clean_oi_tm_seqs.fasta
archaea_swissprot_tm_clustered_clean_oo_tm_seqs.fasta
bacteria_swissprot_tm_clustered_clean_ii_tm_seqs.fasta
bacteria_swissprot_tm_clustered_clean_io_tm_seqs.fasta
bacteria_swissprot_tm_clustered_clean_oi_tm_seqs.fasta
bacteria_swissprot_tm_clustered_clean_oo_tm_seqs.fasta
eukaryote_swissprot_tm_clustered_clean_ii_tm_seqs.fasta
eukaryote_swissprot_tm_clustered_clean_io_tm_seqs.fasta
eukaryote_swissprot_tm_clustered_clean_oi_tm_seqs.fasta
eukaryote_swissprot_tm_clustered_clean_oo_tm_seqs.fasta
get_tm_seqs.py
TM Sequences Analysis Across Domains.ipynb



In [4]:

    
def get_seqs_only(filename):
    """
    Func to read file and strip out accesions data
    returns a list of seqs
    """
    
    seqs = []
    with open(filename, 'r') as fh:
        for line in fh.readlines():
            if not line.startswith(">"):
                seqs.append(line.strip())
    return seqs



In [11]:

    
seqs = {}
for fn in sorted(glob.glob("*.fasta")):
    name = fn.split('_')
    name = name[0] + '_' + name[-3]
    seqs.update({name: get_seqs_only(fn)})



In [13]:

    
seqs['archaea_ii'][]









    Out[13]:





'MISSYKYNPKLYFLSTFVVTYILWFTGAYLSFSSTYSGIYMLIMLPGLMAPFIISTILIAKSKNNELKKDFINRLFNLKLINLKTIPVVFLLMPAVILLSILLSIPFGGSISQFQFSGGFSFSTDFVPVLFLLLLAATFEELGWRGYAFDSLQSRYSLFKASILFGIFWSLWHFPLIFVNNSYQYEIFNQSIWYGLNFFLSILPMGIIITWMCLKNRKSIILAIIFHFLINLNQELLAITQDTKIIETGVLFLVAAAIILYDK'



In [ ]: