In [5]:
%matplotlib inline
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os, sys, glob

TM Sequence analysis

A workbook to analyse the TM sequences of TM annotated Swissprot proteins from the 3 domains.

All TM domains were annotated again using TMHMM and the sequences +/- 3 residues and orientations pulled out from TMHMM output using a trivial script


In [2]:
ls


archaea_swissprot_tm_clustered_clean_ii_tm_seqs.fasta
archaea_swissprot_tm_clustered_clean_io_tm_seqs.fasta
archaea_swissprot_tm_clustered_clean_oi_tm_seqs.fasta
archaea_swissprot_tm_clustered_clean_oo_tm_seqs.fasta
bacteria_swissprot_tm_clustered_clean_ii_tm_seqs.fasta
bacteria_swissprot_tm_clustered_clean_io_tm_seqs.fasta
bacteria_swissprot_tm_clustered_clean_oi_tm_seqs.fasta
bacteria_swissprot_tm_clustered_clean_oo_tm_seqs.fasta
eukaryote_swissprot_tm_clustered_clean_ii_tm_seqs.fasta
eukaryote_swissprot_tm_clustered_clean_io_tm_seqs.fasta
eukaryote_swissprot_tm_clustered_clean_oi_tm_seqs.fasta
eukaryote_swissprot_tm_clustered_clean_oo_tm_seqs.fasta
get_tm_seqs.py
TM Sequences Analysis Across Domains.ipynb

In [4]:
def get_seqs_only(filename):
    """
    Func to read file and strip out accesions data
    returns a list of seqs
    """
    
    seqs = []
    with open(filename, 'r') as fh:
        for line in fh.readlines():
            if not line.startswith(">"):
                seqs.append(line.strip())
    return seqs

In [11]:
seqs = {}
for fn in sorted(glob.glob("*.fasta")):
    name = fn.split('_')
    name = name[0] + '_' + name[-3]
    seqs.update({name: get_seqs_only(fn)})

In [13]:
seqs['archaea_ii'][]


Out[13]:
'MISSYKYNPKLYFLSTFVVTYILWFTGAYLSFSSTYSGIYMLIMLPGLMAPFIISTILIAKSKNNELKKDFINRLFNLKLINLKTIPVVFLLMPAVILLSILLSIPFGGSISQFQFSGGFSFSTDFVPVLFLLLLAATFEELGWRGYAFDSLQSRYSLFKASILFGIFWSLWHFPLIFVNNSYQYEIFNQSIWYGLNFFLSILPMGIIITWMCLKNRKSIILAIIFHFLINLNQELLAITQDTKIIETGVLFLVAAAIILYDK'

In [ ]: