In [1]:
import pandas as pd
import numpy as np
import re
from scipy import stats
import seaborn as sns



%pylab inline
%matplotlib inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
myAmino = ["R","H","K","D","E","S","T","N","Q","C","G","P","A","V","I","L","M","F","Y","W"]

In [3]:
#Load PixelDB
path = "/media/vince/Postdoc/PixelDB/"
PixelDB = pd.read_csv(path+"PixelDB.csv")
len(PixelDB["name"])


Out[3]:
1976

In [4]:
print("Entries in DB:",len(PixelDB["cluster_number"]))
print("Receptor Cluster",len(PixelDB["cluster_number"].value_counts()))
print("Unique Binding Mode",len(PixelDB["unique_id"].value_counts()))
print("Binding mode with 2 plus peptide",np.sum((PixelDB["unique_id"].value_counts()) >= 2))


('Entries in DB:', 1976)
('Receptor Cluster', 479)
('Unique Binding Mode', 702)
('Binding mode with 2 plus peptide', 271)

In [5]:
#Some STAT on PixelDB

In [6]:
PixelDBecr = PixelDB.copy()
for uniid in list(np.unique(PixelDB["unique_id"])):
    sdf = PixelDB[PixelDB["unique_id"] == uniid]
    
    
    
    if not np.sum((np.array(sdf["longest_continuous_core"]) > 3) & (np.array(sdf["longest_continuous_ecr"]) > 3)) > 0:
        PixelDBecr = PixelDBecr[PixelDBecr["unique_id"] != uniid]
        continue
    #break

In [7]:
PixelDBoecr = PixelDB[PixelDB["longest_continuous_ecr"] > 3]

In [ ]:


In [8]:
print("Unique Binding Mode",len(PixelDBecr["unique_id"].value_counts()))
print("Exosite and ECR complexes",np.sum(PixelDBecr["longest_continuous_ecr"] > 3))


('Unique Binding Mode', 124)
('Exosite and ECR complexes', 325)

In [ ]:


In [9]:
#Is there a link between CATHdb and Binding mode

for ikl in range(0,2):
    torun = PixelDB
    if ikl == 1:
        print("ECR")
        torun = PixelDBecr
    else:
        print("Full")

    
    
    

    for test in ["PFAM","uniprot","CATH"]:
        CATHdbOver = []
        BindingMode = []
        for uniid in list(np.unique(torun["cluster_number"])):
            sdf = torun[torun["cluster_number"] == uniid]



            BindingMode.append(len(sdf["unique_id"].value_counts()))
            if (len(sdf["unique_id"].value_counts()) == 1):
                continue
            AllCATH = []



            AllCATHUni = []
            for unid in np.unique(sdf["unique_id"]):

                CATHuni = []
                for cid in sdf[sdf["unique_id"] == unid][test]:
                    #print(cid.split("_"))
                    if str(cid) != "nan":
                        CATHuni += cid.split("_")
                        AllCATHUni += cid.split("_")

                AllCATH.append(list(set(CATHuni)))

            Tot = 0
            Over = 0

            for i in range(0,len(AllCATH)):
                for j in range(i+1,len(AllCATH)):
                    OverT = 0
                    for v in AllCATH[i]:
                        if v in AllCATH[j]:
                            OverT += 1
                    if (OverT != 0):
                        Over += 1
                    #else:
                    #    print(i,j,AllCATH[i],AllCATH[j])
                    Tot += 1
            CATHdbOver.append(float(Over)/float(Tot))
            #if (CATHdbOver[-1] < 0.5):
            #    print(uniid,Over,Tot)
            #    print(AllCATH)



            #if len(sdf["unique_id"].value_counts()) > 6:
            #    print(uniid,len(sdf["unique_id"].value_counts()))
            #    print(pd.Series(AllCATHUni).value_counts())
            #else:
            #    break


            #break
            #          
            #break
        print(len(CATHdbOver))
        print("Average Overlap of:",test,np.mean(CATHdbOver))
        print("Median Overlap of:",test,np.median(CATHdbOver))
        print("Average Nb of Binding mode:",np.mean(BindingMode))
        print("Median Nb of Binding mode:",np.median(BindingMode))
        print(np.min(BindingMode),np.max(BindingMode))
        
        print("More than one binding mode",np.sum(np.array(BindingMode)  > 1))
        print("Total cluster",len(BindingMode))
        
        print()


Full
103
('Average Overlap of:', 'PFAM', 0.69063912129211225)
('Median Overlap of:', 'PFAM', 1.0)
('Average Nb of Binding mode:', 1.465553235908142)
('Median Nb of Binding mode:', 1.0)
(1, 32)
('More than one binding mode', 103)
('Total cluster', 479)
()
103
('Average Overlap of:', 'uniprot', 0.53466530207447838)
('Median Overlap of:', 'uniprot', 0.5)
('Average Nb of Binding mode:', 1.465553235908142)
('Median Nb of Binding mode:', 1.0)
(1, 32)
('More than one binding mode', 103)
('Total cluster', 479)
()
103
('Average Overlap of:', 'CATH', 0.9560795191863154)
('Median Overlap of:', 'CATH', 1.0)
('Average Nb of Binding mode:', 1.465553235908142)
('Median Nb of Binding mode:', 1.0)
(1, 32)
('More than one binding mode', 103)
('Total cluster', 479)
()
ECR
14
('Average Overlap of:', 'PFAM', 0.70000000000000007)
('Median Overlap of:', 'PFAM', 1.0)
('Average Nb of Binding mode:', 1.4090909090909092)
('Median Nb of Binding mode:', 1.0)
(1, 16)
('More than one binding mode', 14)
('Total cluster', 88)
()
14
('Average Overlap of:', 'uniprot', 0.46309523809523812)
('Median Overlap of:', 'uniprot', 0.34166666666666667)
('Average Nb of Binding mode:', 1.4090909090909092)
('Median Nb of Binding mode:', 1.0)
(1, 16)
('More than one binding mode', 14)
('Total cluster', 88)
()
14
('Average Overlap of:', 'CATH', 0.9642857142857143)
('Median Overlap of:', 'CATH', 1.0)
('Average Nb of Binding mode:', 1.4090909090909092)
('Median Nb of Binding mode:', 1.0)
(1, 16)
('More than one binding mode', 14)
('Total cluster', 88)
()

In [10]:
#How often same PDB different binding mode

for ikl in range(0,2):
    torun = PixelDB
    if ikl == 1:
        print("ECR")
        torun = PixelDBecr
    else:
        print("Full")
    tot = 0
    AllCluster = []
    for uniid in list(np.unique(torun["pdb_id"])):
        sdf = torun[torun["pdb_id"] == uniid]
        if (len(sdf) == 1):
            continue
        for ch in list(np.unique(sdf["receptor_chain"])):
            ssdf = sdf[sdf["receptor_chain"] == ch]
            if (len(ssdf) == 1):
                continue
            print(uniid,ch,len(ssdf),np.unique(sdf["cluster_number"]))
            AllCluster += list(np.unique(sdf["cluster_number"]))
            tot += 1
        #break
    print(len(list(set(AllCluster))))
    print("Same receptor engage in multiple peptide binding:",tot)
    #break


Full
('1A7C', 'A', 2, array([19]))
('1EE4', 'A', 2, array([8]))
('1KL3', 'AD', 2, array([47]))
('1LVM', 'A', 2, array([76]))
('1NX0', 'A', 2, array([108]))
('1OV3', 'AB', 2, array([272]))
('1Q1S', 'C', 2, array([8]))
('1Q1T', 'C', 2, array([8]))
('1TOQ', 'CG', 2, array([229]))
('2R28', 'BA', 2, array([189]))
('2W73', 'AB', 2, array([189]))
('2YNR', 'A', 2, array([8]))
('3AGY', 'B', 2, array([155]))
('3AP1', 'AB', 2, array([208]))
('3AV9', 'AB', 2, array([24]))
('3AVA', 'AB', 2, array([24]))
('3AVB', 'AB', 2, array([24]))
('3AVC', 'AB', 2, array([24]))
('3AVF', 'BA', 2, array([24]))
('3AVG', 'AB', 2, array([24]))
('3AVH', 'AB', 2, array([24]))
('3AVI', 'AB', 2, array([24]))
('3AVJ', 'AB', 2, array([24]))
('3AVK', 'AB', 2, array([24]))
('3AVL', 'AB', 2, array([24]))
('3AVM', 'AB', 2, array([24]))
('3AVN', 'AB', 2, array([24]))
('3C27', 'B', 2, array([3]))
('3CYY', 'BA', 2, array([255]))
('3H5R', 'AB', 2, array([125]))
('3H8D', 'CD', 2, array([354]))
('3L3Q', 'A', 2, array([8]))
('3O6Q', 'AC', 2, array([346]))
('3RF3', 'BA', 2, array([291]))
('3TWW', 'AB', 2, array([461]))
('3WNE', 'AB', 2, array([24]))
('3WNF', 'AB', 2, array([24]))
('3WNG', 'AB', 2, array([24]))
('3ZIN', 'A', 2, array([8]))
('3ZIO', 'A', 2, array([8]))
('3ZIP', 'A', 2, array([8]))
('3ZIQ', 'A', 2, array([8]))
('3ZIR', 'A', 2, array([8]))
('3ZKE', 'IK', 2, array([122]))
('3ZQI', 'AB', 2, array([262]))
('4B8O', 'A', 2, array([8]))
('4CY2', 'A', 2, array([11]))
('4DS1', 'AC', 2, array([122]))
('4GUS', 'A', 2, array([103]))
('4MZ5', 'E', 2, array([8]))
('4MZ6', 'E', 2, array([8]))
('4R6O', 'GE', 2, array([63]))
('4RXH', 'B', 2, array([8]))
('4YNL', 'AB', 2, array([347]))
('4Z0Y', 'DB', 2, array([130]))
('4Z0Z', 'CB', 2, array([130]))
25
('Same receptor engage in multiple peptide binding:', 56)
ECR
('1KL3', 'AD', 2, array([47]))
('1Q1S', 'C', 2, array([8]))
('1Q1T', 'C', 2, array([8]))
('2YNR', 'A', 2, array([8]))
('3C27', 'B', 2, array([3]))
('3L3Q', 'A', 2, array([8]))
('3ZIN', 'A', 2, array([8]))
('3ZIO', 'A', 2, array([8]))
('3ZIP', 'A', 2, array([8]))
('3ZIQ', 'A', 2, array([8]))
('3ZIR', 'A', 2, array([8]))
('4B8O', 'A', 2, array([8]))
('4MZ5', 'E', 2, array([8]))
('4MZ6', 'E', 2, array([8]))
('4RXH', 'B', 2, array([8]))
3
('Same receptor engage in multiple peptide binding:', 15)

In [11]:
sdf


Out[11]:
name pdb_id pubmed_id resolution uniprot PFAM CATH receptor_chain receptor_length peptide_chain ... surface_ss interior_ss unique_id COREBINDING_aa COREBINDING_ss EXOSITE_aa EXOSITE_ss bs_loc_type mean_seq_iden_in_bm mean_seq_iden_not_bm
1467 5AJP_A_B_114_1.pdb 5AJP 25939779.0 1.65 Q10471 PF00535_PF00652 3.90.550.10 A 495 B ... C:15;E:1;G:11;H:14;T:28 B:7;C:69;E:115;G:20;H:108;T:87 114_1 A:1;F:2;G:0;H:1;K:1;L:1;N:0;Q:1;R:1;S:0;V:1;W:... C:6;E:4;G:0;H:0;T:1 A:1;F:0;G:1;H:1;K:0;L:0;N:1;Q:0;R:0;S:0;V:0;W:... C:2;E:2;G:2;H:0;T:0 ALA_192_A_COREBINDING;ALA_402_A_EXOSITE;ARG_28... 0.991919 -1.0

1 rows × 36 columns


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [12]:
#Is there a link between CATHdb and Binding mode

for ikl in range(0,2):
    torun = PixelDB
    if ikl == 1:
        print("ECR")
        torun = PixelDBecr
    else:
        print("Full")
    inbm = []
    outbm = []
    for uniid in list(np.unique(torun["cluster_number"])):
        sdf = torun[torun["cluster_number"] == uniid]
        sdf = sdf[sdf["mean_seq_iden_in_bm"] > 0]
        sdf = sdf[sdf["mean_seq_iden_not_bm"] > 0]
        
        
        
        if (len(sdf) == 0):
            continue
            
        #die
        if (len(sdf["unique_id"].value_counts()) == 1):
            continue
        plt.scatter(sdf["mean_seq_iden_in_bm"]*100,sdf["mean_seq_iden_not_bm"]*100)
        inbm.append(np.mean(sdf["mean_seq_iden_in_bm"]))
        outbm.append(np.mean(sdf["mean_seq_iden_not_bm"]))
        #print(inbm[-1],uniid)
        #if uniid == 7:
        #    die
        
    plt.xlabel("Mean sequence receptor within Binding mode")
    plt.ylabel("Mean sequence receptor not in Binding mode")
    plt.plot([-5,105],[-5,105])
    plt.xlim((-5,105))
    plt.ylim((-5,105))
    plt.show()
    #break
    print("Average seq iden in and out",np.mean(inbm),np.mean(outbm),len(inbm))
    plt.hist(inbm,20)
    plt.title("In binding mode")
    plt.show()
    plt.hist(outbm,20)
    plt.title("Between binding mode")
    plt.show()
    print("Median seq iden in and out",np.median(inbm),np.median(outbm),len(inbm))
    print("Max and min seq iden ",np.min(inbm),np.max(inbm),len(inbm))
    
    figsize(10,10)
    plt.hist(np.array(inbm)*100)
    plt.xlim([0,100])
    plt.xlabel("Average sequence idendity in binding mode (%)", fontsize=18 )
    plt.ylabel("Count", fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show()
    
    plt.hist(np.array(outbm)*100)
    plt.xlim([0,100])
    plt.xlabel("Average sequence idendity between binding mode (%)", fontsize=18 )
    plt.ylabel("Count", fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show()
    
    break


Full
('Average seq iden in and out', 0.79647664200254031, 0.60648915632317968, 40)
('Median seq iden in and out', 0.83052071224679525, 0.60353523261498387, 40)
('Max and min seq iden ', 0.37046378885187348, 0.99875930521091805, 40)

In [13]:
#sdf["unique_id"].value_counts()

In [ ]:


In [14]:
for ikl in range(0,3):
    torun = PixelDB
    if ikl == 0:
        print("Full")
    if ikl == 1:
        torun = PixelDBecr
        print("ECR")
    if ikl == 2:
        torun = PixelDBoecr
        print("ECRdist")

    UniquePFAM = []
    UniqueUnip = []
    UniqueCATH = []
        
    UniprotPerBindingMode = []
    CATHPerBindingMode = []
    PFAMPerBindingMode = []

    StrPerBindingMode = []

    for uniid in list(np.unique(torun["unique_id"])):
        sdf = torun[torun["unique_id"] == uniid]
        #print(sdf["sequence_alignment"])
        
        StrPerBindingMode.append(len(sdf))
        
        Uniuni = []
        for cid in sdf["uniprot"]:
            #print(cid.split("_"))
            if str(cid) != "nan":
                Uniuni += cid.split("_")
        UniprotPerBindingMode.append(len(list(set(Uniuni))))
        
        
        CATHuni = []
        for cid in sdf["CATH"]:
            #print(cid.split("_"))
            if str(cid) != "nan":
                CATHuni += cid.split("_")
        CATHPerBindingMode.append(len(list(set(CATHuni))))

        PFAMuni = []
        for cid in sdf["PFAM"]:
            #print(cid.split("_"))
            if str(cid) != "nan":
                PFAMuni += cid.split("_")
        PFAMPerBindingMode.append(len(list(set(PFAMuni))))

            #print("Binding mode with ECR and Core >= 4",len(PixelDBecr["unique_id"].value_counts()))
    print("Unique Binding Mode",len(np.unique(torun["unique_id"])))
    print("Strc Per Binding Mode",np.mean(StrPerBindingMode),np.median(StrPerBindingMode),np.min(StrPerBindingMode),np.max(StrPerBindingMode))
    print("Uniprot Per Binding Mode",np.mean(UniprotPerBindingMode),np.median(UniprotPerBindingMode),np.min(UniprotPerBindingMode),np.max(UniprotPerBindingMode))
    print("CATH Per Binding Mode",np.mean(CATHPerBindingMode),np.median(CATHPerBindingMode),np.min(CATHPerBindingMode),np.max(CATHPerBindingMode))
    print("PFAM Per Binding Mode",np.mean(PFAMPerBindingMode),np.median(PFAMPerBindingMode),np.min(PFAMPerBindingMode),np.max(PFAMPerBindingMode))


Full
('Unique Binding Mode', 702)
('Strc Per Binding Mode', 2.8148148148148149, 1.0, 1, 174)
('Uniprot Per Binding Mode', 1.4800569800569801, 1.0, 0, 31)
('CATH Per Binding Mode', 1.3190883190883191, 1.0, 0, 7)
('PFAM Per Binding Mode', 1.1182336182336183, 1.0, 0, 10)
ECR
('Unique Binding Mode', 124)
('Strc Per Binding Mode', 8.4112903225806459, 4.0, 2, 174)
('Uniprot Per Binding Mode', 2.774193548387097, 2.0, 0, 31)
('CATH Per Binding Mode', 1.3548387096774193, 1.0, 0, 4)
('PFAM Per Binding Mode', 1.2096774193548387, 1.0, 0, 10)
ECRdist
('Unique Binding Mode', 124)
('Strc Per Binding Mode', 2.620967741935484, 1.0, 1, 29)
('Uniprot Per Binding Mode', 1.5241935483870968, 1.0, 0, 14)
('CATH Per Binding Mode', 1.3306451612903225, 1.0, 0, 4)
('PFAM Per Binding Mode', 1.0887096774193548, 1.0, 0, 10)

In [15]:
for ikl in range(0,3):
    torun = PixelDB
    if ikl == 0:
        print("Full")
    if ikl == 1:
        torun = PixelDBecr
        print("ECR")
    if ikl == 2:
        torun = PixelDBoecr
        print("ECRdist")

    UniquePFAM = []
    UniqueUnip = []
    UniqueCATH = []
        
    UniprotPerBindingMode = []
    CATHPerBindingMode = []
    PFAMPerBindingMode = []

    StrPerBindingMode = []

    for uniid in list(np.unique(torun["cluster_number"])):
        sdf = torun[torun["cluster_number"] == uniid]
        #print(sdf["sequence_alignment"])
        
        StrPerBindingMode.append(len(sdf))
        
        Uniuni = []
        for cid in sdf["uniprot"]:
            #print(cid.split("_"))
            if str(cid) != "nan":
                Uniuni += cid.split("_")
        UniprotPerBindingMode.append(len(list(set(Uniuni))))
        
        
        CATHuni = []
        for cid in sdf["CATH"]:
            #print(cid.split("_"))
            if str(cid) != "nan":
                CATHuni += cid.split("_")
        CATHPerBindingMode.append(len(list(set(CATHuni))))

        PFAMuni = []
        for cid in sdf["PFAM"]:
            #print(cid.split("_"))
            if str(cid) != "nan":
                PFAMuni += cid.split("_")
        PFAMPerBindingMode.append(len(list(set(PFAMuni))))

            #print("Binding mode with ECR and Core >= 4",len(PixelDBecr["unique_id"].value_counts()))
    print("Unique Cluster",len(np.unique(torun["cluster_number"])))
    print("Strc Per Cluster",np.mean(StrPerBindingMode),np.median(StrPerBindingMode),np.min(StrPerBindingMode),np.max(StrPerBindingMode))
    print("Uniprot Per Cluster",np.mean(UniprotPerBindingMode),np.median(UniprotPerBindingMode),np.min(UniprotPerBindingMode),np.max(UniprotPerBindingMode))
    print("CATH Per Cluster",np.mean(CATHPerBindingMode),np.median(CATHPerBindingMode),np.min(CATHPerBindingMode),np.max(CATHPerBindingMode))
    print("PFAM Per Cluster",np.mean(PFAMPerBindingMode),np.median(PFAMPerBindingMode),np.min(PFAMPerBindingMode),np.max(PFAMPerBindingMode))


Full
('Unique Cluster', 479)
('Strc Per Cluster', 4.1252609603340291, 1.0, 1, 288)
('Uniprot Per Cluster', 1.8455114822546972, 1.0, 0, 46)
('CATH Per Cluster', 1.3444676409185803, 1.0, 0, 7)
('PFAM Per Cluster', 1.2025052192066805, 1.0, 0, 10)
ECR
('Unique Cluster', 88)
('Strc Per Cluster', 11.852272727272727, 4.0, 2, 272)
('Uniprot Per Cluster', 3.5568181818181817, 2.0, 1, 42)
('CATH Per Cluster', 1.4090909090909092, 1.0, 0, 4)
('PFAM Per Cluster', 1.3181818181818181, 1.0, 0, 10)
ECRdist
('Unique Cluster', 88)
('Strc Per Cluster', 3.6931818181818183, 1.0, 1, 51)
('Uniprot Per Cluster', 1.9545454545454546, 1.0, 1, 16)
('CATH Per Cluster', 1.375, 1.0, 0, 4)
('PFAM Per Cluster', 1.2386363636363635, 1.0, 0, 10)

In [16]:
len(PixelDBecr)


Out[16]:
1043

In [17]:
for cid in PixelDBecr["CATH"]:
    if str(cid) != "nan":
        CATHuni += cid.split("_")
    
CATHPerBindingMode.append(len(list(set(CATHuni))))

In [18]:
UniprotPerBindingModeECR = []
CATHPerBindingModeECR = []
PFAMPerBindingModeECR = []



for uniid in list(np.unique(PixelDBecr["unique_id"])):
    sdf = PixelDBecr[PixelDBecr["unique_id"] == uniid]
    print(sdf["sequence_alignment"])
    UniprotPerBindingMode.append(np.sum(np.unique(sdf["uniprot"]) != "nan"))
    CATHuni = []
    for cid in sdf["CATH"]:
        #print(cid.split("_"))
        if str(cid) != "nan":
            CATHuni += cid.split("_")
    CATHPerBindingMode.append(len(list(set(CATHuni))))
    
    PFAMuni = []
    for cid in sdf["PFAM"]:
        #print(cid.split("_"))
        if str(cid) != "nan":
            PFAMuni += cid.split("_")
    PFAMPerBindingMode.append(len(list(set(PFAMuni))))
    #print(list(set(CATHuni)))
    #die


1422    ----ACNDENYA
1423    GRHGAANDENY-
Name: sequence_alignment, dtype: object
1432    ARTMQTARKSTGGKAPRKQL------
1433    ARTMQTARKSTGGKAPRKQLATKAAR
Name: sequence_alignment, dtype: object
1435    ----TME-NLSRRLKVTGDLFD--IM------
1436    DGG-TME-NLSRRLKVTGDLFD--IMSG----
1437    --SST-MGQVGRQLAIIGD-DINRR---YDSE
Name: sequence_alignment, dtype: object
1438    -----LMRVQAHIRKRMVA
1439    ----SLMRVQAHIRKRMVA
1440    ---GSLLRVQAHIRKKMV-
1441    KNIPSLLRVQAHIRKKMV-
Name: sequence_alignment, dtype: object
690    --CGVPAIQ------P------------
691    --CGVPAIQ------PVL----------
692    --CGVPAIQ------PVLS---------
693    --CGVPAIQ------PVLS--G------
694    ADCGLRPLFEKKSLEDKT-ERELLESYI
Name: sequence_alignment, dtype: object
1453    ------P---PTLHELYDL
1454    DDYLWGLEAGEGISDLF-D
Name: sequence_alignment, dtype: object
1456    --------------CWTTRMSPPQQIC---------
1457    --------------CFTARMSPPQQIC---------
1458    ----KDIGAGPVASCFTTRMSPPQQICLN-------
1459    TQQAKDIGAGPVASCFTTRMSPPQQICLNSVVNTAL
Name: sequence_alignment, dtype: object
1464    -WDEDWDG-----------
1465    DWDEDWDGPKSSSYFKDSE
Name: sequence_alignment, dtype: object
1466    -----STCPA-------
1467    TT---PSPVPTTSTTSA
1468    --GTTPSPVPTTSTCSA
1469    --GTTPSPVPTTSTTSA
Name: sequence_alignment, dtype: object
1483    ---QEDIIRNIARHLAQVGDSMD--
1484    ----A-STKKLSECLKRIGDELDSN
1485    SESQEDIIRNIARHLAQVGDSMD-R
Name: sequence_alignment, dtype: object
695    ----------ARAE-VH---
696    ----------ARTAR-----
697    --------GCARSE-G----
698    --------GSARAE-PKM--
699    --------GAARAE-VHL--
700    --------GSARSE-GY---
701    --------GSARAE-VHL--
702    ---L-----CSRAR-PLV--
703    --------GAARAE-VYL-R
704    --QT----GSARSE-GY---
705    ----DGTCVAARTR-PV---
706    VNPT----GCARSE-PKMS-
707    LNPH----GAARAE-VY---
708    INPT----GCARSE-PKI--
Name: sequence_alignment, dtype: object
1488    RLEHFTKLRPKR-N-KKQQPT----
1489    YLAHPTRDRAKIQHSR-RPPTR---
1490    NLLHLTANRPKM-PGR-RLPGRFNG
Name: sequence_alignment, dtype: object
1504    -------------RR-RRIE-VNVELRKAKKDDQMLKRR-
1505    PRLSQYKSKYSSLEQSER-RRRLLELQKSKRLDYVNHARR
Name: sequence_alignment, dtype: object
726    --------S----KYI-TTIAGVMTLS--
727    ----------KYKQ-SVRLISLCQRLS--
728    ---R----R-RWKL-SFSIVSLCNHLTR-
729    -R-K------KWKQ-SVRLISLCQRLSR-
730    --------R-KW-QKTGHAVRAIGRLSS-
731    -R-R------RWKL-DFSIVSLCNHLTR-
732    -------A-IGFKK-LAEAVKFSAKLMGQ
733    ---R----R-KW-QKTGHAVRAIGRLSSS
734    ----RRREI-RFRV-LVKVVF-F------
735    FNAR----RK-L-KGAILTTMLAT-----
736    FNAR----RK-L-KGAILTTMLAT-----
Name: sequence_alignment, dtype: object
737    -----------KK-KATFRAITS-TLA-SSFK---R-R------
738    ----------ASPW-KSARLMVHTV-ATF--NSIK-----ER--
739    AR---------RKW-QKTGHAVRAI-GRL--S------------
740    --------RKKT-FKEVANAVKISA-S-L--M------------
741    -----------K-KKATFRAITS-TLA-SSFK---RRRS-----
742    --KKRFSF--K-K--S-FKLSGF-S-----------------FK
743    -----------H-M-GKVYAALMIF-D-FYKQ--NKTSRD----
Name: sequence_alignment, dtype: object
744    -------KFYATFLAAEYF-R-KF------KKR---
745    -------KFYATFLIQEYF-R-KF------KKR---
746    EVTVG--KFYATFLIQ--------------------
747    -------KFYATFLIQEHF-R-KF------MKRQEE
748    -----IPRLDTLILVKAMGHRK-RFGNPFR------
Name: sequence_alignment, dtype: object
749    --NHWQ-KIRTMVNL--PVISPFK------
750    ARK--EVI-RNKIRAIGK--M-ARVFSVLR
Name: sequence_alignment, dtype: object
1520    -LTLASKLK-----
1521    MLKLRQLQKKK-Q-
1522    MLKLRQLQK-KKQK
Name: sequence_alignment, dtype: object
1523    -S-MDDLLIRRLTDRNDKEA-HLNELF----------------
1524    MPRWKRHISEQLRRRDRLQRQAFEEIILQYN---------KLL
1525    DS-MDDLLIRRLTDRNDKEA-HLNELFQ--DNSGAIGGNI---
Name: sequence_alignment, dtype: object
1530    --DADTLLHFA--TES-TPD---------------------
1531    LGANDELISFKDEGE-QE--EKSSENSSAERDLADVKSSLV
Name: sequence_alignment, dtype: object
1541    A----RYGVSNTSINRK-
1542    A----RYGVSNTSINRKK
1543    -ARDSPYGLS-QGITK--
Name: sequence_alignment, dtype: object
755    --KR-RRHPS-------
756    -RKR-RRHPS-------
757    -RKR-RRHPSG------
758    --KR-RRHPSG------
759    ARKR-RRHPSG------
760    ----GRSRQPLVLGDNC
Name: sequence_alignment, dtype: object
761    AW---ASGN-LLTQAIRQQYYKPIDVDRMYGTIDSPKLEELF---
762    -IPAWA-SGNLLTQAIRQQYYKPIDVDRMYGTIDSPKLEELFNKS
763    -IPAWASGN-LLTQAIRQQYYKPIDVDRMYGTIDSPKLEELF---
764    PIPAWASGN-LLTQAIRQQYYKPIDVDRMYGTIDSPKLEELF---
Name: sequence_alignment, dtype: object
765    -----EEIFGEFE--
766    ------GEYVNIE-F
767    --PYP-EDYGDIEIG
768    YNPYP-EDYGDIEIG
Name: sequence_alignment, dtype: object
1544    ---SWYSY-PPPQR--A--------
1545    -TANILKPLMSPPSREEIMAT--LL
1546    RTANILKPLMSPPSREEIMATLL--
Name: sequence_alignment, dtype: object
1550    -PDWDFN----
1551    -EWGPWV----
1552    DDFGGFEAAET
Name: sequence_alignment, dtype: object
1556    -----KKPLDG--
1557    -----KGLGKGGA
1558    KSTGGKAPRKQ--
Name: sequence_alignment, dtype: object
1559    ----DHDAHA---
1560    GGGAVPTAKA---
1561    GGGGAPTAKAPSK
Name: sequence_alignment, dtype: object
774    -GSVVIVGRIILS---------
775    -GSVVIVGRIILS---------
776    KGSVVIVGRIVL----------
777    KGSVVIVGRIILS---------
778    KGSVVIVGRIVLSGKPAIIPA-
779    KGCVVIVGRIVLSGKPAIIPKK
780    KGSVVIVGRIVLSGKPAIIPKK
Name: sequence_alignment, dtype: object
781    -PSSTPC------------
782    -DDIVPC------------
783    TEDVVCC------------
784    QEREVPC------------
785    SECTTPC------------
786    -DELVYLLDGPGYDPIH--
787    -GRLVYLLDGPGYDPIHCD
Name: sequence_alignment, dtype: object
1575    ----DWDFLPP-
1576    SDDDFWVRVA-P
Name: sequence_alignment, dtype: object
1588    --AYDPARKLL----
1589    EKPYKEVTEDLLHLN
Name: sequence_alignment, dtype: object
789    --P----------TLQLPLA----
790    -------------KKDLRISC---
791    -------------NPGLKIPK---
792    -------K--P--DLRVVIPP---
793    -PK-------R--PTTLNLF----
794    -VV-------R--PGSLDLP----
795    ---P---R--P--TLQLPLA----
796    -PK-------R--PTTLNLF----
797    -------------AADLRISCNSK
798    -------R-RNLKGLNLNL-H---
799    ---R---RLQK---GNLPVR----
800    QRP--------RPTLQLPLA----
801    ----SLQN-RNTKNLSLDI-----
Name: sequence_alignment, dtype: object
802    --LSSL--AASSLAKRRQ-Q-------------
803    PQLKPI--ESSILAQRRV-R-------------
804    MKLSPP--SKSRLARRRA-L-------------
805    PQLKPI--EASILAARRV-R-------------
806    IKIKKIEDASNPLLLKRRKKARAL---------
807    -----------RLQERRG------SNVALMLDC
Name: sequence_alignment, dtype: object
808    ----RPPDLWIH
809    KRGNIPKPLNL-
Name: sequence_alignment, dtype: object
811    -RPYLPRP------
812    RPILLP-W-R----
813    -PRPLP-FP-----
814    NRLLLT-G------
815    NRLMLT-G------
816    KPPYLPRP------
817    RPPYLPRP------
818    -PPYLP-RPR----
819    RRPRLP-RPR----
820    RPPRLP-RPR----
821    -RPRLPRP-RP---
822    RPPRLPRP-RP---
823    -PPYLPRP-RPP--
824    --SYLP-RPTPP--
825    PRPYLP-RPRPPRP
Name: sequence_alignment, dtype: object
1607    GLRQAVTQ----
1608    ---APAKQLLNF
Name: sequence_alignment, dtype: object
1617    -GSIKK------
1618    SAKISKPLHIKT
Name: sequence_alignment, dtype: object
1625    VNFDD-IASSE----NLLHLTANRPKMPGRRLPG---
1626    -FIS-ELPS--EEGKKLEHFTKLRPKRNKKQQPTQAA
Name: sequence_alignment, dtype: object
836    ---CHPQNT------
837    ---SHPQNT------
838    ---CHPQF-C-----
839    --FSHPQNT------
840    -F-CHPQNT------
841    ---CHPQGPP----C
842    ---CHPQGPPC----
843    -C-CHPQCGAAYSC-
844    RC-CHPQCGAVEEC-
Name: sequence_alignment, dtype: object
1627    ----------MDDDFQL
1628    DLEMLAPYIPMDDDFQL
Name: sequence_alignment, dtype: object
1631    EISLPSDFEHTIHVGFDAVT-GEFT-----------
1632    EISAPSNFEHRVHTGFDQ-HEQKFTGLPRQWQSLIE
Name: sequence_alignment, dtype: object
850    ----------KQLSELL--------
851    ----------KILHRLLQ-------
852    -------H--QLLRYLL--------
853    -------S---LLKKLLD-------
854    ----------TLLQLLLG-----H-
855    -------H--KILHRLLQ-------
856    -------H--KILHRLLQ-E-----
857    -------K--I-LHRLLQDS-----
858    -------H--KILHRLL-QE-----
859    ----------HKLVQLLTT-----T
860    -----L----SLLQKLLL-A----T
861    --------AHKILHRLLQE------
862    ------RH--KILHRLLQE------
863    ----E-RH--KILHRLLQE------
864    -----ARH--KILHRLLQE------
865    ---E--RH--KILHRLLQEG-----
866    -------H--KILHRLLQEGSP---
867    ---E--RH--KILHRLLQEGSPS--
868    --LTE-RH--KILHRLLQEG-----
869    HSSLTERH--KILHRLLQ-------
Name: sequence_alignment, dtype: object
1650    WFEG-----YDNTFP-
1651    ----KSLTIYAQVQ-K
Name: sequence_alignment, dtype: object
870    ---SQRLVFNRPFLMFIVD---N-NILFLGKVNRP--------
871    --NSQRLVFNRPFLMFIVD---N-NILFLGKVNRP--------
872    --R-TIVRFNRPFLMIIVPT-DTQNIFFMSKVTNP-K------
873    --G-TIVRFNRPFLMIIVPT-DTQNIFFMSKVTNPKQ------
874    ---HPIIQIDRSFMLLILERS-TRSILFLGKVVNPTE------
875    -H-VLKFKVDHPFHFFIRHNK-SKTILFFGRFCCP-V------
876    -SIPPEVKFNKPFVFLMIEQN-TKSPLFMGKVVNP-T-Q----
877    --KPIILRFNQPFIIMIFDHF-TWSSLFLARVMNPV-------
878    --LHPIIQIDRSFMLLILERS-TRSILFLGKVVNPTEA-----
879    ---HPIIQIDRSFMLLILERS-TRSILFLGKVVNPTEA-----
880    ---PPVIKIDRPFHFMIYEET-SGMLLFLGRVVNPTLL-----
881    AVLYPQVIVDHPFFFLIRNRR-TGTILFMGRVMHPETM-----
882    ---TIRFSVDRPFHIVVRR---RGAILFLGSIADPH--DPGPA
Name: sequence_alignment, dtype: object
883    TVASS---------
884    TEAAAGDGGVMTGR
885    TEAAAGMGGVMTGR
886    TEAAAGTGGVMTGR
Name: sequence_alignment, dtype: object
0      -NLVPQ-----------V--A-T--V-
1      -VQQES---S----------F-V--M-
2      -IRYPK-----------T--F-G--W-
3      -GILGF-----------V--F-T--L-
4      -LPFEK-----------S--T-V--M-
5      -LPFDR-----------T-T--I--M-
6      -CINGV-----------V-W--T--V-
7      -EEFGR-----------A--A-S--F-
8      -LSSPV-----------T--K-S--F-
9      -RPQVP---L---------R--P--M-
10     -FQWMG-----------Y--E-L--W-
11     -FAPGN---Y----------P-A--W-
12     -KVAEL-----------V-W--F--L-
13     -SRYWA-----------I--R-T--R-
14     -GLCTL-----------V--A-M--L-
15     -CINGM-----------C--W-T--V-
16     -IMDQV---P----------F-S--V-
17     -FAPGF-----------F--P-Y--L-
18     -QFKDN-----------V--I-L--L-
19     -ALYNT---A----------A-A--L-
20     -SLFNT-----------V--A-T--L-
21     -KPIVV---L---------H--G--Y-
22     -VMAPR---T----------L-F--L-
23     -MHPAQ---T----------S-Q--W-
24     -SLFNT-----------V--A-T--LY
25     -RRIYD-----------L--I-E--L-
26     -NLVPT-----------V--A-T--V-
27     -ILMEH---I----------H-K--L-
28     -KTFPP---T----------E-P--K-
29     -SRRWR-----------R--W-N--R-
                  ...             
144    -ELAAIG--I----------L-T--V-
145    -EPLPQ-G--------Q-LT--A--Y-
146    -RPQVP--L-------RP-M--T--Y-
147    -RPHER-NGF----------T-V--L-
148    -GHAEE---Y--------G-AET--L-
149    -HPVGD--A-------D-YF--E--Y-
150    -HPVGQA--D-------Y--F-E--Y-
151    -KAFSP-EV--------I--P-M--F-
152    -HPVAE-----------ADYF-E--Y-
153    -HEEAV--S---------VDR-V--L-
154    -HPVGE--A-------D-YF--E--Y-
155    -KAFNP-EI--------I--P-M--F-
156    -ALPHA-----------I--L-R-L--
157    -KGFNP-EVI----------P-M--F-
158    -HPVGD-----------ADYF-E--Y-
159    -TIAMEL--I-------R--M-I--K-
160    -HPVAE--A-------D-YF--E--Y-
161    -EECDS-E--------LE-IK-R--Y-
162    -AIMPA-R----------FY----PK-
163    -CPSQE--P--------MSIY-V--Y-
164    -RRLLR--G--------H-N----QY-
165    -LPEPL-P----QG--Q-AT--A--Y-
166    -LPEPA-P------QGQ-LT--A--Y-
167    -LPEPL-A----QG--Q-LT--A--Y-
168    -LPEPL-P----QG--Q-LT--A--Y-
169    -LPEPL-P----QG--A-LT--A--Y-
170    -LPEAL-P----QG--Q-LT--A--Y-
171    -RVEDV-----------TNT--AEYW-
172    -LPAV-V-G-L-----SPGEQ-E--Y-
173    -FLNKDL-E-VD-G--H-FV--T--M-
Name: sequence_alignment, dtype: object
174    FPTK-D-V----AL
175    VPLR-P-----MTY
176    YTVK-Y-----PNL
177    IDWF-D-----GKD
178    KVIT-F-I----DL
179    SAPD-T-----RPA
180    IDWF-E-----GKE
181    GGKK-K-----YRL
182    ALYN-F-----ATM
183    GGRK-K-Y----KL
184    IDWF-D-----GKE
185    GGKK-K-Y----QL
186    FEAN-G-----NLI
187    EIIN-F-E----KL
188    EQYK-F-Y----SV
189    KAFS-P-----EVI
190    YTVK-F-----PNM
191    VNDI-F-----EAI
192    RYGF-V-----ANF
193    SQYY-Y-N----SL
194    IQQS-I-E----RI
195    SSIE-F-----ARL
196    GGKK-K-Y----KL
197    RGYV-Y-----QGL
198    SEIE-F-----ARL
199    INFD-F-N----TI
200    RAKF-K-----QLL
201    SIIG-F-E----KL
202    AVFN-F-----ATM
203    RYPL-T-----FGW
            ...      
240    KAPF-N-F---ATM
241    EGPR-N-Q---DWL
242    ASNE-N-A--E-TM
243    YQLE-N-Y---CGL
244    KGPA-N-F---ATM
245    RQAS-L-S--I-SV
246    EEYL-Q-A---FTY
247    IGPG-R-A---FYA
248    SYVN-T-N--M-GL
249    LYLV-C-G---ERV
250    ASNE-H---M-ETM
251    ASNE-N-M---ETM
252    ELKR-K-M---IYM
253    KAVA-N-F--A-TM
254    KVPR-N-Q---DWL
255    ASNE-N-W---ETM
256    KAVY-N-L---ATM
257    KALY-N-F---ATM
258    ASNED--M--E-TM
259    SSLE-N-F-R-AYV
260    SQLK-N-N-AK-EI
261    RYPL-T-F--GWCF
262    RYPL-TLG---WCF
263    SSLE-N-F-A-AYV
264    IGPG-R-A-F-YTI
265    RGPG-R-A-F-VTI
266    KRWII--L-G-LNK
267    EENLL--D-F-VRF
268    VGYP-K-VKEE-ML
269    SGVE-N-P-GGYCL
Name: sequence_alignment, dtype: object
284    --Y---FINILTL
285    YELDEKFDRL---
Name: sequence_alignment, dtype: object
1670    FAAAVSAFAANMLSSVLKSEATSS---------
1671    FAAAVSAFAANMLSSVLKSEATSSIIKSVGETA
Name: sequence_alignment, dtype: object
1678    NEKNGPIIQNN-----KFEYKEDTIK
1679    ---E-TLTGQYDKNLV-TTVEEE-Y-
Name: sequence_alignment, dtype: object
1680    ----PQPVDSWV
1681    PQPV---DSWV-
Name: sequence_alignment, dtype: object
1684    QKFIARNRAPRVQ-----------
1685    QKFIARNRAPRVQIEYDVELYGAE
Name: sequence_alignment, dtype: object
888    -------RTTPV---
889    -------DETNL---
890    -------GETRL---
891    -------LDVPV---
892    -------RETQV---
893    -------IESDV---
894    -----E-AQTRL---
895    ------EQVSAV---
896    -----V-QDTRL---
897    -V-----KESLV---
898    ----RW-QDTRL---
899    ----RH-PTSII---
900    -----SYLVTSV---
901    --T---RRETQL---
902    -----H-REMAVDCP
903    ---ATV-RTYSC---
904    -NS-RV-QDSII---
905    ANS-RF-PTSII---
Name: sequence_alignment, dtype: object
1688    -----GDYM-N-M-
1689    FPLKRHDKVDDLSK
Name: sequence_alignment, dtype: object
1693    DFSIVGSLP--R--
1694    -FSIVGSLPRDFEL
Name: sequence_alignment, dtype: object
1697    ARTKQTA------
1698    ARTKQTARKSTGG
Name: sequence_alignment, dtype: object
966    ------KRYDREFLLGFQ--------------------------
967    -----KKRYSREFLLGF---------------------------
968    -----RIIYDRKFLMECR--------------------------
969    -----KKRYDREFLLGFQ--------------------------
970    --------YDREFLLDFQF-------------------------
971    ----GRIIYDRKFLMECR--------------------------
972    -----RIIYDRKFLMECRN-------------------------
973    ----GRIIYDRKFLMECRN-------------------------
974    ----TRIIYDRKFLMECR--------------------------
975    PGG-TRIIYDRKFLLDRRNS------------------------
976    PGG-TRIIYDRKFLMECRN-SP----------------------
977    ---PHMIRYNRDTLMTARDT--KRAPIPDEMLQEINRVAPDILI
Name: sequence_alignment, dtype: object
978    ---R--RKLPEI-
979    -APT--YSPPLPP
980    -APT--YSPPLPP
981    -APT--YPPPLPP
982    -APT--YPPPPPP
983    SL--ARRPLPPLP
Name: sequence_alignment, dtype: object
984    -PPPVP-P-------
985    -PPPVP-PR------
986    PPPALPSSAP---S-
987    --PALPSSAPSG---
988    PPPPLPSGPA---YA
989    -PPVIA-PRP-EHTK
Name: sequence_alignment, dtype: object
993    MSL----P-GRWKPK-
994    ---QLINTNGSWHI-N
Name: sequence_alignment, dtype: object
995    VEQH---HRRTDND------
996    ----RKRIHIGP--GRAFYT
Name: sequence_alignment, dtype: object
1003    ----RTQPDGQSFR
1004    ----RGCADGQSFR
1005    ---QRESPDGQSFR
1006    ---QRSPPDGQSFR
1007    --VARPPPIGAEVP
1008    PHLQRPPPIGQSFR
Name: sequence_alignment, dtype: object
1009    ----LPTLPKLP------SLS-----
1010    ---HTPRLPTLP------KR-V----
1011    AFVHMPTLPNLDF----------HKT
1012    AFVHMPTLPNLD-FHKT---------
1013    PLYTSPSLPNITLGL--P--------
1014    TLVSMPPLPGLDLK--------GS--
Name: sequence_alignment, dtype: object
288    ----------------ILHRLL-------
289    ----------------ILHRLLQ------
290    -------------K--ILHRLLQ------
291    -------------G--LLWDLLT------
292    -------------K--ILHRLLQ------
293    -------------S--AFSRLYT------
294    ------------P---MLMNLL-------
295    -------------K--ILHRLL-Q-----
296    ------------GA---FQNLFQ------
297    ----------------SLIDLLAD-----
298    ---------------QSLINLLAD-----
299    --------S----A---FSRLYTR-----
300    -------------K--ILHRLLQD-----
301    -------------K--ILHRLLQD-----
302    -------------K--ILHRLLQD-----
303    ------------HK--ILHRLLQ------
304    -------------S--ELLKYLTT-----
305    -------------A--ALAALLAA-----
306    --------P----A--ILYALLSS-----
307    -------------K--ILHRLLQE-----
308    --------H----K--ILHRLLQD-----
309    --------P----A--ILYALLSS-----
310    --------H----K--ILHRLLQ------
311    ------H------K--ILHRLLQE-----
312    --------H----K--ILHRLLQD-----
313    --------H----K--ILHRLLQE-----
314    -------------K--ILHRLLQE-----
315    ------------HK--ILHRLLQE-----
316    -------------K--ILHRLLQD-----
317    ------------PS--LLKKLLLA-----
                   ...              
338    -----R--P----A--ILYALLSS-----
339    -----A--N----A--LLRYLLDKD----
340    -----N--H----P--MLMNLLK------
341    -------------S--LLLHLLKSQ----
342    --------H----K--ILHRLLQDS----
343    ------------HK--ILHRLLQD-S---
344    -----E--N----A--LLRYLLDK-----
345    ---G----N----A--ALRYLLGA-----
346    -----D--H----Q--LLRYLLDKD----
347    -----------KHK--ILHRLLQD-----
348    -----R--H---K---ILHRLLQ------
349    -----R--H---P---LLLRHLL------
350    ---E-R--H----K--ILHRLL-Q-----
351    ---K-N--H----P--MLMNLLK--D---
352    ----RP--C----S--ELLKYLTTN-D--
353    -K---N--H----P--MLMNLLK--D---
354    ----KN--H----P--MLMNLLK------
355    ------GLE----A--IIRKALM------
356    ------------HK--ILHRLLQD-SS--
357    --------H----K-K-LLQLLTCSS---
358    ----KN--H----P--MLMNLLK------
359    ---------K--HK--ILHRLLQD-SS--
360    ------------PS--LLKKLLLAP-A--
361    -K---N--H----P--MLMNLLK------
362    --------N----A--LLRYLLDRD--D-
363    ----KE--N---A---LLRYLLDK-----
364    T--E-R--H----K--ILHRLLQ------
365    --------H----K--ILHRLLQEGS-PS
366    SLTE-R--H----K--ILHRLLQE-----
367    SLTE-R--H----K--ILHRLLQE-----
Name: sequence_alignment, dtype: object
1033    ----DDHLL--
1034    ----DEDLLE-
1035    ---SDEDLLHI
1036    ---DDVPMVIA
1037    ---SDEDLLE-
1038    ----DEDLLHI
1039    --LLDDELMS-
1040    GFSDDVPMVIA
Name: sequence_alignment, dtype: object
1042    -K----G--LIDYYLM-------------
1043    -K----H--TLDIFFKPL-----------
1044    -----P-KHTLDIFFKPL-------T---
1045    ----RQT--SMTDFYHS------------
1046    KK----G--LIDYYLM-------------
1047    -----P-KHTLDIFFKPL-----------
1048    ---AFQA--KLDTFLWS------------
1049    ---RRQT--SMTDFYHSK-------RRLI
1050    --KRRQT--SMTDFFHS-KRRLIFS----
Name: sequence_alignment, dtype: object
1053    ---------KCVVM
1054    ---------GCVLS
1055    ---------KCVIM
1056    --------TKCVVM
1057    --------TKCVIF
1058    --------TKCVFM
1059    -----PTASACNIQ
1060    KKKSK---TKCVIM
Name: sequence_alignment, dtype: object
1061    -PTA---SACVLS
1062    D--DPTASACNIQ
Name: sequence_alignment, dtype: object
1074    ----E--MVRQARILAQATSDLVNAIKA--------
1075    -------ILEAAKSIAAATSALVKAASA--------
1076    ----V-VLINAVKDVAKALGDLISATK---------
1077    ---GR-PLLQAAKGLAGAVSELLRSAQP--------
1078    ----SRKLLSAAKILADATAKMVEAAK----G----
1079    ------PLLQAAKGLAGAVSELLRSAQPASA-----
1080    --TAKRQFVQSAKEVANSTANLVKTIKA--------
1081    ------DIDQMFSTLLGEMDLLTQSL-G---VDTLY
1082    RDDRRERIVAECNAVRQALQDLLSEYMG--------
Name: sequence_alignment, dtype: object
1084    ------ETFSDLWKLLP----
1085    ------ETFSDLWKLLP----
1086    ------LTFEHYWAQL----T
1087    -------TFSDLWKLLP----
1088    -------TFSDLWKLLPE---
1089    -------TFAEYWAQL---AS
1090    ------TSFAEYWNLLS-P--
1091    ------LTFEHWWAQL---TS
1092    ------ETFSDLWKLLPEN--
1093    CNCKAPETFLCYWRCLQ----
Name: sequence_alignment, dtype: object
1103    ------KILHRLLQD-
1104    -----HKILHRLLQ--
1105    G--LE-AIIRKALMGK
1106    SPGSR-EWFKDMLS--
Name: sequence_alignment, dtype: object
380    ----------------------------------------------...
381    ----------------------------------------------...
382    ----------------------------------------------...
383    ----------------------------------------------...
384    ----------------------------------------------...
385    ----------------------------------------------...
386    ----------------------------------------------...
387    ----------------------------------------------...
388    ----------------------------------------------...
389    --------------------------CY------------------...
390    ----------------------------------------------...
391    ----------------------------------------------...
392    ----------------------------------------------...
393    --------------------------I-------------------...
394    ----------------------------------------------...
395    ----------------------------------------------...
396    ----------------------------------------------...
397    ----------------------------------------------...
398    ----------------------------EPC---------------...
399    ----------------------------------------------...
400    ----------------------------------------------...
401    ----------------------------------------------...
402    ----------------------------------------------...
403    ----------------------------------------------...
404    ----------------------------------------------...
405    --------------------------SG------------------...
406    ----------------------------------------------...
407    ---------------------------------QECTPGQTKKQDC...
408    -CSPSGAICSGFGPPEQCCSGACVPHP-------------------...
409    ---------------------------E-VTCE---P-GTTFKDKC...
410    KCSPSGAICSGAGPPEQCCSGACVPHP-------------------...
Name: sequence_alignment, dtype: object
411    ------------ACGRR-----------------------------
412    -------------CGKK-LVT-------------------------
413    --------K--FQCGQK--T----------------------L---
414    -------------CGQK-T-L---------------------RP--
415    ------L-K--FQCGQK-T---------------------------
416    ------L-K--FQCGQK-TL--------------------------
417    -------TTCDGPCGVRFRQ----------------------N---
418    -----A------DCGLR-PLFEKKSLEDKTERELLESY--------
419    ---GEA------DCGLR-PLFEKKSLEDKTERELLESYI-------
420    ----EA------DCGLR-PLFEKKSLEDKTERELLESYI-------
421    -------------CGLR-PLFEKKQVQDQTEKELFESY-I------
422    ------------DCGLR-PLFEKKSLEDKTERELLESYI-------
423    -----A------DCGLR-PLFEKKSLEDKTERELLESYI-------
424    -------------CGLR-PLFEKKSLEDKTERELLESYI-------
425    ----EA------DCGLR-PLFEKKSLEDKTERELLESYI-D-----
426    ----EA------DCGLR-PLFEKKSLEDKTERELLESYID------
427    ----EA------DCGLR-PLFEKKSLEDKTERELLESYID-G----
428    -----A------DCGLR-PLFEKKSLEDKTERELLESY-I-D--GR
429    GSG-EA------DCGLR-PLFEKKSLEDKTERELLESYID-G--R-
Name: sequence_alignment, dtype: object
430    -------FEEIP--------
431    ------KYEPF---------
432    ------DFEEIP--------
433    ------DFEEIPEE------
434    -------FEGIPGE------
435    ------DFEEIP-E-Y----
436    ------DFEEIPGE------
437    -----GDFEEIPEE------
438    -------FEEIPEE------
439    -----GDFEEIP--------
440    -------YEPIPEEA-----
441    ------DFEEIPEE-Y----
442    ------DYEPIPEEAF----
443    ------DFEEIPGE-YL---
444    ------DFEEIPEE-YL---
445    -----GDFEEIPEE-YL---
446    ------DFEEIPEE-YLQ--
447    -----SDFEEFSLDDI--EQ
448    GGGGNGDYEPIPEEA-----
Name: sequence_alignment, dtype: object
449    ----SGKVPL-
450    ----SGKVPLS
451    DFLAEGGGV--
452    TVELQGVVP--
453    DFLAEGGGVR-
454    DFLAEGGGVR-
Name: sequence_alignment, dtype: object
459    GCQV-------------------------NYCP-----PVPCL---
460    --------------------ACSRYEVDCRGRG-----S-----AC
461    ----EDYAAIEASLSETFNT------AADPGRRLGEGSK----P--
Name: sequence_alignment, dtype: object
1124    KRWIIM-GLN-K--
1125    KRWIIL-GLN-K--
1126    -RYPL---TLGWCF
1127    -KLVALV-I-N-AV
Name: sequence_alignment, dtype: object
1140    PA-PFAAA----
1141    -KGEADALSLD-
1142    -KLKLLVVIRLK
Name: sequence_alignment, dtype: object
1149    -----GQVGRQLAIIGDDINR--------
1150    -A--ADPLGQALRAIGDEFETRFR-----
1151    -RPE-IWIAQEARRIGDEANAYYAR----
1152    -RPE-IWIAQEYRRIGDEFNAYYAR----
1153    -RPE-IWAAQELRRIGDEFNAYYR-----
1154    -RPE-IWIAQELRRIGDEFNAYYAR----
1155    -RPE-IWIAQELRRIGDEENAYYR-----
1156    GRPE-IWIAQELRRIGDEFNAYYA-----
1157    AS-T-KKLSECLKRIGDELDSNMELQRMI
Name: sequence_alignment, dtype: object
1158    ---------------------SHPQF-------
1159    ---------------------SHPQF-E-----
1160    ----------------------HPQF-E----K
1161    -------------------RCCHPQCGAVEEC-
1162    GHVVEGLAGELEQLRARLE--HHPQG-Q-----
Name: sequence_alignment, dtype: object
1163    ---HPQF-E-----
1164    ---HPQF-E----K
1165    -CCHPQCGAAYSC-
1166    RCCHPQCGAVEEC-
Name: sequence_alignment, dtype: object
1167    --------ETGTTNTATT--
1168    --------ETGTTNTATTAT
1169    SNPPCQTHETGTTNTATTAT
Name: sequence_alignment, dtype: object
466    -----A-------LDKWD-------
467    -----E-------LEKWAS------
468    -----E-------QDKWAS------
469    -----E-------ADKWQS------
470    -----E-------LDKWAG------
471    -----E-------LDHWAS------
472    -----E-------LDKWAN------
473    -----E-------LDKYAS------
474    -----E-------LDKWAS------
475    -----A-------LDKWAS------
476    -----E-------NDKWAS------
477    ----LE-------LDKWASL-----
478    ---LLE-------LDKWAS------
479    ----LE-------LDKWASLW----
480    ---LLE-------LDKWASLW----
481    --ELLE-------LDKWASL-----
482    --ELLE-------LDKWASLW----
483    EQELLE-------LDKWASLW----
484    ------DKKQKVHALFY----KLDI
Name: sequence_alignment, dtype: object
517    -LIN--TNGS-WHVN--------
518    QLIN--TNGS-WHIN--------
519    ----LPTPPTRE---PKKVAVVR
Name: sequence_alignment, dtype: object
523    VKAETRLNP--D--------LQPTE
524    ----NWFDITNWLWYIKKK------
525    ----NWFDITNWLWYIKKKK-----
Name: sequence_alignment, dtype: object
529    -AQ---SQRAP-DR-----
530    ETI---YNTT----LKY--
531    ---CKEWLST-AP----CG
Name: sequence_alignment, dtype: object
534    VQ-----------GSGAFGR-
535    --CGADSYEMEEDGVRK---C
Name: sequence_alignment, dtype: object
536    TDHG--A-E----
537    YTTSTRGDLAHVT
Name: sequence_alignment, dtype: object
540    --------------KKQKVHALFYK-
541    HFPICIFCCGCCHRSKCG--M--CCK
Name: sequence_alignment, dtype: object
544    --------------PGGGQIVGGVYLLPRR
545    SIQDLRRRFFLHHLIAEI------------
Name: sequence_alignment, dtype: object
546    PTSSE--QI----
547    -YLEDWIKYNNQK
Name: sequence_alignment, dtype: object
548    --LLTEVETPIR----------NEWG
549    GS------ATRELDELMASLSD----
Name: sequence_alignment, dtype: object
497    VCN---------PLTG--AL--LC------
498    ----AI-IGL-MVGGV---V----------
499    ---EEED--DD-MGFG---L----------
500    VCN---------PLTG--AL--LCSAAE--
501    ------------QLINTNG-SWH-----VN
Name: sequence_alignment, dtype: object
502    ----HQLDPAFG----
503    ----PKLEPW-KHP--
504    EPVDPKLEPW-KHPGS
Name: sequence_alignment, dtype: object
505    ----AEP--WTVRNEDL
506    KG-----VRI-GPGQ--
507    -FDSAEP--WTVRNED-
Name: sequence_alignment, dtype: object
508    ------LELDKWA-
509    -----LLELDKWA-
510    QLINTNGSWHI--N
Name: sequence_alignment, dtype: object
511    KLVF--FAEDV--------
512    ----NWWDITNWLWYIKKK
513    ----NWFDITNWLWYIKKK
Name: sequence_alignment, dtype: object
514    --MDWNM----HAA
515    TR-KSIHIGPG---
516    TR-KSIRIGPG---
Name: sequence_alignment, dtype: object
1192    GCCS-DPRCAW-R-------
1193    GC-CSTPPCAVLY---C---
1194    GCCS-LPPCALNNPKYC---
1195    GC-CSRPPCILNN---PDLC
Name: sequence_alignment, dtype: object
1212    ------SS-ETKRAARRPYK-----
1213    -----TK-PAIRRLARRGGV-----
1214    --QGITK-PAIRRLARRG-------
1215    ----LLSSSETKRAARRPYKPIAL-
1216    KL--LSSS-ETKRAARRPYKPIALR
Name: sequence_alignment, dtype: object
1224    -------------NGYENPTY--K-------
1225    -------------NGYENPTY--K-------
1226    --------------NFDNPVY--RK--T---
1227    --------S-I---NFDNPVY--Q----KTT
1228    VAPEERHLSKMQQNGYENPTYKFFEQM----
Name: sequence_alignment, dtype: object
1229    ----RLLEASADAN----
1230    ---Q-LTSYD--------
1231    ----QLTSYDCEVNAP-I
1232    -VVKLLLEHGADVSAQ--
1233    SVVEYLLQHGADVH----
1234    EVVKLLLEHGADVLAQD-
1235    EVVKLLLEHGADVDAQDK
Name: sequence_alignment, dtype: object
559    --------------GRPR----TTSFAE---
560    ---------G-----RPR----TTSFAE---
561    ----P---VL----AFQREGFGRQSMS----
562    -----IAA---G-R-TGR----RQAIHDI--
563    T-TYA-DF-IASGR-TGR----RNAI-----
564    -TTYA-DF-IASGR-TGR----RNAI---HD
565    T-TYA-DF-IASGR-TGR----RNAIHD---
566    T-TYA-DF-IASGR-TGR----RASIHD---
567    T-TYA-DF-IASGR-TGR----RNAIHD---
568    T-TYA-DF-IASGR-TGR----RNAIH----
569    T-TYA-DF-IASGR-TGR----RACIHD---
570    T-TYA-DF-IASGR-TGR----RNAIHD---
571    T-TYA-DF-IASGR-TGR----RNAIH----
Name: sequence_alignment, dtype: object
1236    ----F-SDIYKIREIADGLC--L-
1237    ------SDIYKIREIADGLC--L-
1238    ------WIAQELREIGDKFNAYYA
1239    -RP-EIWIAQEFRRIGDEFN-A--
1240    -QEQLLTLASILREDGKVFD----
1241    GSGTMENLSRRLKVTGDLFDIMSG
Name: sequence_alignment, dtype: object
1279    GSLHRVPLR-----------------------------------
1280    -AVVKVPLKKFKSIRETMKEKGL---------------------
1281    -AVVKVPLKKFKSIRETMKEKGLLGEFLRTHKYDPAWKYRFGDL
Name: sequence_alignment, dtype: object
1287    -----RGALLDQIRQGIQLNKT---------
1288    PS--PREQLMESIRKGKELKQI---------
1289    PS--PREQLMESIRKGKELKQI---------
1290    --TPQGEDMLNAIRRGVKLKKTTTNDRSAPR
Name: sequence_alignment, dtype: object
1293    ----------------VMEMEPE-T-MET--KSVID---S--
1294    -----------D----HMEMEPE-T-MET--KSVT-DYF---
1295    ----------------HMEMEPE-T-MET--KSVT-DYF---
1296    -------F--YMGT---CQDEPEQLD--DWNRI-AEL-----
1297    --Q-L-LH--SD----HMEMEPE-T-MET--KSVT-DYF-SK
1298    HPEPVASWMSEQ--RWAGEPEVMCT-LQH--KSIA-------
Name: sequence_alignment, dtype: object
1309    -----------WR-QDID-------
1310    L-----------DEETGEFL-----
1311    Q-----------NEENGEQE-----
1312    -MDLIDILWRQDI-DLGVSREVFDS
Name: sequence_alignment, dtype: object
1325    -NSTL-Q----
1326    VNSTLQ-----
1327    TSAVLQSG---
1328    -SAVLQSGF--
1329    TSAVLQSGFRK
Name: sequence_alignment, dtype: object
1335    S-----G---------SLANNIKKSTVIVKN
1336    --QSG-S----------LANNIKKSTVIVK-
1337    -IQ--SG---------SLANNIKKSTVIVKN
1338    -------RLHSEIQSGSLANNIKKSTVIVKN
Name: sequence_alignment, dtype: object
1339    -----EAQTRL
1340    -----YPTSII
1341    --QLAWFDTDL
1342    -N-SRWPTSIL
1343    AN-SRWPTSII
Name: sequence_alignment, dtype: object
1344    ----RNLF--GP
1345    -PVKRRLDLE--
1346    -SRHKKLMFK--
1347    PKPLKKLRFD--
Name: sequence_alignment, dtype: object
1349    -------KKVTFL-E-E-VTEYYIS-----------
1350    -------KKVTFL-E-E-VTEYYISGDE-DRK-G--
1351    -------KTVTWPEEGKLREYFYFELDETERVNV-N
1352    -------KKVTFL-E-E-VTEYYISGDE-DRK-GPW
1353    GAMGRKRKTVTWPEEGKLREYFYFELDETERVNV-N
Name: sequence_alignment, dtype: object
1368    --VFFAED-----
1369    --VFFAEDVGS--
1370    KLVFFAEDVGSNK
Name: sequence_alignment, dtype: object
1373    --LTGCGDIIA-E-----
1374    --YQGGGEEMA-L-P---
1375    --SYEGYEGYY-S--Q--
1376    PRDSYSGDALY-E--F--
1377    ----GGGEQLAINEL-IS
Name: sequence_alignment, dtype: object
1378    --------SFNLAPLGRR---
1379    ---E-L-EAYRLGPASA----
1380    ---GLA-LKYLLTPVN-----
1381    ------SH-FNLAPLGRRRV-
1382    ERLE-L-EAYRLGPAS----A
Name: sequence_alignment, dtype: object
614    -------------------------------RKRKFS---------...
615    -------------------------------RKRTWR---------...
616    -------------------------------RKRGYS---------...
617    -------------------------------RKRKWS---------...
618    -------------------------------QKRSFS---------...
619    ------------------------------LGKRKY----------...
620    ----------------------------L--GKRKRH---------...
621    ------------------------------VAKKYRN---------...
622    ----------------------------P-PKKKRK-V--------...
623    ----------------------------E-PSKRARPA----E---...
624    ------------------------------PVKKPKI----R----...
625    ----------------------------P-AAKRVKLD--------...
626    ----------------------------P-FKKKRRE----A----...
627    -------------------------AA-P-PKKKRKV----E----...
628    -----------------D--------G-P-TAKKLKTE----Q---...
629    ------------------------A-I-S-PSKRARP----AE---...
630    -------------------------GS-P-PKKKRKVG--------...
631    -------------------------------GKISKHWTGI-----...
632    ----------------DE---------EGGG-EEDQ-D-----FDL...
633    ---------------------DAQHAA-P-PKKKRKVE--------...
634    --SAKRKEP--E----PK--------G-S-TKKKAKT---------...
635    A--VKRPAA--T---KKA----------G-QAKKKKL---------...
636    SA-VKRPAA--T---KK---------A-G-QAKKKK-L----D---...
637    SA-VKRPAA--T----KK--------A-G-QAKKKKLD--------...
638    C--GKRSAE--G---SN--P------P-K-PLKKLRG---------...
639    P--RKRPL---E---WDE--D----EE-P-PRKRKRLW--------...
640    ---MSRRRHSDE---NDG--------G-Q-PHKRRKTS----D---...
641    R--KKRKTE--E---ES-PL------K-D-KAKKSKG---------...
642    ---MSRRRHSYE---NDG--------G-Q-PHKRRKTS----D---...
643    RR-RKRKREW-D---D---D-----DD-P-PKKRRRLD--------...
644    ---EKKKRT--VAEEDQL-HL----DG-Q-ENKRRRHD---S----...
Name: sequence_alignment, dtype: object
645    -------KKRKV-----
646    -------KKRKV-----
647    -------GKRKR-----
648    -------KKRKV-----
649    -------KKRREA----
650    -------SKRAR--PA-
651    ------KKKRKV-EY--
652    -----SPSKRAR--P--
653    -G--S--IIRKWN----
654    TV--L--GKRK------
655    SV--L--GKRKRH----
656    -------RKRTWRDAF-
657    ----H--RKRKFSDAF-
658    -R--Q--RKRKWSEAF-
659    --SQG--TKRSYEQME-
660    -------RKRGYSVAF-
661    --SRG--QKRSFSKAFG
Name: sequence_alignment, dtype: object
1387    -------LQQTQAQVDEVVDIMRVNVDKVLERD
1388    NLTSNRRLQQTQAQVDEVVDIMRVNVDKVLERD
Name: sequence_alignment, dtype: object
1392    --DLTVEKAADVTWEEEAEQTGVSHNLMITVDDDGTMRIKD-----...
1393    STDMWIERTADISWESDAEITGSSERVDVRLDDDGNFQLM------...
1394    GTDMWIERTADISWESDAEITGSSERVDVRLDDDGNFQLMN-----...
1395    ETDMWIERTADITWESDAEITGSSERVDVRLDDDGNFQLM------...
1396    --DMWIERTADISWESDAEITGSSERVDVRLDDDGNFQL-MNDPGA...
Name: sequence_alignment, dtype: object
1404    AAAAAAA----
1405    RPKPQQFFGLM
Name: sequence_alignment, dtype: object
1411    ----GLLDALDLAS
1412    -Q--GLLDALDLAS
1413    GHGQGLLDALDLAS
Name: sequence_alignment, dtype: object
663    ------VVKQNCLKLAT------------------
664    ----LQPFPQPELPY--------------------
665    ----QWIRVNIPKRI--------------------
666    -----AWRSDEALPLGS------------------
667    ----QVIILNHPGQISA------------------
668    ----AAYSDQATPLLLS------------------
669    ----STDYGILQINSRW------------------
670    ----SAVRLRSSVPGVR------------------
671    ----PKYVKQNTLKLAT------------------
672    ----PEVIPMFSALSEG------------------
673    ----RQLYPEWTEAQRL------------------
674    ----SGEGSFQPSQENP------------------
675    --A-DLIAYPKAATKF-------------------
676    ---GSDWRFLRGYHQY-------------------
677    --G-ELIGTLNAAKVPAD-----------------
678    ----LVEALYLVCGERGG-----------------
679    ---GSDARFLRGYHLYA------------------
680    --G-ELIGILNAAKVPAD-----------------
681    --V-SKWRMATPLLMQAL-----------------
682    -PV-SKMRMATPLLM--------------------
683    -PV-SKMRMATPLLMQA------------------
684    ----PEVIPMFSALSEG-A---------T------
685    -PV-SKMRMATPLLMQAL---------P-------
686    PVV-HFFKNIVTPRTPPP---------S-------
687    ----MNLPSTKVSWAAVG-----------GGGSLV
688    ----QHIRCNIPKRIGP-SKVATLVPR--------
Name: sequence_alignment, dtype: object

In [19]:
plt.hist(CATHPerBindingMode,100)
plt.show()



In [20]:
#Load PFAM
pdbmap_name = ["PDB","Chain","unk","name","PFAM","uniprot","range"]
pdbmap = pd.read_table("/media/vince/Postdoc/PixelDB/PixelDB/other_files/pdbmap",names=pdbmap_name,delimiter="\t")
pdbmap = pdbmap.drop("unk",axis=1)
pdbmap_name = list(pdbmap.columns.values)
for c in pdbmap_name:
    pdbmap[c] = pdbmap[c].str.replace(';', '')


---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-20-b69a39a8ddc2> in <module>()
      1 #Load PFAM
      2 pdbmap_name = ["PDB","Chain","unk","name","PFAM","uniprot","range"]
----> 3 pdbmap = pd.read_table("/media/vince/Postdoc/PixelDB/PixelDB/other_files/pdbmap",names=pdbmap_name,delimiter="\t")
      4 pdbmap = pdbmap.drop("unk",axis=1)
      5 pdbmap_name = list(pdbmap.columns.values)

/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)
    527                     skip_blank_lines=skip_blank_lines)
    528 
--> 529         return _read(filepath_or_buffer, kwds)
    530 
    531     parser_f.__name__ = name

/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    293 
    294     # Create the parser.
--> 295     parser = TextFileReader(filepath_or_buffer, **kwds)
    296 
    297     if (nrows is not None) and (chunksize is not None):

/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds)
    610             self.options['has_index_names'] = kwds['has_index_names']
    611 
--> 612         self._make_engine(self.engine)
    613 
    614     def _get_options_with_defaults(self, engine):

/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.pyc in _make_engine(self, engine)
    745     def _make_engine(self, engine='c'):
    746         if engine == 'c':
--> 747             self._engine = CParserWrapper(self.f, **self.options)
    748         else:
    749             if engine == 'python':

/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.pyc in __init__(self, src, **kwds)
   1117         kwds['allow_leading_cols'] = self.index_col is not False
   1118 
-> 1119         self._reader = _parser.TextReader(src, **kwds)
   1120 
   1121         # XXX

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:3246)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:6111)()

IOError: File /media/vince/Postdoc/PixelDB/PixelDB/other_files/pdbmap does not exist

In [ ]:
#cath-b-newest-all
cath_name = ["PDB","v","CATH","range"]
cathb = pd.read_table("/media/vince/Postdoc/PixelDB/PixelDB/other_files/cath-b-newest-all",delimiter=" ",names=cath_name)

In [ ]:
for ikl in range(0,3):
    torun = PixelDB
    if ikl == 0:
        print("Full")
    if ikl == 1:
        torun = PixelDBecr
        print("ECR")
    if ikl == 2:
        torun = PixelDBoecr
        print("ECRdist")
    
    UniquePFAM = dict()
    for uniid in list(np.unique(torun["PFAM"])):
        if str(uniid) == "nan":
            continue
        for v in uniid.split("_"):
            #print(v)
            if v not in UniquePFAM:
                UniquePFAM[v] = 0
            UniquePFAM[v] += 1
        #break

    UniqueCATH = dict()
    for uniid in list(np.unique(torun["CATH"])):
        if str(uniid) == "nan":
            continue
        for v in uniid.split("_"):
            #print(v)
            if v not in UniqueCATH:
                UniqueCATH[v] = 0
            UniqueCATH[v] += 1

    UniqueUniprot = dict()
    for uniid in list(np.unique(torun["uniprot"])):
        if str(uniid) == "nan":
            continue
        for v in uniid.split("_"):
            #print(v)
            if v not in UniqueUniprot:
                UniqueUniprot[v] = 0
            UniqueUniprot[v] += 1


    print("PDB has this many unique PFAM",len(pdbmap["PFAM"].value_counts()))
    print("PixelDB has this many unique PFAM",len(UniquePFAM))
    print("Percentage = ",100.0*len(UniquePFAM) / float(len(pdbmap["PFAM"].value_counts())))
    #print("PixelDBecr has this many unique PFAM",len(PixelDBecr["PFAM"].value_counts()))

    print("PDB has this many unique uniprot",len(pdbmap["uniprot"].value_counts()))
    print("PixelDB has this many unique uniprot",len(UniqueUniprot))

    print("Percentage = ",100.0*len(UniqueUniprot) / float(len(pdbmap["uniprot"].value_counts())))

    #print("PixelDBecr has this many unique uniprot",len(PixelDBecr["uniprot"].value_counts()))
    print("PDB has this many unique CATH",len(cathb["CATH"].value_counts()))
    print("PixelDB has this many unique CATH",len(UniqueCATH))

    print("Percentage = ",100.0*len(UniqueCATH) / float(len(cathb["CATH"].value_counts())))
    
    print("Most frequence Cath")
    for w in sorted(UniqueCATH, key=UniqueCATH.get, reverse=True)[0:3]:
        print w, UniqueCATH[w]
    print("Most frequence uniprot")
    for w in sorted(UniqueUniprot, key=UniqueUniprot.get, reverse=True)[0:3]:
        print w, UniqueUniprot[w]
    print("Most frequence PFAM")
    for w in sorted(UniquePFAM, key=UniquePFAM.get, reverse=True)[0:3]:
        print w, UniquePFAM[w]

In [ ]:
allCarac = ["resolution","receptor_length","peptide_length","size_of_binding_mode","longest_continuous_core","longest_continuous_ecr"]

for v in allCarac:
    AllMean = []
    for uniid in list(np.unique(PixelDB["unique_id"])):
        sdf = PixelDB[PixelDB["unique_id"] == uniid]
        AllMean.append(np.mean(sdf[v]))
    print("%30s Avg:%6.2f Std:%6.2f Median:%6.2f Min:%6.2f Max:%7.2f" % (v,np.mean(AllMean),np.std(AllMean),np.median(AllMean),np.min(PixelDB[v]),np.max(PixelDB[v])))

In [ ]:
allCarac = ["resolution","receptor_length","peptide_length","size_of_binding_mode","longest_continuous_core","longest_continuous_ecr"]

for v in allCarac:
    AllMean = []
    for uniid in list(np.unique(PixelDBecr["unique_id"])):
        sdf = PixelDBecr[PixelDBecr["unique_id"] == uniid]
        AllMean.append(np.mean(sdf[v]))
    
    print("%30s Avg:%6.2f Std:%6.2f Median:%6.2f Min:%6.2f Max:%7.2f" % (v,np.mean(AllMean),np.std(AllMean),np.median(AllMean),np.min(PixelDBecr[v]),np.max(PixelDBecr[v])))

In [ ]:
allCarac = ["resolution","receptor_length","peptide_length","size_of_binding_mode","longest_continuous_core","longest_continuous_ecr"]
PixelDBoecr = PixelDB[PixelDB["longest_continuous_ecr"] > 3]
for v in allCarac:
    AllMean = []
    for uniid in list(np.unique(PixelDBoecr["unique_id"])):
        sdf = PixelDBoecr[PixelDBoecr["unique_id"] == uniid]
        AllMean.append(np.mean(sdf[v]))
    
    print("%30s Avg:%6.2f Std:%6.2f Median:%6.2f Min:%6.2f Max:%7.2f" % (v,np.mean(AllMean),np.std(AllMean),np.median(AllMean),np.min(PixelDBoecr[v]),np.max(PixelDBoecr[v])))

In [ ]:
PixelDB.sort(["peptide_length"],ascending=False).head(5)[["name","size_of_binding_mode","cluster_number","receptor_length","peptide_length"]]

In [ ]:
PixelDB.sort(["receptor_length"]).head(5)[["name","size_of_binding_mode","cluster_number","receptor_length","peptide_length"]]

In [ ]:
def bootstrap(AllCore,AllECR,it=10000):
    BtECR = dict()
    BtCore = dict()
    for lab in Label:
        BtECR[lab] = []
        BtCore[lab] = []
    for it in range(0,it):
        ECRMean = []
        CoreMean = []
        for aa in Label:

            index = np.random.randint(len(AllECR), size=len(AllECR[aa]))

            ECRMean.append(np.sum(np.array(AllECR[aa])[index]))
            CoreMean.append(np.sum(np.array(AllCore[aa])[index]))
        ECRMean = np.array(ECRMean)/np.sum(ECRMean)*100.0
        CoreMean = np.array(CoreMean)/np.sum(CoreMean)*100.0
        for i in range(0,len(ECRMean)):
            BtECR[Label[i]].append(ECRMean[i])
            BtCore[Label[i]].append(CoreMean[i])
    return(BtCore,BtECR)

In [ ]:
AllMeanData = dict()

In [ ]:
PixelDBoecr = PixelDB[PixelDB["longest_continuous_ecr"] > 3]

In [ ]:
LegendLabel = ["Solvent Exposure [0-9]","Levy Classification","Stride Classifcation", "Amino Acid compostion"]
ToTest = ["percent_exposed_alignment","levy_alignment","stride","sequence_alignment"]

#LegendLabel = ["Solvent Exposure [0-9]","Stride Classifcation"]
#ToTest = ["percent_exposed_alignment","stride"]


for (alin,LL) in zip(ToTest,LegendLabel):
    if alin not in list(PixelDBoecr.columns.values):
        continue
    AllMeanData[LL] = dict()
    print(alin)
    
    AllECR = dict()
    AllCore = dict()
    
    
    SumCore = 0.0
    SumECR = 0.0
    
    #Find Label
    Label = []
    for v in PixelDBoecr[alin]:
        for i in range(0,len(v)):
            if v[i] == "-":
                continue
            Label.append(v[i])
    Label = sorted(list(set(Label)))
    if alin == "sequence_alignment":
        Label = myAmino
    
    df = pd.DataFrame()
    
    print(alin,Label)
    count = 0
    for uniid in list(np.unique(PixelDBoecr["unique_id"])):
        
        Tecr = dict()
        Tcore = dict()
        Totecr = 0.0
        Totcore = 0.0
        
        sdf = PixelDBoecr[PixelDBoecr["unique_id"] == uniid]
        
        for (ecr,ali,lecr) in zip(np.array(sdf["core_ecr_alignment"]),np.array(sdf[alin]),np.array(sdf["longest_continuous_ecr"])):
            
            if lecr < 4:
                continue
            #print(ecr,ali[0])
            #ali = ali[0]
            #print(ali,lecr)
            for i in range(0,len(ecr)):
                #if alin == "sequence_alignment":
                #    if levy[i] != "C":
                #        continue
                if ecr[i] == "E":
                    if ali[i] not in Tecr:
                        Tecr[ali[i]] = 0.0
                    Tecr[ali[i]] += 1.0
                    Totecr += 1.0
                    SumECR += 1.0
                    
                if ecr[i] == "C":
                    if ali[i] not in Tcore:
                        Tcore[ali[i]] = 0.0
                    Tcore[ali[i]] += 1.0
                    Totcore += 1.0
                    SumCore += 1.0
                #print(ali[i],ecr[i])
            #break
        
        for aa in Label:
            if aa not in Tecr:
                Tecr[aa] = 0.0
            #print(aa,Tecr[aa],Totecr)
            if aa not in AllECR:
                AllECR[aa] = []
            
            AllECR[aa].append(Tecr[aa] / float(Totecr))

            if aa not in Tcore:
                Tcore[aa] = 0.0
            if aa not in AllCore:
                AllCore[aa] = []
            AllCore[aa].append(Tcore[aa] / float(Totcore))
            count += 1
            df = df.append({'class': 'Core', 'AA': aa, 'percentage': Tcore[aa] / float(Totcore)}, ignore_index=True)
            df = df.append({'class': 'ECR', 'AA': aa, 'percentage': Tecr[aa] / float(Totecr)}, ignore_index=True)  
        
        #break
    print(SumCore,SumECR)
    ECRMean = []
    CoreMean = []
    
    (BtCore,BtECR) = bootstrap(AllCore,AllECR,it=10000)
    
    
    for aa in Label:
        ECRMean.append(np.mean(BtECR[aa]))
        CoreMean.append(np.mean(BtCore[aa]))
        
        
    plt.scatter(CoreMean,ECRMean)
    for i in range(0,len(Label)):
        ttest = stats.ttest_rel(BtECR[Label[i]], BtCore[Label[i]])[1]
        print("%2d %2s P.val=%10.8f Core:%6.2f ECR:%6.2f" % (i,Label[i],ttest,CoreMean[i],ECRMean[i]))
        #print(i,Label[i],ttest,"CORE:%4.2f " % (CoreMean[i]),"ECR",ECRMean[i])
        if Label[i] not in AllMeanData[LL]:
            AllMeanData[LL][Label[i]] = dict()
        AllMeanData[LL][Label[i]]["ECR"] = [ECRMean[i],np.std(BtECR[Label[i]]) ]
        AllMeanData[LL][Label[i]]["Core"] = [CoreMean[i],np.std(BtCore[Label[i]])]
        if ttest < 0.05:
            plt.text(CoreMean[i]+0.15,ECRMean[i]+0.15,Label[i])
            
    Lim = [-3,int(np.max(ECRMean+CoreMean)*1.1)]
    #print(Lim)
    #Lim = [-1,10]
    plt.plot(Lim,Lim,c="black")
    plt.xlim(Lim)
    plt.ylim(Lim)
    plt.xlabel("Core "+LL+" %")
    plt.ylabel("ECR "+LL+" %")
    plt.show() 
    
    sns.boxplot(x="AA",hue="class",y="percentage",data=df)
    plt.show()

In [ ]:
Label = myAmino
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3
LL= "Amino Acid buried"
AllMeanData[LL] = dict()
for i in range(0,len(Label)):
    plt.scatter(CoreMean[i],ECRMean[i],c=Color[i])
    ttest = stats.ttest_rel(BtECR[Label[i]], BtCore[Label[i]])[1]
    print("%2d %2s P.val=%10.8f Core:%6.2f ECR:%6.2f" % (i,Label[i],ttest,CoreMean[i],ECRMean[i]))
    
    if Label[i] not in AllMeanData[LL]:
        AllMeanData[LL][Label[i]] = dict()
    AllMeanData[LL][Label[i]]["ECR"] = [ECRMean[i],np.std(BtECR[Label[i]]) ]
    AllMeanData[LL][Label[i]]["Core"] = [CoreMean[i],np.std(BtCore[Label[i]])]
    
    
    #print(i,Label[i],ttest,"CORE:%4.2f " % (CoreMean[i]),"ECR",ECRMean[i])
    #if ttest < 0.05:
    plt.text(CoreMean[i]+0.15,ECRMean[i]+0.15,Label[i],color=Color[i])
Lim = [-3,int(np.max(ECRMean+CoreMean)*1.1)]
#print(Lim)
Lim = [-1,15]
plt.plot(Lim,Lim,c="black")
plt.xlim(Lim)
plt.ylim(Lim)
plt.xlabel("Core composition (%)")
plt.ylabel("ECR composition (%)")
plt.show()

In [ ]:


In [ ]:
AllECR = dict()
AllCore = dict()
AllSurf = dict()
AllInte = dict()


SumCore = 0.0
SumECR = 0.0
SumSurf = 0.0
SumInte = 0.0

for aa in myAmino:
    AllECR[aa] = []
    AllCore[aa] = []
    AllSurf[aa] = []
    AllInte[aa] = []
Label = myAmino
for uniid in list(np.unique(PixelDBoecr["unique_id"])):
    sdf = PixelDBoecr[PixelDBoecr["unique_id"] == uniid]
    Tecr = dict()
    Tcore = dict()
    Tsurf = dict()
    Tinte = dict()
    Totecr = 0.0
    Totcore = 0.0
    Totsurf = 0.0
    Totinte = 0.0
    for v in sdf["EXOSITE_aa"]:
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tecr:
                Tecr[sp[0]] = 0
            Tecr[sp[0]] += float(sp[1])
            Totecr += float(sp[1])
            SumECR += float(sp[1])
    for v in sdf["surface_aa"]:
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tsurf:
                Tsurf[sp[0]] = 0
            Tsurf[sp[0]] += float(sp[1])
            Totsurf += float(sp[1])
            SumSurf += float(sp[1])
    for v in sdf["interior_aa"]:
        
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tinte:
                Tinte[sp[0]] = 0
            Tinte[sp[0]] += float(sp[1])
            Totinte += float(sp[1])
            SumInte += float(sp[1])
            
    for v in sdf["COREBINDING_aa"]:
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tcore:
                Tcore[sp[0]] = 0
            Tcore[sp[0]] += float(sp[1])
            Totcore += float(sp[1])
            SumCore += float(sp[1])        
    
    Label = myAmino
    for aa in Label:
        if aa not in Tecr:
            Tecr[aa] = 0.0
        #print(aa,Tecr[aa],Totecr)
        if aa not in AllECR:
            AllECR[aa] = []
        if (Totecr != 0):
            AllECR[aa].append(Tecr[aa] / float(Totecr))
        else:
            AllECR[aa].append(0)

        if aa not in Tcore:
            Tcore[aa] = 0.0
        if aa not in AllCore:
            AllCore[aa] = []
        if Totcore != 0:
            AllCore[aa].append(Tcore[aa] / float(Totcore))
        else:
            AllCore[aa].append(0)
            
            
        if aa not in Tsurf:
            Tsurf[aa] = 0.0
        if aa not in AllSurf:
            AllSurf[aa] = []
        if Totsurf != 0:
            AllSurf[aa].append(Tsurf[aa] / float(Totsurf))
        else:
            AllSurf[aa].append(0)
            
        if aa not in Tinte:
            Tinte[aa] = 0.0
        if aa not in AllInte:
            AllInte[aa] = []
        if Totinte != 0:
            AllInte[aa].append(Tinte[aa] / float(Totinte))
        else:
            AllInte[aa].append(0)
ECRMean = []
CoreMean = []
SurfMean = []
InteMean = []

(BtCore,BtECR) = bootstrap(AllCore,AllECR,it=10000)

(BtInte,BtSurf) = bootstrap(AllInte,AllSurf,it=10000)

Label = myAmino
for aa in Label:
    ECRMean.append(np.mean(BtECR[aa]))
    CoreMean.append(np.mean(BtCore[aa]))
    SurfMean.append(np.mean(BtSurf[aa]))
    InteMean.append(np.mean(BtInte[aa]))

In [ ]:
print(SumCore,SumECR,SumSurf,SumInte)

In [ ]:
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3

LL= "Binding Site"
AllMeanData[LL] = dict()


for i in range(0,len(Label)):
    ttest = stats.ttest_rel(BtSurf[Label[i]], BtCore[Label[i]])[1]
    print("%2d %2s P.val=%10.8f Core:%4.2f ECR:%4.2f Surf:%4.2f Inte:%.2f" % (i,Label[i],ttest,CoreMean[i],ECRMean[i],SurfMean[i],InteMean[i]))
    if Label[i] not in AllMeanData[LL]:
        AllMeanData[LL][Label[i]] = dict()
    AllMeanData[LL][Label[i]]["ECR"] = [ECRMean[i],np.std(BtECR[Label[i]]) ]
    AllMeanData[LL][Label[i]]["Core"] = [CoreMean[i],np.std(BtCore[Label[i]])]    
    AllMeanData[LL][Label[i]]["Surf"] = [SurfMean[i],np.std(BtSurf[Label[i]])]
    AllMeanData[LL][Label[i]]["Inte"] = [InteMean[i],np.std(BtInte[Label[i]])]

In [ ]:
AllMean = [CoreMean,ECRMean,SurfMean,InteMean]
MeanLab = ["CoreBinding","Exosite","Surface","Interior"]
AllMSE = dict()
AllComp = dict()
for (arr1,lab1) in zip(AllMean,MeanLab):
    AllMSE[lab1] = dict()
    AllComp[lab1] = dict()
    for i in range(0,20):
        AllComp[lab1][myAmino[i]] = arr1[i]
    for (arr2,lab2) in zip(AllMean,MeanLab):
        plt.scatter(arr1,arr2)
        plt.xlabel(lab1)
        plt.ylabel(lab2)
        for i in range(0,20):
            plt.text(arr1[i]+0.15,arr2[i]+0.15,Label[i],color=Color[i])
        plt.xlim(0,20)
        plt.ylim(0,20)
        plt.plot([0,100],[0,100])
        plt.show()
        #AllMSE[lab1][lab2] = np.sqrt(np.mean(np.power(np.array(arr1)-np.array(arr2),2)))
        AllMSE[lab1][lab2] = np.corrcoef(arr1,arr2)[0][1]
        print(lab1,lab2,np.sqrt(np.mean(np.power(np.array(arr1)-np.array(arr2),2))),np.corrcoef(arr1,arr2)[0][1])
sns.clustermap(pd.DataFrame(AllMSE))
plt.show()
sns.clustermap(pd.DataFrame(AllComp))
plt.show()

sns.heatmap(pd.DataFrame(AllComp).transpose()[myAmino].transpose(),annot=True)
plt.show()
#print("Core vs Exosite %.2f" % (np.sqrt(np.mean(np.power(np.array(CoreMean)-np.array(ECRMean),2)))))
#print("Core vs Surface %.2f" % (np.sqrt(np.mean(np.power(np.array(CoreMean)-np.array(SurfMean),2)))))
#print("Exosite  vs Surface %.2f" % (np.sqrt(np.mean(np.power(np.array(ECRMean)-np.array(SurfMean),2)))))

In [ ]:


In [ ]:


In [ ]:
AllECR = dict()
AllCore = dict()
AllSurf = dict()
AllInte = dict()


SumCore = 0.0
SumECR = 0.0
SumSurf = 0.0
SumInte = 0.0

for aa in myAmino:
    AllECR[aa] = []
    AllCore[aa] = []
    AllSurf[aa] = []
    AllInte[aa] = []
Label = ['B', 'C', 'E', 'G', 'H', 'T', 'b']
for uniid in list(np.unique(PixelDBoecr["unique_id"])):
    sdf = PixelDBoecr[PixelDBoecr["unique_id"] == uniid]
    Tecr = dict()
    Tcore = dict()
    Tsurf = dict()
    Tinte = dict()
    Totecr = 0.0
    Totcore = 0.0
    Totsurf = 0.0
    Totinte = 0.0
    for v in sdf["EXOSITE_ss"]:
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tecr:
                Tecr[sp[0]] = 0
            Tecr[sp[0]] += float(sp[1])
            Totecr += float(sp[1])
            SumECR += float(sp[1])
    for v in sdf["surface_ss"]:
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tsurf:
                Tsurf[sp[0]] = 0
            Tsurf[sp[0]] += float(sp[1])
            Totsurf += float(sp[1])
            SumSurf += float(sp[1])
    for v in sdf["interior_ss"]:
        
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tinte:
                Tinte[sp[0]] = 0
            Tinte[sp[0]] += float(sp[1])
            Totinte += float(sp[1])
            SumInte += float(sp[1])
            
    for v in sdf["COREBINDING_ss"]:
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tcore:
                Tcore[sp[0]] = 0
            Tcore[sp[0]] += float(sp[1])
            Totcore += float(sp[1])
            SumCore += float(sp[1])        
    
    for aa in Label:
        if aa not in Tecr:
            Tecr[aa] = 0.0
        #print(aa,Tecr[aa],Totecr)
        if aa not in AllECR:
            AllECR[aa] = []
        if (Totecr != 0):
            AllECR[aa].append(Tecr[aa] / float(Totecr))
        else:
            AllECR[aa].append(0)

        if aa not in Tcore:
            Tcore[aa] = 0.0
        if aa not in AllCore:
            AllCore[aa] = []
        if Totcore != 0:
            AllCore[aa].append(Tcore[aa] / float(Totcore))
        else:
            AllCore[aa].append(0)
            
            
        if aa not in Tsurf:
            Tsurf[aa] = 0.0
        if aa not in AllSurf:
            AllSurf[aa] = []
        if Totsurf != 0:
            AllSurf[aa].append(Tsurf[aa] / float(Totsurf))
        else:
            AllSurf[aa].append(0)
            
        if aa not in Tinte:
            Tinte[aa] = 0.0
        if aa not in AllInte:
            AllInte[aa] = []
        if Totinte != 0:
            AllInte[aa].append(Tinte[aa] / float(Totinte))
        else:
            AllInte[aa].append(0)
ECRMean = []
CoreMean = []
SurfMean = []
InteMean = []

(BtCore,BtECR) = bootstrap(AllCore,AllECR,it=10000)

(BtInte,BtSurf) = bootstrap(AllInte,AllSurf,it=10000)

for aa in Label:
    ECRMean.append(np.mean(BtECR[aa]))
    CoreMean.append(np.mean(BtCore[aa]))
    SurfMean.append(np.mean(BtSurf[aa]))
    InteMean.append(np.mean(BtInte[aa]))

In [ ]:
AllMean = [CoreMean,ECRMean,SurfMean,InteMean]
MeanLab = ["CoreBinding","Exosite","Surface","Interior"]
AllMSE = dict()
AllComp = dict()
for (arr1,lab1) in zip(AllMean,MeanLab):
    AllMSE[lab1] = dict()
    AllComp[lab1] = dict()
    for i in range(0,len(Label)):
        AllComp[lab1][Label[i]] = arr1[i]
    for (arr2,lab2) in zip(AllMean,MeanLab):
        #plt.scatter(arr1,arr2)
        #plt.xlabel(lab1)
        #plt.ylabel(lab2)
        #for i in range(0,20):
        #    plt.text(arr1[i]+0.15,arr2[i]+0.15,Label[i],color=Color[i])
        #plt.show()
        AllMSE[lab1][lab2] = np.sqrt(np.mean(np.power(np.array(arr1)-np.array(arr2),2)))
        print(lab1,lab2,np.sqrt(np.mean(np.power(np.array(arr1)-np.array(arr2),2))))
sns.clustermap(pd.DataFrame(AllMSE))
plt.show()
sns.heatmap(pd.DataFrame(AllMSE),annot=True)
plt.show()
sns.clustermap(pd.DataFrame(AllComp))
plt.show()

sns.heatmap(pd.DataFrame(AllComp).transpose()[Label].transpose(),annot=True)
plt.show()
#print("Core vs Exosite %.2f" % (np.sqrt(np.mean(np.power(np.array(CoreMean)-np.array(ECRMean),2)))))
#print("Core vs Surface %.2f" % (np.sqrt(np.mean(np.power(np.array(CoreMean)-np.array(SurfMean),2)))))
#print("Exosite  vs Surface %.2f" % (np.sqrt(np.mean(np.power(np.array(ECRMean)-np.array(SurfMean),2)))))

In [ ]:
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3

LL= "Binding Site SS"
AllMeanData[LL] = dict()


for i in range(0,len(Label)):
    ttest = stats.ttest_rel(BtSurf[Label[i]], BtCore[Label[i]])[1]
    print("%2d %2s P.val=%10.8f Core:%4.2f ECR:%4.2f Surf:%4.2f Inte:%.2f" % (i,Label[i],ttest,CoreMean[i],ECRMean[i],SurfMean[i],InteMean[i]))
    if Label[i] not in AllMeanData[LL]:
        AllMeanData[LL][Label[i]] = dict()
    AllMeanData[LL][Label[i]]["ECR"] = [ECRMean[i],np.std(BtECR[Label[i]]) ]
    AllMeanData[LL][Label[i]]["Core"] = [CoreMean[i],np.std(BtCore[Label[i]])]    
    AllMeanData[LL][Label[i]]["Surf"] = [SurfMean[i],np.std(BtSurf[Label[i]])]
    AllMeanData[LL][Label[i]]["Inte"] = [InteMean[i],np.std(BtInte[Label[i]])]

In [ ]:
figsize(20,8)
Test = ["Amino Acid","Stride Classifcation","Solvent Exposure [0-9]","Binding Site","Binding Site SS"]
Test = AllMeanData.keys()
Label = Test

Order = ["Core","ECR","Surf","Inte"]
Col = ["darkred","darkblue","gray","black"]

pos = 0.0
for test in Test:
    pos = 0.0
    pos += 1.0
    
    sub = AllMeanData[test]
    #print(len(sub))
    Tlab = sorted(sub)
    if len(Tlab) == 20:
        Tlab = myAmino
    Xlabpos = []
    
    for t in Tlab:
        Tpos = []
        
        for o in Order:
            if o not in sub[t]:
                continue
            plt.bar([pos],sub[t][o][0],color=Col[Order.index(o)])
            plt.plot([pos+0.4,pos+0.4],[sub[t][o][0],sub[t][o][0]+sub[t][o][1]],color="black")
            
            plt.plot([pos+0.1,pos+0.7],[sub[t][o][0]+sub[t][o][1]]*2,color="black")
            
            Tpos.append(pos)
            pos += 1.0
        Xlabpos.append(np.mean(Tpos)+0.5)
        pos += 1
        #print(t)
    #break
    plt.xticks(Xlabpos, Tlab)
    plt.title(test)
    plt.show()

In [ ]:
AllMeanData.keys()

In [ ]:
mpl.style.use('seaborn-whitegrid')

In [ ]:
figsize(20,8)
Test = ["Binding Site","Amino Acid compostion","Solvent Exposure [0-9]"]
Label = ["Receptor Amino Acid Composition (%)","Peptide Amino Acid Composition (%)","Solvent Exposure (%)"]
Order = ["Core","ECR","Surf","Inte"]
Col = ["darkred","darkblue","gray","black"]
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3
pos = 0.0
for (test,lab) in zip(Test,Label):
    pos = 0.0
    pos += 1.0
    
    sub = AllMeanData[test]
    #print(len(sub))
    Tlab = sorted(sub)
    if len(Tlab) == 20:
        Tlab = myAmino
    Xlabpos = []
    
    for t in Tlab:
        Tpos = []
        
        for o in Order:
            if o not in sub[t]:
                continue
            plt.bar([pos],sub[t][o][0],color=Col[Order.index(o)])
            plt.plot([pos,pos],[sub[t][o][0],sub[t][o][0]+sub[t][o][1]],color="black")
            
            plt.plot([pos-0.3,pos+0.3],[sub[t][o][0]+sub[t][o][1]]*2,color="black")
            
            Tpos.append(pos)
            pos += 1.0
        Xlabpos.append(np.mean(Tpos))
        pos += 1
        #print(t)
    #break
    if test == "Solvent Exposure [0-9]":
        Tlab = ["[0-10[","[10-20[","[20-30[","[30-40[","[40-50[","[50-60[","[60-70["]
        
    plt.xticks(Xlabpos, Tlab)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.xlabel(lab,size=18)
    plt.ylabel("Residue distribution %",rotation=90)
    plt.show()
    #break

In [ ]:


In [ ]:


In [ ]:
#Relationship between buried and aa type
AllDF = []
for (t,name) in zip(range(0,3),["All","Core","ECR"]):
    Count = dict()
    Tot = dict()
    for aa in myAmino:
        Count[aa] = dict()
        Tot[aa] = 0
        for i in range(0,9):
            Count[aa][str(i)] = 0
    torun = PixelDBecr
    if t == 0:
        torun = PixelDB
    for (seq,bury,ecr,size) in zip(torun["sequence_alignment"],torun["percent_exposed_alignment"],torun["core_ecr_alignment"],torun["size_of_binding_mode"]):
        #print(seq,bury)
        for i in range(0,len(seq)):
            if seq[i] == "-":
                continue
            if t == 1:
                if ecr[i] == "E":
                    continue
            if t == 2:
                if ecr[i] == "C":
                    continue
            if seq[i] not in Count:
                Count[seq[i]] = dict()
            if bury[i] not in Count[seq[i]]:
                Count[seq[i]][bury[i]] = 0
            #print(i,seq[i],bury[i])
            Count[seq[i]][bury[i]] += 1.0/float(size)
            Tot[seq[i]] += 1/float(size)
        #break
    NormC = dict()
    for aa in Tot:
        NormC[aa] = dict()
        print(aa,Tot[aa])
        for b in Count[aa]:
            bt = b
            if int(bt) > 4:
                bt = "5+"
            if bt not in NormC[aa]:
                NormC[aa][bt] = 0
            
            NormC[aa][bt] += int(float(Count[aa][b]) / float(Tot[aa])*100.0+0.5)
    print(name)
    figsize(8,8)
    df = pd.DataFrame(NormC)
    df = df[myAmino]
    AllDF.append(df)
    sns.heatmap(df[myAmino],vmax=70,annot=True)
    plt.title(name )
    plt.ylabel("Amino acid binned exposure distribution")
    plt.show()

In [ ]:
sns.heatmap(AllDF[1]-AllDF[2],annot=True)

In [ ]:
plt.scatter(PixelDBoecr["peptide_length"],PixelDBoecr["longest_continuous_ecr"])
plt.xlabel("Peptide Length")
plt.ylabel("ECR Length")
plt.show()

In [ ]:
plt.scatter(PixelDB["peptide_length"],PixelDB["longest_continuous_ecr"])
plt.xlabel("Peptide Length")
plt.ylabel("ECR Length")
plt.show()

In [ ]:
print(PixelDBecr[PixelDBecr["longest_continuous_ecr"]>=4]["peptide_length"].mean())
print(PixelDBecr[PixelDBecr["longest_continuous_ecr"]>=4]["peptide_length"].min())
print(PixelDBecr[PixelDBecr["longest_continuous_ecr"]>=4]["peptide_length"].max())
plt.hist(PixelDBecr[PixelDBecr["longest_continuous_ecr"]>=4]["peptide_length"].values,25)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.xlabel("Peptide Length")
plt.show()

In [ ]:
Count = dict()
for (ecr,pl) in zip(PixelDBecr["longest_continuous_ecr"],PixelDBecr["peptide_length"]):
    if ecr not in Count:
        Count[ecr] = dict()
    if pl not in Count[ecr]:
        Count[ecr][pl]=0
    Count[ecr][pl] += 1

In [ ]:
sns.heatmap(pd.DataFrame(Count).fillna(value=0).transpose(),annot=True)
plt.xlabel("Peptide Length")
plt.ylabel("ECR Length")
plt.show()

In [ ]:
#Find longest ecr
PixelDB.sort(["longest_continuous_ecr"],ascending=False).head(5)[["name","size_of_binding_mode","unique_id","receptor_length","peptide_length"]]

In [ ]:
#Search for traf
tolook = "2.60.210.10"
for (v,uniid,seq) in zip(np.array(PixelDB["CATH"]),np.array(PixelDB["unique_id"]),np.array(PixelDB["sequence_alignment"])):
    if v == nan:
        continue
    if re.search(tolook,str(v)):
        print(v,uniid,seq)

In [ ]:
#ECR contact to receptor
myCut = 3
tot = 0
low = 0
for (exp,seq) in zip(np.array(PixelDB["percent_exposed_alignment"]),np.array(PixelDB["core_ecr_alignment"])):
    #rint(exp,seq)
    for i in range(0,len(exp)):
        if seq[i] == "-":
            continue
        if (seq[i] == "e") or (seq[i] == "E"):
            tot += 1
            if int(exp[i]) <= myCut:
                low += 1
            #rint(i,exp[i],seq[i])
print(tot,low)
print(float(low)/float(tot))

In [ ]:
PixelDB.columns

In [ ]:
carac = pd.read_table("/media/vince/Postdoc/PixelDB/PixelDB/other_files/AllCp.dat",names=["PDB","Chain","CP","Tot_cont"],delimiter="\s")

In [ ]:
figsize(10,10)
plt.hist(carac["CP"]*100,20)
plt.xlabel("Peptide surface contacting a symmetry-related complex (%)", fontsize=18 )
plt.ylabel("Count", fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()

In [ ]:
for i in range(1,100):
    plt.scatter(i,np.sum(carac["CP"]*100 > i)/float(len(carac["CP"])))

In [ ]:
for i in [5,10,15,20,50]:
    print(i,np.sum(carac["CP"]*100 > i)/float(len(carac["CP"])),np.sum(carac["CP"]*100 > i))

In [ ]:
#Most frequent PFAM in multiple binding mod


for bla in list(np.unique(PixelDB["cluster_number"])):
    sdf = PixelDB[PixelDB["cluster_number"] == bla]
    if (len(np.unique(sdf["unique_id"])) < 10):
        continue
    UniqueCATH = dict()
    for uniid in list(np.unique(sdf["CATH"])):
        if str(uniid) == "nan":
            continue
        for v in uniid.split("_"):
            #print(v)
            if v not in UniquePFAM:
                UniqueCATH[v] = 0
            UniqueCATH[v] += 1
    print(bla,len(np.unique(sdf["unique_id"])),list(set(UniqueCATH)))
    #break

In [ ]:
#Does receptor cluster with multiple binding mode have more CATH

for bla in list(np.unique(PixelDB["cluster_number"])):
    sdf = PixelDB[PixelDB["cluster_number"] == bla]
    UniqueCATH = dict()
    for uniid in list(np.unique(sdf["CATH"])):
        if str(uniid) == "nan":
            continue
        for v in uniid.split("_"):
            #print(v)
            if v not in UniquePFAM:
                UniqueCATH[v] = 0
            UniqueCATH[v] += 1
    plt.scatter(len(np.unique(sdf["unique_id"])),len(list(set(UniqueCATH))))
    plt.xlabel("Number of binding mode")
    plt.ylabel("Number of CATH")
    #print(bla,len(np.unique(sdf["unique_id"])),len(list(set(UniqueCATH))))
    #break

In [ ]:
sorted(list(np.unique(PixelDBoecr["unique_id"])))

In [ ]:
for clu in list(np.unique(PixelDB["cluster_number"])):
    #print(clu)
    sdf = PixelDB[PixelDB["cluster_number"] == clu]
    if len(np.unique(sdf["unique_id"])) < 5:
        continue
    print(list(np.unique(sdf["unique_id"])))
    #break

In [ ]:
np.unique(sdf["unique_id"])

In [ ]:
import glob

In [ ]:
#Get all simil Matrix
AllPair = []
NormCount = dict()
AllK = []
Fract = []
AverageSeqIden = []
for i in range(0,21):
    myK = float(i)/float(20)
    AllK.append(myK)
    NormCount[myK] = 0
for f in glob.glob('/media/vince/Postdoc/PixelDB/PixelDB/clusters/*/*_simil.CSV'):
    sdf = pd.read_table(f,sep="\s")
    if len(sdf) == 1:
        continue
    np.fill_diagonal(sdf.values, -1)
    keep = np.triu(np.ones(sdf.shape)).astype('bool').reshape(sdf.size)
    allval = np.array(sdf.stack()[keep])
    allval = list(allval[allval > 0])
    AllPair += allval
    AverageSeqIden.append(np.mean(allval))
    toadd = 1.0 / float(len(allval))
    Fract.append(np.sum(np.array(allval) > 0.7) / float(len(allval)))
    #die
    for v in allval:
        myK = int(v*20.0)/20.0
        #print(int(v*20.0)/20.0,len(allval))
        NormCount[myK] += toadd
    #if len(sdf) == 5:
    #    break

In [ ]:
print(np.mean(AverageSeqIden))
plt.hist(np.array(AverageSeqIden)*100)
plt.xlabel("Average sequence idendity in receptor cluster (%)", fontsize=18 )
plt.ylabel("Count", fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()

In [ ]:
print(np.mean(AverageSeqIden),np.median(AverageSeqIden),np.min(AverageSeqIden),np.max(AverageSeqIden))

In [ ]:
plt.hist(Fract,20)
plt.title("Distribution of receptor within cluster >70% seq idendity")
plt.show()

In [ ]:
AllV = []
for myK in AllK:
    AllV.append(NormCount[myK])
plt.bar(np.array(AllK)*100,AllV)

In [ ]:


In [ ]:
plt.hist(np.array(AllPair)*100,30)
plt.xlabel("Pairwise sequence identidy in cluster receptor of +2 complexes")
plt.ylabel("Count")
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()

In [ ]:
AllIn = []
AllBe = []


AllBetweenBindingMode =[]

for clu in list(np.unique(torun["cluster_number"])):
    sdf = torun[torun["cluster_number"] == clu]
    if (len(sdf["unique_id"].value_counts()) == 1):
            continue
    #if (len(sdf["unique_id"].value_counts()) != 2):
    #       continue
    for f in glob.glob("/media/vince/Postdoc/PixelDB/PixelDB/clusters/"+str(clu)+"/*_simil.CSV"):
        simildf = pd.read_table(f,sep="\s")
        #print(f)
        
    allunid = list(np.unique(sdf["unique_id"]))
    InBindingMode = []
    BetweenMode = []
    for i in range(0,len(allunid)):
        unidd1 = allunid[i]
        for j in range(i,len(allunid)):
            unidd2 = allunid[j]
            df1 = sdf[sdf["unique_id"] == unidd1]
            df2 = sdf[sdf["unique_id"] == unidd2]

            id1 = df1["pdb_id"]+"_"+df1["receptor_chain"]
            id2 = df2["pdb_id"]+"_"+df2["receptor_chain"]
            allval = []
            for i1 in id1:
                for i2 in id2:
                    allval.append(simildf[i1].transpose()[i2])
            fract = np.sum(np.array(allval) > 0.7) / float(len(allval))
            #print(i,j,unidd1,unidd2,fract)
            if i == j:
                InBindingMode += allval
            else:
                BetweenMode += allval
                AllBetweenBindingMode += allval
    fract = np.sum(np.array(InBindingMode) > 0.7) / float(len(InBindingMode))
    #print("In binding mode",fract)
    AllIn.append(fract)
    fract = np.sum(np.array(BetweenMode) > 0.7) / float(len(BetweenMode))
    #print("Between binding mode",fract)
    AllBe.append(fract)
    #break

In [ ]:
simildf[id1].transpose()[id2]

In [ ]:
plt.hist(AllBetweenBindingMode)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()

In [ ]:
for i in range(80,101):
    print(i,np.sum(np.array(AllBetweenBindingMode) > (float(i)/100)))

In [ ]:
plt.hist(AllIn,20)
plt.xlim((0,1))
plt.title("In binding mode")
plt.show()
plt.hist(AllBe,20)
plt.title("Between binding mode")
plt.xlim((0,1))
plt.show()
print("Average fraction >70% in Binding Mode",np.mean(AllIn))
print("Average fraction >70%  Between Binding Mode",np.mean(AllBe))

In [ ]:
PixelDB.columns

In [ ]:
AllArrang = dict()
for v in np.array(PixelDB["core_ecr_alignment"]):
    vinit = v
    v = v.replace("-","")
    for i in range(0,len(v)):
        vi = v
        v = re.sub('C+', "C", v)
        v = re.sub('E+', "E", v)
        v = re.sub('e+', "e", v)
        v = re.sub('c+', "c", v)
        if v == vi:
            break
    if v not in AllArrang:
        AllArrang[v] = 0
    AllArrang[v] += 1
    #print(vinit,v)
for w in sorted(AllArrang, key=AllArrang.get, reverse=True):
    print w, AllArrang[w]

In [ ]:
AllArrang = dict()
for v in np.array(PixelDBoecr["core_ecr_alignment"]):
    vinit = v
    v = v.replace("-","")
    for i in range(0,len(v)):
        vi = v
        v = re.sub('C+', "C", v)
        v = re.sub('E+', "E", v)
        v = re.sub('e+', "e", v)
        v = re.sub('c+', "c", v)
        if v == vi:
            break
    vinit = vinit.replace("-","")
    print(vinit,v)
    if v not in AllArrang:
        AllArrang[v] = 0
    AllArrang[v] += 1
    #print(vinit,v)

In [ ]:
tot = 0
for w in sorted(AllArrang, key=AllArrang.get, reverse=True):
    if (w[0] == "E") or (w[-1] == "E"):
        tot += AllArrang[w]
    #else:
        print w, AllArrang[w]

In [ ]:
print(tot,len(PixelDBoecr))

In [ ]:
AllArrang = dict()
sdf = PixelDB[PixelDB["cluster_number"] == 1]
for v in np.array(sdf["core_ecr_alignment"]):
    vinit = v
    
    v = v.replace("-","")
    for i in range(0,len(v)):
        vi = v
        v = re.sub('C+', "C", v)
        v = re.sub('E+', "E", v)
        v = re.sub('e+', "e", v)
        v = re.sub('c+', "c", v)
        if v == vi:
            break
    if v not in AllArrang:
        AllArrang[v] = 0
    AllArrang[v] += 1
    #print(vinit,v)
for w in sorted(AllArrang, key=AllArrang.get, reverse=True):
    print w, AllArrang[w]

In [ ]:


In [ ]:
#Amino acid composition
AllAAComb = dict()
sub = AllMeanData["Binding Site"]

for aa in sub:
    if aa not in AllAAComb:
        AllAAComb[aa] = dict()
    for t in sub[aa]:
        ti = t
        if t == "Core":
            ti = "Core-binding"
        if t == "ECR":
            ti = "Exosite"
        if t == "Inte":
            ti = "Interior"
        if t == "Surf":
            ti = "NISR"
        AllAAComb[aa][ti] = sub[aa][t][0]
sub = AllMeanData["Amino Acid compostion"]

for aa in sub:
    if aa not in AllAAComb:
        AllAAComb[aa] = dict()
    for t in sub[aa]:
        AllAAComb[aa][t] = sub[aa][t][0]

In [ ]:
df = pd.DataFrame(AllAAComb)[myAmino].transpose()[["Core","ECR","Core-binding","Exosite","NISR","Interior"]]

In [ ]:
df

In [ ]:
LabelOr = ["Core","ECR","Core-binding","Exosite","NISR","Interior"]

In [ ]:
colcol = ["red","blue"]+["gray"]*4
LabelOr = ["Core","ECR","Core-binding","Exosite","NISR","Interior"]
sns.set(font_scale=1.4)
#sns.clustermap(df,row_cluster=False,row_colors = Color,col_colors=colcol)
pal = sns.light_palette("navy", as_cmap=True)
sns.clustermap(df[LabelOr],row_cluster=False,col_cluster=False,cmap=pal,figsize=(12,12),linewidths=.5,annot=True)

plt.show()

In [ ]:
colcol = ["red","blue"]+["gray"]*4
sns.set(font_scale=1.9)
#sns.clustermap(df,row_cluster=False,row_colors = Color,col_colors=colcol)
pal = sns.light_palette("navy", as_cmap=True)
sns.clustermap(df,row_cluster=False,cmap=pal,figsize=(12,12),linewidths=.5,annot=True,metric="Correlation", fmt=".1f")

plt.show()

In [ ]:


In [ ]:
figsize(20,8)
Test = ["Binding Site","Amino Acid compostion","Solvent Exposure [0-9]"]
Label = ["Receptor Amino Acid Composition (%)","Peptide Amino Acid Composition (%)","Solvent Exposure (%)"]
Order = ["Core","ECR","Surf","Inte"]
Col = ["darkred","darkblue","gray","black"]
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3
pos = 0.0
for (test,lab) in zip(Test,Label):
    pos = 0.0
    pos += 1.0
    
    sub = AllMeanData[test]
    #print(len(sub))
    Tlab = sorted(sub)
    if len(Tlab) == 20:
        Tlab = myAmino
    Xlabpos = []
    
    for t in Tlab:
        Tpos = []
        
        for o in Order:
            if o not in sub[t]:
                continue
            plt.bar([pos],sub[t][o][0],color=Col[Order.index(o)])
            #plt.plot([pos+0.4,pos+0.4],[sub[t][o][0],sub[t][o][0]+sub[t][o][1]],color="black")
            
            plt.plot([pos+0.1,pos+0.7],[sub[t][o][0]+sub[t][o][1]]*2,color="black")
            
            Tpos.append(pos)
            pos += 1.0
        Xlabpos.append(np.mean(Tpos)+0.5)
        pos += 1
        #print(t)
    #break
    if test == "Solvent Exposure [0-9]":
        Tlab = ["[0-10[","[10-20[","[20-30[","[30-40[","[40-50[","[50-60[","[60-70["]
        
    plt.xticks(Xlabpos, Tlab)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.xlabel(lab,size=18)
    
    plt.show()
    #break