In [1]:
import pandas as pd
import numpy as np
import re
from scipy import stats
import seaborn as sns



%pylab inline
%matplotlib inline


Populating the interactive namespace from numpy and matplotlib

In [2]:
myAmino = ["R","H","K","D","E","S","T","N","Q","C","G","P","A","V","I","L","M","F","Y","W"]

In [4]:
#Load PixelDB
path = "/media/vince/Postdoc/PixelDB/PixelDB/"
PixelDB = pd.read_csv(path+"PixelDB.csv")
len(PixelDB["name"])


Out[4]:
1976

In [5]:
print("Entries in DB:",len(PixelDB["cluster_number"]))
print("Receptor Cluster",len(PixelDB["cluster_number"].value_counts()))
print("Unique Binding Mode",len(PixelDB["unique_id"].value_counts()))
print("Binding mode with 2 plus peptide",np.sum((PixelDB["unique_id"].value_counts()) >= 2))


('Entries in DB:', 1976)
('Receptor Cluster', 479)
('Unique Binding Mode', 702)
('Binding mode with 2 plus peptide', 271)

In [ ]:
#Some STAT on PixelDB

In [ ]:
PixelDBecr = PixelDB.copy()
for uniid in list(np.unique(PixelDB["unique_id"])):
    sdf = PixelDB[PixelDB["unique_id"] == uniid]
    
    
    
    if not np.sum((np.array(sdf["longest_continuous_core"]) > 3) & (np.array(sdf["longest_continuous_ecr"]) > 3)) > 0:
        PixelDBecr = PixelDBecr[PixelDBecr["unique_id"] != uniid]
        continue
    #break

In [ ]:
PixelDBoecr = PixelDB[PixelDB["longest_continuous_ecr"] > 3]

In [ ]:


In [ ]:
print("Unique Binding Mode",len(PixelDBecr["unique_id"].value_counts()))
print("Exosite and ECR complexes",np.sum(PixelDBecr["longest_continuous_ecr"] > 3))

In [ ]:


In [ ]:
#Is there a link between CATHdb and Binding mode

for ikl in range(0,2):
    torun = PixelDB
    if ikl == 1:
        print("ECR")
        torun = PixelDBecr
    else:
        print("Full")

    
    
    

    for test in ["PFAM","uniprot","CATH"]:
        CATHdbOver = []
        BindingMode = []
        for uniid in list(np.unique(torun["cluster_number"])):
            sdf = torun[torun["cluster_number"] == uniid]



            BindingMode.append(len(sdf["unique_id"].value_counts()))
            if (len(sdf["unique_id"].value_counts()) == 1):
                continue
            AllCATH = []



            AllCATHUni = []
            for unid in np.unique(sdf["unique_id"]):

                CATHuni = []
                for cid in sdf[sdf["unique_id"] == unid][test]:
                    #print(cid.split("_"))
                    if str(cid) != "nan":
                        CATHuni += cid.split("_")
                        AllCATHUni += cid.split("_")

                AllCATH.append(list(set(CATHuni)))

            Tot = 0
            Over = 0

            for i in range(0,len(AllCATH)):
                for j in range(i+1,len(AllCATH)):
                    OverT = 0
                    for v in AllCATH[i]:
                        if v in AllCATH[j]:
                            OverT += 1
                    if (OverT != 0):
                        Over += 1
                    #else:
                    #    print(i,j,AllCATH[i],AllCATH[j])
                    Tot += 1
            CATHdbOver.append(float(Over)/float(Tot))
            #if (CATHdbOver[-1] < 0.5):
            #    print(uniid,Over,Tot)
            #    print(AllCATH)



            #if len(sdf["unique_id"].value_counts()) > 6:
            #    print(uniid,len(sdf["unique_id"].value_counts()))
            #    print(pd.Series(AllCATHUni).value_counts())
            #else:
            #    break


            #break
            #          
            #break
        print(len(CATHdbOver))
        print("Average Overlap of:",test,np.mean(CATHdbOver))
        print("Median Overlap of:",test,np.median(CATHdbOver))
        print("Average Nb of Binding mode:",np.mean(BindingMode))
        print("Median Nb of Binding mode:",np.median(BindingMode))
        print(np.min(BindingMode),np.max(BindingMode))
        
        print("More than one binding mode",np.sum(np.array(BindingMode)  > 1))
        print("Total cluster",len(BindingMode))
        
        print()

In [ ]:
#How often same PDB different binding mode

for ikl in range(0,2):
    torun = PixelDB
    if ikl == 1:
        print("ECR")
        torun = PixelDBecr
    else:
        print("Full")
    tot = 0
    AllCluster = []
    for uniid in list(np.unique(torun["pdb_id"])):
        sdf = torun[torun["pdb_id"] == uniid]
        if (len(sdf) == 1):
            continue
        for ch in list(np.unique(sdf["receptor_chain"])):
            ssdf = sdf[sdf["receptor_chain"] == ch]
            if (len(ssdf) == 1):
                continue
            print(uniid,ch,len(ssdf),np.unique(sdf["cluster_number"]))
            AllCluster += list(np.unique(sdf["cluster_number"]))
            tot += 1
        #break
    print(len(list(set(AllCluster))))
    print("Same receptor engage in multiple peptide binding:",tot)
    #break

In [ ]:
sdf

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:
#Is there a link between CATHdb and Binding mode

for ikl in range(0,2):
    torun = PixelDB
    if ikl == 1:
        print("ECR")
        torun = PixelDBecr
    else:
        print("Full")
    inbm = []
    outbm = []
    for uniid in list(np.unique(torun["cluster_number"])):
        sdf = torun[torun["cluster_number"] == uniid]
        sdf = sdf[sdf["mean_seq_iden_in_bm"] > 0]
        sdf = sdf[sdf["mean_seq_iden_not_bm"] > 0]
        
        
        
        if (len(sdf) == 0):
            continue
            
        #die
        if (len(sdf["unique_id"].value_counts()) == 1):
            continue
        plt.scatter(sdf["mean_seq_iden_in_bm"]*100,sdf["mean_seq_iden_not_bm"]*100)
        inbm.append(np.mean(sdf["mean_seq_iden_in_bm"]))
        outbm.append(np.mean(sdf["mean_seq_iden_not_bm"]))
        #print(inbm[-1],uniid)
        #if uniid == 7:
        #    die
        
    plt.xlabel("Mean sequence receptor within Binding mode")
    plt.ylabel("Mean sequence receptor not in Binding mode")
    plt.plot([-5,105],[-5,105])
    plt.xlim((-5,105))
    plt.ylim((-5,105))
    plt.show()
    #break
    print("Average seq iden in and out",np.mean(inbm),np.mean(outbm),len(inbm))
    plt.hist(inbm,20)
    plt.title("In binding mode")
    plt.show()
    plt.hist(outbm,20)
    plt.title("Between binding mode")
    plt.show()
    print("Median seq iden in and out",np.median(inbm),np.median(outbm),len(inbm))
    print("Max and min seq iden ",np.min(inbm),np.max(inbm),len(inbm))
    
    figsize(10,10)
    plt.hist(np.array(inbm)*100)
    plt.xlim([0,100])
    plt.xlabel("Average sequence idendity in binding mode (%)", fontsize=18 )
    plt.ylabel("Count", fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show()
    
    plt.hist(np.array(outbm)*100)
    plt.xlim([0,100])
    plt.xlabel("Average sequence idendity between binding mode (%)", fontsize=18 )
    plt.ylabel("Count", fontsize=18)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.show()
    
    break

In [ ]:
#sdf["unique_id"].value_counts()

In [ ]:


In [ ]:
for ikl in range(0,3):
    torun = PixelDB
    if ikl == 0:
        print("Full")
    if ikl == 1:
        torun = PixelDBecr
        print("ECR")
    if ikl == 2:
        torun = PixelDBoecr
        print("ECRdist")

    UniquePFAM = []
    UniqueUnip = []
    UniqueCATH = []
        
    UniprotPerBindingMode = []
    CATHPerBindingMode = []
    PFAMPerBindingMode = []

    StrPerBindingMode = []

    for uniid in list(np.unique(torun["unique_id"])):
        sdf = torun[torun["unique_id"] == uniid]
        #print(sdf["sequence_alignment"])
        
        StrPerBindingMode.append(len(sdf))
        
        Uniuni = []
        for cid in sdf["uniprot"]:
            #print(cid.split("_"))
            if str(cid) != "nan":
                Uniuni += cid.split("_")
        UniprotPerBindingMode.append(len(list(set(Uniuni))))
        
        
        CATHuni = []
        for cid in sdf["CATH"]:
            #print(cid.split("_"))
            if str(cid) != "nan":
                CATHuni += cid.split("_")
        CATHPerBindingMode.append(len(list(set(CATHuni))))

        PFAMuni = []
        for cid in sdf["PFAM"]:
            #print(cid.split("_"))
            if str(cid) != "nan":
                PFAMuni += cid.split("_")
        PFAMPerBindingMode.append(len(list(set(PFAMuni))))

            #print("Binding mode with ECR and Core >= 4",len(PixelDBecr["unique_id"].value_counts()))
    print("Unique Binding Mode",len(np.unique(torun["unique_id"])))
    print("Strc Per Binding Mode",np.mean(StrPerBindingMode),np.median(StrPerBindingMode),np.min(StrPerBindingMode),np.max(StrPerBindingMode))
    print("Uniprot Per Binding Mode",np.mean(UniprotPerBindingMode),np.median(UniprotPerBindingMode),np.min(UniprotPerBindingMode),np.max(UniprotPerBindingMode))
    print("CATH Per Binding Mode",np.mean(CATHPerBindingMode),np.median(CATHPerBindingMode),np.min(CATHPerBindingMode),np.max(CATHPerBindingMode))
    print("PFAM Per Binding Mode",np.mean(PFAMPerBindingMode),np.median(PFAMPerBindingMode),np.min(PFAMPerBindingMode),np.max(PFAMPerBindingMode))

In [ ]:
for ikl in range(0,3):
    torun = PixelDB
    if ikl == 0:
        print("Full")
    if ikl == 1:
        torun = PixelDBecr
        print("ECR")
    if ikl == 2:
        torun = PixelDBoecr
        print("ECRdist")

    UniquePFAM = []
    UniqueUnip = []
    UniqueCATH = []
        
    UniprotPerBindingMode = []
    CATHPerBindingMode = []
    PFAMPerBindingMode = []

    StrPerBindingMode = []

    for uniid in list(np.unique(torun["cluster_number"])):
        sdf = torun[torun["cluster_number"] == uniid]
        #print(sdf["sequence_alignment"])
        
        StrPerBindingMode.append(len(sdf))
        
        Uniuni = []
        for cid in sdf["uniprot"]:
            #print(cid.split("_"))
            if str(cid) != "nan":
                Uniuni += cid.split("_")
        UniprotPerBindingMode.append(len(list(set(Uniuni))))
        
        
        CATHuni = []
        for cid in sdf["CATH"]:
            #print(cid.split("_"))
            if str(cid) != "nan":
                CATHuni += cid.split("_")
        CATHPerBindingMode.append(len(list(set(CATHuni))))

        PFAMuni = []
        for cid in sdf["PFAM"]:
            #print(cid.split("_"))
            if str(cid) != "nan":
                PFAMuni += cid.split("_")
        PFAMPerBindingMode.append(len(list(set(PFAMuni))))

            #print("Binding mode with ECR and Core >= 4",len(PixelDBecr["unique_id"].value_counts()))
    print("Unique Cluster",len(np.unique(torun["cluster_number"])))
    print("Strc Per Cluster",np.mean(StrPerBindingMode),np.median(StrPerBindingMode),np.min(StrPerBindingMode),np.max(StrPerBindingMode))
    print("Uniprot Per Cluster",np.mean(UniprotPerBindingMode),np.median(UniprotPerBindingMode),np.min(UniprotPerBindingMode),np.max(UniprotPerBindingMode))
    print("CATH Per Cluster",np.mean(CATHPerBindingMode),np.median(CATHPerBindingMode),np.min(CATHPerBindingMode),np.max(CATHPerBindingMode))
    print("PFAM Per Cluster",np.mean(PFAMPerBindingMode),np.median(PFAMPerBindingMode),np.min(PFAMPerBindingMode),np.max(PFAMPerBindingMode))

In [ ]:
len(PixelDBecr)

In [ ]:
for cid in PixelDBecr["CATH"]:
    if str(cid) != "nan":
        CATHuni += cid.split("_")
    
CATHPerBindingMode.append(len(list(set(CATHuni))))

In [ ]:
UniprotPerBindingModeECR = []
CATHPerBindingModeECR = []
PFAMPerBindingModeECR = []



for uniid in list(np.unique(PixelDBecr["unique_id"])):
    sdf = PixelDBecr[PixelDBecr["unique_id"] == uniid]
    print(sdf["sequence_alignment"])
    UniprotPerBindingMode.append(np.sum(np.unique(sdf["uniprot"]) != "nan"))
    CATHuni = []
    for cid in sdf["CATH"]:
        #print(cid.split("_"))
        if str(cid) != "nan":
            CATHuni += cid.split("_")
    CATHPerBindingMode.append(len(list(set(CATHuni))))
    
    PFAMuni = []
    for cid in sdf["PFAM"]:
        #print(cid.split("_"))
        if str(cid) != "nan":
            PFAMuni += cid.split("_")
    PFAMPerBindingMode.append(len(list(set(PFAMuni))))
    #print(list(set(CATHuni)))
    #die

In [ ]:
plt.hist(CATHPerBindingMode,100)
plt.show()

In [ ]:
#Load PFAM
pdbmap_name = ["PDB","Chain","unk","name","PFAM","uniprot","range"]

pdbmap = pd.read_table(path+"other_files/pdbmap",names=pdbmap_name,delimiter="\t")
pdbmap = pdbmap.drop("unk",axis=1)
pdbmap_name = list(pdbmap.columns.values)
for c in pdbmap_name:
    pdbmap[c] = pdbmap[c].str.replace(';', '')

In [ ]:
#cath-b-newest-all
cath_name = ["PDB","v","CATH","range"]
cathb = pd.read_table(path+"other_files/cath-b-newest-all",delimiter=" ",names=cath_name)

In [ ]:
for ikl in range(0,3):
    torun = PixelDB
    if ikl == 0:
        print("Full")
    if ikl == 1:
        torun = PixelDBecr
        print("ECR")
    if ikl == 2:
        torun = PixelDBoecr
        print("ECRdist")
    
    UniquePFAM = dict()
    for uniid in list(np.unique(torun["PFAM"])):
        if str(uniid) == "nan":
            continue
        for v in uniid.split("_"):
            #print(v)
            if v not in UniquePFAM:
                UniquePFAM[v] = 0
            UniquePFAM[v] += 1
        #break

    UniqueCATH = dict()
    for uniid in list(np.unique(torun["CATH"])):
        if str(uniid) == "nan":
            continue
        for v in uniid.split("_"):
            #print(v)
            if v not in UniqueCATH:
                UniqueCATH[v] = 0
            UniqueCATH[v] += 1

    UniqueUniprot = dict()
    for uniid in list(np.unique(torun["uniprot"])):
        if str(uniid) == "nan":
            continue
        for v in uniid.split("_"):
            #print(v)
            if v not in UniqueUniprot:
                UniqueUniprot[v] = 0
            UniqueUniprot[v] += 1


    print("PDB has this many unique PFAM",len(pdbmap["PFAM"].value_counts()))
    print("PixelDB has this many unique PFAM",len(UniquePFAM))
    print("Percentage = ",100.0*len(UniquePFAM) / float(len(pdbmap["PFAM"].value_counts())))
    #print("PixelDBecr has this many unique PFAM",len(PixelDBecr["PFAM"].value_counts()))

    print("PDB has this many unique uniprot",len(pdbmap["uniprot"].value_counts()))
    print("PixelDB has this many unique uniprot",len(UniqueUniprot))

    print("Percentage = ",100.0*len(UniqueUniprot) / float(len(pdbmap["uniprot"].value_counts())))

    #print("PixelDBecr has this many unique uniprot",len(PixelDBecr["uniprot"].value_counts()))
    print("PDB has this many unique CATH",len(cathb["CATH"].value_counts()))
    print("PixelDB has this many unique CATH",len(UniqueCATH))

    print("Percentage = ",100.0*len(UniqueCATH) / float(len(cathb["CATH"].value_counts())))
    
    print("Most frequence Cath")
    for w in sorted(UniqueCATH, key=UniqueCATH.get, reverse=True)[0:3]:
        print w, UniqueCATH[w]
    print("Most frequence uniprot")
    for w in sorted(UniqueUniprot, key=UniqueUniprot.get, reverse=True)[0:3]:
        print w, UniqueUniprot[w]
    print("Most frequence PFAM")
    for w in sorted(UniquePFAM, key=UniquePFAM.get, reverse=True)[0:3]:
        print w, UniquePFAM[w]

In [ ]:
allCarac = ["resolution","receptor_length","peptide_length","size_of_binding_mode","longest_continuous_core","longest_continuous_ecr"]

for v in allCarac:
    AllMean = []
    for uniid in list(np.unique(PixelDB["unique_id"])):
        sdf = PixelDB[PixelDB["unique_id"] == uniid]
        AllMean.append(np.mean(sdf[v]))
    print("%30s Avg:%6.2f Std:%6.2f Median:%6.2f Min:%6.2f Max:%7.2f" % (v,np.mean(AllMean),np.std(AllMean),np.median(AllMean),np.min(PixelDB[v]),np.max(PixelDB[v])))

In [ ]:
allCarac = ["resolution","receptor_length","peptide_length","size_of_binding_mode","longest_continuous_core","longest_continuous_ecr"]

for v in allCarac:
    AllMean = []
    for uniid in list(np.unique(PixelDBecr["unique_id"])):
        sdf = PixelDBecr[PixelDBecr["unique_id"] == uniid]
        AllMean.append(np.mean(sdf[v]))
    
    print("%30s Avg:%6.2f Std:%6.2f Median:%6.2f Min:%6.2f Max:%7.2f" % (v,np.mean(AllMean),np.std(AllMean),np.median(AllMean),np.min(PixelDBecr[v]),np.max(PixelDBecr[v])))

In [ ]:
allCarac = ["resolution","receptor_length","peptide_length","size_of_binding_mode","longest_continuous_core","longest_continuous_ecr"]
PixelDBoecr = PixelDB[PixelDB["longest_continuous_ecr"] > 3]
for v in allCarac:
    AllMean = []
    for uniid in list(np.unique(PixelDBoecr["unique_id"])):
        sdf = PixelDBoecr[PixelDBoecr["unique_id"] == uniid]
        AllMean.append(np.mean(sdf[v]))
    
    print("%30s Avg:%6.2f Std:%6.2f Median:%6.2f Min:%6.2f Max:%7.2f" % (v,np.mean(AllMean),np.std(AllMean),np.median(AllMean),np.min(PixelDBoecr[v]),np.max(PixelDBoecr[v])))

In [ ]:
PixelDB.sort(["peptide_length"],ascending=False).head(5)[["name","size_of_binding_mode","cluster_number","receptor_length","peptide_length"]]

In [ ]:
PixelDB.sort(["receptor_length"]).head(5)[["name","size_of_binding_mode","cluster_number","receptor_length","peptide_length"]]

In [ ]:
def bootstrap(AllCore,AllECR,it=10000):
    BtECR = dict()
    BtCore = dict()
    for lab in Label:
        BtECR[lab] = []
        BtCore[lab] = []
    for it in range(0,it):
        ECRMean = []
        CoreMean = []
        for aa in Label:

            index = np.random.randint(len(AllECR), size=len(AllECR[aa]))

            ECRMean.append(np.sum(np.array(AllECR[aa])[index]))
            CoreMean.append(np.sum(np.array(AllCore[aa])[index]))
        ECRMean = np.array(ECRMean)/np.sum(ECRMean)*100.0
        CoreMean = np.array(CoreMean)/np.sum(CoreMean)*100.0
        for i in range(0,len(ECRMean)):
            BtECR[Label[i]].append(ECRMean[i])
            BtCore[Label[i]].append(CoreMean[i])
    return(BtCore,BtECR)

In [ ]:
AllMeanData = dict()

In [ ]:
PixelDBoecr = PixelDB[PixelDB["longest_continuous_ecr"] > 3]

In [ ]:
LegendLabel = ["Solvent Exposure [0-9]","Levy Classification","Stride Classifcation", "Amino Acid compostion"]
ToTest = ["percent_exposed_alignment","levy_alignment","stride","sequence_alignment"]

#LegendLabel = ["Solvent Exposure [0-9]","Stride Classifcation"]
#ToTest = ["percent_exposed_alignment","stride"]


for (alin,LL) in zip(ToTest,LegendLabel):
    if alin not in list(PixelDBoecr.columns.values):
        continue
    AllMeanData[LL] = dict()
    print(alin)
    
    AllECR = dict()
    AllCore = dict()
    
    
    SumCore = 0.0
    SumECR = 0.0
    
    #Find Label
    Label = []
    for v in PixelDBoecr[alin]:
        for i in range(0,len(v)):
            if v[i] == "-":
                continue
            Label.append(v[i])
    Label = sorted(list(set(Label)))
    if alin == "sequence_alignment":
        Label = myAmino
    
    df = pd.DataFrame()
    
    print(alin,Label)
    count = 0
    for uniid in list(np.unique(PixelDBoecr["unique_id"])):
        
        Tecr = dict()
        Tcore = dict()
        Totecr = 0.0
        Totcore = 0.0
        
        sdf = PixelDBoecr[PixelDBoecr["unique_id"] == uniid]
        
        for (ecr,ali,lecr) in zip(np.array(sdf["core_ecr_alignment"]),np.array(sdf[alin]),np.array(sdf["longest_continuous_ecr"])):
            
            if lecr < 4:
                continue
            #print(ecr,ali[0])
            #ali = ali[0]
            #print(ali,lecr)
            for i in range(0,len(ecr)):
                #if alin == "sequence_alignment":
                #    if levy[i] != "C":
                #        continue
                if ecr[i] == "E":
                    if ali[i] not in Tecr:
                        Tecr[ali[i]] = 0.0
                    Tecr[ali[i]] += 1.0
                    Totecr += 1.0
                    SumECR += 1.0
                    
                if ecr[i] == "C":
                    if ali[i] not in Tcore:
                        Tcore[ali[i]] = 0.0
                    Tcore[ali[i]] += 1.0
                    Totcore += 1.0
                    SumCore += 1.0
                #print(ali[i],ecr[i])
            #break
        
        for aa in Label:
            if aa not in Tecr:
                Tecr[aa] = 0.0
            #print(aa,Tecr[aa],Totecr)
            if aa not in AllECR:
                AllECR[aa] = []
            
            AllECR[aa].append(Tecr[aa] / float(Totecr))

            if aa not in Tcore:
                Tcore[aa] = 0.0
            if aa not in AllCore:
                AllCore[aa] = []
            AllCore[aa].append(Tcore[aa] / float(Totcore))
            count += 1
            df = df.append({'class': 'Core', 'AA': aa, 'percentage': Tcore[aa] / float(Totcore)}, ignore_index=True)
            df = df.append({'class': 'ECR', 'AA': aa, 'percentage': Tecr[aa] / float(Totecr)}, ignore_index=True)  
        
        #break
    print(SumCore,SumECR)
    ECRMean = []
    CoreMean = []
    
    (BtCore,BtECR) = bootstrap(AllCore,AllECR,it=10000)
    
    
    for aa in Label:
        ECRMean.append(np.mean(BtECR[aa]))
        CoreMean.append(np.mean(BtCore[aa]))
        
        
    plt.scatter(CoreMean,ECRMean)
    for i in range(0,len(Label)):
        ttest = stats.ttest_rel(BtECR[Label[i]], BtCore[Label[i]])[1]
        print("%2d %2s P.val=%10.8f Core:%6.2f ECR:%6.2f" % (i,Label[i],ttest,CoreMean[i],ECRMean[i]))
        #print(i,Label[i],ttest,"CORE:%4.2f " % (CoreMean[i]),"ECR",ECRMean[i])
        if Label[i] not in AllMeanData[LL]:
            AllMeanData[LL][Label[i]] = dict()
        AllMeanData[LL][Label[i]]["ECR"] = [ECRMean[i],np.std(BtECR[Label[i]]) ]
        AllMeanData[LL][Label[i]]["Core"] = [CoreMean[i],np.std(BtCore[Label[i]])]
        if ttest < 0.05:
            plt.text(CoreMean[i]+0.15,ECRMean[i]+0.15,Label[i])
            
    Lim = [-3,int(np.max(ECRMean+CoreMean)*1.1)]
    #print(Lim)
    #Lim = [-1,10]
    plt.plot(Lim,Lim,c="black")
    plt.xlim(Lim)
    plt.ylim(Lim)
    plt.xlabel("Core "+LL+" %")
    plt.ylabel("ECR "+LL+" %")
    plt.show() 
    
    sns.boxplot(x="AA",hue="class",y="percentage",data=df)
    plt.show()

In [ ]:
Label = myAmino
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3
LL= "Amino Acid buried"
AllMeanData[LL] = dict()
for i in range(0,len(Label)):
    plt.scatter(CoreMean[i],ECRMean[i],c=Color[i])
    ttest = stats.ttest_rel(BtECR[Label[i]], BtCore[Label[i]])[1]
    print("%2d %2s P.val=%10.8f Core:%6.2f ECR:%6.2f" % (i,Label[i],ttest,CoreMean[i],ECRMean[i]))
    
    if Label[i] not in AllMeanData[LL]:
        AllMeanData[LL][Label[i]] = dict()
    AllMeanData[LL][Label[i]]["ECR"] = [ECRMean[i],np.std(BtECR[Label[i]]) ]
    AllMeanData[LL][Label[i]]["Core"] = [CoreMean[i],np.std(BtCore[Label[i]])]
    
    
    #print(i,Label[i],ttest,"CORE:%4.2f " % (CoreMean[i]),"ECR",ECRMean[i])
    #if ttest < 0.05:
    plt.text(CoreMean[i]+0.15,ECRMean[i]+0.15,Label[i],color=Color[i])
Lim = [-3,int(np.max(ECRMean+CoreMean)*1.1)]
#print(Lim)
Lim = [-1,15]
plt.plot(Lim,Lim,c="black")
plt.xlim(Lim)
plt.ylim(Lim)
plt.xlabel("Core composition (%)")
plt.ylabel("ECR composition (%)")
plt.show()

In [ ]:


In [ ]:
AllECR = dict()
AllCore = dict()
AllSurf = dict()
AllInte = dict()


SumCore = 0.0
SumECR = 0.0
SumSurf = 0.0
SumInte = 0.0

for aa in myAmino:
    AllECR[aa] = []
    AllCore[aa] = []
    AllSurf[aa] = []
    AllInte[aa] = []
Label = myAmino
for uniid in list(np.unique(PixelDBoecr["unique_id"])):
    sdf = PixelDBoecr[PixelDBoecr["unique_id"] == uniid]
    Tecr = dict()
    Tcore = dict()
    Tsurf = dict()
    Tinte = dict()
    Totecr = 0.0
    Totcore = 0.0
    Totsurf = 0.0
    Totinte = 0.0
    for v in sdf["EXOSITE_aa"]:
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tecr:
                Tecr[sp[0]] = 0
            Tecr[sp[0]] += float(sp[1])
            Totecr += float(sp[1])
            SumECR += float(sp[1])
    for v in sdf["surface_aa"]:
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tsurf:
                Tsurf[sp[0]] = 0
            Tsurf[sp[0]] += float(sp[1])
            Totsurf += float(sp[1])
            SumSurf += float(sp[1])
    for v in sdf["interior_aa"]:
        
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tinte:
                Tinte[sp[0]] = 0
            Tinte[sp[0]] += float(sp[1])
            Totinte += float(sp[1])
            SumInte += float(sp[1])
            
    for v in sdf["COREBINDING_aa"]:
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tcore:
                Tcore[sp[0]] = 0
            Tcore[sp[0]] += float(sp[1])
            Totcore += float(sp[1])
            SumCore += float(sp[1])        
    
    Label = myAmino
    for aa in Label:
        if aa not in Tecr:
            Tecr[aa] = 0.0
        #print(aa,Tecr[aa],Totecr)
        if aa not in AllECR:
            AllECR[aa] = []
        if (Totecr != 0):
            AllECR[aa].append(Tecr[aa] / float(Totecr))
        else:
            AllECR[aa].append(0)

        if aa not in Tcore:
            Tcore[aa] = 0.0
        if aa not in AllCore:
            AllCore[aa] = []
        if Totcore != 0:
            AllCore[aa].append(Tcore[aa] / float(Totcore))
        else:
            AllCore[aa].append(0)
            
            
        if aa not in Tsurf:
            Tsurf[aa] = 0.0
        if aa not in AllSurf:
            AllSurf[aa] = []
        if Totsurf != 0:
            AllSurf[aa].append(Tsurf[aa] / float(Totsurf))
        else:
            AllSurf[aa].append(0)
            
        if aa not in Tinte:
            Tinte[aa] = 0.0
        if aa not in AllInte:
            AllInte[aa] = []
        if Totinte != 0:
            AllInte[aa].append(Tinte[aa] / float(Totinte))
        else:
            AllInte[aa].append(0)
ECRMean = []
CoreMean = []
SurfMean = []
InteMean = []

(BtCore,BtECR) = bootstrap(AllCore,AllECR,it=10000)

(BtInte,BtSurf) = bootstrap(AllInte,AllSurf,it=10000)

Label = myAmino
for aa in Label:
    ECRMean.append(np.mean(BtECR[aa]))
    CoreMean.append(np.mean(BtCore[aa]))
    SurfMean.append(np.mean(BtSurf[aa]))
    InteMean.append(np.mean(BtInte[aa]))

In [ ]:
print(SumCore,SumECR,SumSurf,SumInte)

In [ ]:
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3

LL= "Binding Site"
AllMeanData[LL] = dict()


for i in range(0,len(Label)):
    ttest = stats.ttest_rel(BtSurf[Label[i]], BtCore[Label[i]])[1]
    print("%2d %2s P.val=%10.8f Core:%4.2f ECR:%4.2f Surf:%4.2f Inte:%.2f" % (i,Label[i],ttest,CoreMean[i],ECRMean[i],SurfMean[i],InteMean[i]))
    if Label[i] not in AllMeanData[LL]:
        AllMeanData[LL][Label[i]] = dict()
    AllMeanData[LL][Label[i]]["ECR"] = [ECRMean[i],np.std(BtECR[Label[i]]) ]
    AllMeanData[LL][Label[i]]["Core"] = [CoreMean[i],np.std(BtCore[Label[i]])]    
    AllMeanData[LL][Label[i]]["Surf"] = [SurfMean[i],np.std(BtSurf[Label[i]])]
    AllMeanData[LL][Label[i]]["Inte"] = [InteMean[i],np.std(BtInte[Label[i]])]

In [ ]:
AllMean = [CoreMean,ECRMean,SurfMean,InteMean]
MeanLab = ["CoreBinding","Exosite","Surface","Interior"]
AllMSE = dict()
AllComp = dict()
for (arr1,lab1) in zip(AllMean,MeanLab):
    AllMSE[lab1] = dict()
    AllComp[lab1] = dict()
    for i in range(0,20):
        AllComp[lab1][myAmino[i]] = arr1[i]
    for (arr2,lab2) in zip(AllMean,MeanLab):
        plt.scatter(arr1,arr2)
        plt.xlabel(lab1)
        plt.ylabel(lab2)
        for i in range(0,20):
            plt.text(arr1[i]+0.15,arr2[i]+0.15,Label[i],color=Color[i])
        plt.xlim(0,20)
        plt.ylim(0,20)
        plt.plot([0,100],[0,100])
        plt.show()
        #AllMSE[lab1][lab2] = np.sqrt(np.mean(np.power(np.array(arr1)-np.array(arr2),2)))
        AllMSE[lab1][lab2] = np.corrcoef(arr1,arr2)[0][1]
        print(lab1,lab2,np.sqrt(np.mean(np.power(np.array(arr1)-np.array(arr2),2))),np.corrcoef(arr1,arr2)[0][1])
sns.clustermap(pd.DataFrame(AllMSE))
plt.show()
sns.clustermap(pd.DataFrame(AllComp))
plt.show()

sns.heatmap(pd.DataFrame(AllComp).transpose()[myAmino].transpose(),annot=True)
plt.show()
#print("Core vs Exosite %.2f" % (np.sqrt(np.mean(np.power(np.array(CoreMean)-np.array(ECRMean),2)))))
#print("Core vs Surface %.2f" % (np.sqrt(np.mean(np.power(np.array(CoreMean)-np.array(SurfMean),2)))))
#print("Exosite  vs Surface %.2f" % (np.sqrt(np.mean(np.power(np.array(ECRMean)-np.array(SurfMean),2)))))

In [ ]:


In [ ]:


In [ ]:
AllECR = dict()
AllCore = dict()
AllSurf = dict()
AllInte = dict()


SumCore = 0.0
SumECR = 0.0
SumSurf = 0.0
SumInte = 0.0

for aa in myAmino:
    AllECR[aa] = []
    AllCore[aa] = []
    AllSurf[aa] = []
    AllInte[aa] = []
Label = ['B', 'C', 'E', 'G', 'H', 'T', 'b']
for uniid in list(np.unique(PixelDBoecr["unique_id"])):
    sdf = PixelDBoecr[PixelDBoecr["unique_id"] == uniid]
    Tecr = dict()
    Tcore = dict()
    Tsurf = dict()
    Tinte = dict()
    Totecr = 0.0
    Totcore = 0.0
    Totsurf = 0.0
    Totinte = 0.0
    for v in sdf["EXOSITE_ss"]:
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tecr:
                Tecr[sp[0]] = 0
            Tecr[sp[0]] += float(sp[1])
            Totecr += float(sp[1])
            SumECR += float(sp[1])
    for v in sdf["surface_ss"]:
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tsurf:
                Tsurf[sp[0]] = 0
            Tsurf[sp[0]] += float(sp[1])
            Totsurf += float(sp[1])
            SumSurf += float(sp[1])
    for v in sdf["interior_ss"]:
        
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tinte:
                Tinte[sp[0]] = 0
            Tinte[sp[0]] += float(sp[1])
            Totinte += float(sp[1])
            SumInte += float(sp[1])
            
    for v in sdf["COREBINDING_ss"]:
        for aa in v.split(";"):
            sp = aa.split(":")
            if sp[0] not in Tcore:
                Tcore[sp[0]] = 0
            Tcore[sp[0]] += float(sp[1])
            Totcore += float(sp[1])
            SumCore += float(sp[1])        
    
    for aa in Label:
        if aa not in Tecr:
            Tecr[aa] = 0.0
        #print(aa,Tecr[aa],Totecr)
        if aa not in AllECR:
            AllECR[aa] = []
        if (Totecr != 0):
            AllECR[aa].append(Tecr[aa] / float(Totecr))
        else:
            AllECR[aa].append(0)

        if aa not in Tcore:
            Tcore[aa] = 0.0
        if aa not in AllCore:
            AllCore[aa] = []
        if Totcore != 0:
            AllCore[aa].append(Tcore[aa] / float(Totcore))
        else:
            AllCore[aa].append(0)
            
            
        if aa not in Tsurf:
            Tsurf[aa] = 0.0
        if aa not in AllSurf:
            AllSurf[aa] = []
        if Totsurf != 0:
            AllSurf[aa].append(Tsurf[aa] / float(Totsurf))
        else:
            AllSurf[aa].append(0)
            
        if aa not in Tinte:
            Tinte[aa] = 0.0
        if aa not in AllInte:
            AllInte[aa] = []
        if Totinte != 0:
            AllInte[aa].append(Tinte[aa] / float(Totinte))
        else:
            AllInte[aa].append(0)
ECRMean = []
CoreMean = []
SurfMean = []
InteMean = []

(BtCore,BtECR) = bootstrap(AllCore,AllECR,it=10000)

(BtInte,BtSurf) = bootstrap(AllInte,AllSurf,it=10000)

for aa in Label:
    ECRMean.append(np.mean(BtECR[aa]))
    CoreMean.append(np.mean(BtCore[aa]))
    SurfMean.append(np.mean(BtSurf[aa]))
    InteMean.append(np.mean(BtInte[aa]))

In [ ]:
AllMean = [CoreMean,ECRMean,SurfMean,InteMean]
MeanLab = ["CoreBinding","Exosite","Surface","Interior"]
AllMSE = dict()
AllComp = dict()
for (arr1,lab1) in zip(AllMean,MeanLab):
    AllMSE[lab1] = dict()
    AllComp[lab1] = dict()
    for i in range(0,len(Label)):
        AllComp[lab1][Label[i]] = arr1[i]
    for (arr2,lab2) in zip(AllMean,MeanLab):
        #plt.scatter(arr1,arr2)
        #plt.xlabel(lab1)
        #plt.ylabel(lab2)
        #for i in range(0,20):
        #    plt.text(arr1[i]+0.15,arr2[i]+0.15,Label[i],color=Color[i])
        #plt.show()
        AllMSE[lab1][lab2] = np.sqrt(np.mean(np.power(np.array(arr1)-np.array(arr2),2)))
        print(lab1,lab2,np.sqrt(np.mean(np.power(np.array(arr1)-np.array(arr2),2))))
sns.clustermap(pd.DataFrame(AllMSE))
plt.show()
sns.heatmap(pd.DataFrame(AllMSE),annot=True)
plt.show()
sns.clustermap(pd.DataFrame(AllComp))
plt.show()

sns.heatmap(pd.DataFrame(AllComp).transpose()[Label].transpose(),annot=True)
plt.show()
#print("Core vs Exosite %.2f" % (np.sqrt(np.mean(np.power(np.array(CoreMean)-np.array(ECRMean),2)))))
#print("Core vs Surface %.2f" % (np.sqrt(np.mean(np.power(np.array(CoreMean)-np.array(SurfMean),2)))))
#print("Exosite  vs Surface %.2f" % (np.sqrt(np.mean(np.power(np.array(ECRMean)-np.array(SurfMean),2)))))

In [ ]:
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3

LL= "Binding Site SS"
AllMeanData[LL] = dict()


for i in range(0,len(Label)):
    ttest = stats.ttest_rel(BtSurf[Label[i]], BtCore[Label[i]])[1]
    print("%2d %2s P.val=%10.8f Core:%4.2f ECR:%4.2f Surf:%4.2f Inte:%.2f" % (i,Label[i],ttest,CoreMean[i],ECRMean[i],SurfMean[i],InteMean[i]))
    if Label[i] not in AllMeanData[LL]:
        AllMeanData[LL][Label[i]] = dict()
    AllMeanData[LL][Label[i]]["ECR"] = [ECRMean[i],np.std(BtECR[Label[i]]) ]
    AllMeanData[LL][Label[i]]["Core"] = [CoreMean[i],np.std(BtCore[Label[i]])]    
    AllMeanData[LL][Label[i]]["Surf"] = [SurfMean[i],np.std(BtSurf[Label[i]])]
    AllMeanData[LL][Label[i]]["Inte"] = [InteMean[i],np.std(BtInte[Label[i]])]

In [ ]:
figsize(20,8)
Test = ["Amino Acid","Stride Classifcation","Solvent Exposure [0-9]","Binding Site","Binding Site SS"]
Test = AllMeanData.keys()
Label = Test

Order = ["Core","ECR","Surf","Inte"]
Col = ["darkred","darkblue","gray","black"]

pos = 0.0
for test in Test:
    pos = 0.0
    pos += 1.0
    
    sub = AllMeanData[test]
    #print(len(sub))
    Tlab = sorted(sub)
    if len(Tlab) == 20:
        Tlab = myAmino
    Xlabpos = []
    
    for t in Tlab:
        Tpos = []
        
        for o in Order:
            if o not in sub[t]:
                continue
            plt.bar([pos],sub[t][o][0],color=Col[Order.index(o)])
            plt.plot([pos+0.4,pos+0.4],[sub[t][o][0],sub[t][o][0]+sub[t][o][1]],color="black")
            
            plt.plot([pos+0.1,pos+0.7],[sub[t][o][0]+sub[t][o][1]]*2,color="black")
            
            Tpos.append(pos)
            pos += 1.0
        Xlabpos.append(np.mean(Tpos)+0.5)
        pos += 1
        #print(t)
    #break
    plt.xticks(Xlabpos, Tlab)
    plt.title(test)
    plt.show()

In [ ]:
AllMeanData.keys()

In [ ]:
mpl.style.use('seaborn-whitegrid')

In [ ]:
figsize(20,8)
Test = ["Binding Site","Amino Acid compostion","Solvent Exposure [0-9]"]
Label = ["Receptor Amino Acid Composition (%)","Peptide Amino Acid Composition (%)","Solvent Exposure (%)"]
Order = ["Core","ECR","Surf","Inte"]
Col = ["darkred","darkblue","gray","black"]
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3
pos = 0.0
for (test,lab) in zip(Test,Label):
    pos = 0.0
    pos += 1.0
    
    sub = AllMeanData[test]
    #print(len(sub))
    Tlab = sorted(sub)
    if len(Tlab) == 20:
        Tlab = myAmino
    Xlabpos = []
    
    for t in Tlab:
        Tpos = []
        
        for o in Order:
            if o not in sub[t]:
                continue
            plt.bar([pos],sub[t][o][0],color=Col[Order.index(o)])
            plt.plot([pos,pos],[sub[t][o][0],sub[t][o][0]+sub[t][o][1]],color="black")
            
            plt.plot([pos-0.3,pos+0.3],[sub[t][o][0]+sub[t][o][1]]*2,color="black")
            
            Tpos.append(pos)
            pos += 1.0
        Xlabpos.append(np.mean(Tpos))
        pos += 1
        #print(t)
    #break
    if test == "Solvent Exposure [0-9]":
        Tlab = ["[0-10[","[10-20[","[20-30[","[30-40[","[40-50[","[50-60[","[60-70["]
        
    plt.xticks(Xlabpos, Tlab)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.xlabel(lab,size=18)
    plt.ylabel("Residue distribution %",rotation=90)
    plt.show()
    #break

In [ ]:


In [ ]:


In [ ]:
#Relationship between buried and aa type
AllDF = []
for (t,name) in zip(range(0,3),["All","Core","ECR"]):
    Count = dict()
    Tot = dict()
    for aa in myAmino:
        Count[aa] = dict()
        Tot[aa] = 0
        for i in range(0,9):
            Count[aa][str(i)] = 0
    torun = PixelDBecr
    if t == 0:
        torun = PixelDB
    for (seq,bury,ecr,size) in zip(torun["sequence_alignment"],torun["percent_exposed_alignment"],torun["core_ecr_alignment"],torun["size_of_binding_mode"]):
        #print(seq,bury)
        for i in range(0,len(seq)):
            if seq[i] == "-":
                continue
            if t == 1:
                if ecr[i] == "E":
                    continue
            if t == 2:
                if ecr[i] == "C":
                    continue
            if seq[i] not in Count:
                Count[seq[i]] = dict()
            if bury[i] not in Count[seq[i]]:
                Count[seq[i]][bury[i]] = 0
            #print(i,seq[i],bury[i])
            Count[seq[i]][bury[i]] += 1.0/float(size)
            Tot[seq[i]] += 1/float(size)
        #break
    NormC = dict()
    for aa in Tot:
        NormC[aa] = dict()
        print(aa,Tot[aa])
        for b in Count[aa]:
            bt = b
            if int(bt) > 4:
                bt = "5+"
            if bt not in NormC[aa]:
                NormC[aa][bt] = 0
            
            NormC[aa][bt] += int(float(Count[aa][b]) / float(Tot[aa])*100.0+0.5)
    print(name)
    figsize(8,8)
    df = pd.DataFrame(NormC)
    df = df[myAmino]
    AllDF.append(df)
    sns.heatmap(df[myAmino],vmax=70,annot=True)
    plt.title(name )
    plt.ylabel("Amino acid binned exposure distribution")
    plt.show()

In [ ]:
sns.heatmap(AllDF[1]-AllDF[2],annot=True)

In [ ]:
plt.scatter(PixelDBoecr["peptide_length"],PixelDBoecr["longest_continuous_ecr"])
plt.xlabel("Peptide Length")
plt.ylabel("ECR Length")
plt.show()

In [ ]:
plt.scatter(PixelDB["peptide_length"],PixelDB["longest_continuous_ecr"])
plt.xlabel("Peptide Length")
plt.ylabel("ECR Length")
plt.show()

In [ ]:
print(PixelDBecr[PixelDBecr["longest_continuous_ecr"]>=4]["peptide_length"].mean())
print(PixelDBecr[PixelDBecr["longest_continuous_ecr"]>=4]["peptide_length"].min())
print(PixelDBecr[PixelDBecr["longest_continuous_ecr"]>=4]["peptide_length"].max())
plt.hist(PixelDBecr[PixelDBecr["longest_continuous_ecr"]>=4]["peptide_length"].values,25)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.xlabel("Peptide Length")
plt.show()

In [ ]:
Count = dict()
for (ecr,pl) in zip(PixelDBecr["longest_continuous_ecr"],PixelDBecr["peptide_length"]):
    if ecr not in Count:
        Count[ecr] = dict()
    if pl not in Count[ecr]:
        Count[ecr][pl]=0
    Count[ecr][pl] += 1

In [ ]:
sns.heatmap(pd.DataFrame(Count).fillna(value=0).transpose(),annot=True)
plt.xlabel("Peptide Length")
plt.ylabel("ECR Length")
plt.show()

In [ ]:
#Find longest ecr
PixelDB.sort(["longest_continuous_ecr"],ascending=False).head(5)[["name","size_of_binding_mode","unique_id","receptor_length","peptide_length"]]

In [ ]:
#Search for traf
tolook = "2.60.210.10"
for (v,uniid,seq) in zip(np.array(PixelDB["CATH"]),np.array(PixelDB["unique_id"]),np.array(PixelDB["sequence_alignment"])):
    if v == nan:
        continue
    if re.search(tolook,str(v)):
        print(v,uniid,seq)

In [ ]:
#ECR contact to receptor
myCut = 3
tot = 0
low = 0
for (exp,seq) in zip(np.array(PixelDB["percent_exposed_alignment"]),np.array(PixelDB["core_ecr_alignment"])):
    #rint(exp,seq)
    for i in range(0,len(exp)):
        if seq[i] == "-":
            continue
        if (seq[i] == "e") or (seq[i] == "E"):
            tot += 1
            if int(exp[i]) <= myCut:
                low += 1
            #rint(i,exp[i],seq[i])
print(tot,low)
print(float(low)/float(tot))

In [ ]:
PixelDB.columns

In [ ]:
carac = pd.read_table(path+"other_files/AllCp.dat",names=["PDB","Chain","CP","Tot_cont"],delimiter="\s")

In [ ]:
figsize(10,10)
plt.hist(carac["CP"]*100,20)
plt.xlabel("Peptide surface contacting a symmetry-related complex (%)", fontsize=18 )
plt.ylabel("Count", fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()

In [ ]:
for i in range(1,100):
    plt.scatter(i,np.sum(carac["CP"]*100 > i)/float(len(carac["CP"])))

In [ ]:
for i in [5,10,15,20,50]:
    print(i,np.sum(carac["CP"]*100 > i)/float(len(carac["CP"])),np.sum(carac["CP"]*100 > i))

In [ ]:
#Most frequent PFAM in multiple binding mod


for bla in list(np.unique(PixelDB["cluster_number"])):
    sdf = PixelDB[PixelDB["cluster_number"] == bla]
    if (len(np.unique(sdf["unique_id"])) < 10):
        continue
    UniqueCATH = dict()
    for uniid in list(np.unique(sdf["CATH"])):
        if str(uniid) == "nan":
            continue
        for v in uniid.split("_"):
            #print(v)
            if v not in UniquePFAM:
                UniqueCATH[v] = 0
            UniqueCATH[v] += 1
    print(bla,len(np.unique(sdf["unique_id"])),list(set(UniqueCATH)))
    #break

In [ ]:
#Does receptor cluster with multiple binding mode have more CATH

for bla in list(np.unique(PixelDB["cluster_number"])):
    sdf = PixelDB[PixelDB["cluster_number"] == bla]
    UniqueCATH = dict()
    for uniid in list(np.unique(sdf["CATH"])):
        if str(uniid) == "nan":
            continue
        for v in uniid.split("_"):
            #print(v)
            if v not in UniquePFAM:
                UniqueCATH[v] = 0
            UniqueCATH[v] += 1
    plt.scatter(len(np.unique(sdf["unique_id"])),len(list(set(UniqueCATH))))
    plt.xlabel("Number of binding mode")
    plt.ylabel("Number of CATH")
    #print(bla,len(np.unique(sdf["unique_id"])),len(list(set(UniqueCATH))))
    #break

In [ ]:
sorted(list(np.unique(PixelDBoecr["unique_id"])))

In [ ]:
for clu in list(np.unique(PixelDB["cluster_number"])):
    #print(clu)
    sdf = PixelDB[PixelDB["cluster_number"] == clu]
    if len(np.unique(sdf["unique_id"])) < 5:
        continue
    print(list(np.unique(sdf["unique_id"])))
    #break

In [ ]:
np.unique(sdf["unique_id"])

In [ ]:
import glob

In [ ]:
#Get all simil Matrix
AllPair = []
NormCount = dict()
AllK = []
Fract = []
AverageSeqIden = []
for i in range(0,21):
    myK = float(i)/float(20)
    AllK.append(myK)
    NormCount[myK] = 0
for f in glob.glob('/media/vince/Postdoc/PixelDB/PixelDB/clusters/*/*_simil.CSV'):
    sdf = pd.read_table(f,sep="\s")
    if len(sdf) == 1:
        continue
    np.fill_diagonal(sdf.values, -1)
    keep = np.triu(np.ones(sdf.shape)).astype('bool').reshape(sdf.size)
    allval = np.array(sdf.stack()[keep])
    allval = list(allval[allval > 0])
    AllPair += allval
    AverageSeqIden.append(np.mean(allval))
    toadd = 1.0 / float(len(allval))
    Fract.append(np.sum(np.array(allval) > 0.7) / float(len(allval)))
    #die
    for v in allval:
        myK = int(v*20.0)/20.0
        #print(int(v*20.0)/20.0,len(allval))
        NormCount[myK] += toadd
    #if len(sdf) == 5:
    #    break

In [ ]:
print(np.mean(AverageSeqIden))
plt.hist(np.array(AverageSeqIden)*100)
plt.xlabel("Average sequence idendity in receptor cluster (%)", fontsize=18 )
plt.ylabel("Count", fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()

In [ ]:
print(np.mean(AverageSeqIden),np.median(AverageSeqIden),np.min(AverageSeqIden),np.max(AverageSeqIden))

In [ ]:
plt.hist(Fract,20)
plt.title("Distribution of receptor within cluster >70% seq idendity")
plt.show()

In [ ]:
AllV = []
for myK in AllK:
    AllV.append(NormCount[myK])
plt.bar(np.array(AllK)*100,AllV)

In [ ]:


In [ ]:
plt.hist(np.array(AllPair)*100,30)
plt.xlabel("Pairwise sequence identidy in cluster receptor of +2 complexes")
plt.ylabel("Count")
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()

In [ ]:
AllIn = []
AllBe = []


AllBetweenBindingMode =[]

for clu in list(np.unique(torun["cluster_number"])):
    sdf = torun[torun["cluster_number"] == clu]
    if (len(sdf["unique_id"].value_counts()) == 1):
            continue
    #if (len(sdf["unique_id"].value_counts()) != 2):
    #       continue
    for f in glob.glob("/media/vince/Postdoc/PixelDB/PixelDB/clusters/"+str(clu)+"/*_simil.CSV"):
        simildf = pd.read_table(f,sep="\s")
        #print(f)
        
    allunid = list(np.unique(sdf["unique_id"]))
    InBindingMode = []
    BetweenMode = []
    for i in range(0,len(allunid)):
        unidd1 = allunid[i]
        for j in range(i,len(allunid)):
            unidd2 = allunid[j]
            df1 = sdf[sdf["unique_id"] == unidd1]
            df2 = sdf[sdf["unique_id"] == unidd2]

            id1 = df1["pdb_id"]+"_"+df1["receptor_chain"]
            id2 = df2["pdb_id"]+"_"+df2["receptor_chain"]
            allval = []
            for i1 in id1:
                for i2 in id2:
                    allval.append(simildf[i1].transpose()[i2])
            fract = np.sum(np.array(allval) > 0.7) / float(len(allval))
            #print(i,j,unidd1,unidd2,fract)
            if i == j:
                InBindingMode += allval
            else:
                BetweenMode += allval
                AllBetweenBindingMode += allval
    fract = np.sum(np.array(InBindingMode) > 0.7) / float(len(InBindingMode))
    #print("In binding mode",fract)
    AllIn.append(fract)
    fract = np.sum(np.array(BetweenMode) > 0.7) / float(len(BetweenMode))
    #print("Between binding mode",fract)
    AllBe.append(fract)
    #break

In [ ]:
simildf[id1].transpose()[id2]

In [ ]:
plt.hist(AllBetweenBindingMode)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()

In [ ]:
for i in range(80,101):
    print(i,np.sum(np.array(AllBetweenBindingMode) > (float(i)/100)))

In [ ]:
plt.hist(AllIn,20)
plt.xlim((0,1))
plt.title("In binding mode")
plt.show()
plt.hist(AllBe,20)
plt.title("Between binding mode")
plt.xlim((0,1))
plt.show()
print("Average fraction >70% in Binding Mode",np.mean(AllIn))
print("Average fraction >70%  Between Binding Mode",np.mean(AllBe))

In [ ]:
PixelDB.columns

In [ ]:
AllArrang = dict()
for v in np.array(PixelDB["core_ecr_alignment"]):
    vinit = v
    v = v.replace("-","")
    for i in range(0,len(v)):
        vi = v
        v = re.sub('C+', "C", v)
        v = re.sub('E+', "E", v)
        v = re.sub('e+', "e", v)
        v = re.sub('c+', "c", v)
        if v == vi:
            break
    if v not in AllArrang:
        AllArrang[v] = 0
    AllArrang[v] += 1
    #print(vinit,v)
for w in sorted(AllArrang, key=AllArrang.get, reverse=True):
    print w, AllArrang[w]

In [ ]:
AllArrang = dict()
for v in np.array(PixelDBoecr["core_ecr_alignment"]):
    vinit = v
    v = v.replace("-","")
    for i in range(0,len(v)):
        vi = v
        v = re.sub('C+', "C", v)
        v = re.sub('E+', "E", v)
        v = re.sub('e+', "e", v)
        v = re.sub('c+', "c", v)
        if v == vi:
            break
    vinit = vinit.replace("-","")
    print(vinit,v)
    if v not in AllArrang:
        AllArrang[v] = 0
    AllArrang[v] += 1
    #print(vinit,v)

In [ ]:
tot = 0
for w in sorted(AllArrang, key=AllArrang.get, reverse=True):
    if (w[0] == "E") or (w[-1] == "E"):
        tot += AllArrang[w]
    #else:
        print w, AllArrang[w]

In [ ]:
print(tot,len(PixelDBoecr))

In [ ]:
AllArrang = dict()
sdf = PixelDB[PixelDB["cluster_number"] == 1]
for v in np.array(sdf["core_ecr_alignment"]):
    vinit = v
    
    v = v.replace("-","")
    for i in range(0,len(v)):
        vi = v
        v = re.sub('C+', "C", v)
        v = re.sub('E+', "E", v)
        v = re.sub('e+', "e", v)
        v = re.sub('c+', "c", v)
        if v == vi:
            break
    if v not in AllArrang:
        AllArrang[v] = 0
    AllArrang[v] += 1
    #print(vinit,v)
for w in sorted(AllArrang, key=AllArrang.get, reverse=True):
    print w, AllArrang[w]

In [ ]:


In [ ]:
#Amino acid composition
AllAAComb = dict()
sub = AllMeanData["Binding Site"]

for aa in sub:
    if aa not in AllAAComb:
        AllAAComb[aa] = dict()
    for t in sub[aa]:
        ti = t
        if t == "Core":
            ti = "Core-binding"
        if t == "ECR":
            ti = "Exosite"
        if t == "Inte":
            ti = "Interior"
        if t == "Surf":
            ti = "NISR"
        AllAAComb[aa][ti] = sub[aa][t][0]
sub = AllMeanData["Amino Acid compostion"]

for aa in sub:
    if aa not in AllAAComb:
        AllAAComb[aa] = dict()
    for t in sub[aa]:
        AllAAComb[aa][t] = sub[aa][t][0]

In [ ]:
df = pd.DataFrame(AllAAComb)[myAmino].transpose()[["Core","ECR","Core-binding","Exosite","NISR","Interior"]]

In [ ]:
df

In [ ]:
LabelOr = ["Core","ECR","Core-binding","Exosite","NISR","Interior"]

In [ ]:
colcol = ["red","blue"]+["gray"]*4
LabelOr = ["Core","ECR","Core-binding","Exosite","NISR","Interior"]
sns.set(font_scale=1.4)
#sns.clustermap(df,row_cluster=False,row_colors = Color,col_colors=colcol)
pal = sns.light_palette("navy", as_cmap=True)
sns.clustermap(df[LabelOr],row_cluster=False,col_cluster=False,cmap=pal,figsize=(12,12),linewidths=.5,annot=True)

plt.show()

In [ ]:
colcol = ["red","blue"]+["gray"]*4
sns.set(font_scale=1.9)
#sns.clustermap(df,row_cluster=False,row_colors = Color,col_colors=colcol)
pal = sns.light_palette("navy", as_cmap=True)
sns.clustermap(df,row_cluster=False,cmap=pal,figsize=(12,12),linewidths=.5,annot=True,metric="Correlation", fmt=".1f")

plt.show()

In [ ]:


In [ ]:
figsize(20,8)
Test = ["Binding Site","Amino Acid compostion","Solvent Exposure [0-9]"]
Label = ["Receptor Amino Acid Composition (%)","Peptide Amino Acid Composition (%)","Solvent Exposure (%)"]
Order = ["Core","ECR","Surf","Inte"]
Col = ["darkred","darkblue","gray","black"]
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3
pos = 0.0
for (test,lab) in zip(Test,Label):
    pos = 0.0
    pos += 1.0
    
    sub = AllMeanData[test]
    #print(len(sub))
    Tlab = sorted(sub)
    if len(Tlab) == 20:
        Tlab = myAmino
    Xlabpos = []
    
    for t in Tlab:
        Tpos = []
        
        for o in Order:
            if o not in sub[t]:
                continue
            plt.bar([pos],sub[t][o][0],color=Col[Order.index(o)])
            #plt.plot([pos+0.4,pos+0.4],[sub[t][o][0],sub[t][o][0]+sub[t][o][1]],color="black")
            
            plt.plot([pos+0.1,pos+0.7],[sub[t][o][0]+sub[t][o][1]]*2,color="black")
            
            Tpos.append(pos)
            pos += 1.0
        Xlabpos.append(np.mean(Tpos)+0.5)
        pos += 1
        #print(t)
    #break
    if test == "Solvent Exposure [0-9]":
        Tlab = ["[0-10[","[10-20[","[20-30[","[30-40[","[40-50[","[50-60[","[60-70["]
        
    plt.xticks(Xlabpos, Tlab)
    plt.tick_params(axis='both', which='major', labelsize=18)
    plt.xlabel(lab,size=18)
    
    plt.show()
    #break

In [ ]: