In [1]:
import pandas as pd
import numpy as np
import re
from scipy import stats
import seaborn as sns
%pylab inline
%matplotlib inline
In [2]:
myAmino = ["R","H","K","D","E","S","T","N","Q","C","G","P","A","V","I","L","M","F","Y","W"]
In [4]:
#Load PixelDB
path = "/media/vince/Postdoc/PixelDB/PixelDB/"
PixelDB = pd.read_csv(path+"PixelDB.csv")
len(PixelDB["name"])
Out[4]:
In [5]:
print("Entries in DB:",len(PixelDB["cluster_number"]))
print("Receptor Cluster",len(PixelDB["cluster_number"].value_counts()))
print("Unique Binding Mode",len(PixelDB["unique_id"].value_counts()))
print("Binding mode with 2 plus peptide",np.sum((PixelDB["unique_id"].value_counts()) >= 2))
In [ ]:
#Some STAT on PixelDB
In [ ]:
PixelDBecr = PixelDB.copy()
for uniid in list(np.unique(PixelDB["unique_id"])):
sdf = PixelDB[PixelDB["unique_id"] == uniid]
if not np.sum((np.array(sdf["longest_continuous_core"]) > 3) & (np.array(sdf["longest_continuous_ecr"]) > 3)) > 0:
PixelDBecr = PixelDBecr[PixelDBecr["unique_id"] != uniid]
continue
#break
In [ ]:
PixelDBoecr = PixelDB[PixelDB["longest_continuous_ecr"] > 3]
In [ ]:
In [ ]:
print("Unique Binding Mode",len(PixelDBecr["unique_id"].value_counts()))
print("Exosite and ECR complexes",np.sum(PixelDBecr["longest_continuous_ecr"] > 3))
In [ ]:
In [ ]:
#Is there a link between CATHdb and Binding mode
for ikl in range(0,2):
torun = PixelDB
if ikl == 1:
print("ECR")
torun = PixelDBecr
else:
print("Full")
for test in ["PFAM","uniprot","CATH"]:
CATHdbOver = []
BindingMode = []
for uniid in list(np.unique(torun["cluster_number"])):
sdf = torun[torun["cluster_number"] == uniid]
BindingMode.append(len(sdf["unique_id"].value_counts()))
if (len(sdf["unique_id"].value_counts()) == 1):
continue
AllCATH = []
AllCATHUni = []
for unid in np.unique(sdf["unique_id"]):
CATHuni = []
for cid in sdf[sdf["unique_id"] == unid][test]:
#print(cid.split("_"))
if str(cid) != "nan":
CATHuni += cid.split("_")
AllCATHUni += cid.split("_")
AllCATH.append(list(set(CATHuni)))
Tot = 0
Over = 0
for i in range(0,len(AllCATH)):
for j in range(i+1,len(AllCATH)):
OverT = 0
for v in AllCATH[i]:
if v in AllCATH[j]:
OverT += 1
if (OverT != 0):
Over += 1
#else:
# print(i,j,AllCATH[i],AllCATH[j])
Tot += 1
CATHdbOver.append(float(Over)/float(Tot))
#if (CATHdbOver[-1] < 0.5):
# print(uniid,Over,Tot)
# print(AllCATH)
#if len(sdf["unique_id"].value_counts()) > 6:
# print(uniid,len(sdf["unique_id"].value_counts()))
# print(pd.Series(AllCATHUni).value_counts())
#else:
# break
#break
#
#break
print(len(CATHdbOver))
print("Average Overlap of:",test,np.mean(CATHdbOver))
print("Median Overlap of:",test,np.median(CATHdbOver))
print("Average Nb of Binding mode:",np.mean(BindingMode))
print("Median Nb of Binding mode:",np.median(BindingMode))
print(np.min(BindingMode),np.max(BindingMode))
print("More than one binding mode",np.sum(np.array(BindingMode) > 1))
print("Total cluster",len(BindingMode))
print()
In [ ]:
#How often same PDB different binding mode
for ikl in range(0,2):
torun = PixelDB
if ikl == 1:
print("ECR")
torun = PixelDBecr
else:
print("Full")
tot = 0
AllCluster = []
for uniid in list(np.unique(torun["pdb_id"])):
sdf = torun[torun["pdb_id"] == uniid]
if (len(sdf) == 1):
continue
for ch in list(np.unique(sdf["receptor_chain"])):
ssdf = sdf[sdf["receptor_chain"] == ch]
if (len(ssdf) == 1):
continue
print(uniid,ch,len(ssdf),np.unique(sdf["cluster_number"]))
AllCluster += list(np.unique(sdf["cluster_number"]))
tot += 1
#break
print(len(list(set(AllCluster))))
print("Same receptor engage in multiple peptide binding:",tot)
#break
In [ ]:
sdf
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
#Is there a link between CATHdb and Binding mode
for ikl in range(0,2):
torun = PixelDB
if ikl == 1:
print("ECR")
torun = PixelDBecr
else:
print("Full")
inbm = []
outbm = []
for uniid in list(np.unique(torun["cluster_number"])):
sdf = torun[torun["cluster_number"] == uniid]
sdf = sdf[sdf["mean_seq_iden_in_bm"] > 0]
sdf = sdf[sdf["mean_seq_iden_not_bm"] > 0]
if (len(sdf) == 0):
continue
#die
if (len(sdf["unique_id"].value_counts()) == 1):
continue
plt.scatter(sdf["mean_seq_iden_in_bm"]*100,sdf["mean_seq_iden_not_bm"]*100)
inbm.append(np.mean(sdf["mean_seq_iden_in_bm"]))
outbm.append(np.mean(sdf["mean_seq_iden_not_bm"]))
#print(inbm[-1],uniid)
#if uniid == 7:
# die
plt.xlabel("Mean sequence receptor within Binding mode")
plt.ylabel("Mean sequence receptor not in Binding mode")
plt.plot([-5,105],[-5,105])
plt.xlim((-5,105))
plt.ylim((-5,105))
plt.show()
#break
print("Average seq iden in and out",np.mean(inbm),np.mean(outbm),len(inbm))
plt.hist(inbm,20)
plt.title("In binding mode")
plt.show()
plt.hist(outbm,20)
plt.title("Between binding mode")
plt.show()
print("Median seq iden in and out",np.median(inbm),np.median(outbm),len(inbm))
print("Max and min seq iden ",np.min(inbm),np.max(inbm),len(inbm))
figsize(10,10)
plt.hist(np.array(inbm)*100)
plt.xlim([0,100])
plt.xlabel("Average sequence idendity in binding mode (%)", fontsize=18 )
plt.ylabel("Count", fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
plt.hist(np.array(outbm)*100)
plt.xlim([0,100])
plt.xlabel("Average sequence idendity between binding mode (%)", fontsize=18 )
plt.ylabel("Count", fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
break
In [ ]:
#sdf["unique_id"].value_counts()
In [ ]:
In [ ]:
for ikl in range(0,3):
torun = PixelDB
if ikl == 0:
print("Full")
if ikl == 1:
torun = PixelDBecr
print("ECR")
if ikl == 2:
torun = PixelDBoecr
print("ECRdist")
UniquePFAM = []
UniqueUnip = []
UniqueCATH = []
UniprotPerBindingMode = []
CATHPerBindingMode = []
PFAMPerBindingMode = []
StrPerBindingMode = []
for uniid in list(np.unique(torun["unique_id"])):
sdf = torun[torun["unique_id"] == uniid]
#print(sdf["sequence_alignment"])
StrPerBindingMode.append(len(sdf))
Uniuni = []
for cid in sdf["uniprot"]:
#print(cid.split("_"))
if str(cid) != "nan":
Uniuni += cid.split("_")
UniprotPerBindingMode.append(len(list(set(Uniuni))))
CATHuni = []
for cid in sdf["CATH"]:
#print(cid.split("_"))
if str(cid) != "nan":
CATHuni += cid.split("_")
CATHPerBindingMode.append(len(list(set(CATHuni))))
PFAMuni = []
for cid in sdf["PFAM"]:
#print(cid.split("_"))
if str(cid) != "nan":
PFAMuni += cid.split("_")
PFAMPerBindingMode.append(len(list(set(PFAMuni))))
#print("Binding mode with ECR and Core >= 4",len(PixelDBecr["unique_id"].value_counts()))
print("Unique Binding Mode",len(np.unique(torun["unique_id"])))
print("Strc Per Binding Mode",np.mean(StrPerBindingMode),np.median(StrPerBindingMode),np.min(StrPerBindingMode),np.max(StrPerBindingMode))
print("Uniprot Per Binding Mode",np.mean(UniprotPerBindingMode),np.median(UniprotPerBindingMode),np.min(UniprotPerBindingMode),np.max(UniprotPerBindingMode))
print("CATH Per Binding Mode",np.mean(CATHPerBindingMode),np.median(CATHPerBindingMode),np.min(CATHPerBindingMode),np.max(CATHPerBindingMode))
print("PFAM Per Binding Mode",np.mean(PFAMPerBindingMode),np.median(PFAMPerBindingMode),np.min(PFAMPerBindingMode),np.max(PFAMPerBindingMode))
In [ ]:
for ikl in range(0,3):
torun = PixelDB
if ikl == 0:
print("Full")
if ikl == 1:
torun = PixelDBecr
print("ECR")
if ikl == 2:
torun = PixelDBoecr
print("ECRdist")
UniquePFAM = []
UniqueUnip = []
UniqueCATH = []
UniprotPerBindingMode = []
CATHPerBindingMode = []
PFAMPerBindingMode = []
StrPerBindingMode = []
for uniid in list(np.unique(torun["cluster_number"])):
sdf = torun[torun["cluster_number"] == uniid]
#print(sdf["sequence_alignment"])
StrPerBindingMode.append(len(sdf))
Uniuni = []
for cid in sdf["uniprot"]:
#print(cid.split("_"))
if str(cid) != "nan":
Uniuni += cid.split("_")
UniprotPerBindingMode.append(len(list(set(Uniuni))))
CATHuni = []
for cid in sdf["CATH"]:
#print(cid.split("_"))
if str(cid) != "nan":
CATHuni += cid.split("_")
CATHPerBindingMode.append(len(list(set(CATHuni))))
PFAMuni = []
for cid in sdf["PFAM"]:
#print(cid.split("_"))
if str(cid) != "nan":
PFAMuni += cid.split("_")
PFAMPerBindingMode.append(len(list(set(PFAMuni))))
#print("Binding mode with ECR and Core >= 4",len(PixelDBecr["unique_id"].value_counts()))
print("Unique Cluster",len(np.unique(torun["cluster_number"])))
print("Strc Per Cluster",np.mean(StrPerBindingMode),np.median(StrPerBindingMode),np.min(StrPerBindingMode),np.max(StrPerBindingMode))
print("Uniprot Per Cluster",np.mean(UniprotPerBindingMode),np.median(UniprotPerBindingMode),np.min(UniprotPerBindingMode),np.max(UniprotPerBindingMode))
print("CATH Per Cluster",np.mean(CATHPerBindingMode),np.median(CATHPerBindingMode),np.min(CATHPerBindingMode),np.max(CATHPerBindingMode))
print("PFAM Per Cluster",np.mean(PFAMPerBindingMode),np.median(PFAMPerBindingMode),np.min(PFAMPerBindingMode),np.max(PFAMPerBindingMode))
In [ ]:
len(PixelDBecr)
In [ ]:
for cid in PixelDBecr["CATH"]:
if str(cid) != "nan":
CATHuni += cid.split("_")
CATHPerBindingMode.append(len(list(set(CATHuni))))
In [ ]:
UniprotPerBindingModeECR = []
CATHPerBindingModeECR = []
PFAMPerBindingModeECR = []
for uniid in list(np.unique(PixelDBecr["unique_id"])):
sdf = PixelDBecr[PixelDBecr["unique_id"] == uniid]
print(sdf["sequence_alignment"])
UniprotPerBindingMode.append(np.sum(np.unique(sdf["uniprot"]) != "nan"))
CATHuni = []
for cid in sdf["CATH"]:
#print(cid.split("_"))
if str(cid) != "nan":
CATHuni += cid.split("_")
CATHPerBindingMode.append(len(list(set(CATHuni))))
PFAMuni = []
for cid in sdf["PFAM"]:
#print(cid.split("_"))
if str(cid) != "nan":
PFAMuni += cid.split("_")
PFAMPerBindingMode.append(len(list(set(PFAMuni))))
#print(list(set(CATHuni)))
#die
In [ ]:
plt.hist(CATHPerBindingMode,100)
plt.show()
In [ ]:
#Load PFAM
pdbmap_name = ["PDB","Chain","unk","name","PFAM","uniprot","range"]
pdbmap = pd.read_table(path+"other_files/pdbmap",names=pdbmap_name,delimiter="\t")
pdbmap = pdbmap.drop("unk",axis=1)
pdbmap_name = list(pdbmap.columns.values)
for c in pdbmap_name:
pdbmap[c] = pdbmap[c].str.replace(';', '')
In [ ]:
#cath-b-newest-all
cath_name = ["PDB","v","CATH","range"]
cathb = pd.read_table(path+"other_files/cath-b-newest-all",delimiter=" ",names=cath_name)
In [ ]:
for ikl in range(0,3):
torun = PixelDB
if ikl == 0:
print("Full")
if ikl == 1:
torun = PixelDBecr
print("ECR")
if ikl == 2:
torun = PixelDBoecr
print("ECRdist")
UniquePFAM = dict()
for uniid in list(np.unique(torun["PFAM"])):
if str(uniid) == "nan":
continue
for v in uniid.split("_"):
#print(v)
if v not in UniquePFAM:
UniquePFAM[v] = 0
UniquePFAM[v] += 1
#break
UniqueCATH = dict()
for uniid in list(np.unique(torun["CATH"])):
if str(uniid) == "nan":
continue
for v in uniid.split("_"):
#print(v)
if v not in UniqueCATH:
UniqueCATH[v] = 0
UniqueCATH[v] += 1
UniqueUniprot = dict()
for uniid in list(np.unique(torun["uniprot"])):
if str(uniid) == "nan":
continue
for v in uniid.split("_"):
#print(v)
if v not in UniqueUniprot:
UniqueUniprot[v] = 0
UniqueUniprot[v] += 1
print("PDB has this many unique PFAM",len(pdbmap["PFAM"].value_counts()))
print("PixelDB has this many unique PFAM",len(UniquePFAM))
print("Percentage = ",100.0*len(UniquePFAM) / float(len(pdbmap["PFAM"].value_counts())))
#print("PixelDBecr has this many unique PFAM",len(PixelDBecr["PFAM"].value_counts()))
print("PDB has this many unique uniprot",len(pdbmap["uniprot"].value_counts()))
print("PixelDB has this many unique uniprot",len(UniqueUniprot))
print("Percentage = ",100.0*len(UniqueUniprot) / float(len(pdbmap["uniprot"].value_counts())))
#print("PixelDBecr has this many unique uniprot",len(PixelDBecr["uniprot"].value_counts()))
print("PDB has this many unique CATH",len(cathb["CATH"].value_counts()))
print("PixelDB has this many unique CATH",len(UniqueCATH))
print("Percentage = ",100.0*len(UniqueCATH) / float(len(cathb["CATH"].value_counts())))
print("Most frequence Cath")
for w in sorted(UniqueCATH, key=UniqueCATH.get, reverse=True)[0:3]:
print w, UniqueCATH[w]
print("Most frequence uniprot")
for w in sorted(UniqueUniprot, key=UniqueUniprot.get, reverse=True)[0:3]:
print w, UniqueUniprot[w]
print("Most frequence PFAM")
for w in sorted(UniquePFAM, key=UniquePFAM.get, reverse=True)[0:3]:
print w, UniquePFAM[w]
In [ ]:
allCarac = ["resolution","receptor_length","peptide_length","size_of_binding_mode","longest_continuous_core","longest_continuous_ecr"]
for v in allCarac:
AllMean = []
for uniid in list(np.unique(PixelDB["unique_id"])):
sdf = PixelDB[PixelDB["unique_id"] == uniid]
AllMean.append(np.mean(sdf[v]))
print("%30s Avg:%6.2f Std:%6.2f Median:%6.2f Min:%6.2f Max:%7.2f" % (v,np.mean(AllMean),np.std(AllMean),np.median(AllMean),np.min(PixelDB[v]),np.max(PixelDB[v])))
In [ ]:
allCarac = ["resolution","receptor_length","peptide_length","size_of_binding_mode","longest_continuous_core","longest_continuous_ecr"]
for v in allCarac:
AllMean = []
for uniid in list(np.unique(PixelDBecr["unique_id"])):
sdf = PixelDBecr[PixelDBecr["unique_id"] == uniid]
AllMean.append(np.mean(sdf[v]))
print("%30s Avg:%6.2f Std:%6.2f Median:%6.2f Min:%6.2f Max:%7.2f" % (v,np.mean(AllMean),np.std(AllMean),np.median(AllMean),np.min(PixelDBecr[v]),np.max(PixelDBecr[v])))
In [ ]:
allCarac = ["resolution","receptor_length","peptide_length","size_of_binding_mode","longest_continuous_core","longest_continuous_ecr"]
PixelDBoecr = PixelDB[PixelDB["longest_continuous_ecr"] > 3]
for v in allCarac:
AllMean = []
for uniid in list(np.unique(PixelDBoecr["unique_id"])):
sdf = PixelDBoecr[PixelDBoecr["unique_id"] == uniid]
AllMean.append(np.mean(sdf[v]))
print("%30s Avg:%6.2f Std:%6.2f Median:%6.2f Min:%6.2f Max:%7.2f" % (v,np.mean(AllMean),np.std(AllMean),np.median(AllMean),np.min(PixelDBoecr[v]),np.max(PixelDBoecr[v])))
In [ ]:
PixelDB.sort(["peptide_length"],ascending=False).head(5)[["name","size_of_binding_mode","cluster_number","receptor_length","peptide_length"]]
In [ ]:
PixelDB.sort(["receptor_length"]).head(5)[["name","size_of_binding_mode","cluster_number","receptor_length","peptide_length"]]
In [ ]:
def bootstrap(AllCore,AllECR,it=10000):
BtECR = dict()
BtCore = dict()
for lab in Label:
BtECR[lab] = []
BtCore[lab] = []
for it in range(0,it):
ECRMean = []
CoreMean = []
for aa in Label:
index = np.random.randint(len(AllECR), size=len(AllECR[aa]))
ECRMean.append(np.sum(np.array(AllECR[aa])[index]))
CoreMean.append(np.sum(np.array(AllCore[aa])[index]))
ECRMean = np.array(ECRMean)/np.sum(ECRMean)*100.0
CoreMean = np.array(CoreMean)/np.sum(CoreMean)*100.0
for i in range(0,len(ECRMean)):
BtECR[Label[i]].append(ECRMean[i])
BtCore[Label[i]].append(CoreMean[i])
return(BtCore,BtECR)
In [ ]:
AllMeanData = dict()
In [ ]:
PixelDBoecr = PixelDB[PixelDB["longest_continuous_ecr"] > 3]
In [ ]:
LegendLabel = ["Solvent Exposure [0-9]","Levy Classification","Stride Classifcation", "Amino Acid compostion"]
ToTest = ["percent_exposed_alignment","levy_alignment","stride","sequence_alignment"]
#LegendLabel = ["Solvent Exposure [0-9]","Stride Classifcation"]
#ToTest = ["percent_exposed_alignment","stride"]
for (alin,LL) in zip(ToTest,LegendLabel):
if alin not in list(PixelDBoecr.columns.values):
continue
AllMeanData[LL] = dict()
print(alin)
AllECR = dict()
AllCore = dict()
SumCore = 0.0
SumECR = 0.0
#Find Label
Label = []
for v in PixelDBoecr[alin]:
for i in range(0,len(v)):
if v[i] == "-":
continue
Label.append(v[i])
Label = sorted(list(set(Label)))
if alin == "sequence_alignment":
Label = myAmino
df = pd.DataFrame()
print(alin,Label)
count = 0
for uniid in list(np.unique(PixelDBoecr["unique_id"])):
Tecr = dict()
Tcore = dict()
Totecr = 0.0
Totcore = 0.0
sdf = PixelDBoecr[PixelDBoecr["unique_id"] == uniid]
for (ecr,ali,lecr) in zip(np.array(sdf["core_ecr_alignment"]),np.array(sdf[alin]),np.array(sdf["longest_continuous_ecr"])):
if lecr < 4:
continue
#print(ecr,ali[0])
#ali = ali[0]
#print(ali,lecr)
for i in range(0,len(ecr)):
#if alin == "sequence_alignment":
# if levy[i] != "C":
# continue
if ecr[i] == "E":
if ali[i] not in Tecr:
Tecr[ali[i]] = 0.0
Tecr[ali[i]] += 1.0
Totecr += 1.0
SumECR += 1.0
if ecr[i] == "C":
if ali[i] not in Tcore:
Tcore[ali[i]] = 0.0
Tcore[ali[i]] += 1.0
Totcore += 1.0
SumCore += 1.0
#print(ali[i],ecr[i])
#break
for aa in Label:
if aa not in Tecr:
Tecr[aa] = 0.0
#print(aa,Tecr[aa],Totecr)
if aa not in AllECR:
AllECR[aa] = []
AllECR[aa].append(Tecr[aa] / float(Totecr))
if aa not in Tcore:
Tcore[aa] = 0.0
if aa not in AllCore:
AllCore[aa] = []
AllCore[aa].append(Tcore[aa] / float(Totcore))
count += 1
df = df.append({'class': 'Core', 'AA': aa, 'percentage': Tcore[aa] / float(Totcore)}, ignore_index=True)
df = df.append({'class': 'ECR', 'AA': aa, 'percentage': Tecr[aa] / float(Totecr)}, ignore_index=True)
#break
print(SumCore,SumECR)
ECRMean = []
CoreMean = []
(BtCore,BtECR) = bootstrap(AllCore,AllECR,it=10000)
for aa in Label:
ECRMean.append(np.mean(BtECR[aa]))
CoreMean.append(np.mean(BtCore[aa]))
plt.scatter(CoreMean,ECRMean)
for i in range(0,len(Label)):
ttest = stats.ttest_rel(BtECR[Label[i]], BtCore[Label[i]])[1]
print("%2d %2s P.val=%10.8f Core:%6.2f ECR:%6.2f" % (i,Label[i],ttest,CoreMean[i],ECRMean[i]))
#print(i,Label[i],ttest,"CORE:%4.2f " % (CoreMean[i]),"ECR",ECRMean[i])
if Label[i] not in AllMeanData[LL]:
AllMeanData[LL][Label[i]] = dict()
AllMeanData[LL][Label[i]]["ECR"] = [ECRMean[i],np.std(BtECR[Label[i]]) ]
AllMeanData[LL][Label[i]]["Core"] = [CoreMean[i],np.std(BtCore[Label[i]])]
if ttest < 0.05:
plt.text(CoreMean[i]+0.15,ECRMean[i]+0.15,Label[i])
Lim = [-3,int(np.max(ECRMean+CoreMean)*1.1)]
#print(Lim)
#Lim = [-1,10]
plt.plot(Lim,Lim,c="black")
plt.xlim(Lim)
plt.ylim(Lim)
plt.xlabel("Core "+LL+" %")
plt.ylabel("ECR "+LL+" %")
plt.show()
sns.boxplot(x="AA",hue="class",y="percentage",data=df)
plt.show()
In [ ]:
Label = myAmino
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3
LL= "Amino Acid buried"
AllMeanData[LL] = dict()
for i in range(0,len(Label)):
plt.scatter(CoreMean[i],ECRMean[i],c=Color[i])
ttest = stats.ttest_rel(BtECR[Label[i]], BtCore[Label[i]])[1]
print("%2d %2s P.val=%10.8f Core:%6.2f ECR:%6.2f" % (i,Label[i],ttest,CoreMean[i],ECRMean[i]))
if Label[i] not in AllMeanData[LL]:
AllMeanData[LL][Label[i]] = dict()
AllMeanData[LL][Label[i]]["ECR"] = [ECRMean[i],np.std(BtECR[Label[i]]) ]
AllMeanData[LL][Label[i]]["Core"] = [CoreMean[i],np.std(BtCore[Label[i]])]
#print(i,Label[i],ttest,"CORE:%4.2f " % (CoreMean[i]),"ECR",ECRMean[i])
#if ttest < 0.05:
plt.text(CoreMean[i]+0.15,ECRMean[i]+0.15,Label[i],color=Color[i])
Lim = [-3,int(np.max(ECRMean+CoreMean)*1.1)]
#print(Lim)
Lim = [-1,15]
plt.plot(Lim,Lim,c="black")
plt.xlim(Lim)
plt.ylim(Lim)
plt.xlabel("Core composition (%)")
plt.ylabel("ECR composition (%)")
plt.show()
In [ ]:
In [ ]:
AllECR = dict()
AllCore = dict()
AllSurf = dict()
AllInte = dict()
SumCore = 0.0
SumECR = 0.0
SumSurf = 0.0
SumInte = 0.0
for aa in myAmino:
AllECR[aa] = []
AllCore[aa] = []
AllSurf[aa] = []
AllInte[aa] = []
Label = myAmino
for uniid in list(np.unique(PixelDBoecr["unique_id"])):
sdf = PixelDBoecr[PixelDBoecr["unique_id"] == uniid]
Tecr = dict()
Tcore = dict()
Tsurf = dict()
Tinte = dict()
Totecr = 0.0
Totcore = 0.0
Totsurf = 0.0
Totinte = 0.0
for v in sdf["EXOSITE_aa"]:
for aa in v.split(";"):
sp = aa.split(":")
if sp[0] not in Tecr:
Tecr[sp[0]] = 0
Tecr[sp[0]] += float(sp[1])
Totecr += float(sp[1])
SumECR += float(sp[1])
for v in sdf["surface_aa"]:
for aa in v.split(";"):
sp = aa.split(":")
if sp[0] not in Tsurf:
Tsurf[sp[0]] = 0
Tsurf[sp[0]] += float(sp[1])
Totsurf += float(sp[1])
SumSurf += float(sp[1])
for v in sdf["interior_aa"]:
for aa in v.split(";"):
sp = aa.split(":")
if sp[0] not in Tinte:
Tinte[sp[0]] = 0
Tinte[sp[0]] += float(sp[1])
Totinte += float(sp[1])
SumInte += float(sp[1])
for v in sdf["COREBINDING_aa"]:
for aa in v.split(";"):
sp = aa.split(":")
if sp[0] not in Tcore:
Tcore[sp[0]] = 0
Tcore[sp[0]] += float(sp[1])
Totcore += float(sp[1])
SumCore += float(sp[1])
Label = myAmino
for aa in Label:
if aa not in Tecr:
Tecr[aa] = 0.0
#print(aa,Tecr[aa],Totecr)
if aa not in AllECR:
AllECR[aa] = []
if (Totecr != 0):
AllECR[aa].append(Tecr[aa] / float(Totecr))
else:
AllECR[aa].append(0)
if aa not in Tcore:
Tcore[aa] = 0.0
if aa not in AllCore:
AllCore[aa] = []
if Totcore != 0:
AllCore[aa].append(Tcore[aa] / float(Totcore))
else:
AllCore[aa].append(0)
if aa not in Tsurf:
Tsurf[aa] = 0.0
if aa not in AllSurf:
AllSurf[aa] = []
if Totsurf != 0:
AllSurf[aa].append(Tsurf[aa] / float(Totsurf))
else:
AllSurf[aa].append(0)
if aa not in Tinte:
Tinte[aa] = 0.0
if aa not in AllInte:
AllInte[aa] = []
if Totinte != 0:
AllInte[aa].append(Tinte[aa] / float(Totinte))
else:
AllInte[aa].append(0)
ECRMean = []
CoreMean = []
SurfMean = []
InteMean = []
(BtCore,BtECR) = bootstrap(AllCore,AllECR,it=10000)
(BtInte,BtSurf) = bootstrap(AllInte,AllSurf,it=10000)
Label = myAmino
for aa in Label:
ECRMean.append(np.mean(BtECR[aa]))
CoreMean.append(np.mean(BtCore[aa]))
SurfMean.append(np.mean(BtSurf[aa]))
InteMean.append(np.mean(BtInte[aa]))
In [ ]:
print(SumCore,SumECR,SumSurf,SumInte)
In [ ]:
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3
LL= "Binding Site"
AllMeanData[LL] = dict()
for i in range(0,len(Label)):
ttest = stats.ttest_rel(BtSurf[Label[i]], BtCore[Label[i]])[1]
print("%2d %2s P.val=%10.8f Core:%4.2f ECR:%4.2f Surf:%4.2f Inte:%.2f" % (i,Label[i],ttest,CoreMean[i],ECRMean[i],SurfMean[i],InteMean[i]))
if Label[i] not in AllMeanData[LL]:
AllMeanData[LL][Label[i]] = dict()
AllMeanData[LL][Label[i]]["ECR"] = [ECRMean[i],np.std(BtECR[Label[i]]) ]
AllMeanData[LL][Label[i]]["Core"] = [CoreMean[i],np.std(BtCore[Label[i]])]
AllMeanData[LL][Label[i]]["Surf"] = [SurfMean[i],np.std(BtSurf[Label[i]])]
AllMeanData[LL][Label[i]]["Inte"] = [InteMean[i],np.std(BtInte[Label[i]])]
In [ ]:
AllMean = [CoreMean,ECRMean,SurfMean,InteMean]
MeanLab = ["CoreBinding","Exosite","Surface","Interior"]
AllMSE = dict()
AllComp = dict()
for (arr1,lab1) in zip(AllMean,MeanLab):
AllMSE[lab1] = dict()
AllComp[lab1] = dict()
for i in range(0,20):
AllComp[lab1][myAmino[i]] = arr1[i]
for (arr2,lab2) in zip(AllMean,MeanLab):
plt.scatter(arr1,arr2)
plt.xlabel(lab1)
plt.ylabel(lab2)
for i in range(0,20):
plt.text(arr1[i]+0.15,arr2[i]+0.15,Label[i],color=Color[i])
plt.xlim(0,20)
plt.ylim(0,20)
plt.plot([0,100],[0,100])
plt.show()
#AllMSE[lab1][lab2] = np.sqrt(np.mean(np.power(np.array(arr1)-np.array(arr2),2)))
AllMSE[lab1][lab2] = np.corrcoef(arr1,arr2)[0][1]
print(lab1,lab2,np.sqrt(np.mean(np.power(np.array(arr1)-np.array(arr2),2))),np.corrcoef(arr1,arr2)[0][1])
sns.clustermap(pd.DataFrame(AllMSE))
plt.show()
sns.clustermap(pd.DataFrame(AllComp))
plt.show()
sns.heatmap(pd.DataFrame(AllComp).transpose()[myAmino].transpose(),annot=True)
plt.show()
#print("Core vs Exosite %.2f" % (np.sqrt(np.mean(np.power(np.array(CoreMean)-np.array(ECRMean),2)))))
#print("Core vs Surface %.2f" % (np.sqrt(np.mean(np.power(np.array(CoreMean)-np.array(SurfMean),2)))))
#print("Exosite vs Surface %.2f" % (np.sqrt(np.mean(np.power(np.array(ECRMean)-np.array(SurfMean),2)))))
In [ ]:
In [ ]:
In [ ]:
AllECR = dict()
AllCore = dict()
AllSurf = dict()
AllInte = dict()
SumCore = 0.0
SumECR = 0.0
SumSurf = 0.0
SumInte = 0.0
for aa in myAmino:
AllECR[aa] = []
AllCore[aa] = []
AllSurf[aa] = []
AllInte[aa] = []
Label = ['B', 'C', 'E', 'G', 'H', 'T', 'b']
for uniid in list(np.unique(PixelDBoecr["unique_id"])):
sdf = PixelDBoecr[PixelDBoecr["unique_id"] == uniid]
Tecr = dict()
Tcore = dict()
Tsurf = dict()
Tinte = dict()
Totecr = 0.0
Totcore = 0.0
Totsurf = 0.0
Totinte = 0.0
for v in sdf["EXOSITE_ss"]:
for aa in v.split(";"):
sp = aa.split(":")
if sp[0] not in Tecr:
Tecr[sp[0]] = 0
Tecr[sp[0]] += float(sp[1])
Totecr += float(sp[1])
SumECR += float(sp[1])
for v in sdf["surface_ss"]:
for aa in v.split(";"):
sp = aa.split(":")
if sp[0] not in Tsurf:
Tsurf[sp[0]] = 0
Tsurf[sp[0]] += float(sp[1])
Totsurf += float(sp[1])
SumSurf += float(sp[1])
for v in sdf["interior_ss"]:
for aa in v.split(";"):
sp = aa.split(":")
if sp[0] not in Tinte:
Tinte[sp[0]] = 0
Tinte[sp[0]] += float(sp[1])
Totinte += float(sp[1])
SumInte += float(sp[1])
for v in sdf["COREBINDING_ss"]:
for aa in v.split(";"):
sp = aa.split(":")
if sp[0] not in Tcore:
Tcore[sp[0]] = 0
Tcore[sp[0]] += float(sp[1])
Totcore += float(sp[1])
SumCore += float(sp[1])
for aa in Label:
if aa not in Tecr:
Tecr[aa] = 0.0
#print(aa,Tecr[aa],Totecr)
if aa not in AllECR:
AllECR[aa] = []
if (Totecr != 0):
AllECR[aa].append(Tecr[aa] / float(Totecr))
else:
AllECR[aa].append(0)
if aa not in Tcore:
Tcore[aa] = 0.0
if aa not in AllCore:
AllCore[aa] = []
if Totcore != 0:
AllCore[aa].append(Tcore[aa] / float(Totcore))
else:
AllCore[aa].append(0)
if aa not in Tsurf:
Tsurf[aa] = 0.0
if aa not in AllSurf:
AllSurf[aa] = []
if Totsurf != 0:
AllSurf[aa].append(Tsurf[aa] / float(Totsurf))
else:
AllSurf[aa].append(0)
if aa not in Tinte:
Tinte[aa] = 0.0
if aa not in AllInte:
AllInte[aa] = []
if Totinte != 0:
AllInte[aa].append(Tinte[aa] / float(Totinte))
else:
AllInte[aa].append(0)
ECRMean = []
CoreMean = []
SurfMean = []
InteMean = []
(BtCore,BtECR) = bootstrap(AllCore,AllECR,it=10000)
(BtInte,BtSurf) = bootstrap(AllInte,AllSurf,it=10000)
for aa in Label:
ECRMean.append(np.mean(BtECR[aa]))
CoreMean.append(np.mean(BtCore[aa]))
SurfMean.append(np.mean(BtSurf[aa]))
InteMean.append(np.mean(BtInte[aa]))
In [ ]:
AllMean = [CoreMean,ECRMean,SurfMean,InteMean]
MeanLab = ["CoreBinding","Exosite","Surface","Interior"]
AllMSE = dict()
AllComp = dict()
for (arr1,lab1) in zip(AllMean,MeanLab):
AllMSE[lab1] = dict()
AllComp[lab1] = dict()
for i in range(0,len(Label)):
AllComp[lab1][Label[i]] = arr1[i]
for (arr2,lab2) in zip(AllMean,MeanLab):
#plt.scatter(arr1,arr2)
#plt.xlabel(lab1)
#plt.ylabel(lab2)
#for i in range(0,20):
# plt.text(arr1[i]+0.15,arr2[i]+0.15,Label[i],color=Color[i])
#plt.show()
AllMSE[lab1][lab2] = np.sqrt(np.mean(np.power(np.array(arr1)-np.array(arr2),2)))
print(lab1,lab2,np.sqrt(np.mean(np.power(np.array(arr1)-np.array(arr2),2))))
sns.clustermap(pd.DataFrame(AllMSE))
plt.show()
sns.heatmap(pd.DataFrame(AllMSE),annot=True)
plt.show()
sns.clustermap(pd.DataFrame(AllComp))
plt.show()
sns.heatmap(pd.DataFrame(AllComp).transpose()[Label].transpose(),annot=True)
plt.show()
#print("Core vs Exosite %.2f" % (np.sqrt(np.mean(np.power(np.array(CoreMean)-np.array(ECRMean),2)))))
#print("Core vs Surface %.2f" % (np.sqrt(np.mean(np.power(np.array(CoreMean)-np.array(SurfMean),2)))))
#print("Exosite vs Surface %.2f" % (np.sqrt(np.mean(np.power(np.array(ECRMean)-np.array(SurfMean),2)))))
In [ ]:
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3
LL= "Binding Site SS"
AllMeanData[LL] = dict()
for i in range(0,len(Label)):
ttest = stats.ttest_rel(BtSurf[Label[i]], BtCore[Label[i]])[1]
print("%2d %2s P.val=%10.8f Core:%4.2f ECR:%4.2f Surf:%4.2f Inte:%.2f" % (i,Label[i],ttest,CoreMean[i],ECRMean[i],SurfMean[i],InteMean[i]))
if Label[i] not in AllMeanData[LL]:
AllMeanData[LL][Label[i]] = dict()
AllMeanData[LL][Label[i]]["ECR"] = [ECRMean[i],np.std(BtECR[Label[i]]) ]
AllMeanData[LL][Label[i]]["Core"] = [CoreMean[i],np.std(BtCore[Label[i]])]
AllMeanData[LL][Label[i]]["Surf"] = [SurfMean[i],np.std(BtSurf[Label[i]])]
AllMeanData[LL][Label[i]]["Inte"] = [InteMean[i],np.std(BtInte[Label[i]])]
In [ ]:
figsize(20,8)
Test = ["Amino Acid","Stride Classifcation","Solvent Exposure [0-9]","Binding Site","Binding Site SS"]
Test = AllMeanData.keys()
Label = Test
Order = ["Core","ECR","Surf","Inte"]
Col = ["darkred","darkblue","gray","black"]
pos = 0.0
for test in Test:
pos = 0.0
pos += 1.0
sub = AllMeanData[test]
#print(len(sub))
Tlab = sorted(sub)
if len(Tlab) == 20:
Tlab = myAmino
Xlabpos = []
for t in Tlab:
Tpos = []
for o in Order:
if o not in sub[t]:
continue
plt.bar([pos],sub[t][o][0],color=Col[Order.index(o)])
plt.plot([pos+0.4,pos+0.4],[sub[t][o][0],sub[t][o][0]+sub[t][o][1]],color="black")
plt.plot([pos+0.1,pos+0.7],[sub[t][o][0]+sub[t][o][1]]*2,color="black")
Tpos.append(pos)
pos += 1.0
Xlabpos.append(np.mean(Tpos)+0.5)
pos += 1
#print(t)
#break
plt.xticks(Xlabpos, Tlab)
plt.title(test)
plt.show()
In [ ]:
AllMeanData.keys()
In [ ]:
mpl.style.use('seaborn-whitegrid')
In [ ]:
figsize(20,8)
Test = ["Binding Site","Amino Acid compostion","Solvent Exposure [0-9]"]
Label = ["Receptor Amino Acid Composition (%)","Peptide Amino Acid Composition (%)","Solvent Exposure (%)"]
Order = ["Core","ECR","Surf","Inte"]
Col = ["darkred","darkblue","gray","black"]
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3
pos = 0.0
for (test,lab) in zip(Test,Label):
pos = 0.0
pos += 1.0
sub = AllMeanData[test]
#print(len(sub))
Tlab = sorted(sub)
if len(Tlab) == 20:
Tlab = myAmino
Xlabpos = []
for t in Tlab:
Tpos = []
for o in Order:
if o not in sub[t]:
continue
plt.bar([pos],sub[t][o][0],color=Col[Order.index(o)])
plt.plot([pos,pos],[sub[t][o][0],sub[t][o][0]+sub[t][o][1]],color="black")
plt.plot([pos-0.3,pos+0.3],[sub[t][o][0]+sub[t][o][1]]*2,color="black")
Tpos.append(pos)
pos += 1.0
Xlabpos.append(np.mean(Tpos))
pos += 1
#print(t)
#break
if test == "Solvent Exposure [0-9]":
Tlab = ["[0-10[","[10-20[","[20-30[","[30-40[","[40-50[","[50-60[","[60-70["]
plt.xticks(Xlabpos, Tlab)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.xlabel(lab,size=18)
plt.ylabel("Residue distribution %",rotation=90)
plt.show()
#break
In [ ]:
In [ ]:
In [ ]:
#Relationship between buried and aa type
AllDF = []
for (t,name) in zip(range(0,3),["All","Core","ECR"]):
Count = dict()
Tot = dict()
for aa in myAmino:
Count[aa] = dict()
Tot[aa] = 0
for i in range(0,9):
Count[aa][str(i)] = 0
torun = PixelDBecr
if t == 0:
torun = PixelDB
for (seq,bury,ecr,size) in zip(torun["sequence_alignment"],torun["percent_exposed_alignment"],torun["core_ecr_alignment"],torun["size_of_binding_mode"]):
#print(seq,bury)
for i in range(0,len(seq)):
if seq[i] == "-":
continue
if t == 1:
if ecr[i] == "E":
continue
if t == 2:
if ecr[i] == "C":
continue
if seq[i] not in Count:
Count[seq[i]] = dict()
if bury[i] not in Count[seq[i]]:
Count[seq[i]][bury[i]] = 0
#print(i,seq[i],bury[i])
Count[seq[i]][bury[i]] += 1.0/float(size)
Tot[seq[i]] += 1/float(size)
#break
NormC = dict()
for aa in Tot:
NormC[aa] = dict()
print(aa,Tot[aa])
for b in Count[aa]:
bt = b
if int(bt) > 4:
bt = "5+"
if bt not in NormC[aa]:
NormC[aa][bt] = 0
NormC[aa][bt] += int(float(Count[aa][b]) / float(Tot[aa])*100.0+0.5)
print(name)
figsize(8,8)
df = pd.DataFrame(NormC)
df = df[myAmino]
AllDF.append(df)
sns.heatmap(df[myAmino],vmax=70,annot=True)
plt.title(name )
plt.ylabel("Amino acid binned exposure distribution")
plt.show()
In [ ]:
sns.heatmap(AllDF[1]-AllDF[2],annot=True)
In [ ]:
plt.scatter(PixelDBoecr["peptide_length"],PixelDBoecr["longest_continuous_ecr"])
plt.xlabel("Peptide Length")
plt.ylabel("ECR Length")
plt.show()
In [ ]:
plt.scatter(PixelDB["peptide_length"],PixelDB["longest_continuous_ecr"])
plt.xlabel("Peptide Length")
plt.ylabel("ECR Length")
plt.show()
In [ ]:
print(PixelDBecr[PixelDBecr["longest_continuous_ecr"]>=4]["peptide_length"].mean())
print(PixelDBecr[PixelDBecr["longest_continuous_ecr"]>=4]["peptide_length"].min())
print(PixelDBecr[PixelDBecr["longest_continuous_ecr"]>=4]["peptide_length"].max())
plt.hist(PixelDBecr[PixelDBecr["longest_continuous_ecr"]>=4]["peptide_length"].values,25)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.xlabel("Peptide Length")
plt.show()
In [ ]:
Count = dict()
for (ecr,pl) in zip(PixelDBecr["longest_continuous_ecr"],PixelDBecr["peptide_length"]):
if ecr not in Count:
Count[ecr] = dict()
if pl not in Count[ecr]:
Count[ecr][pl]=0
Count[ecr][pl] += 1
In [ ]:
sns.heatmap(pd.DataFrame(Count).fillna(value=0).transpose(),annot=True)
plt.xlabel("Peptide Length")
plt.ylabel("ECR Length")
plt.show()
In [ ]:
#Find longest ecr
PixelDB.sort(["longest_continuous_ecr"],ascending=False).head(5)[["name","size_of_binding_mode","unique_id","receptor_length","peptide_length"]]
In [ ]:
#Search for traf
tolook = "2.60.210.10"
for (v,uniid,seq) in zip(np.array(PixelDB["CATH"]),np.array(PixelDB["unique_id"]),np.array(PixelDB["sequence_alignment"])):
if v == nan:
continue
if re.search(tolook,str(v)):
print(v,uniid,seq)
In [ ]:
#ECR contact to receptor
myCut = 3
tot = 0
low = 0
for (exp,seq) in zip(np.array(PixelDB["percent_exposed_alignment"]),np.array(PixelDB["core_ecr_alignment"])):
#rint(exp,seq)
for i in range(0,len(exp)):
if seq[i] == "-":
continue
if (seq[i] == "e") or (seq[i] == "E"):
tot += 1
if int(exp[i]) <= myCut:
low += 1
#rint(i,exp[i],seq[i])
print(tot,low)
print(float(low)/float(tot))
In [ ]:
PixelDB.columns
In [ ]:
carac = pd.read_table(path+"other_files/AllCp.dat",names=["PDB","Chain","CP","Tot_cont"],delimiter="\s")
In [ ]:
figsize(10,10)
plt.hist(carac["CP"]*100,20)
plt.xlabel("Peptide surface contacting a symmetry-related complex (%)", fontsize=18 )
plt.ylabel("Count", fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
In [ ]:
for i in range(1,100):
plt.scatter(i,np.sum(carac["CP"]*100 > i)/float(len(carac["CP"])))
In [ ]:
for i in [5,10,15,20,50]:
print(i,np.sum(carac["CP"]*100 > i)/float(len(carac["CP"])),np.sum(carac["CP"]*100 > i))
In [ ]:
#Most frequent PFAM in multiple binding mod
for bla in list(np.unique(PixelDB["cluster_number"])):
sdf = PixelDB[PixelDB["cluster_number"] == bla]
if (len(np.unique(sdf["unique_id"])) < 10):
continue
UniqueCATH = dict()
for uniid in list(np.unique(sdf["CATH"])):
if str(uniid) == "nan":
continue
for v in uniid.split("_"):
#print(v)
if v not in UniquePFAM:
UniqueCATH[v] = 0
UniqueCATH[v] += 1
print(bla,len(np.unique(sdf["unique_id"])),list(set(UniqueCATH)))
#break
In [ ]:
#Does receptor cluster with multiple binding mode have more CATH
for bla in list(np.unique(PixelDB["cluster_number"])):
sdf = PixelDB[PixelDB["cluster_number"] == bla]
UniqueCATH = dict()
for uniid in list(np.unique(sdf["CATH"])):
if str(uniid) == "nan":
continue
for v in uniid.split("_"):
#print(v)
if v not in UniquePFAM:
UniqueCATH[v] = 0
UniqueCATH[v] += 1
plt.scatter(len(np.unique(sdf["unique_id"])),len(list(set(UniqueCATH))))
plt.xlabel("Number of binding mode")
plt.ylabel("Number of CATH")
#print(bla,len(np.unique(sdf["unique_id"])),len(list(set(UniqueCATH))))
#break
In [ ]:
sorted(list(np.unique(PixelDBoecr["unique_id"])))
In [ ]:
for clu in list(np.unique(PixelDB["cluster_number"])):
#print(clu)
sdf = PixelDB[PixelDB["cluster_number"] == clu]
if len(np.unique(sdf["unique_id"])) < 5:
continue
print(list(np.unique(sdf["unique_id"])))
#break
In [ ]:
np.unique(sdf["unique_id"])
In [ ]:
import glob
In [ ]:
#Get all simil Matrix
AllPair = []
NormCount = dict()
AllK = []
Fract = []
AverageSeqIden = []
for i in range(0,21):
myK = float(i)/float(20)
AllK.append(myK)
NormCount[myK] = 0
for f in glob.glob('/media/vince/Postdoc/PixelDB/PixelDB/clusters/*/*_simil.CSV'):
sdf = pd.read_table(f,sep="\s")
if len(sdf) == 1:
continue
np.fill_diagonal(sdf.values, -1)
keep = np.triu(np.ones(sdf.shape)).astype('bool').reshape(sdf.size)
allval = np.array(sdf.stack()[keep])
allval = list(allval[allval > 0])
AllPair += allval
AverageSeqIden.append(np.mean(allval))
toadd = 1.0 / float(len(allval))
Fract.append(np.sum(np.array(allval) > 0.7) / float(len(allval)))
#die
for v in allval:
myK = int(v*20.0)/20.0
#print(int(v*20.0)/20.0,len(allval))
NormCount[myK] += toadd
#if len(sdf) == 5:
# break
In [ ]:
print(np.mean(AverageSeqIden))
plt.hist(np.array(AverageSeqIden)*100)
plt.xlabel("Average sequence idendity in receptor cluster (%)", fontsize=18 )
plt.ylabel("Count", fontsize=18)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
In [ ]:
print(np.mean(AverageSeqIden),np.median(AverageSeqIden),np.min(AverageSeqIden),np.max(AverageSeqIden))
In [ ]:
plt.hist(Fract,20)
plt.title("Distribution of receptor within cluster >70% seq idendity")
plt.show()
In [ ]:
AllV = []
for myK in AllK:
AllV.append(NormCount[myK])
plt.bar(np.array(AllK)*100,AllV)
In [ ]:
In [ ]:
plt.hist(np.array(AllPair)*100,30)
plt.xlabel("Pairwise sequence identidy in cluster receptor of +2 complexes")
plt.ylabel("Count")
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
In [ ]:
AllIn = []
AllBe = []
AllBetweenBindingMode =[]
for clu in list(np.unique(torun["cluster_number"])):
sdf = torun[torun["cluster_number"] == clu]
if (len(sdf["unique_id"].value_counts()) == 1):
continue
#if (len(sdf["unique_id"].value_counts()) != 2):
# continue
for f in glob.glob("/media/vince/Postdoc/PixelDB/PixelDB/clusters/"+str(clu)+"/*_simil.CSV"):
simildf = pd.read_table(f,sep="\s")
#print(f)
allunid = list(np.unique(sdf["unique_id"]))
InBindingMode = []
BetweenMode = []
for i in range(0,len(allunid)):
unidd1 = allunid[i]
for j in range(i,len(allunid)):
unidd2 = allunid[j]
df1 = sdf[sdf["unique_id"] == unidd1]
df2 = sdf[sdf["unique_id"] == unidd2]
id1 = df1["pdb_id"]+"_"+df1["receptor_chain"]
id2 = df2["pdb_id"]+"_"+df2["receptor_chain"]
allval = []
for i1 in id1:
for i2 in id2:
allval.append(simildf[i1].transpose()[i2])
fract = np.sum(np.array(allval) > 0.7) / float(len(allval))
#print(i,j,unidd1,unidd2,fract)
if i == j:
InBindingMode += allval
else:
BetweenMode += allval
AllBetweenBindingMode += allval
fract = np.sum(np.array(InBindingMode) > 0.7) / float(len(InBindingMode))
#print("In binding mode",fract)
AllIn.append(fract)
fract = np.sum(np.array(BetweenMode) > 0.7) / float(len(BetweenMode))
#print("Between binding mode",fract)
AllBe.append(fract)
#break
In [ ]:
simildf[id1].transpose()[id2]
In [ ]:
plt.hist(AllBetweenBindingMode)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()
In [ ]:
for i in range(80,101):
print(i,np.sum(np.array(AllBetweenBindingMode) > (float(i)/100)))
In [ ]:
plt.hist(AllIn,20)
plt.xlim((0,1))
plt.title("In binding mode")
plt.show()
plt.hist(AllBe,20)
plt.title("Between binding mode")
plt.xlim((0,1))
plt.show()
print("Average fraction >70% in Binding Mode",np.mean(AllIn))
print("Average fraction >70% Between Binding Mode",np.mean(AllBe))
In [ ]:
PixelDB.columns
In [ ]:
AllArrang = dict()
for v in np.array(PixelDB["core_ecr_alignment"]):
vinit = v
v = v.replace("-","")
for i in range(0,len(v)):
vi = v
v = re.sub('C+', "C", v)
v = re.sub('E+', "E", v)
v = re.sub('e+', "e", v)
v = re.sub('c+', "c", v)
if v == vi:
break
if v not in AllArrang:
AllArrang[v] = 0
AllArrang[v] += 1
#print(vinit,v)
for w in sorted(AllArrang, key=AllArrang.get, reverse=True):
print w, AllArrang[w]
In [ ]:
AllArrang = dict()
for v in np.array(PixelDBoecr["core_ecr_alignment"]):
vinit = v
v = v.replace("-","")
for i in range(0,len(v)):
vi = v
v = re.sub('C+', "C", v)
v = re.sub('E+', "E", v)
v = re.sub('e+', "e", v)
v = re.sub('c+', "c", v)
if v == vi:
break
vinit = vinit.replace("-","")
print(vinit,v)
if v not in AllArrang:
AllArrang[v] = 0
AllArrang[v] += 1
#print(vinit,v)
In [ ]:
tot = 0
for w in sorted(AllArrang, key=AllArrang.get, reverse=True):
if (w[0] == "E") or (w[-1] == "E"):
tot += AllArrang[w]
#else:
print w, AllArrang[w]
In [ ]:
print(tot,len(PixelDBoecr))
In [ ]:
AllArrang = dict()
sdf = PixelDB[PixelDB["cluster_number"] == 1]
for v in np.array(sdf["core_ecr_alignment"]):
vinit = v
v = v.replace("-","")
for i in range(0,len(v)):
vi = v
v = re.sub('C+', "C", v)
v = re.sub('E+', "E", v)
v = re.sub('e+', "e", v)
v = re.sub('c+', "c", v)
if v == vi:
break
if v not in AllArrang:
AllArrang[v] = 0
AllArrang[v] += 1
#print(vinit,v)
for w in sorted(AllArrang, key=AllArrang.get, reverse=True):
print w, AllArrang[w]
In [ ]:
In [ ]:
#Amino acid composition
AllAAComb = dict()
sub = AllMeanData["Binding Site"]
for aa in sub:
if aa not in AllAAComb:
AllAAComb[aa] = dict()
for t in sub[aa]:
ti = t
if t == "Core":
ti = "Core-binding"
if t == "ECR":
ti = "Exosite"
if t == "Inte":
ti = "Interior"
if t == "Surf":
ti = "NISR"
AllAAComb[aa][ti] = sub[aa][t][0]
sub = AllMeanData["Amino Acid compostion"]
for aa in sub:
if aa not in AllAAComb:
AllAAComb[aa] = dict()
for t in sub[aa]:
AllAAComb[aa][t] = sub[aa][t][0]
In [ ]:
df = pd.DataFrame(AllAAComb)[myAmino].transpose()[["Core","ECR","Core-binding","Exosite","NISR","Interior"]]
In [ ]:
df
In [ ]:
LabelOr = ["Core","ECR","Core-binding","Exosite","NISR","Interior"]
In [ ]:
colcol = ["red","blue"]+["gray"]*4
LabelOr = ["Core","ECR","Core-binding","Exosite","NISR","Interior"]
sns.set(font_scale=1.4)
#sns.clustermap(df,row_cluster=False,row_colors = Color,col_colors=colcol)
pal = sns.light_palette("navy", as_cmap=True)
sns.clustermap(df[LabelOr],row_cluster=False,col_cluster=False,cmap=pal,figsize=(12,12),linewidths=.5,annot=True)
plt.show()
In [ ]:
colcol = ["red","blue"]+["gray"]*4
sns.set(font_scale=1.9)
#sns.clustermap(df,row_cluster=False,row_colors = Color,col_colors=colcol)
pal = sns.light_palette("navy", as_cmap=True)
sns.clustermap(df,row_cluster=False,cmap=pal,figsize=(12,12),linewidths=.5,annot=True,metric="Correlation", fmt=".1f")
plt.show()
In [ ]:
In [ ]:
figsize(20,8)
Test = ["Binding Site","Amino Acid compostion","Solvent Exposure [0-9]"]
Label = ["Receptor Amino Acid Composition (%)","Peptide Amino Acid Composition (%)","Solvent Exposure (%)"]
Order = ["Core","ECR","Surf","Inte"]
Col = ["darkred","darkblue","gray","black"]
Color = ["red"]*3+["blue"]*2+["purple"]*4+["black"]*3+["green"]*5+["yellow"]*3
pos = 0.0
for (test,lab) in zip(Test,Label):
pos = 0.0
pos += 1.0
sub = AllMeanData[test]
#print(len(sub))
Tlab = sorted(sub)
if len(Tlab) == 20:
Tlab = myAmino
Xlabpos = []
for t in Tlab:
Tpos = []
for o in Order:
if o not in sub[t]:
continue
plt.bar([pos],sub[t][o][0],color=Col[Order.index(o)])
#plt.plot([pos+0.4,pos+0.4],[sub[t][o][0],sub[t][o][0]+sub[t][o][1]],color="black")
plt.plot([pos+0.1,pos+0.7],[sub[t][o][0]+sub[t][o][1]]*2,color="black")
Tpos.append(pos)
pos += 1.0
Xlabpos.append(np.mean(Tpos)+0.5)
pos += 1
#print(t)
#break
if test == "Solvent Exposure [0-9]":
Tlab = ["[0-10[","[10-20[","[20-30[","[30-40[","[40-50[","[50-60[","[60-70["]
plt.xticks(Xlabpos, Tlab)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.xlabel(lab,size=18)
plt.show()
#break
In [ ]: