In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import os
from small_script.myFunctions import *
%matplotlib inline
%load_ext autoreload
%autoreload 2
In [24]:
In [3]:
data = pd.read_csv("/Users/weilu/Research/server/april_2019/database/Q_iter3_with_rg_1ccr", index_col=0)
a = data.groupby("Run").apply(lambda grp: grp.nlargest(1, "index")).reset_index(drop=True)
In [20]:
In [79]:
pre = "/Users/weilu/Research/server/april_2019/database/"
dataset = {"old":"1R69, 1UTG, 3ICB, 256BA, 4CPV, 1CCR, 2MHR, 1MBA, 2FHA".split(", "),
"new":"1FC2C, 1ENH, 2GB1, 2CRO, 1CTF, 4ICB".split(", "),
"test":["t089", "t120", "t251", "top7", "1ubq", "t0766", "t0778", "t0782", "t0792", "t0803", "t0815", "t0833", "t0842", "t0844"]}
dataset["combined"] = dataset["old"] + dataset["new"]
simulation_location_list = ["iter3_with_rg", "iter2_with_rg", "iter1_with_rg"]
pdb_list = dataset["combined"]
cluster_folder = "/Users/weilu/Research/server/april_2019/cluster_results/"
for simulation_location in simulation_location_list:
for p in pdb_list:
name = p.lower()[:4]
targetFolder = f"{cluster_folder}{simulation_location}_{name}"
os.system(f"mkdir -p {cluster_folder}{simulation_location}_{name}")
data = pd.read_csv(f"{pre}/Q_{simulation_location}_{name}", index_col=0)
a = data.groupby("Run").apply(lambda grp: grp.nlargest(1, "index")).reset_index(drop=True)
for i, item in a.iterrows():
# print(i)
fileLocation = f"{pre}/{simulation_location}_{name}_{int(item['Run'])}/frame{int(item['index'])}.pdb"
os.system(f"cp {fileLocation} {targetFolder}/run{i}.pdb")
In [90]:
%%time
all_data = []
for p in pdb_list:
name = p.lower()[:4]
for simulation_location in simulation_location_list:
t1 = f"{cluster_folder}{simulation_location}_{name}"
data = pd.read_csv(f"{pre}/Q_{simulation_location}_{name}", index_col=0)
a = data.groupby("Run").apply(lambda grp: grp.nlargest(1, "index")).reset_index(drop=True)
for i, item in a.iterrows():
for simulation_location_2 in simulation_location_list:
t2 = f"{cluster_folder}{simulation_location_2}_{name}"
data2 = pd.read_csv(f"{pre}/Q_{simulation_location_2}_{name}", index_col=0)
b = data2.groupby("Run").apply(lambda grp: grp.nlargest(1, "index")).reset_index(drop=True)
for j, item2 in b.iterrows():
p1 = f"{t1}/run{i}.pdb"
p2 = f"{t2}/run{j}.pdb"
get_aligned_info(p1, p2)
all_data.append([name, simulation_location, i, simulation_location_2, j, aligned_length, rmsd, tmscore, seqid])
# break
break
break
In [147]:
plt.rcParams['figure.figsize'] = [16.18033, 10]
In [91]:
data = pd.DataFrame(all_data, columns=["protein", "sim1", "i", "sim2", "j", "aligned_length", "rmsd", "tmscore", "seqid"])
data.to_csv(f"{simulation_location}_{name}.csv")
In [ ]:
for simulation_location in simulation_location_list:
for p in pdb_list:
name = p.lower()[:4]
targetFolder = f"{cluster_folder}{simulation_location}_{name}"
os.system(f"mkdir -p {cluster_folder}{simulation_location}_{name}")
for i, item in a.iterrows():
# print(i)
get_aligned_info(p1, p2)
In [35]:
simulation_location = "iter3_with_rg"
name = "1ccr"
cluster_folder = "/Users/weilu/Research/server/april_2019/cluster_results/"
targetFolder = f"{cluster_folder}{simulation_location}_{name}"
os.system(f"mkdir -p {cluster_folder}{simulation_location}_{name}")
for i, item in a.iterrows():
# print(i)
fileLocation = f"{pre}/{simulation_location}_{name}_{int(item['Run'])}/frame{int(item['index'])}.pdb"
os.system(f"cp {fileLocation} {targetFolder}/run{i}.pdb")
# print(f"cp {fileLocation} {cluster_folder}{simulation_location}_{name}/run{i}.pdb")
In [48]:
cmd = f"~/opt/TMalign/TMalign {targetFolder}/run0.pdb {targetFolder}/run1.pdb | grep -A 1 'Aligned'"
output = getFromTerminal(cmd)
print(output)
In [81]:
def get_aligned_info(p1, p2):
cmd = f"~/opt/TMalign/TMalign {p1} {p2} | grep -A 1 'Aligned'"
output = getFromTerminal(cmd)
line = output.split("\n")[0]
line2 = output.split("\n")[1]
aligned_length,rmsd,seqid = line.split(",")
aligned_length = int(aligned_length.split("=")[1])
rmsd = float(rmsd.split("=")[1])
tmscore = float(line2.split(",")[0].split("=")[-1].strip().split(" ")[0])
seqid = float(seqid.split("=")[-1])
# print("aligned_length, rmsd, tmscore, seqid")
# print(aligned_length, rmsd, tmscore, seqid)
In [76]:
p1 = f"{targetFolder}/run0.pdb"
p2 = f"{targetFolder}/run1.pdb"
get_aligned_info(p1, p2)
In [134]:
complete = pd.read_csv("/Users/weilu/Research/server/april_2019/cluster_apr12.csv", index_col=0)
complete["sim1_i"] = complete["sim1"] + "_" + complete["i"].astype(str)
complete["sim2_j"] = complete["sim2"] + "_" + complete["j"].astype(str)
In [156]:
a = complete.query("protein == '1fc2'")
_table = a.pivot(index="sim1_i", columns="sim2_j", values="tmscore")
ax = sns.heatmap(_table)
In [153]:
a = complete.query("protein == '1fc2'")
_table = a.pivot(index="sim1_i", columns="sim2_j", values="rmsd")
ax = sns.heatmap(_table, cmap="seismic")
In [165]:
info
Out[165]:
In [159]:
for p in pdb_list:
name = p.lower()[:4]
a = complete.query(f"protein == '{name}'")
_table = a.pivot(index="sim1_i", columns="sim2_j", values="tmscore")
print(name)
ax = sns.heatmap(_table)
ax.set_title(name+)
plt.figure()
In [150]:
for p in pdb_list:
name = p.lower()[:4]
a = complete.query(f"protein == '{name}'")
_table = a.pivot(index="sim1_i", columns="sim2_j", values="rmsd")
ax = sns.heatmap(_table)
plt.figure()
In [ ]:
In [ ]:
In [ ]:
In [106]:
info = pd.read_csv("/Users/weilu/Research/library/training_set_length_info.csv", index_col=0)
In [95]:
import glob
a = glob.glob("/Users/weilu/Research/server/april_2019/cluster_results/*.csv")
dd = []
for f in a:
tmp = pd.read_csv(f, index_col=0)
dd.append(tmp)
# data.join(info.set_index("protein"), on="protein")
In [97]:
In [103]:
In [117]:
complete.to_csv("/Users/weilu/Research/server/april_2019/cluster_apr12.csv")
In [119]:
In [ ]: