In [1]:
# written in python3
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import datetime
import seaborn as sns
%matplotlib inline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
def my_transform(data, label, degree, FEATURES):
# LABEL = "Qw"
LABEL = label
PolynomialDegree = degree
num_attribs = FEATURES
cat_attribs = [LABEL]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('std_scaler', StandardScaler()),
('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs))
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
return full_pipeline.fit_transform(data)
In [70]:
import glob
a = glob.glob("/Users/weilu/Research/frustration_selection/tr*")
print(len(a))
pdb_list = [i.split("/")[-1] for i in a]
In [101]:
all_ = []
for pdb in pdb_list:
# print(pdb)
fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/q.txt"
q = np.loadtxt(fileLocation)
fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/rmsd.txt"
rmsd = np.loadtxt(fileLocation)
fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/gdt.txt"
gdt = np.loadtxt(fileLocation)
fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/awsem_energy.txt"
awsem_column = ['Step', 'Chain', 'Shake', 'Chi', 'Rama', 'Excluded', 'DSSP', 'P_AP', 'Water', 'Burial', 'Helix', 'AMH-Go', 'Frag_Mem', 'Vec_FM', 'Membrane', 'SSB', 'Electro', 'QGO', 'VTotal']
awsem = pd.read_csv(fileLocation, sep="\s+", names=awsem_column)
fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/rosetta_energy.txt"
rosetta = pd.read_csv(fileLocation, sep="\s+")
data = pd.concat([awsem, rosetta], axis=1)
assert len(awsem) == len(rosetta) == len(q) == len(rmsd) == len(gdt)
data["Q"] = q
data["RMSD"] = rmsd
data["GDT"] = gdt
data["Protein"] = pdb
all_.append(data)
# print(pdb, len(data))
data = pd.concat(all_)
drop_col = []
for col in data.columns:
# print(col, len(data[col].unique()))
if len(data[col].unique()) == 1:
drop_col.append(col)
data = data.drop(drop_col, axis=1)
def extract_frame(data):
return int(data.split("_")[0])
data["description"] = data["description"].apply(extract_frame)
In [135]:
def choose_top(data, col="RMSD", n=5, ascending=True):
return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='dense')<=n)
In [154]:
folder_list = ["tr894", "tr884", "tr922", "tr882", "tr896", "tr872", "tr594", "tr862", "tr869", "tr898", "tr885", "tr866", "tr868", "tr891", "tr895", "tr870", "tr921", "tr877", "tr948", "tr947"]
In [138]:
raw_data_all = data.groupby("Protein").apply(choose_top, n=100, col="RMSD").reset_index(drop=True)
In [193]:
# train_name_list = ["tr872", "tr885", "tr948"]
# train_name_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# train_name_list = ["tr870"]
# train_name_list = ["tr891"]
# train_name_list = ["tr882"]
# train_name_list = ["tr894"]
# train_name_list = ["tr872"]
# train_name_list = ["tr869"]
# train_name_list = ["tr884"]
# train_name_list = ["tr866", "tr884"]
# train_name_list = ["tr870", "tr872"]
# train_name_list = ["tr866", "tr947"]
# train_name_list = ["tr872"]
# train_name_list = ["tr884", "tr872"]
# train_name_list = ["tr866"]
# train_name_list = ["tr947"]
# train_name_list = ["tr894"]
# train_name_list = ["tr885"]
train_name_list = ["tr884"]
# select for training.
raw_data = raw_data_all.reset_index(drop=True).query(f'Protein in {train_name_list}')
In [194]:
raw_data_all.columns
Out[194]:
In [195]:
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = [
# "biasQ",
'score',
'VTotal',
'fa_atr', 'fa_rep', 'fa_sol',
'fa_intra_rep', 'fa_intra_sol_xover4', 'lk_ball_wtd', 'fa_elec',
'pro_close', 'hbond_sr_bb', 'hbond_lr_bb', 'hbond_bb_sc', 'hbond_sc',
'omega', 'fa_dun', 'p_aa_pp', 'ref', 'rama_prepro', 'allatom_rms', 'maxsub', 'maxsub2.0'
# 'RMSD', # test
# 'Qw',
# 'Burial',
# 'Water',
# 'Rama',
# 'DSSP',
# 'P_AP',
# 'Helix',
# 'Frag_Mem'
]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "chosen"
DEGREE = 1
def pred_from_raw(a):
data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
test_y = data[:,-1]
test_set = data[:,:-1]
prob= clf.predict_proba(test_set)[:,1]
return a.assign(prob=prob)
# data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
data = np.concatenate(raw_data.groupby('Protein').apply(my_transform,
label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
train_y = data[:,-1]
train_set = data[:,:-1]
# clf = svm.SVC(probability=True)
# p = 0.01
# clf = LogisticRegression(random_state=27, class_weight={0:p, 1:(1-p)})
clf = LogisticRegression(random_state=27, solver="liblinear")
clf.fit(train_set, train_y)
filtered = raw_data_all.groupby("Protein").apply(pred_from_raw).reset_index(drop=True)
picked_n = 1
best = raw_data_all.groupby("Protein").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
# if True:
picked_1 = filtered.groupby("Protein").apply(choose_top, col="prob"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
# if False:
picked_5 = filtered.groupby("Protein").apply(choose_top, col="prob"
, n=5, ascending=False).reset_index(drop=True).query("chosen==True")
picked = picked_5.groupby("Protein").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("Protein").apply(choose_top, col="RMSD"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
# init = raw_data_all.groupby("Protein").apply(choose_top, col="i"
# , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'),
picked_1.assign(result='picked'),
# picked.assign(result='picked_5'),
# init.assign(result='init'),
worst.assign(result='worst')
], sort=False)
# all_results = pd.concat([best.assign(result='best'),
# picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv
# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'Protein', 'RMSD').add_legend(fontsize=20)
# fg.set(ylim=(0, 10))
Out[195]:
In [198]:
for pdb in folder_list:
print(pdb, round(10*picked_1.query(f"Protein=='{pdb}'")["RMSD"].values[0], 3))
In [197]:
clf.coef_
Out[197]:
In [ ]:
In [ ]:
# train_name_list = ["tr872", "tr885", "tr948"]
# train_name_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# train_name_list = ["tr870"]
# train_name_list = ["tr891"]
# train_name_list = ["tr882"]
# train_name_list = ["tr894"]
# train_name_list = ["tr872"]
# train_name_list = ["tr869"]
# train_name_list = ["tr884"]
# train_name_list = ["tr866", "tr884"]
# train_name_list = ["tr870", "tr872"]
# train_name_list = ["tr866", "tr947"]
# train_name_list = ["tr872"]
# train_name_list = ["tr884", "tr872"]
train_name_list = ["tr866"]
# train_name_list = ["tr947"]
# select for training.
raw_data = raw_data_all.reset_index(drop=True).query(f'Protein in {train_name_list}')
In [152]:
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = [
# "biasQ",
'score',
'VTotal',
# 'RMSD', # test
# 'Qw',
# 'Burial',
# 'Water',
# 'Rama',
# 'DSSP',
# 'P_AP',
# 'Helix',
# 'Frag_Mem'
]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "chosen"
DEGREE = 1
def pred_from_raw(a):
data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
test_y = data[:,-1]
test_set = data[:,:-1]
prob= clf.predict_proba(test_set)[:,1]
return a.assign(prob=prob)
# data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
data = np.concatenate(raw_data.groupby('Protein').apply(my_transform,
label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
train_y = data[:,-1]
train_set = data[:,:-1]
# clf = svm.SVC(probability=True)
# p = 0.01
# clf = LogisticRegression(random_state=27, class_weight={0:p, 1:(1-p)})
clf = LogisticRegression(random_state=27)
clf.fit(train_set, train_y)
filtered = raw_data_all.groupby("Protein").apply(pred_from_raw).reset_index(drop=True)
picked_n = 1
best = raw_data_all.groupby("Protein").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
# if True:
picked_1 = filtered.groupby("Protein").apply(choose_top, col="prob"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
# if False:
picked_5 = filtered.groupby("Protein").apply(choose_top, col="prob"
, n=5, ascending=False).reset_index(drop=True).query("chosen==True")
picked = picked_5.groupby("Protein").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("Protein").apply(choose_top, col="RMSD"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
# init = raw_data_all.groupby("Protein").apply(choose_top, col="i"
# , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'),
picked_1.assign(result='picked'),
# picked.assign(result='picked_5'),
# init.assign(result='init'),
worst.assign(result='worst')
], sort=False)
# all_results = pd.concat([best.assign(result='best'),
# picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv
# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'Protein', 'RMSD').add_legend(fontsize=20)
# fg.set(ylim=(0, 10))
Out[152]:
In [156]:
picked_1
Out[156]:
In [160]:
picked_1["Protein"].unique()
Out[160]:
In [161]:
folder_list
Out[161]:
In [166]:
for pdb in folder_list:
print(pdb, 10*picked_1.query(f"Protein=='{pdb}'")["RMSD"].values[0])
In [153]:
clf.coef_
Out[153]:
In [149]:
clf.coef_
Out[149]:
In [ ]:
In [132]:
g = sns.FacetGrid(data, col="Protein", col_wrap=4)
g = g.map(plt.hist, "RMSD")
In [128]:
In [129]:
data
Out[129]:
In [121]:
data.groupby("Protein").head(1)
Out[121]:
In [120]:
data.groupby("Protein").apply(choose_top, n=1).query("chosen == True")
Out[120]:
In [111]:
data.shape
Out[111]:
In [112]:
data.columns
Out[112]:
In [113]:
drop_col
Out[113]:
In [116]:
data["description"].unique()
Out[116]:
In [82]:
rosetta.columns
Out[82]:
In [83]:
rosetta.drop("maxsub", axis=1)
Out[83]:
In [78]:
Out[78]:
In [73]:
rosetta
Out[73]:
In [72]:
awsem
Out[72]:
In [4]:
import glob
a = glob.glob("/Users/weilu/Research/frustration_selection/q_and_rmsd/tr*")
In [7]:
In [29]:
pdb = pdb_list[0]
fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/q.txt"
q = np.loadtxt(fileLocation)
In [30]:
q
Out[30]:
In [31]:
len(q)
Out[31]:
In [ ]:
In [32]:
fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/rmsd.txt"
rmsd = np.loadtxt(fileLocation)
In [34]:
len(rmsd)
Out[34]:
In [35]:
fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/gdt.txt"
gdt = np.loadtxt(fileLocation)
In [36]:
len(gdt)
Out[36]:
In [45]:
gdt
Out[45]:
In [59]:
fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/awsem_energy.txt"
awsem_column = ['Step', 'Chain', 'Shake', 'Chi', 'Rama', 'Excluded', 'DSSP', 'P_AP', 'Water', 'Burial', 'Helix', 'AMH-Go', 'Frag_Mem', 'Vec_FM', 'Membrane', 'SSB', 'Electro', 'QGO', 'VTotal']
awsem = pd.read_csv(fileLocation, sep="\s+", names=awsem_column)
In [60]:
len(awsem)
Out[60]:
In [39]:
fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/rosetta_energy.txt"
rosetta = pd.read_csv(fileLocation, sep="\s+")
In [62]:
len(rosetta)
Out[62]:
In [63]:
awsem.columns
Out[63]:
In [64]:
awsem.dtypes
Out[64]:
In [65]:
rosetta.columns
Out[65]:
In [67]:
rosetta.dtypes
Out[67]:
In [18]:
len(gdt)
Out[18]:
In [15]:
len(rmsd)
Out[15]:
In [12]:
len(q)
Out[12]:
In [2]:
# read energy, rw, bias, rmsd data from location
def read_data(name):
# name="tr872"
name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB","VTotal"]
# you probably want to change the location below
# location = f"/Users/weilu/Research/server/sep_2018/03_week/02_week/{name}/"
location = f"/Users/weilu/Research/server/dec_2018/structure_selection_2/{name}/"
RMSD = pd.read_table(location+"rmsd.xvg", names=["i", "RMSD"], sep="\s+")
bias = pd.read_table(location+"bias.log", names=["i", "biasQ", "bias"], sep="\s+").drop("i", axis=1)
awsem = pd.read_table(location+"awsem.log", names=name_list)
rw = pd.read_table(location+"rwplusScore.txt", names=["i", "Rw"], sep="\s+").drop("i", axis=1)
qw = pd.read_table(location+"Qw.out", names=["i", "Qw"], sep="\s+", comment="#").drop("i", axis=1)
# pc location
# location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
# location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
pc = pd.read_table(location+"pcarmsd_scaled.txt", names=["i", "pc", "pc2"], sep="\s+", comment="#").drop("i", axis=1)
raw_data = pd.concat([RMSD, rw, bias, qw, awsem, pc], axis=1)
return raw_data.assign(name=name).reset_index().rename(columns={"index":"folder"})
def choose_top(data,col="RMSD", n=5, ascending=True):
return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='dense')<=n)
# read the pmf, rc.
# def read_data_2(name):
# # name = "tr894"
# # location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
# # location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
# location = f"/Users/weilu/Research/server/dec_2018/structure_selection/{name}/"
# rw = pd.read_table(location+"rc_rwplus", names=["pc","rw"], sep="\s+")
# rmsd = pd.read_table(location+"rc_rmsdlowerBound", names=["pc", "rmsd"], sep="\s+")
# awsem = pd.read_table(location+"rc_awsemEne", names=["pc", "awsem"], sep="\s+")
# qw = pd.read_table(location+"rc_QwhigherBound", names=["pc", "qw"], sep="\s+")
# freeE = pd.read_table(location+"pmf3000"
# , names=["pc", "f", "remove1", "remove2"], sep="\s+").drop(["remove1", "remove2"], axis=1)
# raw_data = freeE.merge(rw, on="pc").merge(awsem, on="pc").merge(qw, on="pc").merge(rmsd, on="pc").assign(name=name)
# return raw_data
def read_data_2(name):
# name = "tr894"
# location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
# location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
location = f"/Users/weilu/Research/server/dec_2018/structure_selection_2/{name}/"
rw = pd.read_table(location+"rc_rwplus", names=["pc","rw"], sep="\s+")
rmsd = pd.read_table(location+"rc_rmsdlowerBound", names=["pc", "rmsd"], sep="\s+")
# awsem = pd.read_table(location+"rc_awsemEne", names=["pc", "awsem"], sep="\s+")
# qw = pd.read_table(location+"rc_QwhigherBound", names=["pc", "qw"], sep="\s+")
freeE = pd.read_table(location+"pmf3000"
, names=["pc", "f", "remove1", "remove2"], sep="\s+").drop(["remove1", "remove2"], axis=1)
raw_data = freeE.merge(rw, on="pc").merge(rmsd, on="pc").assign(name=name)
return raw_data
In [3]:
# folder_list = ["tr894", "tr882", "tr594", "tr898", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
folder_list = ["tr884-halfDIHE", "tr872-halfDIHE", "tr948-halfDIHE", "tr898", "tr947", "tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr884-halfDIHE", "tr872-halfDIHE", "tr898", "tr947", "tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = [ "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# "tr877","tr884", "tr922"
# "tr869"
# folder_list = ["tr894"]
# read all data
# tr884-halfDIHE
# tr872-halfDIHE
# tr948-halfDIHE
data_list = []
for name in folder_list:
tmp = read_data_2(name)
data_list.append(tmp)
raw_data_all = pd.concat(data_list)
n = 1
raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(choose_top, n=n, col="rmsd").reset_index(drop=True)
# train_name_list = ["tr872", "tr885", "tr948"]
# train_name_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# train_name_list = ["tr870"]
# train_name_list = ["tr891"]
# train_name_list = ["tr882"]
# train_name_list = ["tr894"]
# train_name_list = ["tr872"]
# train_name_list = ["tr869"]
# train_name_list = ["tr884"]
# train_name_list = ["tr866", "tr884"]
# train_name_list = ["tr870", "tr872"]
# train_name_list = ["tr866", "tr947"]
# train_name_list = ["tr872"]
# train_name_list = ["tr884", "tr872"]
train_name_list = ["tr866"]
# train_name_list = ["tr947"]
# select for training.
raw_data = raw_data_all.reset_index(drop=True).query(f'name in {train_name_list}')
In [4]:
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = ["f",
'rw',
'awsem',
# 'RMSD', # test
# 'Burial',
# 'Water',
# 'Rama',
# 'DSSP',
# 'P_AP',
# 'Helix',
# 'Frag_Mem'
]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "rmsd"
# LABEL = "qw"
DEGREE = 1
def pred_from_raw(a, clf):
data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
test_y = data[:,-1]
test_set = data[:,:-1]
prediceted_rmsd= clf.predict(test_set)
return a.assign(prediceted_rmsd=prediceted_rmsd)
def assign_lowest_f(a):
return a.assign(lowest_f=a["f"].sort_values().iloc[0])
In [5]:
raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(assign_lowest_f).reset_index(drop=True)
In [6]:
# # data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# # data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
# data = np.concatenate(raw_data.groupby('name').apply(my_transform,
# label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
# train_y = data[:,-1]
# train_set = data[:,:-1]
# from sklearn import svm
# # clf = svm.SVC(probability=True)
# clf = LinearRegression()
# clf.fit(train_set, train_y)
# y_pred_svm = clf.predict(train_set)
# raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(pred_from_raw, clf).reset_index(drop=True)
# # raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(assign_lowest_f).reset_index(drop=True)
# picked_n = 1
# best = raw_data_all.groupby("name").apply(choose_top, col="rmsd"
# , n=picked_n, ascending=True).reset_index(drop=True).query("chosen==True")
# picked = raw_data_all.groupby("name").apply(choose_top, col="prediceted_rmsd"
# , n=picked_n, ascending=True).reset_index(drop=True).query("chosen==True")
# # init = raw_data_all.query("i == 0.0")
# all_results = pd.concat([best.assign(result='best'),
# picked.assign(result='picked')])
# picked_keep = picked.copy()
In [7]:
from scipy.interpolate import interp1d
f_dic = {}
for name in folder_list:
a = raw_data_all.query(f"name == '{name}'")
# print(name ,a.shape)
x = a["pc"].values
y = a["f"].values
f_dic[name] = interp1d(x, y, fill_value="extrapolate")
In [8]:
# g = sns.FacetGrid(raw_data_all, col="name", col_wrap=4)
# g = g.map(plt.plot, "pc", "f")
# plt.ylim([0,1])
In [9]:
# g = sns.FacetGrid(raw_data_all, col="name", col_wrap=4)
# g = g.map(plt.plot, "pc", "f")
# g = g.map(plt.plot, "pc", "prediceted_rmsd")
In [10]:
# raw_data_all.query("name == 'tr594'").plot("pc", "f")
In [11]:
# g = sns.FacetGrid(raw_data_all, col="name", col_wrap=4)
# g = g.map(plt.plot, "pc", "rmsd")
# plt.ylim([0,1])
In [12]:
f_dic["tr594"](raw_data_all["pc"]).shape
Out[12]:
In [13]:
def choose_top(data,col="RMSD", n=5, ascending=True):
return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='first')<=n)
# WIDTH = 100
# WIDTH = 0.1
# WIDTH = 1
# WIDTH = 0.2
# def with_in_range(data, width=WIDTH):
# return data.assign(inrange= (data["pc"] < (data["pc_center"]+width)) & (data["pc"] > (data["pc_center"]-width)))
def with_in_range(data, width=5):
name = data["name"].iloc[0]
return data.assign(inrange= (0 < (f_dic[name](data["pc"]))) & ((f_dic[name](data["pc"])) < width))
In [14]:
# folder_list = ["tr898", "tr869", "tr947", "tr894", "tr882", "tr594", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr884-halfDIHE", "tr872-halfDIHE", "tr898", "tr947", "tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr872-halfDIHE", "tr898", "tr947", "tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# "tr898"
# folder_list = ["tr894", "tr882", "tr594", "tr898", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = [ "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# "tr877","tr884", "tr922"
# "tr869"
# folder_list = ["tr894"]
# folder_list = ["tr866"]
# define top based on RMSD or Qw
# best_metric = "RMSD"
best_metric = "Qw"
if best_metric == "Qw":
isAscending = False
else:
isAscending = True
data_list = []
for name in folder_list:
# print(name)
tmp = read_data(name)
data_list.append(tmp)
raw_data_all_2 = pd.concat(data_list).dropna()
n = 25
raw_data_all_2 = raw_data_all_2.reset_index(drop=True).groupby("name").\
apply(choose_top, n=n, col=best_metric, ascending=isAscending).reset_index(drop=True)
raw_data = raw_data_all_2.reset_index(drop=True).query(f'name in {train_name_list}').dropna()
# a = raw_data_all_2.dropna().merge(picked_keep[["pc", "name"]].rename(columns={"pc":"pc_center"}),on="name")
a = raw_data_all_2.dropna()
filtered = a.groupby("name").apply(with_in_range).query("inrange == True").reset_index(drop=True)
In [19]:
filtered.reset_index(drop=True).to_csv("/Users/weilu/Research/server/dec_2018/structure_selection_3/filtered.csv")
In [15]:
filtered
Out[15]:
In [ ]:
In [16]:
filtered.shape
Out[16]:
In [17]:
a.shape
Out[17]:
In [53]:
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = [
"biasQ",
'Rw',
'VTotal',
# 'RMSD', # test
# 'Qw',
# 'Burial',
# 'Water',
# 'Rama',
# 'DSSP',
# 'P_AP',
# 'Helix',
# 'Frag_Mem'
]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "chosen"
DEGREE = 1
def pred_from_raw(a):
data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
test_y = data[:,-1]
test_set = data[:,:-1]
prob= clf.predict_proba(test_set)[:,1]
return a.assign(prob=prob)
# data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
data = np.concatenate(raw_data.groupby('name').apply(my_transform,
label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
train_y = data[:,-1]
train_set = data[:,:-1]
# clf = svm.SVC(probability=True)
# p = 0.01
# clf = LogisticRegression(random_state=27, class_weight={0:p, 1:(1-p)})
clf = LogisticRegression(random_state=27)
clf.fit(train_set, train_y)
filtered = filtered.reset_index(drop=True).groupby("name").apply(pred_from_raw).reset_index(drop=True)
picked_n = 1
best = raw_data_all_2.groupby("name").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
# if True:
picked_1 = filtered.groupby("name").apply(choose_top, col="prob"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
# if False:
picked_5 = filtered.groupby("name").apply(choose_top, col="prob"
, n=5, ascending=False).reset_index(drop=True).query("chosen==True")
picked = picked_5.groupby("name").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("name").apply(choose_top, col="RMSD"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
init = raw_data_all_2.groupby("name").apply(choose_top, col="i"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'),
picked_1.assign(result='picked'), init.assign(result='init')
, worst.assign(result='worst')
], sort=False)
# all_results = pd.concat([best.assign(result='best'),
# picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv
# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
# fg.set(ylim=(0, 10))
Out[53]:
In [56]:
all_results.query("name == 'tr872-halfDIHE' or name == 'tr872'").query("result == 'picked_top1'")
Out[56]:
In [55]:
all_results.query("name == 'tr948-halfDIHE' or name == 'tr948'").query("result == 'picked_top1'")
Out[55]:
In [52]:
all_results.query("name == 'tr884-halfDIHE' or name == 'tr884'").query("result == 'picked_top1'")
Out[52]:
In [54]:
all_results = pd.concat([picked_1.assign(result="picked_top1"), picked_5.assign(result='picked_top5'),
picked.assign(result='picked_best'),
best.assign(result='best'),
init.assign(result='init'),
worst.assign(result='worst'),
], sort=False)
In [30]:
all_results.reset_index(drop=True).to_csv("/Users/weilu/Research/server/dec_2018/structure_selection_2/all_results.csv")
In [32]:
all_results = pd.read_csv("/Users/weilu/Research/server/dec_2018/structure_selection_2/all_results.csv", index_col=0)
In [62]:
all_results.query("result == 'picked_top5'").groupby("name").apply(pd.sort_values, "prob")
In [69]:
all_results.query("result == 'picked_top5'").sort_values(["name", "prob"], ascending=False)
Out[69]:
In [41]:
for i, line in all_results.query("result == 'picked_top5'").reset_index(drop=True).iterrows():
print(i, line["name"], line["folder"])
os.system("")
In [23]:
all_results = pd.concat([picked_1.assign(result="picked_top1"), picked.assign(result='picked_top5'),
best.assign(result='best'),
init.assign(result='init'),
worst.assign(result='worst'),
], sort=False)
In [25]:
picked.shape
Out[25]:
In [32]:
all_results.reindex(columns=my_reorder(all_results.columns, ["name", "RMSD", "result"])) .reset_index(drop=True).to_csv("/Users/weilu/Desktop/selection_result.csv")
In [28]:
def my_reorder(a, first):
# move first to the top. and keep the rest
new_order = first.copy()
for col in a:
if col not in first:
new_order.append(col)
return new_order
In [31]:
all_results.reindex(columns=my_reorder(all_results.columns, ["name", "RMSD", "result"]))
Out[31]:
In [ ]:
picked
In [19]:
count = 0
total = 0
for name in folder_list:
init_tmp = init.query(f"name == '{name}'")["RMSD"].iloc[0]
picked_tmp = picked.query(f"name == '{name}'")["RMSD"].iloc[0]
improved = picked_tmp < init_tmp
print(name, init_tmp, picked_tmp, round(init_tmp - picked_tmp, 3), improved)
total += init_tmp - picked_tmp
count += improved
print("improved: ", count, len(folder_list), total)
In [26]:
count = 0
total = 0
for name in folder_list:
init_tmp = init.query(f"name == '{name}'")["RMSD"].iloc[0]
picked_tmp = picked_1.query(f"name == '{name}'")["RMSD"].iloc[0]
improved = picked_tmp < init_tmp
print(name, init_tmp, picked_tmp, round(init_tmp - picked_tmp, 3), improved)
total += init_tmp - picked_tmp
count += improved
print("improved: ", count, len(folder_list), total)
In [181]:
filtered = a
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = [
"biasQ",
'Rw',
'VTotal',
# 'RMSD', # test
# 'Qw',
# 'Burial',
# 'Water',
# 'Rama',
# 'DSSP',
# 'P_AP',
# 'Helix',
# 'Frag_Mem'
]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "chosen"
DEGREE = 1
def pred_from_raw(a):
data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
test_y = data[:,-1]
test_set = data[:,:-1]
prob= clf.predict_proba(test_set)[:,1]
return a.assign(prob=prob)
# data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
data = np.concatenate(raw_data.groupby('name').apply(my_transform,
label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
train_y = data[:,-1]
train_set = data[:,:-1]
# clf = svm.SVC(probability=True)
# p = 0.01
# clf = LogisticRegression(random_state=27, class_weight={0:p, 1:(1-p)})
clf = LogisticRegression(random_state=27)
clf.fit(train_set, train_y)
filtered = filtered.reset_index(drop=True).groupby("name").apply(pred_from_raw).reset_index(drop=True)
picked_n = 1
best = raw_data_all_2.groupby("name").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
if True:
picked = filtered.groupby("name").apply(choose_top, col="prob"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
if False:
picked = filtered.groupby("name").apply(choose_top, col="prob"
, n=5, ascending=False).reset_index(drop=True).query("chosen==True")
picked = picked.groupby("name").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("name").apply(choose_top, col="RMSD"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
init = raw_data_all_2.groupby("name").apply(choose_top, col="i"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'),
picked.assign(result='picked'), init.assign(result='init')
, worst.assign(result='worst')
], sort=False)
# all_results = pd.concat([best.assign(result='best'),
# picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv
# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
# fg.set(ylim=(0, 10))
Out[181]:
In [182]:
count = 0
total = 0
for name in folder_list:
init_tmp = init.query(f"name == '{name}'")["RMSD"].iloc[0]
picked_tmp = picked.query(f"name == '{name}'")["RMSD"].iloc[0]
improved = picked_tmp < init_tmp
print(name, init_tmp, picked_tmp, round(init_tmp - picked_tmp, 3), improved)
total += init_tmp - picked_tmp
count += improved
print("improved: ", count, len(folder_list), total)
In [339]:
all_results.query("name == 'tr594'")
Out[339]:
In [35]:
clf.coef_
Out[35]:
In [221]:
Plot_Metric = "Qw"
if Plot_Metric:
isAscending = False
else:
isAscending = True
picked_n = 1
best = raw_data_all_2.groupby("name").apply(choose_top, col=Plot_Metric
, n=1, ascending=isAscending).reset_index(drop=True).query("chosen==True")
picked = filtered.groupby("name").apply(choose_top, col="prob"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("name").apply(choose_top, col=Plot_Metric
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
init = raw_data_all_2.groupby("name").apply(choose_top, col="i"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'),
picked.assign(result='picked'), init.assign(result='init')
# , worst.assign(result='worst')
], sort=False)
# all_results = pd.concat([best.assign(result='best'),
# picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv
# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', Plot_Metric).add_legend(fontsize=20)
fg.set(ylim=(0, 1))
Out[221]:
In [170]:
picked["init_RMSD"] = init["RMSD"].values
picked["diff_RMSD"] = init["RMSD"].values - picked["RMSD"].values
out = picked[["name", "RMSD", "init_RMSD", "diff_RMSD", "folder"]].reset_index(drop=True)
In [206]:
fg = sns.FacetGrid(data=filtered, hue='name', height=8, aspect=1.63)
fg.map(plt.scatter, 'Qw', 'RMSD').add_legend(fontsize=20)
Out[206]:
In [35]:
filtered.plot.scatter("prob", "RMSD")
Out[35]:
In [25]:
out
Out[25]:
In [13]:
raw_data_all_2.plot("RMSD", "Rw")
Out[13]:
In [14]:
raw_data_all_2.plot("RMSD", "pc")
Out[14]:
In [15]:
out
Out[15]:
In [16]:
out
Out[16]:
In [70]:
all_results
Out[70]:
In [13]:
# out.to_csv("/Users/weilu/Desktop/picked_3.csv")
In [26]:
clf.coef_
Out[26]:
In [14]:
clf.coef_
Out[14]:
In [15]:
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
fg.set(ylim=(0, 10))
Out[15]:
In [16]:
filtered["name"].unique().shape
Out[16]:
In [17]:
picked[["RMSD", "name"]]
Out[17]:
In [18]:
# picked.to_csv("/Users/weilu/Desktop/picked_2.csv")
In [19]:
name ="tr894"
name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB","VTotal"]
# you probably want to change the location below
# location = f"/Users/weilu/Research/server/sep_2018/03_week/02_week/{name}/"
location = f"/Users/weilu/Research/server/nov_2018/structure_selection/{name}/"
RMSD = pd.read_table(location+"rmsd-angstrom.xvg", names=["i", "RMSD"], sep="\s+")
bias = pd.read_table(location+"bias.log", names=["i", "biasQ", "bias"], sep="\s+").drop("i", axis=1)
awsem = pd.read_table(location+"awsem.log", names=name_list)
rw = pd.read_table(location+"rwplusScore.txt", names=["i", "Rw"], sep="\s+").drop("i", axis=1)
# pc location
# location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
# location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
pc = pd.read_table(location+"pcarmsd_scaled.txt", names=["i", "pc", "pc2"], sep="\s+", comment="#").drop("i", axis=1)
raw_data = pd.concat([RMSD, rw, bias, awsem, pc], axis=1)
raw_data.assign(name=name).reset_index().rename(columns={"index":"folder"})
Out[19]:
In [ ]: