In [2]:
# written in python3
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import datetime
import seaborn as sns
%matplotlib inline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
def my_transform(data, label, degree, FEATURES):
# LABEL = "Qw"
LABEL = label
PolynomialDegree = degree
num_attribs = FEATURES
cat_attribs = [LABEL]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('std_scaler', StandardScaler()),
('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs))
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
return full_pipeline.fit_transform(data)
In [5]:
# read energy, rw, bias, rmsd data from location
def read_data(name):
# name="tr872"
name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB","VTotal"]
# you probably want to change the location below
# location = f"/Users/weilu/Research/server/sep_2018/03_week/02_week/{name}/"
location = f"/Users/weilu/Research/server/dec_2018/structure_selection/{name}/"
RMSD = pd.read_table(location+"rmsd.xvg", names=["i", "RMSD"], sep="\s+")
bias = pd.read_table(location+"bias.log", names=["i", "biasQ", "bias"], sep="\s+").drop("i", axis=1)
awsem = pd.read_table(location+"awsem.log", names=name_list)
rw = pd.read_table(location+"rwplusScore.txt", names=["i", "Rw"], sep="\s+").drop("i", axis=1)
qw = pd.read_table(location+"Qw.out", names=["i", "Qw"], sep="\s+").drop("i", axis=1)
# pc location
# location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
# location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
pc = pd.read_table(location+"pcarmsd_scaled.txt", names=["i", "pc", "pc2"], sep="\s+", comment="#").drop("i", axis=1)
raw_data = pd.concat([RMSD, rw, bias, qw, awsem, pc], axis=1)
return raw_data.assign(name=name).reset_index().rename(columns={"index":"folder"})
def choose_top(data,col="RMSD", n=5, ascending=True):
return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='dense')<=n)
# read the pmf, rc.
# def read_data_2(name):
# # name = "tr894"
# # location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
# # location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
# location = f"/Users/weilu/Research/server/dec_2018/structure_selection/{name}/"
# rw = pd.read_table(location+"rc_rwplus", names=["pc","rw"], sep="\s+")
# rmsd = pd.read_table(location+"rc_rmsdlowerBound", names=["pc", "rmsd"], sep="\s+")
# awsem = pd.read_table(location+"rc_awsemEne", names=["pc", "awsem"], sep="\s+")
# qw = pd.read_table(location+"rc_QwhigherBound", names=["pc", "qw"], sep="\s+")
# freeE = pd.read_table(location+"pmf3000"
# , names=["pc", "f", "remove1", "remove2"], sep="\s+").drop(["remove1", "remove2"], axis=1)
# raw_data = freeE.merge(rw, on="pc").merge(awsem, on="pc").merge(qw, on="pc").merge(rmsd, on="pc").assign(name=name)
# return raw_data
def read_data_2(name):
# name = "tr894"
# location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
# location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
location = f"/Users/weilu/Research/server/dec_2018/structure_selection/{name}/"
rw = pd.read_table(location+"rc_rwplus", names=["pc","rw"], sep="\s+")
rmsd = pd.read_table(location+"rc_rmsdlowerBound", names=["pc", "rmsd"], sep="\s+")
awsem = pd.read_table(location+"rc_awsemEne", names=["pc", "awsem"], sep="\s+")
qw = pd.read_table(location+"rc_QwhigherBound", names=["pc", "qw"], sep="\s+")
freeE = pd.read_table(location+"pmf3000"
, names=["pc", "f", "remove1", "remove2"], sep="\s+").drop(["remove1", "remove2"], axis=1)
raw_data = freeE.merge(rw, on="pc").merge(awsem, on="pc").merge(qw, on="pc").merge(rmsd, on="pc").assign(name=name)
return raw_data
In [6]:
# folder_list = ["tr894", "tr882", "tr594", "tr898", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
folder_list = ["tr884-halfDIHE", "tr872-halfDIHE", "tr948-halfDIHE", "tr898", "tr947", "tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
folder_list = ["tr884-halfDIHE", "tr872-halfDIHE", "tr898", "tr947", "tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = [ "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# "tr877","tr884", "tr922"
# "tr869"
# folder_list = ["tr894"]
# read all data
# tr884-halfDIHE
# tr872-halfDIHE
# tr948-halfDIHE
data_list = []
for name in folder_list:
tmp = read_data_2(name)
data_list.append(tmp)
raw_data_all = pd.concat(data_list)
n = 1
raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(choose_top, n=n, col="rmsd").reset_index(drop=True)
# train_name_list = ["tr872", "tr885", "tr948"]
# train_name_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# train_name_list = ["tr870"]
# train_name_list = ["tr891"]
# train_name_list = ["tr882"]
# train_name_list = ["tr894"]
# train_name_list = ["tr872"]
# train_name_list = ["tr869"]
# train_name_list = ["tr884"]
# train_name_list = ["tr866", "tr884"]
# train_name_list = ["tr870", "tr872"]
# train_name_list = ["tr866", "tr947"]
# train_name_list = ["tr872"]
# train_name_list = ["tr884", "tr872"]
train_name_list = ["tr866"]
# train_name_list = ["tr947"]
# select for training.
raw_data = raw_data_all.reset_index(drop=True).query(f'name in {train_name_list}')
In [4]:
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = ["f",
'rw',
'awsem',
# 'RMSD', # test
# 'Burial',
# 'Water',
# 'Rama',
# 'DSSP',
# 'P_AP',
# 'Helix',
# 'Frag_Mem'
]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "rmsd"
LABEL = "qw"
DEGREE = 1
def pred_from_raw(a, clf):
data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
test_y = data[:,-1]
test_set = data[:,:-1]
prediceted_rmsd= clf.predict(test_set)
return a.assign(prediceted_rmsd=prediceted_rmsd)
def assign_lowest_f(a):
return a.assign(lowest_f=a["f"].sort_values().iloc[0])
In [5]:
raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(assign_lowest_f).reset_index(drop=True)
In [6]:
# # data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# # data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
# data = np.concatenate(raw_data.groupby('name').apply(my_transform,
# label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
# train_y = data[:,-1]
# train_set = data[:,:-1]
# from sklearn import svm
# # clf = svm.SVC(probability=True)
# clf = LinearRegression()
# clf.fit(train_set, train_y)
# y_pred_svm = clf.predict(train_set)
# raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(pred_from_raw, clf).reset_index(drop=True)
# # raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(assign_lowest_f).reset_index(drop=True)
# picked_n = 1
# best = raw_data_all.groupby("name").apply(choose_top, col="rmsd"
# , n=picked_n, ascending=True).reset_index(drop=True).query("chosen==True")
# picked = raw_data_all.groupby("name").apply(choose_top, col="prediceted_rmsd"
# , n=picked_n, ascending=True).reset_index(drop=True).query("chosen==True")
# # init = raw_data_all.query("i == 0.0")
# all_results = pd.concat([best.assign(result='best'),
# picked.assign(result='picked')])
# picked_keep = picked.copy()
In [7]:
from scipy.interpolate import interp1d
f_dic = {}
for name in folder_list:
a = raw_data_all.query(f"name == '{name}'")
x = a["pc"].values
y = a["f"].values
f_dic[name] = interp1d(x, y, fill_value="extrapolate")
In [8]:
# g = sns.FacetGrid(raw_data_all, col="name", col_wrap=4)
# g = g.map(plt.plot, "pc", "f")
# plt.ylim([0,1])
In [9]:
# g = sns.FacetGrid(raw_data_all, col="name", col_wrap=4)
# g = g.map(plt.plot, "pc", "f")
# g = g.map(plt.plot, "pc", "prediceted_rmsd")
In [10]:
# raw_data_all.query("name == 'tr594'").plot("pc", "f")
In [11]:
# g = sns.FacetGrid(raw_data_all, col="name", col_wrap=4)
# g = g.map(plt.plot, "pc", "rmsd")
# plt.ylim([0,1])
In [12]:
f_dic["tr594"](raw_data_all["pc"]).shape
Out[12]:
In [13]:
def choose_top(data,col="RMSD", n=5, ascending=True):
return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='first')<=n)
# WIDTH = 100
# WIDTH = 0.1
# WIDTH = 1
# WIDTH = 0.2
# def with_in_range(data, width=WIDTH):
# return data.assign(inrange= (data["pc"] < (data["pc_center"]+width)) & (data["pc"] > (data["pc_center"]-width)))
def with_in_range(data, width=5):
name = data["name"].iloc[0]
return data.assign(inrange= (0 < (f_dic[name](data["pc"]))) & ((f_dic[name](data["pc"])) < width))
In [14]:
folder_list = ["tr898", "tr869", "tr947", "tr894", "tr882", "tr594", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# "tr898"
# folder_list = ["tr894", "tr882", "tr594", "tr898", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = [ "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# "tr877","tr884", "tr922"
# "tr869"
# folder_list = ["tr894"]
# folder_list = ["tr866"]
# define top based on RMSD or Qw
# best_metric = "RMSD"
best_metric = "Qw"
if best_metric == "Qw":
isAscending = False
else:
isAscending = True
data_list = []
for name in folder_list:
tmp = read_data(name)
data_list.append(tmp)
raw_data_all_2 = pd.concat(data_list).dropna()
n = 25
raw_data_all_2 = raw_data_all_2.reset_index(drop=True).groupby("name").\
apply(choose_top, n=n, col=best_metric, ascending=isAscending).reset_index(drop=True)
raw_data = raw_data_all_2.reset_index(drop=True).query(f'name in {train_name_list}').dropna()
# a = raw_data_all_2.dropna().merge(picked_keep[["pc", "name"]].rename(columns={"pc":"pc_center"}),on="name")
a = raw_data_all_2.dropna()
filtered = a.groupby("name").apply(with_in_range).query("inrange == True").reset_index(drop=True)
In [15]:
filtered.shape
Out[15]:
In [16]:
a.shape
Out[16]:
In [17]:
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = [
"biasQ",
'Rw',
'VTotal',
# 'RMSD', # test
# 'Qw',
# 'Burial',
# 'Water',
# 'Rama',
# 'DSSP',
# 'P_AP',
# 'Helix',
# 'Frag_Mem'
]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "chosen"
DEGREE = 1
def pred_from_raw(a):
data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
test_y = data[:,-1]
test_set = data[:,:-1]
prob= clf.predict_proba(test_set)[:,1]
return a.assign(prob=prob)
# data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
data = np.concatenate(raw_data.groupby('name').apply(my_transform,
label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
train_y = data[:,-1]
train_set = data[:,:-1]
# clf = svm.SVC(probability=True)
# p = 0.01
# clf = LogisticRegression(random_state=27, class_weight={0:p, 1:(1-p)})
clf = LogisticRegression(random_state=27)
clf.fit(train_set, train_y)
filtered = filtered.reset_index(drop=True).groupby("name").apply(pred_from_raw).reset_index(drop=True)
picked_n = 1
best = raw_data_all_2.groupby("name").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
# if True:
picked_1 = filtered.groupby("name").apply(choose_top, col="prob"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
# if False:
picked = filtered.groupby("name").apply(choose_top, col="prob"
, n=5, ascending=False).reset_index(drop=True).query("chosen==True")
picked = picked.groupby("name").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("name").apply(choose_top, col="RMSD"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
init = raw_data_all_2.groupby("name").apply(choose_top, col="i"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'),
picked.assign(result='picked'), init.assign(result='init')
, worst.assign(result='worst')
], sort=False)
# all_results = pd.concat([best.assign(result='best'),
# picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv
# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
# fg.set(ylim=(0, 10))
Out[17]:
In [23]:
all_results = pd.concat([picked_1.assign(result="picked_top1"), picked.assign(result='picked_top5'),
best.assign(result='best'),
init.assign(result='init'),
worst.assign(result='worst'),
], sort=False)
In [25]:
picked.shape
Out[25]:
In [32]:
all_results.reindex(columns=my_reorder(all_results.columns, ["name", "RMSD", "result"])) .reset_index(drop=True).to_csv("/Users/weilu/Desktop/selection_result.csv")
In [28]:
def my_reorder(a, first):
# move first to the top. and keep the rest
new_order = first.copy()
for col in a:
if col not in first:
new_order.append(col)
return new_order
In [31]:
all_results.reindex(columns=my_reorder(all_results.columns, ["name", "RMSD", "result"]))
Out[31]:
In [ ]:
picked
In [19]:
count = 0
total = 0
for name in folder_list:
init_tmp = init.query(f"name == '{name}'")["RMSD"].iloc[0]
picked_tmp = picked.query(f"name == '{name}'")["RMSD"].iloc[0]
improved = picked_tmp < init_tmp
print(name, init_tmp, picked_tmp, round(init_tmp - picked_tmp, 3), improved)
total += init_tmp - picked_tmp
count += improved
print("improved: ", count, len(folder_list), total)
In [26]:
count = 0
total = 0
for name in folder_list:
init_tmp = init.query(f"name == '{name}'")["RMSD"].iloc[0]
picked_tmp = picked_1.query(f"name == '{name}'")["RMSD"].iloc[0]
improved = picked_tmp < init_tmp
print(name, init_tmp, picked_tmp, round(init_tmp - picked_tmp, 3), improved)
total += init_tmp - picked_tmp
count += improved
print("improved: ", count, len(folder_list), total)
In [181]:
filtered = a
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = [
"biasQ",
'Rw',
'VTotal',
# 'RMSD', # test
# 'Qw',
# 'Burial',
# 'Water',
# 'Rama',
# 'DSSP',
# 'P_AP',
# 'Helix',
# 'Frag_Mem'
]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "chosen"
DEGREE = 1
def pred_from_raw(a):
data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
test_y = data[:,-1]
test_set = data[:,:-1]
prob= clf.predict_proba(test_set)[:,1]
return a.assign(prob=prob)
# data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
data = np.concatenate(raw_data.groupby('name').apply(my_transform,
label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
train_y = data[:,-1]
train_set = data[:,:-1]
# clf = svm.SVC(probability=True)
# p = 0.01
# clf = LogisticRegression(random_state=27, class_weight={0:p, 1:(1-p)})
clf = LogisticRegression(random_state=27)
clf.fit(train_set, train_y)
filtered = filtered.reset_index(drop=True).groupby("name").apply(pred_from_raw).reset_index(drop=True)
picked_n = 1
best = raw_data_all_2.groupby("name").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
if True:
picked = filtered.groupby("name").apply(choose_top, col="prob"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
if False:
picked = filtered.groupby("name").apply(choose_top, col="prob"
, n=5, ascending=False).reset_index(drop=True).query("chosen==True")
picked = picked.groupby("name").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("name").apply(choose_top, col="RMSD"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
init = raw_data_all_2.groupby("name").apply(choose_top, col="i"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'),
picked.assign(result='picked'), init.assign(result='init')
, worst.assign(result='worst')
], sort=False)
# all_results = pd.concat([best.assign(result='best'),
# picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv
# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
# fg.set(ylim=(0, 10))
Out[181]:
In [182]:
count = 0
total = 0
for name in folder_list:
init_tmp = init.query(f"name == '{name}'")["RMSD"].iloc[0]
picked_tmp = picked.query(f"name == '{name}'")["RMSD"].iloc[0]
improved = picked_tmp < init_tmp
print(name, init_tmp, picked_tmp, round(init_tmp - picked_tmp, 3), improved)
total += init_tmp - picked_tmp
count += improved
print("improved: ", count, len(folder_list), total)
In [339]:
all_results.query("name == 'tr594'")
Out[339]:
In [35]:
clf.coef_
Out[35]:
In [221]:
Plot_Metric = "Qw"
if Plot_Metric:
isAscending = False
else:
isAscending = True
picked_n = 1
best = raw_data_all_2.groupby("name").apply(choose_top, col=Plot_Metric
, n=1, ascending=isAscending).reset_index(drop=True).query("chosen==True")
picked = filtered.groupby("name").apply(choose_top, col="prob"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("name").apply(choose_top, col=Plot_Metric
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
init = raw_data_all_2.groupby("name").apply(choose_top, col="i"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'),
picked.assign(result='picked'), init.assign(result='init')
# , worst.assign(result='worst')
], sort=False)
# all_results = pd.concat([best.assign(result='best'),
# picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv
# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', Plot_Metric).add_legend(fontsize=20)
fg.set(ylim=(0, 1))
Out[221]:
In [170]:
picked["init_RMSD"] = init["RMSD"].values
picked["diff_RMSD"] = init["RMSD"].values - picked["RMSD"].values
out = picked[["name", "RMSD", "init_RMSD", "diff_RMSD", "folder"]].reset_index(drop=True)
In [206]:
fg = sns.FacetGrid(data=filtered, hue='name', height=8, aspect=1.63)
fg.map(plt.scatter, 'Qw', 'RMSD').add_legend(fontsize=20)
Out[206]:
In [35]:
filtered.plot.scatter("prob", "RMSD")
Out[35]:
In [25]:
out
Out[25]:
In [13]:
raw_data_all_2.plot("RMSD", "Rw")
Out[13]:
In [14]:
raw_data_all_2.plot("RMSD", "pc")
Out[14]:
In [15]:
out
Out[15]:
In [16]:
out
Out[16]:
In [70]:
all_results
Out[70]:
In [13]:
# out.to_csv("/Users/weilu/Desktop/picked_3.csv")
In [26]:
clf.coef_
Out[26]:
In [14]:
clf.coef_
Out[14]:
In [15]:
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
fg.set(ylim=(0, 10))
Out[15]:
In [16]:
filtered["name"].unique().shape
Out[16]:
In [17]:
picked[["RMSD", "name"]]
Out[17]:
In [18]:
# picked.to_csv("/Users/weilu/Desktop/picked_2.csv")
In [19]:
name ="tr894"
name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB","VTotal"]
# you probably want to change the location below
# location = f"/Users/weilu/Research/server/sep_2018/03_week/02_week/{name}/"
location = f"/Users/weilu/Research/server/nov_2018/structure_selection/{name}/"
RMSD = pd.read_table(location+"rmsd-angstrom.xvg", names=["i", "RMSD"], sep="\s+")
bias = pd.read_table(location+"bias.log", names=["i", "biasQ", "bias"], sep="\s+").drop("i", axis=1)
awsem = pd.read_table(location+"awsem.log", names=name_list)
rw = pd.read_table(location+"rwplusScore.txt", names=["i", "Rw"], sep="\s+").drop("i", axis=1)
# pc location
# location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
# location = f"/Users/weilu/Research/server/oct_2018/01_week/{name}/"
pc = pd.read_table(location+"pcarmsd_scaled.txt", names=["i", "pc", "pc2"], sep="\s+", comment="#").drop("i", axis=1)
raw_data = pd.concat([RMSD, rw, bias, awsem, pc], axis=1)
raw_data.assign(name=name).reset_index().rename(columns={"index":"folder"})
Out[19]:
In [ ]: