In [158]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import datetime
import seaborn as sns
%matplotlib inline
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
def save_fig(fig_id, tight_layout=True):
path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(path, format='png', dpi=300)
In [159]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
# Create a class to select numerical or categorical columns
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
class RemoveFirstFrame(BaseEstimator, TransformerMixin):
def __init__(self, frame):
self.frame = frame
def fit(self, X, y=None):
return self
def transform(self, X):
return X.query(f"Step % {frame} != 1")
def choose_top_rw(data,n=5):
return data.assign(chosen=pd.DataFrame.rank(data.Rw, method='first')<=n)
def choose_top_vtotal(data,n=5):
return data.assign(chosen=pd.DataFrame.rank(data.VTotal, method='first')<=n)
def choose_top(data,col="Qw", n=5, ascending=False):
return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='first')<=n)
In [160]:
# protein_list = ["T0766", "1MBA", "T0784", "T0792", "T0803", "T0815", "T0833", "T0251"]
protein_list = ["T0766", "T0784", "T0803", "T0833", "T0251"]
name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB" , "Electro.", "QGO" ,"VTotal"]
all_data_list = []
for protein in protein_list:
awsem = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar21/{protein}_awsem.log", names=name_list)
rw = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar21/{protein}_rw.txt", names=["i", "Rw"], sep="\s+")
rmsd = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar21/{protein}_rmsd.txt", names=["i2", "Rmsd"], sep="\s+")
qw = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar21/{protein}_qw.txt", names=["i3", "Qw"], sep="\s+")
gdt = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar21/{protein}_GDT.txt", names=["gdt"+str(i) for i in range(1,21)], sep="\s+")
rw = rw.reset_index(drop=True)
awsem = awsem.reset_index(drop=True)
rmsd = rmsd.reset_index(drop=True)
qw = qw.reset_index(drop=True)
gdt["GDT"] = (gdt["gdt2"] + gdt["gdt4"] + gdt["gdt8"] + gdt["gdt16"])*25
gdt = gdt.reset_index(drop=True)
data = pd.concat([rw, qw, rmsd, gdt["GDT"], awsem], axis=1)
# print(data)
remove_columns = ['i', 'i2', 'i3', 'Step', "Shake", "Excluded", "AMH_Go", "Membrane", "Vec_FM", "SSB", "Electro."]
# if protein == "T0251":
# p = "T251"
# elif protein == "1mba":
# p = "1MBA"
# else:
# p = protein
p = protein
data = data.drop(remove_columns, axis=1).reset_index().assign(Name=p)
all_data_list.append(data)
all_data = pd.concat(all_data_list).reset_index(drop=True)
In [215]:
print(all_data.shape)
all_data.dropna().shape
Out[215]:
In [162]:
# all_data.to_csv("/Users/weilu/Research/data/test_data/test_data_mar21_2.csv")
In [203]:
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_4.csv")
raw_test_data_2 = raw_test_data.drop_duplicates(subset=['Qw', 'Rw', "VTotal"])
raw_test_data_2 = raw_test_data_2.assign(isGood=raw_test_data_2.groupby("Name")["GDT"].rank(ascending=False, method='first') < 51)
raw_test_data_old = raw_test_data_2.assign(VwithoutGo = raw_test_data.VTotal - raw_test_data.QGO)
In [204]:
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_mar21_2.csv")
raw_test_data_2 = raw_test_data.drop_duplicates(subset=['Qw', 'Rw', "VTotal"])
raw_test_data_2 = raw_test_data_2.assign(isGood=raw_test_data_2.groupby("Name")["GDT"].rank(ascending=False, method='first') < 51)
raw_test_data = raw_test_data_2.assign(VwithoutGo = raw_test_data.VTotal - raw_test_data.QGO)
raw_test_data["Step"] = raw_test_data_2["index"]+1
# raw_test_data = raw_test_data_2
a = raw_test_data_old.query("Name not in @protein_list").reset_index(drop=True)
raw_test_data = pd.concat([a, raw_test_data.drop('Unnamed: 0', axis=1)])
raw_test_data = raw_test_data.query("Name != 'T251'").reset_index(drop=True)
raw_test_data = raw_test_data.query("Step != 1").reset_index(drop=True)
In [205]:
raw_test_data.shape
Out[205]:
In [166]:
raw_test_data["Name"].unique()
Out[166]:
In [207]:
# raw_test_data[["Name", "Step", "Qw", "GDT"]].to_csv("/Users/weilu/Desktop/steps_qw_gdt.csv")
In [4]:
# all_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_mar03.csv")
# all_data = all_data.assign(VwithoutGo = all_data.VTotal - all_data.QGO)
# raw_test_data_2 = all_data
# raw_test_data_2 = all_data.assign(isGood=raw_test_data_2.groupby("Name")["Qw"].rank(ascending=False, method='first') < 6)
# raw_test_data = raw_test_data_2.assign(VwithoutGo = raw_test_data_2.VTotal - raw_test_data_2.QGO)
In [184]:
FEATURES = ['Rw',
# 'VTotal',
'QGO',
'VwithoutGo',
# 'Burial',
# 'Water',
# 'Rama',
# 'DSSP',
# 'P_AP',
# 'Helix',
# 'Frag_Mem'
]
n = 5
def my_transform(data, label, degree, FEATURES=FEATURES):
# LABEL = "Qw"
LABEL = label
PolynomialDegree = degree
num_attribs = FEATURES
cat_attribs = [LABEL]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('std_scaler', StandardScaler()),
('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs))
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
return full_pipeline.fit_transform(data)
def my_transform_predict(data, degree, FEATURES=FEATURES):
# LABEL = "Qw"
PolynomialDegree = degree
num_attribs = FEATURES
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('std_scaler', StandardScaler()),
('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
])
return num_pipeline.fit_transform(data)
In [185]:
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "Rw", "Qw")
In [186]:
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "VwithoutGo", "Qw")
In [187]:
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "QGO", "Qw")
In [188]:
raw_data_T0784 = raw_test_data.groupby("Name").get_group("T0784")
# raw_data_T0792 = raw_test_data.groupby("Name").get_group("T0792")
# raw_data = pd.concat([raw_data_T0784, raw_data_T0792])
# raw_data = raw_data_T0792
# raw_data = raw_test_data.groupby("Name").get_group("1mba")
raw_data = raw_data_T0784
In [189]:
# FEATURES = ["Rw", "VTotal", "QGO"]
# FEATURES = ["Rw", "VTotal", "QGO", "Burial", "Frag_Mem", "Water"]
# FEATURES = list(raw_test_data.columns[2:-3])
def train_and_test(raw_data, label="Qw", degree=1, p=0.1):
# my_full_pipeline = Pipeline([
# # ('removeFirstFrame', RemoveFirstFrame(frame)),
# ('featureSelection', full_pipeline)
# ])
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=142)
for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
strat_train_set = raw_data.iloc[train_index]
strat_test_set = raw_data.iloc[test_index]
# strat_test_set[LABEL].value_counts() / len(strat_test_set)
X_train = my_transform(strat_train_set, label, degree)
X_test = my_transform(strat_test_set, label, degree)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
test_y = X_test[:,-1]
test_set = X_test[:,:-1]
return (train_set, train_y, test_set, test_y)
In [190]:
label = "isGood"
degree = 1
p = 0.1
train_set, train_y, test_set, test_y = train_and_test(raw_data, label=label, degree=degree)
log_clf = LogisticRegression(random_state=140, penalty='l2')
# log_clf = LogisticRegression(random_state=14, class_weight={0:p, 1:(1-p)}, penalty='l1')
log_clf.fit(train_set, train_y)
y_pred = log_clf.predict(train_set)
# n = 100
prediction_list = []
for name, data in raw_test_data.groupby("Name"):
print(name)
# X = full_pipeline.fit_transform(data)
X = my_transform(data, label, degree)
eval_y = X[:,-1]
eval_set = X[:,:-1]
test= log_clf.predict_proba(eval_set)[:,1]
one = data.assign(prediction=test)
prediction_list.append(one)
# prediction_list.append(pd.Series(test))
t = pd.concat(prediction_list)
# t = raw_test_data.assign(prediction=prediction.values)
best_by_prediction = t.groupby("Name").apply(choose_top, n=n, col="prediction").query("chosen==True")
In [191]:
print(*(zip(FEATURES, log_clf.coef_[0])))
In [175]:
print(*(zip(FEATURES, log_clf.coef_[0])))
In [192]:
n = 5
chosen_by_rw = raw_test_data.groupby("Name").apply(choose_top_rw, n)
chosen_by_vtotal = raw_test_data.groupby("Name").apply(choose_top_vtotal, n)
chosen_by_qgo = raw_test_data.groupby("Name").apply(choose_top, n=n, col="QGO", ascending=True)
top_rw = chosen_by_rw.query("chosen==True")
top_vtotal = chosen_by_vtotal.query("chosen==True")
top_qgo = chosen_by_qgo.query("chosen==True")
In [193]:
top_qgo_old = top_qgo
In [208]:
# T0784
label = "GDT"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T0251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
# plt.savefig("/Users/weilu/Desktop/test.png", dpi=300)
# plt.ylim([0.4,1])
Out[208]:
In [209]:
# T0784
label = "Qw"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T0251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
# plt.savefig("/Users/weilu/Desktop/test.png", dpi=300)
# plt.ylim([0.4,1])
Out[209]:
In [211]:
# T0784
label = "Rmsd"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T0251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
# plt.savefig("/Users/weilu/Desktop/test.png", dpi=300)
# plt.ylim([0.4,1])
Out[211]:
In [178]:
# T0784
label = "GDT"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T0251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
plt.savefig("/Users/weilu/Desktop/test.png", dpi=300)
# plt.ylim([0.4,1])
In [195]:
a = best_by_prediction.reset_index(drop=True)[["Name", "Qw", "GDT", "prediction", "Step"]].groupby("Name").apply(lambda x: x.sort_values("prediction", ascending=False))
In [196]:
import itertools
In [198]:
a.reset_index(drop=True).to_csv("/Users/weilu/Desktop/selected.csv")
In [197]:
a.reset_index(drop=True).set_index("Name")
Out[197]:
In [153]:
In [154]:
raw_test_data.groupby("Name").apply(choose_top, n=1, col="Step", ascending=True).query("chosen==True")[["Name", "GDT", "Qw", "Step"]]
Out[154]:
In [80]:
best_by_prediction.reset_index(drop=True)[["Name", "GDT", "Qw"]].groupby("Name").mean()
Out[80]:
In [69]:
raw_test_data.groupby("Name").apply(choose_top, n=1, col=label).query("chosen==True")[["GDT"]]
Out[69]:
In [70]:
raw_test_data.groupby("Name").apply(choose_top, n=1, col="Qw").query("chosen==True")[["Qw"]]
Out[70]:
In [22]:
# T0784
label = "GDT"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
# plt.savefig("/Users/weilu/Desktop/test.png", dpi=300)
# plt.ylim([0.4,1])
Out[22]:
In [24]:
# T0784
label = "Qw"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
plt.savefig("/Users/weilu/Desktop/figure6_qw.png", dpi=300)
# plt.ylim([0.4,1])
In [16]:
initial = raw_test_data.groupby("Name").apply(choose_top, n=1, col='Step', ascending=True).query("chosen==True")
f2 = initial.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Initial"})
In [18]:
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
In [19]:
f2.melt(id_vars="Name", value_name=label, var_name=" ")
Out[19]:
In [20]:
# T0784
label = "GDT"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"Selected"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
final3 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["Selected"]], axis=1).melt(id_vars="Name", value_name=label, var_name=" ")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
order = ["T251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name", label, data=final3, hue=" ", errwidth=0, order=order)
# plt.clf()
t = f2.melt(id_vars="Name", value_name=label, var_name=" ")
sns.stripplot("Name", label, data=t, order=order, color="black")
# plt.savefig("/Users/weilu/Desktop/test.png", dpi=300)
# plt.ylim([0.4,1])
Out[20]:
In [55]:
best_old = best
In [146]:
# best_by_prediction.to_csv("/Users/weilu/Research/data/structure_selector_mar03/old_best_by_prediction.csv")
In [162]:
protein_list = ["T0766", "1mba", "T0784", "T0792", "T0803", "T0815", "T0833", "T0251"]
name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB" , "Electro.", "QGO" ,"VTotal"]
all_data_list = []
for protein in protein_list:
awsem = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar03/{protein}_awsem.log", names=name_list)
rw = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar03/{protein}_rw.txt", names=["i", "Rw"], sep="\s+")
rmsd = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar03/{protein}_rmsd.txt", names=["i2", "Rmsd"], sep="\s+")
qw = pd.read_table(f"/Users/weilu/Research/davinci/structure_selector_mar03/{protein}_qw.txt", names=["i3", "Qw"], sep="\s+")
rw = rw[:2000].reset_index(drop=True)
awsem = awsem[:2000].reset_index(drop=True)
rmsd = rmsd[:2000].reset_index(drop=True)
qw = qw[:2000].reset_index(drop=True)
data = pd.concat([rw, awsem, rmsd, qw], axis=1)
remove_columns = ['i', 'i2', 'i3', 'Step', "Shake", "Excluded", "AMH_Go", "Membrane", "Vec_FM", "SSB", "Electro."]
if protein == "T0251":
p = "T251"
elif protein == "1mba":
p = "1MBA"
else:
p = protein
data = data.drop(remove_columns, axis=1).reset_index().assign(Name=p)
all_data_list.append(data)
all_data = pd.concat(all_data_list).reset_index(drop=True)
In [16]:
a = raw_test_data.query("Name=='1MBA'").assign(T='old')
In [17]:
b = all_data.query("Name=='1MBA'").assign(T='new')
In [18]:
c = pd.concat([a,b])
In [ ]:
b
In [19]:
g = sns.FacetGrid(c, col="Name", hue="T", col_wrap=1)
g = g.map(plt.scatter, "Rw", "Qw", alpha=0.1)
In [23]:
g = sns.FacetGrid(raw_test_data, col="Name", hue="isGood", col_wrap=4)
g = g.map(plt.scatter, "QGO", "Qw")
In [22]:
g = sns.FacetGrid(all_data, col="Name", col_wrap=4)
g = g.map(plt.scatter, "QGO", "Qw")
In [163]:
# all_data.to_csv("/Users/weilu/Research/data/test_data/test_data_mar05.csv")
In [79]:
all_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_mar05.csv")
all_data = all_data.assign(VwithoutGo = all_data.VTotal - all_data.QGO)
In [80]:
prediction_list = []
for name, data in all_data.groupby("Name"):
print(name)
# X = full_pipeline.fit_transform(data)
X = my_transform_predict(data, degree=1)
eval_set = X
test= log_clf.predict_proba(eval_set)[:,1]
one = data.assign(prediction=test)
prediction_list.append(one)
# prediction_list.append(pd.Series(test))
t = pd.concat(prediction_list)
# t = all_data.assign(prediction=prediction.values)
best_by_prediction_new = t.groupby("Name").apply(choose_top, n=n, col="prediction").query("chosen==True").drop('Unnamed: 0',axis=1)
In [34]:
best_by_prediction_new
Out[34]:
In [37]:
top_qgo
Out[37]:
In [63]:
best_old.query("Name == '1MBA'")
Out[63]:
In [62]:
best.query("Name == '1MBA'")
Out[62]:
In [82]:
top_qgo_old.query("Name == '1MBA'")
Out[82]:
In [83]:
top_qgo.query("Name == '1MBA'")
Out[83]:
In [81]:
raw_test_data = all_data
chosen_by_rw = raw_test_data.groupby("Name").apply(choose_top_rw, n)
chosen_by_vtotal = raw_test_data.groupby("Name").apply(choose_top_vtotal, n)
chosen_by_qgo = raw_test_data.groupby("Name").apply(choose_top, n=n, col="QGO", ascending=True)
top_rw = chosen_by_rw.query("chosen==True")
top_vtotal = chosen_by_vtotal.query("chosen==True")
top_qgo = chosen_by_qgo.query("chosen==True")
# T0784
label = "Qw"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction_new.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
# sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
order = ["T251", "T0833", "T0815", "T0803", "T0792", "T0784", "T0766", "1MBA"]
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0, order=order)
plt.ylim([0.4,1])
Out[81]:
In [27]:
raw_test_data = all_data
chosen_by_rw = raw_test_data.groupby("Name").apply(choose_top_rw, n)
chosen_by_vtotal = raw_test_data.groupby("Name").apply(choose_top_vtotal, n)
chosen_by_qgo = raw_test_data.groupby("Name").apply(choose_top, n=n, col="QGO", ascending=True)
top_rw = chosen_by_rw.query("chosen==True")
top_vtotal = chosen_by_vtotal.query("chosen==True")
top_qgo = chosen_by_qgo.query("chosen==True")
# T0784
label = "Rmsd"
best = raw_test_data.groupby("Name").apply(choose_top, n=n, col=label, ascending=True).query("chosen==True")
a2 = best.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction_new.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
e2 = top_qgo.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"QGo"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# final2 = pd.concat([a2, e2["QGo"], b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
Out[27]:
In [30]:
t.groupby("Name").apply(choose_top, n=n, col=label, ascending=True).query("chosen==True")
Out[30]:
In [28]:
best_by_prediction
Out[28]:
In [126]:
best_by_prediction.to_csv("/Users/weilu/Research/data/structure_selector_mar03/best_by_prediction.csv")
In [171]:
a = pd.read_csv("/Users/weilu/Research/data/structure_selector_mar03/old_best_by_prediction.csv")
In [168]:
a = pd.read_csv("/Users/weilu/Research/data/structure_selector_mar03/best_by_prediction.csv")
In [165]:
for name, data in a.groupby("Name"):
print(name)
# print(data["index"])
for i in data["index"]:
print(i)
In [ ]:
In [138]:
# T0784
label = "GDT"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
Out[138]:
In [122]:
# T0784
label = "GDT"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
Out[122]:
In [114]:
# T0784
label = "GDT"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.boxplot("Name","value", data=final2, hue="variable")
Out[114]:
In [99]:
# T0784
label = "Qw"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
Out[99]:
In [97]:
# T0784
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
Out[97]:
In [12]:
# T0784
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
Out[12]:
In [15]:
# T0784
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
Out[15]:
In [17]:
# T0784
label = "Qw"
a2 = best_by_GDT.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"best"})
b2 = top_rw.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Rw"})
c2 = top_vtotal.reset_index(drop=True)[[label, "Name"]].rename(index=str,columns={label:"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", label]].rename(index=str,columns={label:"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
Out[17]:
In [ ]:
# T0784
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
In [ ]:
# 1mba
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
In [ ]:
# t0792
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
In [ ]:
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
# final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
final2 = pd.concat([a2, b2["Rw"], c2["Awsem"], d2["prediction"]], axis=1).melt(id_vars="Name")
# sns.pointplot("Name","value", data=final2, hue="variable", hue_order=["prediction", "Awsem", "Rw", "best"])
# sns.stripplot("value", "Name", data=final2, hue="variable")
sns.pointplot("Name","value", data=final2, hue="variable", errwidth=0)
In [ ]:
sns.boxplot("Name","value", data=final2, hue="variable")
In [60]:
sns.swarmplot(x='Name', y='value', data=final2, hue="variable", size=10)
Out[60]:
In [51]:
sns.swarmplot(x='Name', y='value', data=final2, hue="variable")
Out[51]:
In [52]:
sns.stripplot("Name", "value", data=final2, hue="variable", jitter=True)
Out[52]:
In [ ]:
sns.stripplot("value", "Name", data=final2, hue="variable", jitter=True)
In [ ]:
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
sns.pointplot("Name","value", data=final2, hue="variable")
# sns.stripplot("value", "Name", data=final2, hue="variable")
In [ ]:
In [ ]:
a2 = best_by_GDT.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"best"})
b2 = top_rw.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Rw"})
c2 = top_vtotal.reset_index(drop=True)[["GDT", "Name"]].rename(index=str,columns={"GDT":"Awsem"})
d2 = best_by_prediction.reset_index(drop=True)[["Name", "GDT"]].rename(index=str,columns={"GDT":"prediction"})
final2 = a2.merge(b2, on="Name").merge(c2, on="Name").merge(d2, on="Name").melt(id_vars="Name")
sns.pointplot("Name","value", data=final2, hue="variable")
# sns.stripplot("value", "Name", data=final2, hue="variable")
In [ ]:
np.sum(y_pred != train_y)
In [ ]:
In [ ]:
prediction.shape
In [ ]:
raw_test_data.shape
In [ ]:
y_pred.shape
In [ ]:
eval_y.argsort()[-n:][::-1]
In [ ]:
n = 10
for name, data in raw_test_data.groupby("Name"):
print(name)
X = full_pipeline.fit_transform(data)
eval_y = X[:,-1]
eval_set = X[:,:-1]
test= regr.predict(eval_set)
position_of_top_n = test.argsort()[-n:][::-1]
threshold = test[position_of_top_n][-1]
predict_y = np.zeros(len(eval_y),)
predict_y[position_of_top_n] = 1
position_of_top_n = eval_y.argsort()[-50:][::-1]
threshold = eval_y[position_of_top_n][-1]
test_y = np.zeros(len(eval_y),)
test_y[position_of_top_n] = 1
plt.figure()
plt.scatter(test, eval_y)
plt.show()
# predict_y = (test > threshold)
# print(threshold)
print(confusion_matrix(test_y, predict_y))
In [ ]:
plt.scatter(y_pred, train_y)
In [ ]:
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
# log_clf = LogisticRegression(random_state=142)
# rnd_clf = RandomForestClassifier(random_state=432)
# svm_clf = SVC(probability=True, random_state=412)
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})
voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
voting='soft')
log_clf.fit(train_set, train_y)
rnd_clf.fit(train_set, train_y)
svm_clf.fit(train_set, train_y)
voting_clf.fit(train_set, train_y)
# check on training set
n = 10
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
# y_pred = clf.predict(train_set)
prob= clf.predict_proba(train_set)[:,1]
position_of_top_n = prob.argsort()[-n:][::-1]
threshold = prob[position_of_top_n][-1]
predict_y = np.zeros(len(train_y),)
predict_y[position_of_top_n] = 1
# predict_y = (test > threshold)
# print(threshold)
cm = confusion_matrix(train_y, predict_y)
# print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
print(clf.__class__.__name__, "\n", cm)
time_stamp = f"{datetime.today().strftime('%d_%h_%H%M%S')}"
for name, data in raw_test_data.groupby("Name"):
print(name)
X = full_pipeline.fit_transform(data)
eval_y = X[:,-1]
eval_set = X[:,:-1]
test= log_clf.predict_proba(eval_set)[:,1]
position_of_top_n = test.argsort()[-n:][::-1]
threshold = test[position_of_top_n][-1]
predict_y = np.zeros(len(eval_y),)
predict_y[position_of_top_n] = 1
with open(f"/Users/weilu/Research/data/structure_selector/p{p}_poly{PolynomialDegree}_{name}.csv", "w") as f:
f.write("Result\n")
for i in test:
f.write(str(i) + "\n")
# with open(f"/Users/weilu/Research/data/structure_selector/{name}_results_{time_stamp}.csv", "w") as f:
# f.write("Result\n")
# for i in test:
# f.write(str(i) + "\n")
# predict_y = (test > threshold)
# print(threshold)
print(confusion_matrix(eval_y, predict_y))
print(f"p{p}_poly{PolynomialDegree}")
In [ ]:
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/test_data_4.csv")
raw_test_data_2 = raw_test_data.drop_duplicates(subset=['Qw', 'Rw', "VTotal"])
raw_test_data_2 = raw_test_data_2.assign(isGood=raw_test_data_2.groupby("Name")["GDT"].rank(ascending=False, method='first') < 50)
raw_test_data = raw_test_data_2
raw_data_T0784 = raw_test_data.groupby("Name").get_group("T0784")
raw_data_T0792 = raw_test_data.groupby("Name").get_group("T0792")
# raw_data = pd.concat([raw_data_T0784, raw_data_T0792])
raw_data = raw_data_T0792
In [ ]:
# FEATURES = ["Rw", "VTotal", "QGO"]
# FEATURES = ["Rw", "VTotal", "QGO", "Burial", "Frag_Mem", "Water"]
# FEATURES = list(raw_test_data.columns[2:-3])
FEATURES = ['Rw',
'VTotal',
'QGO',
'Burial',
'Water',
'Rama',
'DSSP',
'P_AP',
'Helix',
'Frag_Mem']
# LABEL = "Qw"
LABEL = "isGood"
PolynomialDegree = 2
p = 0.1
num_attribs = FEATURES
cat_attribs = [LABEL]
frame = 201
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('std_scaler', StandardScaler()),
('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs))
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
my_full_pipeline = Pipeline([
# ('removeFirstFrame', RemoveFirstFrame(frame)),
('featureSelection', full_pipeline)
])
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
strat_train_set = raw_data.iloc[train_index]
strat_test_set = raw_data.iloc[test_index]
# strat_test_set[LABEL].value_counts() / len(strat_test_set)
X_train = my_full_pipeline.fit_transform(strat_train_set)
X_test = my_full_pipeline.fit_transform(strat_test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
test_y = X_test[:,-1]
test_set = X_test[:,:-1]
# log_clf = LogisticRegression(random_state=142)
# rnd_clf = RandomForestClassifier(random_state=432)
# svm_clf = SVC(probability=True, random_state=412)
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
log_clf.fit(train_set, train_y)
# check on training set
n = 5
clf = log_clf
# y_pred = clf.predict(train_set)
prob= clf.predict_proba(train_set)[:,1]
position_of_top_n = prob.argsort()[-n:][::-1]
threshold = prob[position_of_top_n][-1]
predict_y = np.zeros(len(train_y),)
predict_y[position_of_top_n] = 1
# predict_y = (test > threshold)
# print(threshold)
cm = confusion_matrix(train_y, predict_y)
# print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
print(clf.__class__.__name__, "\n", cm)
for name, data in raw_test_data.groupby("Name"):
print(name)
X = full_pipeline.fit_transform(data)
eval_y = X[:,-1]
eval_set = X[:,:-1]
test= log_clf.predict_proba(eval_set)[:,1]
position_of_top_n = test.argsort()[-n:][::-1]
threshold = test[position_of_top_n][-1]
predict_y = np.zeros(len(eval_y),)
predict_y[position_of_top_n] = 1
print(confusion_matrix(eval_y, predict_y))
print(f"p{p}_poly{PolynomialDegree}")
In [ ]:
strat_train_set["a"] = prob
In [ ]:
strat_train_set.plot("a", "GDT", kind="scatter")
In [ ]:
prob
In [ ]:
pd.concat([strat_train_set, pd.Series(prob)], axis=1)
In [ ]:
def compute_with_my_score_function(p=0.9, PolynomialDegree=3):
FEATURES = ['Rw',
'VTotal',
'QGO',
'Burial',
'Water',
'Rama',
'DSSP',
'P_AP',
'Helix',
'Frag_Mem']
# LABEL = "Qw"
LABEL = "isGood"
num_attribs = FEATURES
cat_attribs = [LABEL]
frame = 201
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('std_scaler', StandardScaler()),
('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs))
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
my_full_pipeline = Pipeline([
# ('removeFirstFrame', RemoveFirstFrame(frame)),
('featureSelection', full_pipeline)
])
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
strat_train_set = raw_data.iloc[train_index]
strat_test_set = raw_data.iloc[test_index]
# strat_test_set[LABEL].value_counts() / len(strat_test_set)
X_train = my_full_pipeline.fit_transform(strat_train_set)
X_test = my_full_pipeline.fit_transform(strat_test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
test_y = X_test[:,-1]
test_set = X_test[:,:-1]
# log_clf = LogisticRegression(random_state=142)
# rnd_clf = RandomForestClassifier(random_state=432)
# svm_clf = SVC(probability=True, random_state=412)
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
log_clf.fit(train_set, train_y)
# voting_clf.fit(train_set, train_y)
n = 10
cl_name = "lr"
clf = log_clf
# for cl_name, clf in ("voting", voting_clf):
my_evaluation = 1.0
another_evaluation = 0.0
for name, data in raw_test_data.groupby("Name"):
# print(name)
# X = full_pipeline.fit_transform(data)
# validation_data, test_data = train_test_split(X, test_size=0.6, random_state=124)
validation_data = my_full_pipeline.fit_transform(raw_data_T0784)
validation_y = validation_data[:,-1]
validation_set = validation_data[:,:-1]
clf.fit(train_set, train_y)
test= clf.predict_proba(validation_set)[:,1]
position_of_top_n = test.argsort()[-n:][::-1]
threshold = test[position_of_top_n][-1]
predict_y = np.zeros(len(validation_y),)
predict_y[position_of_top_n] = 1
# predict_y = (test > threshold)
# print(threshold)
cm = confusion_matrix(validation_y, predict_y)
# print(cm)
precision = cm[1][1] / (cm[1][1] + cm[0][1])
# print(name, " precision", precision,end = " ")
if name != "T0766" and name != "T0833":
my_evaluation *= precision
another_evaluation += precision
# print("")
print("classifier:", cl_name, ", p:",p, ", degree", PolynomialDegree, ", score", my_evaluation, ", another score", another_evaluation)
return (cl_name, p, PolynomialDegree, my_evaluation)
In [ ]:
def myGridSerach():
p_list = [0.9, 0.8, 0.7, 0.5, 0.1]
degree_list = [3, 2, 1]
# p_list = [0.1, 0.8, 0.9, 0.95]
# degree_list = [1, 2, 3]
# p_list = [0.1, 0.15, 0.2, 0.5, 0.7, 0.8, 0.85, 0.9, 0.95]
# degree_list = [1, 2, 3, 4]
result = []
for p in p_list:
for degree in degree_list:
result += [compute_with_my_score_function(p, degree)]
In [ ]:
myGridSerach()
In [ ]:
compute_with_my_score_function(0.1, 1)
In [ ]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
# Create a class to select numerical or categorical columns
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
class RemoveFirstFrame(BaseEstimator, TransformerMixin):
def __init__(self, frame):
self.frame = frame
def fit(self, X, y=None):
return self
def transform(self, X):
return X.query(f"Step % {frame} != 1")
In [ ]:
num_attribs = FEATURES
cat_attribs = [LABEL]
frame = 201
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('std_scaler', StandardScaler()),
('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs))
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
my_full_pipeline = Pipeline([
# ('removeFirstFrame', RemoveFirstFrame(frame)),
('featureSelection', full_pipeline)
])
In [ ]:
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
strat_train_set = raw_data.iloc[train_index]
strat_test_set = raw_data.iloc[test_index]
# strat_test_set[LABEL].value_counts() / len(strat_test_set)
X_train = my_full_pipeline.fit_transform(strat_train_set)
X_test = my_full_pipeline.fit_transform(strat_test_set)
train_y = X_train[:,-1]
train_set = X_train[:,:-1]
test_y = X_test[:,-1]
test_set = X_test[:,:-1]
In [ ]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
# log_clf = LogisticRegression(random_state=142)
# rnd_clf = RandomForestClassifier(random_state=432)
# svm_clf = SVC(probability=True, random_state=412)
log_clf = LogisticRegression(random_state=142, class_weight={0:p, 1:(1-p)})
rnd_clf = RandomForestClassifier(random_state=432, class_weight={0:p, 1:(1-p)})
svm_clf = SVC(probability=True, random_state=412, class_weight={0:p, 1:(1-p)})
voting_clf = VotingClassifier(
estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
voting='soft')
log_clf.fit(train_set, train_y)
rnd_clf.fit(train_set, train_y)
svm_clf.fit(train_set, train_y)
voting_clf.fit(train_set, train_y)
In [ ]:
# check on training set
n = 10
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
# y_pred = clf.predict(train_set)
prob= clf.predict_proba(train_set)[:,1]
position_of_top_n = prob.argsort()[-n:][::-1]
threshold = prob[position_of_top_n][-1]
predict_y = np.zeros(len(train_y),)
predict_y[position_of_top_n] = 1
# predict_y = (test > threshold)
# print(threshold)
cm = confusion_matrix(train_y, predict_y)
# print(clf.__class__.__name__, "\n", accuracy_score(train_y, predict_y))
print(clf.__class__.__name__, "\n", cm)
In [ ]:
time_stamp = f"{datetime.today().strftime('%d_%h_%H%M%S')}"
for name, data in raw_test_data.groupby("Name"):
print(name)
X = full_pipeline.fit_transform(data)
eval_y = X[:,-1]
eval_set = X[:,:-1]
test= log_clf.predict_proba(eval_set)[:,1]
position_of_top_n = test.argsort()[-n:][::-1]
threshold = test[position_of_top_n][-1]
predict_y = np.zeros(len(eval_y),)
predict_y[position_of_top_n] = 1
with open(f"/Users/weilu/Research/data/structure_selector/p{p}_poly{PolynomialDegree}_{name}.csv", "w") as f:
f.write("Result\n")
for i in test:
f.write(str(i) + "\n")
# with open(f"/Users/weilu/Research/data/structure_selector/{name}_results_{time_stamp}.csv", "w") as f:
# f.write("Result\n")
# for i in test:
# f.write(str(i) + "\n")
# predict_y = (test > threshold)
# print(threshold)
print(confusion_matrix(eval_y, predict_y))
In [ ]:
time_stamp = f"{datetime.today().strftime('%d_%h_%H%M%S')}"
for name, data in raw_test_data.groupby("Name"):
print(name)
X = my_full_pipeline.fit_transform(data)
eval_y = X[:,-1]
eval_set = X[:,:-1]
test= log_clf.predict_proba(eval_set)[:,1]
position_of_top_n = test.argsort()[-n:][::-1]
threshold = test[position_of_top_n][-1]
predict_y = np.zeros(len(eval_y),)
predict_y[position_of_top_n] = 1
# predict_y = (test > threshold)
# print(threshold)
print(confusion_matrix(eval_y, predict_y))
In [ ]: