In [1]:
# written in python3
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import datetime
import seaborn as sns
%matplotlib inline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.attribute_names].values
plt.rcParams['figure.figsize'] = (10,6.180) #golden ratio
def my_transform(data, label, degree, FEATURES):
# LABEL = "Qw"
LABEL = label
PolynomialDegree = degree
num_attribs = FEATURES
cat_attribs = [LABEL]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('std_scaler', StandardScaler()),
('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs))
])
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
return full_pipeline.fit_transform(data)
In [2]:
def choose_top(data, col="RMSD", n=5, ascending=True):
return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='dense')<=n)
folder_list = ["tr894", "tr884", "tr922", "tr882", "tr896", "tr872", "tr594", "tr862", "tr869", "tr898", "tr885", "tr866", "tr868", "tr891", "tr895", "tr870", "tr921", "tr877", "tr948", "tr947"]
raw_data_all = pd.read_csv("/Users/weilu/Research/frustration_selection/data.csv", index_col=0)
In [3]:
train_name_list = ["tr884"]
# select for training.
raw_data = raw_data_all.reset_index(drop=True).query(f'Protein in {train_name_list}')
In [4]:
raw_data_all.columns
Out[4]:
In [5]:
# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = [
# "biasQ",
# 'score',
# 'VTotal',
'DSSP', 'P_AP', 'Water', 'Burial', 'Helix', 'Electro',
'fa_atr', 'fa_rep', 'fa_sol',
'fa_intra_rep', 'fa_intra_sol_xover4', 'lk_ball_wtd', 'fa_elec',
'pro_close', 'hbond_sr_bb', 'hbond_lr_bb', 'hbond_bb_sc', 'hbond_sc',
'omega', 'fa_dun', 'p_aa_pp', 'rama_prepro', 'allatom_rms', 'maxsub', 'maxsub2.0'
# 'RMSD', # test
# 'Qw',
# 'Burial',
# 'Water',
# 'Rama',
# 'DSSP',
# 'P_AP',
# 'Helix',
# 'Frag_Mem'
]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "chosen"
DEGREE = 1
def pred_from_raw(a):
data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
test_y = data[:,-1]
test_set = data[:,:-1]
prob= clf.predict_proba(test_set)[:,1]
return a.assign(prob=prob)
# data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
data = np.concatenate(raw_data.groupby('Protein').apply(my_transform,
label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
train_y = data[:,-1]
train_set = data[:,:-1]
# clf = svm.SVC(probability=True)
# p = 0.01
# clf = LogisticRegression(random_state=27, class_weight={0:p, 1:(1-p)})
clf = LogisticRegression(random_state=1127, solver="lbfgs")
clf.fit(train_set, train_y)
filtered = raw_data_all.groupby("Protein").apply(pred_from_raw).reset_index(drop=True)
picked_n = 1
best = raw_data_all.groupby("Protein").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
# if True:
picked_1 = filtered.groupby("Protein").apply(choose_top, col="prob"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
# if False:
picked_5 = filtered.groupby("Protein").apply(choose_top, col="prob"
, n=5, ascending=False).reset_index(drop=True).query("chosen==True")
picked = picked_5.groupby("Protein").apply(choose_top, col="RMSD"
, n=1, ascending=True).reset_index(drop=True).query("chosen==True")
worst = filtered.groupby("Protein").apply(choose_top, col="RMSD"
, n=1, ascending=False).reset_index(drop=True).query("chosen==True")
# init = raw_data_all.groupby("Protein").apply(choose_top, col="i"
# , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
all_results = pd.concat([best.assign(result='best'),
picked_1.assign(result='picked'),
# picked.assign(result='picked_5'),
# init.assign(result='init'),
worst.assign(result='worst')
], sort=False)
# all_results = pd.concat([best.assign(result='best'),
# picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv
# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'Protein', 'RMSD').add_legend(fontsize=20)
# fg.set(ylim=(0, 10))
Out[5]:
In [9]:
In [11]:
def my_reorder(a, first):
# move first to the top. and keep the rest
new_order = first.copy()
for col in a:
if col not in first:
new_order.append(col)
return new_order
In [17]:
picked = picked.reindex(columns=my_reorder(picked.columns, ["Protein", "description"])).reset_index(drop=True)
In [18]:
picked.to_csv("/Users/weilu/Research/frustration_selection/pick.csv")
In [ ]:
In [6]:
for pdb in folder_list:
print(pdb, round(10*picked_1.query(f"Protein=='{pdb}'")["RMSD"].values[0], 3))
In [7]:
clf.coef_
Out[7]:
In [8]:
g = sns.FacetGrid(raw_data_all, col="Protein", col_wrap=4)
g = g.map(plt.hist, "RMSD")
In [9]:
# import glob
# a = glob.glob("/Users/weilu/Research/frustration_selection/tr*")
# print(len(a))
# pdb_list = [i.split("/")[-1] for i in a]
# all_ = []
# for pdb in pdb_list:
# # print(pdb)
# fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/q.txt"
# q = np.loadtxt(fileLocation)
# fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/rmsd.txt"
# rmsd = np.loadtxt(fileLocation)
# fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/gdt.txt"
# gdt = np.loadtxt(fileLocation)
# fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/awsem_energy.txt"
# awsem_column = ['Step', 'Chain', 'Shake', 'Chi', 'Rama', 'Excluded', 'DSSP', 'P_AP', 'Water', 'Burial', 'Helix', 'AMH-Go', 'Frag_Mem', 'Vec_FM', 'Membrane', 'SSB', 'Electro', 'QGO', 'VTotal']
# awsem = pd.read_csv(fileLocation, sep="\s+", names=awsem_column)
# fileLocation = f"/Users/weilu/Research/frustration_selection/{pdb}/rosetta_energy.txt"
# rosetta = pd.read_csv(fileLocation, sep="\s+")
# data = pd.concat([awsem, rosetta], axis=1)
# assert len(awsem) == len(rosetta) == len(q) == len(rmsd) == len(gdt)
# data["Q"] = q
# data["RMSD"] = rmsd
# data["GDT"] = gdt
# data["Protein"] = pdb
# all_.append(data)
# # print(pdb, len(data))
# data = pd.concat(all_)
# drop_col = []
# for col in data.columns:
# # print(col, len(data[col].unique()))
# if len(data[col].unique()) == 1:
# drop_col.append(col)
# data = data.drop(drop_col, axis=1)
# def extract_frame(data):
# return int(data.split("_")[0])
# data["description"] = data["description"].apply(extract_frame)
# raw_data_all = data.groupby("Protein").apply(choose_top, n=100, col="RMSD").reset_index(drop=True)
# # raw_data_all.to_csv("/Users/weilu/Research/frustration_selection/data.csv")