In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import datetime
import seaborn as sns
%matplotlib inline

plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio
# Where to save the figures

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


# Create a class to select numerical or categorical columns 
# since Scikit-Learn doesn't handle DataFrames yet
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
class RemoveFirstFrame(BaseEstimator, TransformerMixin):
    def __init__(self, frame):
        self.frame = frame
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X.query(f"Step % {frame} != 1")
def choose_top_rw(data,n=5):
    return data.assign(chosen=pd.DataFrame.rank(data.Rw, method='first')<=n)
def choose_top_vtotal(data,n=5):
    return data.assign(chosen=pd.DataFrame.rank(data.VTotal, method='first')<=n)
def choose_top(data,col="Qw", n=5, ascending=False):
    return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='first')<=n)

In [3]:
FEATURES = ['Rw',
#      'VTotal',
#      'QGO',
#      'VwithoutGo',
     'Burial',
     'Water',
     'Rama',
#      'DSSP',
#      'P_AP',
     'Helix',
#      'Frag_Mem'
               ]
n = 5
def my_transform(data, label, degree, FEATURES=FEATURES):

    # LABEL = "Qw"
    LABEL = label
    PolynomialDegree = degree

    num_attribs = FEATURES
    cat_attribs = [LABEL]
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    return full_pipeline.fit_transform(data)

def my_transform_predict(data, degree, FEATURES=FEATURES):

    # LABEL = "Qw"
    PolynomialDegree = degree

    num_attribs = FEATURES
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    return num_pipeline.fit_transform(data)

In [4]:
raw_test_data = pd.read_csv("/Users/weilu/Research/data/test_data/complete_data_mar27.csv", index_col=0)
raw_data_T0784 = raw_test_data.groupby("Name").get_group("T0784")
# raw_data_T0792 = raw_test_data.groupby("Name").get_group("T0792")
# raw_data = pd.concat([raw_data_T0784, raw_data_T0792])
# raw_data = raw_data_T0792
# raw_data = raw_test_data.groupby("Name").get_group("1mba")
raw_data = raw_data_T0784

In [17]:
# FEATURES = ["Rw", "VTotal", "QGO"]
# FEATURES = ["Rw", "VTotal", "QGO", "Burial", "Frag_Mem", "Water"]
# FEATURES = list(raw_test_data.columns[2:-3])
def train_and_test(raw_data, label="Qw", degree=1, p=0.1):
    # my_full_pipeline = Pipeline([
    # #         ('removeFirstFrame', RemoveFirstFrame(frame)),
    #         ('featureSelection', full_pipeline)
    # ])

    from sklearn.model_selection import StratifiedShuffleSplit

    split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=142)
    for train_index, test_index in split.split(raw_data, raw_data["isGood"]):
        strat_train_set = raw_data.iloc[train_index]
        strat_test_set = raw_data.iloc[test_index]
    # strat_test_set[LABEL].value_counts() / len(strat_test_set)
    X_train = my_transform(strat_train_set, label, degree)
    X_test = my_transform(strat_test_set, label, degree)
    train_y = X_train[:,-1]
    train_set = X_train[:,:-1]
    test_y = X_test[:,-1]
    test_set = X_test[:,:-1]
    return (train_set, train_y, test_set, test_y)
label = "isGood"
degree = 1
p = 0.1
train_set, train_y, test_set, test_y = train_and_test(raw_data, label=label, degree=degree)
log_clf = LogisticRegression(random_state=140, penalty='l2')

# log_clf = LogisticRegression(random_state=14, class_weight={0:p, 1:(1-p)}, penalty='l1')
log_clf.fit(train_set, train_y)
# y_pred = log_clf.predict(train_set)


Out[17]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=140, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
# protein_list = ["T0949", "T0950", "T0953S1", "T0953S2", "T0954", "T0955", "T0956", "T0957S1", "T0957S2", "T0958", "T0960"]
# name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB" , "Electro.", "QGO" ,"VTotal"]
# all_data_list = []
# for protein in protein_list:
#     for i in range(1,4):
#         if protein == "T0957S2" and i == 2:
#             continue
#         awsem = pd.read_table(f"/Users/weilu/Research/server/casp13/result/{protein}/model.{i}/awsem_energy/awsem.log", names=name_list)
#         rw = pd.read_table(f"/Users/weilu/Research/server/casp13/result/{protein}/model.{i}/lowTstructure/rwplusScore.short.txt", names=["i", "Rw"], sep="\s+")
#         rw = rw.reset_index(drop=True)
#         awsem = awsem.reset_index().drop("Step", axis=1).rename(columns={"index":"Step"}).assign(model=i)
#         data = pd.concat([rw, awsem], axis=1)
#         remove_columns = ['i', "Shake", "Excluded", "Membrane", "Vec_FM", "SSB", "Electro."]
#         all_data_list.append(data.drop(remove_columns, axis=1).assign(Name=protein))
# data = pd.concat(all_data_list)

In [40]:
protein_list = ["T0958", "T0960"]
name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB" , "Electro.", "QGO" ,"VTotal"]
all_data_list = []
for protein in protein_list:
    for i in range(1,4):
        if protein == "T0957S2" and i == 2:
            continue
        awsem = pd.read_table(f"/Users/weilu/Research/server/casp13/result/{protein}/model.{i}/awsem_energy/awsem.log", names=name_list)
        rw = pd.read_table(f"/Users/weilu/Research/server/casp13/result/{protein}/model.{i}/lowTstructure/rwplusScore.short.txt", names=["i", "Rw"], sep="\s+")
        rw = rw.reset_index(drop=True)
        awsem = awsem.reset_index().drop("Step", axis=1).rename(columns={"index":"Step"}).assign(model=i)
        data = pd.concat([rw, awsem], axis=1)
        remove_columns = ['i', "Shake", "Excluded", "Membrane", "Vec_FM", "SSB", "Electro."]
        all_data_list.append(data.drop(remove_columns, axis=1).assign(Name=protein))
data = pd.concat(all_data_list)

In [41]:
prediction_list = []
for name, one in data.groupby("Name"):
    print(name)
#     X = full_pipeline.fit_transform(data)
#     X = my_transform(data, label, degree)
#     eval_y = X[:,-1]
#     eval_set = X[:,:-1]
    X = my_transform_predict(one, degree)
    eval_set = X
    test= log_clf.predict_proba(eval_set)[:,1]
    one = one.assign(prediction=test)
    prediction_list.append(one)
#     prediction_list.append(pd.Series(test))
t = pd.concat(prediction_list)
# t = raw_test_data.assign(prediction=prediction.values)
best_by_prediction = t.drop_duplicates(subset=['prediction', 'Rw', "VTotal"]).groupby("Name").apply(choose_top, n=10, col="prediction").query("chosen==True")


T0958
T0960

In [42]:
best_by_prediction.groupby("Name").count()


/Users/weilu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: FutureWarning: 'Name' is both a column name and an index level.
Defaulting to column but this will raise an ambiguity error in a future version
  """Entry point for launching an IPython kernel.
Out[42]:
Rw Step Chain Chi Rama DSSP P_AP Water Burial Helix AMH_Go Frag_Mem QGO VTotal model prediction chosen
Name
T0958 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
T0960 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10

In [43]:
t.drop_duplicates(subset=['prediction', 'Rw', "VTotal"]).rename(columns={"prediction":"score"}).\
    to_csv("/Users/weilu/Dropbox/IAAWSEM/casp13/selection/all_atom_refinement_energy_rw_and_score_by_ml_may31.csv")

In [44]:
for name, one in best_by_prediction.reset_index(drop=True).groupby("Name"):
    print(name)
#     print(a.sort_values("prediction", ascending=False).reset_index())
    for i,a in one.sort_values("prediction", ascending=False).reset_index().iterrows():
        print(name, "model", a["model"], "Step", a["Step"])
        from_loc = f"/Users/weilu/Research/server/casp13/result/{name}/model.{a['model']}/lowTstructure/lowTstructure{a['Step']}.pdb"
        to_loc = f"/Users/weilu/Dropbox/IAAWSEM/casp13/selection/selected/{name}_chosen_{i}.pdb"
        os.system("cp "+from_loc+" "+to_loc)


T0958
T0958 model 2 Step 16
T0958 model 2 Step 20
T0958 model 2 Step 19
T0958 model 2 Step 27
T0958 model 2 Step 34
T0958 model 2 Step 0
T0958 model 2 Step 15
T0958 model 2 Step 12
T0958 model 2 Step 33
T0958 model 3 Step 0
T0960
T0960 model 2 Step 0
T0960 model 1 Step 0
T0960 model 2 Step 1
T0960 model 2 Step 6
T0960 model 3 Step 0
T0960 model 2 Step 4
T0960 model 2 Step 3
T0960 model 2 Step 11
T0960 model 3 Step 10
T0960 model 3 Step 5

In [ ]:
t.drop_duplicates(subset=['prediction', 'Rw', "VTotal"])

In [25]:
import seaborn
fg = seaborn.FacetGrid(data=t.query("Name == 'T0950'"), hue='model',  aspect=1.61, size=10)
fg.map(plt.plot, 'Step', 'Rw').add_legend()


Out[25]:
<seaborn.axisgrid.FacetGrid at 0x1a22433390>

In [11]:
import seaborn
fg = seaborn.FacetGrid(data=t.query("Name == 'T0949'"), hue='model',  aspect=1.61, size=10)
fg.map(plt.plot, 'Step', 'Rw').add_legend()


Out[11]:
<seaborn.axisgrid.FacetGrid at 0x1a174c0400>

In [ ]:
# n = 100
prediction_list = []
for name, data in raw_test_data.groupby("Name"):
    print(name)
#     X = full_pipeline.fit_transform(data)
#     X = my_transform(data, label, degree)
#     eval_y = X[:,-1]
#     eval_set = X[:,:-1]
    X = my_transform_predict(data, degree)
    eval_set = X
    test= log_clf.predict_proba(eval_set)[:,1]
    one = data.assign(prediction=test)
    prediction_list.append(one)
#     prediction_list.append(pd.Series(test))
t = pd.concat(prediction_list)
# t = raw_test_data.assign(prediction=prediction.values)
best_by_prediction = t.groupby("Name").apply(choose_top, n=5, col="prediction").query("chosen==True")

In [ ]:


In [ ]:


In [ ]: