In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import os
from datetime import datetime
import seaborn as sns
%matplotlib inline
from sklearn.base import BaseEstimator, TransformerMixin
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values
    
plt.rcParams['figure.figsize'] = (10,6.180)    #golden ratio

In [9]:
def read_data(name):
#     name="tr872"
    name_list = ["Step" , "Chain" , "Shake" , "Chi" , "Rama", "Excluded", "DSSP", "P_AP", "Water" ,"Burial", "Helix", "AMH_Go", "Frag_Mem", "Vec_FM", "Membrane", "SSB","VTotal"]


    location = f"/Volumes/Wei_backup/oct_2018/01_week/{name}/"
    RMSD = pd.read_table(location+"rmsd-angstrom.xvg", names=["i", "RMSD"], sep="\s+")
    bias = pd.read_table(location+"bias.log", names=["i", "biasQ", "bias"], sep="\s+").drop("i", axis=1)
    awsem = pd.read_table(location+"awsem.log", names=name_list)
    rw = pd.read_table(location+"rwplusScore.txt", names=["i", "Rw"], sep="\s+").drop("i", axis=1)
    location = f"/Volumes/Wei_backup/oct_2018/01_week/{name}/"
    pc = pd.read_table(location+"pcarmsd_scaled.txt", names=["i", "pc", "pc2"], sep="\s+").drop("i", axis=1)
    raw_data = pd.concat([RMSD, rw, bias, awsem, pc], axis=1)
    return raw_data.assign(name=name).reset_index().rename(columns={"index":"folder"})

def choose_top(data,col="RMSD", n=5, ascending=True):
    return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='dense')<=n)


def read_data_2(name):
#     name = "tr894"
    location = f"/Volumes/Wei_backup/oct_2018/01_week/{name}/"
    rw = pd.read_table(location+"rc_rwplus", names=["pc","rw"], sep="\s+")
    rmsd = pd.read_table(location+"rc_rmsdlowerBound", names=["pc", "rmsd"], sep="\s+")
    awsem = pd.read_table(location+"rc_awsemEne", names=["pc", "awsem"], sep="\s+")
    freeE = pd.read_table(location+"pmf3000"
                          , names=["pc", "f", "remove1", "remove2"], sep="\s+").drop(["remove1", "remove2"], axis=1)
    raw_data = freeE.merge(rw, on="pc").merge(awsem, on="pc").merge(rmsd, on="pc").assign(name=name)
    return raw_data

In [ ]:


In [10]:
# name = "tr894"
# location = f"/Users/weilu/Research/server/sep_2018/03_week/{name}/"
# rw = pd.read_table(location+"rc_rwplus", names=["pc","rw"], sep="\s+")
# rmsd = pd.read_table(location+"rc_rmsdlowerBound", names=["pc", "rmsd"], sep="\s+")
# awsem = pd.read_table(location+"rc_awsemEne", names=["pc", "awsem"], sep="\s+")
# freeE = pd.read_table(location+"pmf3000"
#                       , names=["pc", "f", "remove1", "remove2"], sep="\s+").drop(["remove1", "remove2"], axis=1)
# raw_data = freeE.merge(rw, on="pc").merge(awsem, on="pc").merge(rmsd, on="pc").assign(name=name)

In [11]:
# folder_list = ["tr894", "tr882", "tr594", "tr898", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
folder_list = ["tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = [ "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# "tr877","tr884", "tr922"
# "tr869"
data_list = []
for name in folder_list:
    tmp = read_data_2(name)
    data_list.append(tmp)
raw_data_all = pd.concat(data_list)
n = 20
raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(choose_top, n=n, col="rmsd").reset_index(drop=True)


# train_name_list = ["tr872", "tr885", "tr948"]
# train_name_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# train_name_list = ["tr866"]
# train_name_list = ["tr948"]
train_name_list = ["tr894"]
# train_name_list = ["tr885"]
train_name_list = ["tr872"]
# train_name_list = ["tr594"]
# train_name_list = ["tr894", "tr922", "tr882", "tr872"]
# train_name_list = ["tr898", "tr896", "tr894", "tr895"]
train_name_list = ["tr868"]
train_name_list = ["tr882"]
raw_data = raw_data_all.reset_index(drop=True).query(f'name in {train_name_list}')


/Users/weilu/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:11: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  # This is added back by InteractiveShellApp.init_path()

In [12]:
len(folder_list)


Out[12]:
18

In [13]:
raw_data.shape


Out[13]:
(18, 7)

In [14]:
raw_data_all.columns


Out[14]:
Index(['awsem', 'f', 'name', 'pc', 'rmsd', 'rw', 'chosen'], dtype='object')

In [15]:
raw_data.plot.scatter("rw", "rmsd")


Out[15]:
<matplotlib.axes._subplots.AxesSubplot at 0x1099dae48>

In [16]:
import seaborn as sns
# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=raw_data_all.reset_index(), hue='name', height=8, aspect=1.63)
fg.map(plt.scatter, 'index', 'rmsd').add_legend(fontsize=20)


Out[16]:
<seaborn.axisgrid.FacetGrid at 0x1a21093be0>

In [17]:
import seaborn as sns
# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=raw_data_all.reset_index(), hue='chosen', size=8, aspect=1.63)
fg.map(plt.scatter, 'index', 'rmsd').add_legend()


/Users/weilu/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py:230: UserWarning: The `size` paramter has been renamed to `height`; please update your code.
  warnings.warn(msg, UserWarning)
Out[17]:
<seaborn.axisgrid.FacetGrid at 0x1a213d46d8>

In [18]:
raw_data.head(3)


Out[18]:
awsem f name pc rmsd rw chosen
81 -1069.453333 10.311355 tr882 -1.853 0.195000 -11960.460667 True
82 -1066.120000 5.676881 tr882 -1.653 0.201622 -11960.550757 True
83 -1070.160404 3.055764 tr882 -1.453 0.205455 -11985.915212 True

In [19]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
def my_transform(data, label, degree, FEATURES):

    # LABEL = "Qw"
    LABEL = label
    PolynomialDegree = degree

    num_attribs = FEATURES
    cat_attribs = [LABEL]
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    return full_pipeline.fit_transform(data)


# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = ["f",
    'rw',
     'awsem',
#     'RMSD', # test
#      'Burial',
#      'Water',
#      'Rama',
#      'DSSP',
#      'P_AP',
#      'Helix',
#      'Frag_Mem'
               ]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "rmsd"
DEGREE = 1

def pred_from_raw(a):
    data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
    test_y = data[:,-1]
    test_set = data[:,:-1]
    prediceted_rmsd= clf.predict(test_set)
    return a.assign(prediceted_rmsd=prediceted_rmsd)

In [20]:
# data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
data = np.concatenate(raw_data.groupby('name').apply(my_transform, 
                                                     label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
train_y = data[:,-1]
train_set = data[:,:-1]
from sklearn import svm
# clf = svm.SVC(probability=True)
clf = LinearRegression()
clf.fit(train_set, train_y)
y_pred_svm = clf.predict(train_set)

raw_data_all = raw_data_all.reset_index(drop=True).groupby("name").apply(pred_from_raw).reset_index(drop=True)


picked_n = 1
best = raw_data_all.groupby("name").apply(choose_top, col="rmsd"
                                            , n=picked_n, ascending=True).reset_index(drop=True).query("chosen==True")
picked = raw_data_all.groupby("name").apply(choose_top, col="prediceted_rmsd"
                                            , n=picked_n, ascending=True).reset_index(drop=True).query("chosen==True")
# init = raw_data_all.query("i == 0.0")
all_results = pd.concat([best.assign(result='best'), 
                         picked.assign(result='picked')])

picked_keep = picked.copy()

In [21]:
# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.scatter, 'name', 'pc').add_legend(fontsize=20)


Out[21]:
<seaborn.axisgrid.FacetGrid at 0x1a219744a8>

In [22]:
def choose_top(data,col="RMSD", n=5, ascending=True):
    return data.assign(chosen=pd.DataFrame.rank(data[col], ascending=ascending, method='first')<=n)


WIDTH = 0.2
def with_in_range(data, width=WIDTH):
    return data.assign(inrange= (data["pc"] < (data["pc_center"]+width)) & (data["pc"] > (data["pc_center"]-width)))

In [23]:
folder_list = ["tr894", "tr882", "tr594", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# "tr898"
# folder_list = ["tr894", "tr882", "tr594", "tr898", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr894", "tr882", "tr594", "tr869", "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = [ "tr862", "tr877", "tr872", "tr885", "tr866", "tr868", "tr884", "tr895", "tr896", "tr870", "tr921", "tr922", "tr891", "tr948"]
# folder_list = ["tr862", "tr872", "tr885", "tr866", "tr868" , "tr895", "tr896", "tr870", "tr921", "tr891", "tr948"]
# "tr877","tr884", "tr922"
# "tr869"
data_list = []
for name in folder_list:
    tmp = read_data(name)
    data_list.append(tmp)
raw_data_all_2 = pd.concat(data_list)
n = 20
raw_data_all_2 = raw_data_all_2.reset_index(drop=True).groupby("name").apply(choose_top, n=n, col="RMSD").reset_index(drop=True)


raw_data = raw_data_all_2.reset_index(drop=True).query(f'name in {train_name_list}')
a = raw_data_all_2.merge(picked_keep[["pc", "name"]].rename(columns={"pc":"pc_center"}),on="name")
filtered = a.groupby("name").apply(with_in_range).query("inrange == True").reset_index(drop=True)

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
def my_transform(data, label, degree, FEATURES):

    # LABEL = "Qw"
    LABEL = label
    PolynomialDegree = degree

    num_attribs = FEATURES
    cat_attribs = [LABEL]
    num_pipeline = Pipeline([
            ('selector', DataFrameSelector(num_attribs)),
            ('std_scaler', StandardScaler()),
            ('poly', PolynomialFeatures(degree=PolynomialDegree, include_bias=False))
        ])
    cat_pipeline = Pipeline([
            ('selector', DataFrameSelector(cat_attribs))
        ])

    full_pipeline = FeatureUnion(transformer_list=[
            ("num_pipeline", num_pipeline),
            ("cat_pipeline", cat_pipeline),
        ])
    return full_pipeline.fit_transform(data)


# FEATURES = ["eigenvalues", "entropy", "pca"]
# FEATURES = ["eigenvalues", "entropy", "diffRMSD"]
# FEATURES = ["eigenvalues", "entropy"]
FEATURES = [
    "biasQ",
    'Rw',
     'VTotal',
#     'RMSD', # test
#      'Burial',
#      'Water',
#      'Rama',
#      'DSSP',
#      'P_AP',
#      'Helix',
#      'Frag_Mem'
               ]
# FEATURES = ["eigenvalues"]
# LABEL = "diffRMSD"
# LABEL = "RMSD"
LABEL = "chosen"
DEGREE = 1

def pred_from_raw(a):
    data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
    test_y = data[:,-1]
    test_set = data[:,:-1]
    prob= clf.predict_proba(test_set)[:,1]
    return a.assign(prob=prob)

# data = my_transform(raw_data, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
# data = raw_data.groupby('name').apply(my_transform, label=LABEL, degree=DEGREE, FEATURES=FEATURES)[0]
data = np.concatenate(raw_data.groupby('name').apply(my_transform, 
                                                     label=LABEL, degree=DEGREE, FEATURES=FEATURES).values)
train_y = data[:,-1]
train_set = data[:,:-1]
from sklearn import svm
p = 0.5
# clf = svm.SVC(probability=True)
clf = LogisticRegression(random_state=1142)
clf.fit(train_set, train_y)
y_pred_svm = clf.predict(train_set)


filtered = filtered.reset_index(drop=True).groupby("name").apply(pred_from_raw).reset_index(drop=True)


picked_n = 1
best = raw_data_all_2.groupby("name").apply(choose_top, col="RMSD"
                                            , n=1, ascending=True).reset_index(drop=True).query("chosen==True")
picked = filtered.groupby("name").apply(choose_top, col="prob"
                                            , n=1, ascending=False).reset_index(drop=True).query("chosen==True")
init = raw_data_all_2.query("i == 0.0")
all_results = pd.concat([best.assign(result='best'), 
                         picked.assign(result='picked'), init.assign(result='init')])
# all_results = pd.concat([best.assign(result='best'), 
#                          picked.assign(result='picked')])
# picked.to_csv("/Users/weilu/Desktop/picked.csv

# sns.set(rc={'figure.figsize':(20,30)})
# plt.figure(figsize=(15,8))
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
fg.set(ylim=(0, 10))


/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/ensemble/weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
/Users/weilu/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py:475: DataConversionWarning: Data with input dtype object was converted to float64 by StandardScaler.
  warnings.warn(msg, DataConversionWarning)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
    917             try:
--> 918                 result = self._python_apply_general(f)
    919             except Exception:

~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
    935         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 936                                                    self.axis)
    937 

~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in apply(self, f, data, axis)
   2272             group_axes = _get_axes(group)
-> 2273             res = f(group)
   2274             if not _is_indexed_like(res, group_axes):

<ipython-input-24-d3dfc9c50ced> in pred_from_raw(a)
     56 def pred_from_raw(a):
---> 57     data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
     58     test_y = data[:,-1]

<ipython-input-24-d3dfc9c50ced> in my_transform(data, label, degree, FEATURES)
     30         ])
---> 31     return full_pipeline.fit_transform(data)
     32 

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    738                                         **fit_params)
--> 739             for name, trans, weight in self._iter())
    740 

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    331         # arguments in memory
--> 332         self.results = batch()
    333 

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
    580     if hasattr(transformer, 'fit_transform'):
--> 581         res = transformer.fit_transform(X, y, **fit_params)
    582     else:

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    280         last_step = self._final_estimator
--> 281         Xt, fit_params = self._fit(X, y, **fit_params)
    282         if hasattr(last_step, 'fit_transform'):

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    212                     cloned_transformer, None, Xt, y,
--> 213                     **fit_params_steps[name])
    214                 # Replace the transformer of the step with the fitted

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py in __call__(self, *args, **kwargs)
    361     def __call__(self, *args, **kwargs):
--> 362         return self.func(*args, **kwargs)
    363 

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
    580     if hasattr(transformer, 'fit_transform'):
--> 581         res = transformer.fit_transform(X, y, **fit_params)
    582     else:

~/anaconda3/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    516             # fit method of arity 1 (unsupervised transformation)
--> 517             return self.fit(X, **fit_params).transform(X)
    518         else:

~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/data.py in fit(self, X, y)
    589         self._reset()
--> 590         return self.partial_fit(X, y)
    591 

~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/data.py in partial_fit(self, X, y)
    611         X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
--> 612                         warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES)
    613 

~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    452         if force_all_finite:
--> 453             _assert_all_finite(array)
    454 

~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in _assert_all_finite(X)
     43         raise ValueError("Input contains NaN, infinity"
---> 44                          " or a value too large for %r." % X.dtype)
     45 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

During handling of the above exception, another exception occurred:

ValueError                                Traceback (most recent call last)
<ipython-input-24-d3dfc9c50ced> in <module>()
     75 
     76 
---> 77 filtered = filtered.reset_index(drop=True).groupby("name").apply(pred_from_raw).reset_index(drop=True)
     78 
     79 

~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in apply(self, func, *args, **kwargs)
    928 
    929                 with _group_selection_context(self):
--> 930                     return self._python_apply_general(f)
    931 
    932         return result

~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in _python_apply_general(self, f)
    934     def _python_apply_general(self, f):
    935         keys, values, mutated = self.grouper.apply(f, self._selected_obj,
--> 936                                                    self.axis)
    937 
    938         return self._wrap_applied_output(

~/anaconda3/lib/python3.6/site-packages/pandas/core/groupby/groupby.py in apply(self, f, data, axis)
   2271             # group might be modified
   2272             group_axes = _get_axes(group)
-> 2273             res = f(group)
   2274             if not _is_indexed_like(res, group_axes):
   2275                 mutated = True

<ipython-input-24-d3dfc9c50ced> in pred_from_raw(a)
     55 
     56 def pred_from_raw(a):
---> 57     data = my_transform(a, label=LABEL, degree=DEGREE, FEATURES=FEATURES)
     58     test_y = data[:,-1]
     59     test_set = data[:,:-1]

<ipython-input-24-d3dfc9c50ced> in my_transform(data, label, degree, FEATURES)
     29             ("cat_pipeline", cat_pipeline),
     30         ])
---> 31     return full_pipeline.fit_transform(data)
     32 
     33 

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    737             delayed(_fit_transform_one)(trans, weight, X, y,
    738                                         **fit_params)
--> 739             for name, trans, weight in self._iter())
    740 
    741         if not result:

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
    777             # was dispatched. In particular this covers the edge
    778             # case of Parallel used with an exhausted iterator.
--> 779             while self.dispatch_one_batch(iterator):
    780                 self._iterating = True
    781             else:

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
    623                 return False
    624             else:
--> 625                 self._dispatch(tasks)
    626                 return True
    627 

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
    586         dispatch_timestamp = time.time()
    587         cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588         job = self._backend.apply_async(batch, callback=cb)
    589         self._jobs.append(job)
    590 

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
    109     def apply_async(self, func, callback=None):
    110         """Schedule a func to be run"""
--> 111         result = ImmediateResult(func)
    112         if callback:
    113             callback(result)

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
    330         # Don't delay the application, to avoid keeping the input
    331         # arguments in memory
--> 332         self.results = batch()
    333 
    334     def get(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
    129 
    130     def __call__(self):
--> 131         return [func(*args, **kwargs) for func, args, kwargs in self.items]
    132 
    133     def __len__(self):

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
    579                        **fit_params):
    580     if hasattr(transformer, 'fit_transform'):
--> 581         res = transformer.fit_transform(X, y, **fit_params)
    582     else:
    583         res = transformer.fit(X, y, **fit_params).transform(X)

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
    279         """
    280         last_step = self._final_estimator
--> 281         Xt, fit_params = self._fit(X, y, **fit_params)
    282         if hasattr(last_step, 'fit_transform'):
    283             return last_step.fit_transform(Xt, y, **fit_params)

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
    211                 Xt, fitted_transformer = fit_transform_one_cached(
    212                     cloned_transformer, None, Xt, y,
--> 213                     **fit_params_steps[name])
    214                 # Replace the transformer of the step with the fitted
    215                 # transformer. This is necessary when loading the transformer

~/anaconda3/lib/python3.6/site-packages/sklearn/externals/joblib/memory.py in __call__(self, *args, **kwargs)
    360 
    361     def __call__(self, *args, **kwargs):
--> 362         return self.func(*args, **kwargs)
    363 
    364     def call_and_shelve(self, *args, **kwargs):

~/anaconda3/lib/python3.6/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, weight, X, y, **fit_params)
    579                        **fit_params):
    580     if hasattr(transformer, 'fit_transform'):
--> 581         res = transformer.fit_transform(X, y, **fit_params)
    582     else:
    583         res = transformer.fit(X, y, **fit_params).transform(X)

~/anaconda3/lib/python3.6/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
    515         if y is None:
    516             # fit method of arity 1 (unsupervised transformation)
--> 517             return self.fit(X, **fit_params).transform(X)
    518         else:
    519             # fit method of arity 2 (supervised transformation)

~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/data.py in fit(self, X, y)
    588         # Reset internal state before fitting
    589         self._reset()
--> 590         return self.partial_fit(X, y)
    591 
    592     def partial_fit(self, X, y=None):

~/anaconda3/lib/python3.6/site-packages/sklearn/preprocessing/data.py in partial_fit(self, X, y)
    610         """
    611         X = check_array(X, accept_sparse=('csr', 'csc'), copy=self.copy,
--> 612                         warn_on_dtype=True, estimator=self, dtype=FLOAT_DTYPES)
    613 
    614         # Even in the case of `with_mean=False`, we update the mean anyway

~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in check_array(array, accept_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, warn_on_dtype, estimator)
    451                              % (array.ndim, estimator_name))
    452         if force_all_finite:
--> 453             _assert_all_finite(array)
    454 
    455     shape_repr = _shape_repr(array.shape)

~/anaconda3/lib/python3.6/site-packages/sklearn/utils/validation.py in _assert_all_finite(X)
     42             and not np.isfinite(X).all()):
     43         raise ValueError("Input contains NaN, infinity"
---> 44                          " or a value too large for %r." % X.dtype)
     45 
     46 

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [25]:
fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
fg.set(ylim=(0, 10))


---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-25-6281687814d9> in <module>()
      1 fg = sns.FacetGrid(data=all_results.reset_index(), hue='result', height=8, aspect=1.63)
----> 2 fg.map(plt.plot, 'name', 'RMSD').add_legend(fontsize=20)
      3 fg.set(ylim=(0, 10))

~/anaconda3/lib/python3.6/site-packages/seaborn/axisgrid.py in map(self, func, *args, **kwargs)
    742 
    743             # Get the actual data we are going to plot with
--> 744             plot_data = data_ijk[list(args)]
    745             if self._dropna:
    746                 plot_data = plot_data.dropna()

~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2680         if isinstance(key, (Series, np.ndarray, Index, list)):
   2681             # either boolean or fancy integer index
-> 2682             return self._getitem_array(key)
   2683         elif isinstance(key, DataFrame):
   2684             return self._getitem_frame(key)

~/anaconda3/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_array(self, key)
   2724             return self._take(indexer, axis=0)
   2725         else:
-> 2726             indexer = self.loc._convert_to_indexer(key, axis=1)
   2727             return self._take(indexer, axis=1)
   2728 

~/anaconda3/lib/python3.6/site-packages/pandas/core/indexing.py in _convert_to_indexer(self, obj, axis, is_setter)
   1325                 if mask.any():
   1326                     raise KeyError('{mask} not in index'
-> 1327                                    .format(mask=objarr[mask]))
   1328 
   1329                 return com._values_from_object(indexer)

KeyError: "['RMSD'] not in index"

In [19]:
filtered["name"].unique().shape


Out[19]:
(17,)

In [20]:
picked[["RMSD", "name"]]


Out[20]:
RMSD name
969 7.02078 tr594
1848 5.61065 tr862
3276 2.91322 tr866
4497 1.06823 tr868
6098 8.09616 tr870
6481 4.10102 tr872
8201 3.07719 tr877
9164 2.25775 tr882
10670 3.23479 tr884
11696 2.35937 tr885
15047 2.68814 tr891
16215 1.16419 tr894
17311 5.38768 tr895
18249 8.22359 tr896
22625 3.16510 tr921
23681 3.14700 tr922
26050 6.11488 tr948

In [21]:
picked.to_csv("/Users/weilu/Desktop/picked_2.csv")

In [ ]: