Predictive Modeling with pyPCM



In [2]:

    
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import warnings
warnings.simplefilter('ignore', DeprecationWarning)



In [3]:

    
data = pd.read_csv('HD_data.csv')



In [4]:

    
data.head(5)









    Out[4]:






  
    
      
      Activity
      Compound
      SubFPC1
      SubFPC2
      SubFPC3
      SubFPC4
      SubFPC5
      SubFPC6
      SubFPC7
      SubFPC8
      ...
      m10z2p7
      m10z2p8
      m10z3p1
      m10z3p2
      m10z3p3
      m10z3p4
      m10z3p5
      m10z3p6
      m10z3p7
      m10z3p8
    
  
  
    
      0
       Active
       CHEMBL2000089
       0
       9
       0
       0
       0
       0
       0
       0
      ...
      -1.73
      -0.97
       0.57
       0.57
       2.36
      -3.14
      -3.44
      -1.03
       0.09
       4.13
    
    
      1
       Active
            CHEMBL99
       2
       0
       1
       0
       2
       0
       0
       0
      ...
      -1.73
      -0.97
       0.57
       0.57
       2.36
      -3.14
      -3.44
      -1.03
       0.09
       4.13
    
    
      2
       Active
       CHEMBL2047701
       0
       1
       0
       0
       0
       0
       0
       0
      ...
      -1.73
      -0.97
       0.57
       0.57
       2.36
      -3.14
      -3.44
      -1.03
       0.09
       4.13
    
    
      3
       Active
        CHEMBL343448
       5
       2
       2
       0
       1
       0
       0
       0
      ...
      -1.73
      -0.97
       0.57
       0.57
       2.36
      -3.14
      -3.44
      -1.03
       0.09
       4.13
    
    
      4
       Active
        CHEMBL360194
       2
       8
       0
       0
       0
       0
       0
       0
      ...
      -1.73
      -0.97
       0.57
       0.57
       2.36
      -3.14
      -3.44
      -1.03
       0.09
       4.13
    
  

5 rows × 937 columns



In [5]:

    
data.count()









    Out[5]:





Activity    22800
Compound    22800
SubFPC1     22800
SubFPC2     22800
SubFPC3     22800
SubFPC4     22800
SubFPC5     22800
SubFPC6     22800
SubFPC7     22800
SubFPC8     22800
SubFPC9     22800
SubFPC10    22800
SubFPC11    22800
SubFPC12    22800
SubFPC13    22800
...
m10z2p2    22800
m10z2p3    22800
m10z2p4    22800
m10z2p5    22800
m10z2p6    22800
m10z2p7    22800
m10z2p8    22800
m10z3p1    22800
m10z3p2    22800
m10z3p3    22800
m10z3p4    22800
m10z3p5    22800
m10z3p6    22800
m10z3p7    22800
m10z3p8    22800
Length: 937, dtype: int64



In [6]:

    
compound = data.iloc[:, 2:309]



In [7]:

    
compound.count()









    Out[7]:





SubFPC1     22800
SubFPC2     22800
SubFPC3     22800
SubFPC4     22800
SubFPC5     22800
SubFPC6     22800
SubFPC7     22800
SubFPC8     22800
SubFPC9     22800
SubFPC10    22800
SubFPC11    22800
SubFPC12    22800
SubFPC13    22800
SubFPC14    22800
SubFPC15    22800
...
SubFPC293    22800
SubFPC294    22800
SubFPC295    22800
SubFPC296    22800
SubFPC297    22800
SubFPC298    22800
SubFPC299    22800
SubFPC300    22800
SubFPC301    22800
SubFPC302    22800
SubFPC303    22800
SubFPC304    22800
SubFPC305    22800
SubFPC306    22800
SubFPC307    22800
Length: 307, dtype: int64



In [8]:

    
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca.fit(compound)
print(pca.explained_variance_)
print(pca.components_)









    



[ 253.06246322   43.74058158]
[[  2.66742429e-02   6.88682835e-02   1.45269733e-02   5.83758974e-03
    1.94859050e-03   1.03105901e-04   3.30872245e-24  -1.09551304e-04
   -3.81705419e-03  -9.28833712e-06   7.63449665e-06   6.94634156e-03
    6.39591534e-04   3.97779072e-03   2.32895930e-03   1.95705000e-03
    5.14850727e-04   2.32251138e-03  -1.46616703e-04  -6.59608162e-04
   -5.67672428e-05   0.00000000e+00   1.98332758e-03   5.23524074e-06
    0.00000000e+00   7.40540385e-04  -1.86045595e-06  -4.90669202e-05
    0.00000000e+00   0.00000000e+00   0.00000000e+00   3.97802394e-04
    1.30169676e-03   0.00000000e+00   5.01722954e-05   1.99907935e-04
   -6.22681892e-07  -6.85278136e-04   6.86217326e-05   1.05219436e-04
    8.45607667e-04  -4.08061920e-06   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   7.17480058e-05
    1.33390681e-03   0.00000000e+00   8.04687583e-06   2.51032412e-05
    4.44793469e-04   3.61188118e-05   2.11198513e-05   5.76763449e-05
   -4.08061920e-06   0.00000000e+00  -4.08388052e-06   1.25557681e-05
    0.00000000e+00  -5.71697136e-04  -5.14610025e-04   0.00000000e+00
   -1.15438815e-05  -1.15438815e-05   0.00000000e+00  -1.04524101e-05
   -1.69477642e-05   0.00000000e+00   0.00000000e+00   4.33011234e-04
    0.00000000e+00   3.55894157e-05   3.30723761e-05   6.16124517e-04
    0.00000000e+00  -6.73139298e-06   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   7.49198215e-04
    1.00315713e-03   8.91352659e-04   0.00000000e+00   1.31092080e-02
    0.00000000e+00   2.01183158e-05   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00  -1.23057510e-04
    0.00000000e+00   0.00000000e+00   1.63048015e-04   7.80365453e-03
    2.93054115e-03   0.00000000e+00   2.49931821e-05  -8.81537650e-06
   -1.82047305e-05   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.91270658e-04   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   7.55888692e-07
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.19529073e-05  -2.23063453e-06   3.81468624e-04   6.94197735e-04
    1.73961024e-05   0.00000000e+00   0.00000000e+00   3.67594155e-06
   -7.25773669e-04   0.00000000e+00   3.27625460e-03   1.47486448e-05
   -2.09715835e-03   0.00000000e+00  -4.04130582e-04   9.99277900e-06
    0.00000000e+00   0.00000000e+00   1.40668687e-03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   7.18655555e-05   0.00000000e+00
   -2.05068301e-07  -1.82053385e-04   1.16817543e-04   0.00000000e+00
    4.28366037e-04   0.00000000e+00  -7.84877974e-05   0.00000000e+00
    1.89886810e-04   0.00000000e+00   1.30497699e-04  -8.97017355e-07
    1.00332194e-04  -7.08699654e-08   0.00000000e+00   0.00000000e+00
   -9.01577090e-06  -4.39095647e-06   0.00000000e+00   0.00000000e+00
    4.60185229e-03   9.33768155e-05  -3.42354075e-03  -2.49512681e-03
   -5.24091831e-04  -5.52379430e-05  -1.43283794e-04   0.00000000e+00
    0.00000000e+00   0.00000000e+00   9.40527616e-04  -2.77481599e-04
   -3.49618900e-03  -1.62946858e-03  -9.58554430e-04  -5.42116599e-03
    0.00000000e+00   0.00000000e+00   0.00000000e+00  -2.37093053e-03
    0.00000000e+00  -9.45404395e-06   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00  -8.42116270e-06
    2.41703185e-04   4.49183588e-04   0.00000000e+00  -2.03787535e-05
   -2.91755891e-05   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    3.99781217e-06   0.00000000e+00   0.00000000e+00  -9.97946325e-06
    0.00000000e+00  -2.87877755e-04   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00  -5.00468041e-05
    0.00000000e+00  -3.55508520e-06   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   1.23763166e-05   0.00000000e+00
    0.00000000e+00   0.00000000e+00   1.23763166e-05   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   3.25110484e-07   0.00000000e+00  -3.41477758e-06
    0.00000000e+00  -3.41477758e-06   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00  -2.55732184e-02   1.13277657e-02   4.25380159e-05
    0.00000000e+00   0.00000000e+00  -1.18853625e-04  -1.18853625e-04
    5.33525736e-04   7.86190338e-04   4.68298281e-04   3.10381980e-06
    7.86190338e-04   7.86190338e-04  -1.06638611e-02  -7.12087416e-04
    0.00000000e+00   0.00000000e+00  -4.50756284e-06   0.00000000e+00
    0.00000000e+00  -1.11241950e-03   9.00637342e-02  -3.54495661e-04
   -2.25346114e-03   1.89896548e-03  -2.25346114e-03   6.79431113e-01
    6.79431113e-01   8.22048424e-02   2.79534754e-03   0.00000000e+00
    0.00000000e+00   0.00000000e+00   2.34079580e-01]
 [  3.50735467e-02   1.29106051e-01   2.63708032e-02   5.90951196e-03
    5.18064632e-03   5.55306855e-04   1.35525272e-20   3.44557430e-04
    1.49563300e-03   3.75191965e-05   5.22055400e-05   8.66285871e-03
    6.67164451e-04   4.71205980e-03   3.28363446e-03   3.62210159e-03
    1.42224178e-03  -1.58866532e-02  -1.85073061e-03  -6.64548485e-03
   -1.98862029e-04   0.00000000e+00  -2.12427079e-03  -7.19922005e-06
    0.00000000e+00  -2.18348368e-05   1.08415020e-05  -1.56869914e-03
    0.00000000e+00   0.00000000e+00   0.00000000e+00  -1.07469248e-03
   -2.32546651e-03   0.00000000e+00  -3.16588490e-05   4.16584916e-04
    2.08883881e-03  -6.59694507e-03   4.47281247e-05  -1.69874285e-04
    1.39685188e-03  -8.53435805e-06   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   1.51866363e-04
    1.75966504e-03   0.00000000e+00   3.27493665e-06   1.49254455e-05
    4.43391889e-04   3.71677361e-04   1.99614440e-04   1.82447765e-04
   -8.53435805e-06   0.00000000e+00  -6.01551036e-06   4.54958488e-06
    0.00000000e+00   1.53332040e-03   1.71625069e-03   0.00000000e+00
    9.58379107e-05   9.58379107e-05   0.00000000e+00   2.33988469e-05
    8.94864611e-05   0.00000000e+00   0.00000000e+00   1.96254996e-04
    0.00000000e+00  -2.25463869e-04   4.12476428e-05   2.65585847e-04
    0.00000000e+00  -2.52123036e-07   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00  -1.28828672e-03
    7.59677068e-04   8.07064259e-04   0.00000000e+00  -1.75694526e-02
    0.00000000e+00   4.59433791e-05   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00  -1.03318816e-02
    0.00000000e+00   0.00000000e+00  -5.20126228e-05  -1.77567363e-02
   -3.50576237e-04   0.00000000e+00  -7.29005673e-04  -8.04326310e-05
   -9.65306422e-05   0.00000000e+00   0.00000000e+00   0.00000000e+00
    1.78400894e-04   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   1.42707784e-04
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
   -1.03346199e-05   5.07324321e-07   1.27666671e-03  -1.15974631e-03
    1.06504779e-05   0.00000000e+00   0.00000000e+00  -3.12069659e-05
    1.23182039e-03   0.00000000e+00  -1.10222499e-02   1.44090563e-04
   -3.75599816e-02   0.00000000e+00   1.46931053e-05   6.07624378e-06
    0.00000000e+00   0.00000000e+00   8.11214954e-03   0.00000000e+00
    0.00000000e+00   0.00000000e+00  -7.59959776e-05   0.00000000e+00
    5.91259626e-06  -1.03282470e-03  -1.15433393e-03   0.00000000e+00
    1.65997216e-04   0.00000000e+00   1.55433190e-03   0.00000000e+00
    3.11847405e-04   0.00000000e+00   2.40378429e-03  -4.85325348e-07
    3.66800082e-04  -5.05923220e-07   0.00000000e+00   0.00000000e+00
    4.13978521e-05  -1.73320726e-06   0.00000000e+00   0.00000000e+00
   -6.96926600e-03  -5.30843209e-04  -3.38884939e-03  -1.00265144e-03
    9.32669683e-06  -1.33214270e-04   2.65349021e-04   0.00000000e+00
    0.00000000e+00   0.00000000e+00  -1.24821098e-02  -1.06705931e-02
   -4.54070325e-02  -5.39238856e-03  -3.54361156e-03  -7.74957354e-02
    0.00000000e+00   0.00000000e+00   0.00000000e+00  -1.13034706e-03
    0.00000000e+00  -6.12024070e-05   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   3.61290992e-06
   -3.46769298e-04  -4.15116695e-03   0.00000000e+00   2.56391634e-04
   -5.06452270e-05   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    5.73567469e-05   0.00000000e+00   0.00000000e+00  -3.48398879e-05
    0.00000000e+00  -1.28860546e-03   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   9.16696725e-04
    0.00000000e+00  -1.62369117e-06   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00  -5.47319215e-06   0.00000000e+00
    0.00000000e+00   0.00000000e+00  -5.47319215e-06   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   2.62437395e-05   0.00000000e+00  -2.44950598e-05
    0.00000000e+00  -2.44950598e-05   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   0.00000000e+00   0.00000000e+00
    0.00000000e+00  -7.01993334e-01  -8.93381672e-02   4.28077566e-05
    0.00000000e+00   0.00000000e+00   6.94222566e-03   6.94222566e-03
    2.18583641e-04   8.76981215e-04   4.56532574e-04   1.21788314e-05
    8.76981215e-04   8.76981215e-04  -8.70478910e-02   4.05136306e-04
    0.00000000e+00   0.00000000e+00   2.94518171e-06   0.00000000e+00
    0.00000000e+00   5.03209554e-05  -2.23948124e-01  -1.00946303e-02
   -2.08715471e-03  -8.00747558e-03  -2.08715471e-03   1.04537705e-01
    1.04537705e-01  -5.98941263e-02   2.69175850e-03   0.00000000e+00
    0.00000000e+00   0.00000000e+00  -6.21132503e-01]]

Explained Variance Plot for Compound



In [9]:

    
plt.title("Explained Variance")
plt.ylabel("Percentage of explained variance")
plt.xlabel("PCA Components")
plt.plot(pca.explained_variance_ratio_);

Cumulated Explained Variance plot for Compound



In [10]:

    
plt.title("Cumulated Explained Variance")
plt.ylabel("Percentage of explained variance")
plt.xlabel("PCA Components")
plt.plot(np.cumsum(pca.explained_variance_ratio_));



In [11]:

    
compound = compound
activity = data.iloc[:, 0]

Data Partition



In [12]:

    
from sklearn.cross_validation import train_test_split
compound_train, compound_test, activity_train, activity_test = train_test_split(
    compound, activity, test_size = 0.20, random_state = 0)
print("train data shape: %r, train target shape: %r"
     % (compound_train.shape, activity_train.shape))
print("test data shape: %r, test target shape: %r"
     % (compound_test.shape, activity_test.shape))









    



train data shape: (18240, 307), train target shape: (18240,)
test data shape: (4560, 307), test target shape: (4560,)

Classification with Logistic Regression



In [13]:

    
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=1)
logreg.fit(compound_train, activity_train)









    Out[13]:





LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)



In [14]:

    
activity_predicted = logreg.predict(compound_test)



In [15]:

    
from sklearn.metrics import accuracy_score
accuracy_score(activity_test, activity_predicted)









    Out[15]:





0.9765350877192982

Feature Importance from Logistic Regression



In [16]:

    
compound_descriptors = compound.columns
compound_descriptors









    Out[16]:





Index(['SubFPC1', 'SubFPC2', 'SubFPC3', 'SubFPC4', 'SubFPC5', 'SubFPC6', 'SubFPC7', 'SubFPC8', 'SubFPC9', 'SubFPC10', 'SubFPC11', 'SubFPC12', 'SubFPC13', 'SubFPC14', 'SubFPC15', 'SubFPC16', 'SubFPC17', 'SubFPC18', 'SubFPC19', 'SubFPC20', 'SubFPC21', 'SubFPC22', 'SubFPC23', 'SubFPC24', 'SubFPC25', 'SubFPC26', 'SubFPC27', 'SubFPC28', 'SubFPC29', 'SubFPC30', 'SubFPC31', 'SubFPC32', 'SubFPC33', 'SubFPC34', 'SubFPC35', 'SubFPC36', 'SubFPC37', 'SubFPC38', 'SubFPC39', 'SubFPC40', 'SubFPC41', 'SubFPC42', 'SubFPC43', 'SubFPC44', 'SubFPC45', 'SubFPC46', 'SubFPC47', 'SubFPC48', 'SubFPC49', 'SubFPC50', 'SubFPC51', 'SubFPC52', 'SubFPC53', 'SubFPC54', 'SubFPC55', 'SubFPC56', 'SubFPC57', 'SubFPC58', 'SubFPC59', 'SubFPC60', 'SubFPC61', 'SubFPC62', 'SubFPC63', 'SubFPC64', 'SubFPC65', 'SubFPC66', 'SubFPC67', 'SubFPC68', 'SubFPC69', 'SubFPC70', 'SubFPC71', 'SubFPC72', 'SubFPC73', 'SubFPC74', 'SubFPC75', 'SubFPC76', 'SubFPC77', 'SubFPC78', 'SubFPC79', 'SubFPC80', 'SubFPC81', 'SubFPC82', 'SubFPC83', 'SubFPC84', 'SubFPC85', 'SubFPC86', 'SubFPC87', 'SubFPC88', 'SubFPC89', 'SubFPC90', 'SubFPC91', 'SubFPC92', 'SubFPC93', 'SubFPC94', 'SubFPC95', 'SubFPC96', 'SubFPC97', 'SubFPC98', 'SubFPC99', 'SubFPC100', ...], dtype='object')



In [17]:

    
logreg.coef_









    Out[17]:





array([[  8.01093500e-01,   1.35822836e-01,  -6.98952430e-02,
         -4.82642854e-01,  -1.72416728e+00,  -3.73300332e-01,
          0.00000000e+00,   3.64199065e-01,  -1.06611181e-01,
          3.46809995e-02,   1.50630652e-02,   1.09390589e+00,
         -1.75323876e-01,   2.53032187e-01,   1.01619758e+00,
          2.02943764e+00,   2.81571145e+00,   1.46681479e+00,
          1.20457430e+00,   2.45309490e+00,   2.09710585e+00,
          0.00000000e+00,   1.85716915e-01,  -7.02275390e-01,
          0.00000000e+00,  -2.59493086e-01,   3.14698619e-01,
         -1.77390221e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.80832494e+00,   1.42461499e-01,
          0.00000000e+00,  -1.26020906e+00,  -2.96083782e+00,
          1.99965061e+00,   2.37983977e+00,  -1.52899569e+00,
         -3.40705664e-01,   1.37304375e-01,   1.95128624e-03,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   6.03503900e-01,
         -1.79890858e+00,   0.00000000e+00,   3.20718865e-06,
          1.14156006e+00,   8.49621558e-01,  -3.13135214e-01,
         -5.59434979e-01,   3.23087860e-01,   1.95128624e-03,
          0.00000000e+00,   2.44770470e-01,  -9.80936173e-01,
          0.00000000e+00,   1.07121368e+00,   4.19393537e-01,
          0.00000000e+00,   3.38875896e-01,   3.38875896e-01,
          0.00000000e+00,   4.03627008e-02,   1.01132564e-02,
          0.00000000e+00,   0.00000000e+00,  -5.57603681e-01,
          0.00000000e+00,   4.69453944e-02,   3.86955664e-01,
          5.27447651e-01,   0.00000000e+00,   2.06805386e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   6.81328772e-01,
          2.91947071e+00,  -7.93446548e-01,   0.00000000e+00,
         -3.31491399e-01,   0.00000000e+00,  -6.50576110e-01,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,  -1.23687190e+00,
          0.00000000e+00,   0.00000000e+00,   1.63447370e+00,
          1.06299446e+00,   8.41905858e-01,   0.00000000e+00,
          3.07244949e-01,   1.28756663e-03,   5.77031696e-01,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          1.75971456e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   2.99262023e-01,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   1.18985312e-01,   3.34666501e-03,
         -1.85071423e-01,   1.45296500e+00,   1.10067229e-02,
          0.00000000e+00,   0.00000000e+00,   8.78229253e-02,
          2.84664605e+00,   0.00000000e+00,   3.81956515e-01,
          1.44256595e+00,   1.66698461e-01,   0.00000000e+00,
          9.75736943e-01,   1.13564227e-02,   0.00000000e+00,
          0.00000000e+00,   1.90745913e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,  -4.19993286e-01,
          0.00000000e+00,   2.88192072e-02,   8.30019296e-01,
          5.16648119e-01,   0.00000000e+00,  -1.63071738e+00,
          0.00000000e+00,   1.26773171e+00,   0.00000000e+00,
          3.57939551e-02,   0.00000000e+00,   1.63192805e+00,
          6.65187524e-06,   6.00230072e-02,   5.05530046e-02,
          0.00000000e+00,   0.00000000e+00,   1.01256157e-01,
          8.19678617e-04,   0.00000000e+00,   0.00000000e+00,
          2.02509805e+00,   8.87901694e-02,   1.34462102e+00,
          5.70825036e-01,   2.52990990e-01,  -1.20031221e+00,
          1.33598482e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   3.01603336e-01,   1.55424888e-01,
         -9.80653473e-02,   7.07172105e-01,  -2.33437205e-01,
          8.32697777e-01,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   5.05643174e-01,   0.00000000e+00,
         -2.02577134e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          9.37778261e-03,   1.78070542e+00,   1.93477163e+00,
          0.00000000e+00,   7.94268232e-01,   2.49777310e-01,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,  -1.50437445e+00,   0.00000000e+00,
          0.00000000e+00,   1.64810773e-01,   0.00000000e+00,
          9.50491075e-02,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,  -9.69049253e-01,   0.00000000e+00,
         -5.58147946e-01,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,  -5.41958987e-01,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         -5.41958987e-01,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,  -4.13275280e-01,   0.00000000e+00,
         -4.13275280e-01,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         -4.87852813e-01,  -1.30437902e-01,  -1.40171141e+00,
          0.00000000e+00,   0.00000000e+00,   2.30531739e-01,
          2.30531739e-01,   5.37439194e-02,   5.57330223e-01,
          1.75711962e-02,   1.99220344e-03,   5.57330223e-01,
          5.57330223e-01,   8.09317373e-03,  -8.83031817e-01,
          0.00000000e+00,   0.00000000e+00,   8.23069968e-02,
          0.00000000e+00,   0.00000000e+00,  -3.36541745e-01,
         -3.62586689e-03,   3.83556158e-01,  -9.04844042e-02,
          4.74040562e-01,  -9.04844042e-02,  -8.56091510e-02,
         -8.56091510e-02,  -2.84815006e-01,  -2.53663849e-01,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          4.19146766e-01]])



In [18]:

    
x = np.arange(len(compound_descriptors))
plt.bar(x, logreg.coef_.ravel())
plt.xticks(x + 0.5, compound_descriptors, rotation = 30);



In [19]:

    
from sklearn.metrics import confusion_matrix

cm= confusion_matrix(activity_test, activity_predicted)
print(cm)









    



[[  23   93]
 [  14 4430]]



In [20]:

    
def plot_confusion(cm, activity_names = ['Active', 'Inactive'],
                  title = 'Confusion matrix'):
    plt.imshow(cm, interpolation = 'nearest', cmap = plt.cm.Blues)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(activity_names))
    plt.xticks(tick_marks, activity_names, rotation = 60)
    plt.yticks(tick_marks, activity_names)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

plot_confusion(cm)



In [21]:

    
print(cm)









    



[[  23   93]
 [  14 4430]]



In [22]:

    
cm.sum(axis =1)









    Out[22]:





array([ 116, 4444])



In [23]:

    
cm_normalized = cm.astype(np.float64) / cm.sum(axis = 1)[:, np.newaxis]
print(cm_normalized)









    



[[ 0.19827586  0.80172414]
 [ 0.00315032  0.99684968]]



In [24]:

    
plot_confusion(cm_normalized, title = "Normalized confusion matrix")



In [25]:

    
from sklearn.metrics import classification_report

print(classification_report(activity_test, activity_predicted,
                           target_names = ['Inactive', 'Active']))









    



             precision    recall  f1-score   support

   Inactive       0.62      0.20      0.30       116
     Active       0.98      1.00      0.99      4444

avg / total       0.97      0.98      0.97      4560



In [26]:

    
activity_predicted_proba = logreg.predict_proba(compound_test)
activity_predicted_proba[:5]









    Out[26]:





array([[ 0.00113849,  0.99886151],
       [ 0.02121255,  0.97878745],
       [ 0.03190176,  0.96809824],
       [ 0.00221402,  0.99778598],
       [ 0.00181644,  0.99818356]])



In [73]:

    
from sklearn.metrics import roc_curve
from sklearn.metrics import auc


def plot_roc_curve(activity_test, activity_predicted_proba):
    fpr, tpr, thresholds = roc_curve(activity_test, 
                                     activity_predicted_proba[:, 1],
                                     pos_label = 'Active')
    
    roc_auc = auc(fpr, tpr)
    #Plot ROC curve
    plt.plot(fpr, tpr, label = 'Area Under Curve = %0.3f' %roc_auc)
    plt.plot([0, 1], [0, 1], 'k--') 
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('False Positive Rate or (1 - Specificity)')
    plt.ylabel('True Positive Rate or (Sensitivity)')
    plt.title('Receiver Operation Characteristic for Histone Acetylase')
    plt.legend(loc = 'lower right')



In [28]:

    
fpr, tpr, thresholds = roc_curve(activity_test, activity_predicted_proba[:, 1],
                                pos_label = 'Active')

ROC Curve for compound alone



In [29]:

    
plot_roc_curve(activity_test, activity_predicted_proba)

Principal Component Analysis for Protein



In [30]:

    
protein = data.iloc[:, 311:937]



In [31]:

    
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
pca.fit(protein)
print(pca.explained_variance_)
print(pca.components_)









    



[ 671.9186243   148.05358522]
[[ -5.37984500e-02   3.05210694e-02  -1.38777878e-17 ...,   2.29204059e-02
    5.92913660e-04   9.84543516e-03]
 [ -1.57921818e-02   5.45986027e-03   2.22044605e-16 ...,  -9.06756612e-03
   -3.03772321e-03  -3.06035965e-04]]

Explained Variance Plot for Protein



In [32]:

    
plt.title("Explained Variance")
plt.ylabel("Percentage of explained variance")
plt.xlabel("PCA Components")
plt.plot(pca.explained_variance_ratio_);

Cumulated Explained Variance plot for Protein



In [33]:

    
plt.title("Cumulated Explained VAriance")
plt.ylabel("Percentage of explained variance")
plt.xlabel("PCA Components")
plt.plot(np.cumsum(pca.explained_variance_ratio_));

Data Partition



In [34]:

    
from sklearn.cross_validation import train_test_split
protein_train, protein_test, activity_train, activity_test = train_test_split(
    protein, activity, test_size = 0.20, random_state = 0)
print("train data shape: %r, trian target shape: %r"
     % (protein_train.shape, activity_train.shape))
print("test data shape: %r, test target shape: %r"
     % (protein_test.shape, activity_test.shape))









    



train data shape: (18240, 626), trian target shape: (18240,)
test data shape: (4560, 626), test target shape: (4560,)

Classification with Logistic Regression



In [35]:

    
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression (C = 1)
logreg.fit(protein_train, activity_train)









    Out[35]:





LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)



In [36]:

    
activity_predicted = logreg.predict(protein_test)



In [37]:

    
from sklearn.metrics import accuracy_score
accuracy_score(activity_test, activity_predicted)









    Out[37]:





0.97456140350877196

Feature Importance from Logistic Regression



In [38]:

    
protein_descriptors = protein.columns
protein_descriptors









    Out[38]:





Index(['m1z1p2', 'm1z1p3', 'm1z1p4', 'm1z1p5', 'm1z1p6', 'm1z1p7', 'm1z1p8', 'm1z1p9', 'm1z1p10', 'm1z1p11', 'm1z1p12', 'm1z1p13', 'm1z1p14', 'm1z1p15', 'm1z1p16', 'm1z1p17', 'm1z1p18', 'm1z1p19', 'm1z1p20', 'm1z1p21', 'm1z1p22', 'm1z1p23', 'm1z1p24', 'm1z1p25', 'm1z1p26', 'm1z1p27', 'm1z1p28', 'm1z1p29', 'm1z2p1', 'm1z2p2', 'm1z2p3', 'm1z2p4', 'm1z2p5', 'm1z2p6', 'm1z2p7', 'm1z2p8', 'm1z2p9', 'm1z2p10', 'm1z2p11', 'm1z2p12', 'm1z2p13', 'm1z2p14', 'm1z2p15', 'm1z2p16', 'm1z2p17', 'm1z2p18', 'm1z2p19', 'm1z2p20', 'm1z2p21', 'm1z2p22', 'm1z2p23', 'm1z2p24', 'm1z2p25', 'm1z2p26', 'm1z2p27', 'm1z2p28', 'm1z2p29', 'm1z3p1', 'm1z3p2', 'm1z3p3', 'm1z3p4', 'm1z3p5', 'm1z3p6', 'm1z3p7', 'm1z3p8', 'm1z3p9', 'm1z3p10', 'm1z3p11', 'm1z3p12', 'm1z3p13', 'm1z3p14', 'm1z3p15', 'm1z3p16', 'm1z3p17', 'm1z3p18', 'm1z3p19', 'm1z3p20', 'm1z3p21', 'm1z3p22', 'm1z3p23', 'm1z3p24', 'm1z3p25', 'm1z3p26', 'm1z3p27', 'm1z3p28', 'm1z3p29', 'm2z1p1', 'm2z1p2', 'm2z1p3', 'm2z1p4', 'm2z1p5', 'm2z1p6', 'm2z1p7', 'm2z1p8', 'm2z1p9', 'm2z1p10', 'm2z1p11', 'm2z1p12', 'm2z1p13', 'm2z1p14', ...], dtype='object')



In [39]:

    
x = np.arange(len(protein_descriptors))
plt.bar(x, logreg.coef_.ravel())
plt.xticks(x + 20, protein_descriptors, rotation = 30);



In [40]:

    
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(activity_test, activity_predicted)
print(cm)









    



[[   0  116]
 [   0 4444]]



In [41]:

    
plot_confusion(cm)



In [42]:

    
cm_normalized = cm.astype(np.float64) / cm.sum(axis = 1)[:, np.newaxis]
print(cm_normalized)









    



[[ 0.  1.]
 [ 0.  1.]]



In [43]:

    
plot_confusion(cm_normalized, title = "Normalized confusion matrix")



In [44]:

    
from sklearn.metrics import classification_report 

print(classification_report(activity_test, activity_predicted,
                           target_names = ['Active', 'Inactive']))









    



             precision    recall  f1-score   support

     Active       0.00      0.00      0.00       116
   Inactive       0.97      1.00      0.99      4444

avg / total       0.95      0.97      0.96      4560







    



C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\sklearn\metrics\metrics.py:1771: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)



In [45]:

    
activity_predicted_proba = logreg.predict_proba(protein_test)
activity_predicted_proba[:5]









    Out[45]:





array([[ 0.02521799,  0.97478201],
       [ 0.02535162,  0.97464838],
       [ 0.02521799,  0.97478201],
       [ 0.02222358,  0.97777642],
       [ 0.02524507,  0.97475493]])



In [46]:

    
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
plot_roc_curve(activity_test, activity_predicted_proba)



In [61]:

    
protein = data.iloc[:, 311:320]
compound = data.iloc[:, 2:10]
print(protein.shape)
print(compound.shape)


def cross_terms_ligand_protein(ligand, header_ligand, protein, header_protein):
    import numpy as np
    R, Cl = np.shape(ligand)
    R, Cp = np.shape(protein)
    cross_terms = Cl * Cp
    Cross_lp = np.zeros((R, cross_terms))
    H_lp = []
    for j in range(Cl):
        for jj in range(Cp):
            H_lp.append(str(header_ligand[j])+'*'+str(header_protein[jj]))
            cross = np.multiply(ligand[:,j], protein[:, jj])
            Cross_lp = np.append(Cross_lp, np.reshape(cross, (ligand.shape[0],1)),
                                axis = 1)
            return np.delete(Cross_lp, 0, axis = 1), H_lp









    



(22800, 9)
(22800, 8)



In [60]:

    
R, Cl = np.shape(ligand)
R
Cross_lp = np.zeros((R, 1))
Cross_lp
Cl









    Out[60]:





8

Compound and Protein Cross-Terms



In [63]:

    
#header_protein, protein = protein.iloc[0,:], np.delete(protein.iloc, 0, axis = 0).astype(np.float)
ligand = np.array(compound)
protein = np.array(protein)
#header_ligand, ligand = ligand[0,:], np.delete(ligand, 0, axis = 0).astype(np.float)
#header_protein, protein = protein[0,:], np.delete(protein, 0, axis = 0).astype(np.float)


cross_terms, header_cross_terms = cross_terms_ligand_protein(ligand, list(header_ligand), protein, list(header_protein))

print(cross_terms.shape)









    



(22800, 72)



In [64]:

    
print(cross_terms.shape)
print(ligand.shape)
activity = np.array(activity)
print(activity.shape)

from sklearn.cross_validation import train_test_split
cross_terms_train, cross_terms_test, activity_train, activity_test = train_test_split(
    cross_terms, activity, test_size = 0.20, random_state = 0)
print("train data shape: %r, train target shape: %r"
     % (cross_terms_train.shape, activity_train.shape))
print("test data shape: %r, test target shape: %r"
     % (cross_terms_test.shape, activity_test.shape))









    



(22800, 72)
(22800, 8)
(22800,)
train data shape: (18240, 72), train target shape: (18240,)
test data shape: (4560, 72), test target shape: (4560,)

Classification with Logistic Regression for cross--terms



In [107]:

    
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression (C = 1)
logreg.fit(cross_terms_train, activity_train)









    Out[107]:





LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, penalty='l2', random_state=None, tol=0.0001)



In [108]:

    
activity_predicted = logreg.predict(cross_terms_test)



In [109]:

    
from sklearn.metrics import accuracy_score
accuracy_score(activity_test, activity_predicted)









    Out[109]:





0.97456140350877196



In [110]:

    
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(activity_test, activity_predicted)



In [111]:

    
print(cm)









    



[[   0  116]
 [   0 4444]]



In [112]:

    
plot_confusion(cm)



In [113]:

    
activity_predicted_proba = logreg.predict_proba(cross_terms_test)
activity_predicted_proba[:5]









    Out[113]:





array([[ 0.01086595,  0.98913405],
       [ 0.02011169,  0.97988831],
       [ 0.01941756,  0.98058244],
       [ 0.03446488,  0.96553512],
       [ 0.00605725,  0.99394275]])



In [74]:

    
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
plot_roc_curve(activity_test, activity_predicted_proba)



In [53]:

    
len(ligand[0])









    Out[53]:





8



In [51]:

    
protein









    Out[51]:





array([[-1.39, -4.44,  3.64, ...,  2.41,  2.41,  2.23],
       [-1.39, -4.44,  3.64, ...,  2.41,  2.41,  2.23],
       [-1.39, -4.44,  3.64, ...,  2.41,  2.41,  2.23],
       ..., 
       [-4.44, -4.44,  3.64, ...,  2.41,  2.18,  2.23],
       [-4.44, -4.44,  3.64, ...,  2.41,  2.18,  2.23],
       [-4.44, -4.44,  3.64, ...,  2.41,  2.18,  2.23]])



In [54]:

    
protein.shape









    Out[54]:





(22799, 9)



In [65]:

    
print(data)









    



       Activity       Compound  SubFPC1  SubFPC2  SubFPC3  SubFPC4  SubFPC5  \
0        Active  CHEMBL2000089        0        9        0        0        0   
1        Active       CHEMBL99        2        0        1        0        2   
2        Active  CHEMBL2047701        0        1        0        0        0   
3        Active   CHEMBL343448        5        2        2        0        1   
4        Active   CHEMBL360194        2        8        0        0        0   
5        Active   CHEMBL217083        0        5        0        0        0   
6        Active  CHEMBL1835659        2        5        1        0        1   
7        Active   CHEMBL327146        0        5        1        0        0   
8        Active  CHEMBL2047699        0        1        0        0        0   
9        Active   CHEMBL116620        0        0        1        0        1   
10       Active  CHEMBL1819274        3        2        0        0        1   
11       Active  CHEMBL1094708        0        6        0        0        0   
12       Active  CHEMBL1819257        0        1        0        0        1   
13       Active   CHEMBL485514        0        4        0        0        1   
14       Active   CHEMBL403323        0        1        0        0        1   
15       Active   CHEMBL394261        1        8        0        0        0   
16       Active   CHEMBL217154        0        5        0        0        0   
17       Active  CHEMBL2047690        0        1        0        0        0   
18       Active   CHEMBL609583        0        0        0        0        0   
19       Active  CHEMBL1801250        0        0        3        0        0   
20       Active   CHEMBL445506        1        6        0        0        0   
21       Active   CHEMBL470421        0        0        0        0        0   
22       Active    CHEMBL95552        0        6        0        0        0   
23       Active  CHEMBL2047704        0        1        0        0        0   
24       Active   CHEMBL474890        0        4        0        0        0   
25       Active  CHEMBL1819261        4        1        0        0        1   
26       Active  CHEMBL1095451        1        1        0        0        1   
27       Active   CHEMBL217004        0        0        0        0        0   
28       Active  CHEMBL2022825        0        5        0        0        0   
29       Active   CHEMBL216249        0        0        0        0        0   
...         ...            ...      ...      ...      ...      ...      ...   
22770  Inactive   ZINC65132170        1        3        0        0        0   
22771  Inactive   ZINC71832196        0        6        1        0        0   
22772  Inactive   ZINC16306616        1        2        0        0        0   
22773  Inactive   ZINC16306614        1        2        0        0        0   
22774  Inactive   ZINC65592903        2        3        0        0        0   
22775  Inactive   ZINC05825033        1        6        0        0        0   
22776  Inactive   ZINC05825031        1        6        0        0        0   
22777  Inactive   ZINC26678013        1        2        0        0        0   
22778  Inactive   ZINC26805798        1        2        0        0        0   
22779  Inactive   ZINC26805791        1        2        0        0        0   
22780  Inactive   ZINC26805804        1        2        0        0        0   
22781  Inactive   ZINC26805811        1        2        0        0        0   
22782  Inactive   ZINC66609339        2        3        0        0        0   
22783  Inactive   ZINC66609341        2        3        0        0        0   
22784  Inactive   ZINC67674680        1        1        1        0        0   
22785  Inactive   ZINC26678118        1        3        0        0        0   
22786  Inactive   ZINC64425971        2        2        0        0        0   
22787  Inactive   ZINC53755642        0        2        0        0        1   
22788  Inactive   ZINC64426182        2        2        0        0        0   
22789  Inactive   ZINC64426066        1        2        0        0        1   
22790  Inactive   ZINC69297719        2        1        1        0        0   
22791  Inactive   ZINC64241400        0        5        0        0        0   
22792  Inactive   ZINC68214819        2        2        0        0        0   
22793  Inactive   ZINC58307377        0        5        0        0        0   
22794  Inactive   ZINC43466358        2        2        0        0        0   
22795  Inactive   ZINC64425853        2        2        0        0        0   
22796  Inactive   ZINC03071133        1        0        0        0        1   
22797  Inactive   ZINC43050326        2        1        0        0        0   
22798  Inactive   ZINC67083247        0        2        0        0        0   
22799  Inactive   ZINC69294238        1        2        0        0        0   

       SubFPC6  SubFPC7  SubFPC8   ...     m10z2p7  m10z2p8  m10z3p1  m10z3p2  \
0            0        0        0   ...       -1.73    -0.97     0.57     0.57   
1            0        0        0   ...       -1.73    -0.97     0.57     0.57   
2            0        0        0   ...       -1.73    -0.97     0.57     0.57   
3            0        0        0   ...       -1.73    -0.97     0.57     0.57   
4            0        0        0   ...       -1.73    -0.97     0.57     0.57   
5            0        0        0   ...       -1.73    -0.97     0.57     0.57   
6            0        0        0   ...       -1.73    -0.97     0.57     0.57   
7            0        0        0   ...       -1.73    -0.97     0.57     0.57   
8            0        0        0   ...       -1.73    -0.97     0.57     0.57   
9            0        0        0   ...       -1.73    -0.97     0.57     0.57   
10           0        0        0   ...       -1.73    -0.97     0.57     0.57   
11           0        0        0   ...       -1.73    -0.97     0.57     0.57   
12           0        0        0   ...       -1.73    -0.97     0.57     0.57   
13           0        0        0   ...       -1.73    -0.97     0.57     0.57   
14           0        0        0   ...       -1.73    -0.97     0.57     0.57   
15           0        0        0   ...       -1.73    -0.97     0.57     0.57   
16           0        0        0   ...       -1.73    -0.97     0.57     0.57   
17           0        0        0   ...       -1.73    -0.97     0.57     0.57   
18           0        0        0   ...       -1.73    -0.97     0.57     0.57   
19           0        0        0   ...       -1.73    -0.97     0.57     0.57   
20           0        0        0   ...       -1.73    -0.97     0.57     0.57   
21           0        0        0   ...       -1.73    -0.97     0.57     0.57   
22           0        0        0   ...       -1.73    -0.97     0.57     0.57   
23           0        0        0   ...       -1.73    -0.97     0.57     0.57   
24           0        0        0   ...       -1.73    -0.97     0.57     0.57   
25           0        0        0   ...       -1.73    -0.97     0.57     0.57   
26           0        0        0   ...       -1.73    -0.97     0.57     0.57   
27           0        0        0   ...       -1.73    -0.97     0.57     0.57   
28           0        0        0   ...       -1.73    -0.97     0.57     0.57   
29           0        0        0   ...       -1.73    -0.97     0.57     0.57   
...        ...      ...      ...   ...         ...      ...      ...      ...   
22770        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22771        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22772        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22773        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22774        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22775        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22776        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22777        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22778        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22779        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22780        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22781        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22782        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22783        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22784        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22785        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22786        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22787        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22788        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22789        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22790        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22791        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22792        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22793        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22794        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22795        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22796        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22797        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22798        0        0        0   ...       -2.53     1.41    -1.03     0.57   
22799        0        0        0   ...       -2.53     1.41    -1.03     0.57   

       m10z3p3  m10z3p4  m10z3p5  m10z3p6  m10z3p7  m10z3p8  
0         2.36    -3.14    -3.44    -1.03     0.09     4.13  
1         2.36    -3.14    -3.44    -1.03     0.09     4.13  
2         2.36    -3.14    -3.44    -1.03     0.09     4.13  
3         2.36    -3.14    -3.44    -1.03     0.09     4.13  
4         2.36    -3.14    -3.44    -1.03     0.09     4.13  
5         2.36    -3.14    -3.44    -1.03     0.09     4.13  
6         2.36    -3.14    -3.44    -1.03     0.09     4.13  
7         2.36    -3.14    -3.44    -1.03     0.09     4.13  
8         2.36    -3.14    -3.44    -1.03     0.09     4.13  
9         2.36    -3.14    -3.44    -1.03     0.09     4.13  
10        2.36    -3.14    -3.44    -1.03     0.09     4.13  
11        2.36    -3.14    -3.44    -1.03     0.09     4.13  
12        2.36    -3.14    -3.44    -1.03     0.09     4.13  
13        2.36    -3.14    -3.44    -1.03     0.09     4.13  
14        2.36    -3.14    -3.44    -1.03     0.09     4.13  
15        2.36    -3.14    -3.44    -1.03     0.09     4.13  
16        2.36    -3.14    -3.44    -1.03     0.09     4.13  
17        2.36    -3.14    -3.44    -1.03     0.09     4.13  
18        2.36    -3.14    -3.44    -1.03     0.09     4.13  
19        2.36    -3.14    -3.44    -1.03     0.09     4.13  
20        2.36    -3.14    -3.44    -1.03     0.09     4.13  
21        2.36    -3.14    -3.44    -1.03     0.09     4.13  
22        2.36    -3.14    -3.44    -1.03     0.09     4.13  
23        2.36    -3.14    -3.44    -1.03     0.09     4.13  
24        2.36    -3.14    -3.44    -1.03     0.09     4.13  
25        2.36    -3.14    -3.44    -1.03     0.09     4.13  
26        2.36    -3.14    -3.44    -1.03     0.09     4.13  
27        2.36    -3.14    -3.44    -1.03     0.09     4.13  
28        2.36    -3.14    -3.44    -1.03     0.09     4.13  
29        2.36    -3.14    -3.44    -1.03     0.09     4.13  
...        ...      ...      ...      ...      ...      ...  
22770     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22771     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22772     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22773     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22774     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22775     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22776     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22777     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22778     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22779     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22780     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22781     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22782     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22783     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22784     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22785     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22786     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22787     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22788     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22789     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22790     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22791     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22792     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22793     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22794     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22795     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22796     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22797     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22798     2.23     0.09     0.30    -1.03    -1.29    -3.14  
22799     2.23     0.09     0.30    -1.03    -1.29    -3.14  

[22800 rows x 937 columns]



In [67]:

    
data.summary









    



---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-67-44419227e2a1> in <module>()
----> 1 data.summary

C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\pandas\core\generic.py in __getattr__(self, name)
   1945                 return self[name]
   1946             raise AttributeError("'%s' object has no attribute '%s'" %
-> 1947                                  (type(self).__name__, name))
   1948 
   1949     def __setattr__(self, name, value):

AttributeError: 'DataFrame' object has no attribute 'summary'



In [68]:

    
data.count()









    Out[68]:





Activity    22800
Compound    22800
SubFPC1     22800
SubFPC2     22800
SubFPC3     22800
SubFPC4     22800
SubFPC5     22800
SubFPC6     22800
SubFPC7     22800
SubFPC8     22800
SubFPC9     22800
SubFPC10    22800
SubFPC11    22800
SubFPC12    22800
SubFPC13    22800
...
m10z2p2    22800
m10z2p3    22800
m10z2p4    22800
m10z2p5    22800
m10z2p6    22800
m10z2p7    22800
m10z2p8    22800
m10z3p1    22800
m10z3p2    22800
m10z3p3    22800
m10z3p4    22800
m10z3p5    22800
m10z3p6    22800
m10z3p7    22800
m10z3p8    22800
Length: 937, dtype: int64



In [68]:

    
data.plot()









    Out[68]:





<matplotlib.axes._subplots.AxesSubplot at 0xf1210b8>






    



Traceback (most recent call last):

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\IPython\kernel\zmq\ipkernel.py", line 181, in do_execute
    shell.run_cell(code, store_history=store_history, silent=silent)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\IPython\core\interactiveshell.py", line 2874, in run_cell
    self.events.trigger('post_execute')

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\IPython\core\events.py", line 74, in trigger
    func(*args, **kwargs)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\IPython\kernel\zmq\pylab\backend_inline.py", line 109, in flush_figures
    return show(True)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\IPython\kernel\zmq\pylab\backend_inline.py", line 32, in show
    display(figure_manager.canvas.figure)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\IPython\core\display.py", line 159, in display
    format_dict, md_dict = format(obj, include=include, exclude=exclude)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\IPython\core\formatters.py", line 179, in format
    data = formatter(obj)

  File "<string>", line 2, in __call__

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\IPython\core\formatters.py", line 224, in catch_format_error
    r = method(self, *args, **kwargs)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\IPython\core\formatters.py", line 335, in __call__
    return printer(obj)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\IPython\core\pylabtools.py", line 207, in <lambda>
    png_formatter.for_type(Figure, lambda fig: print_figure(fig, 'png', **kwargs))

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\IPython\core\pylabtools.py", line 117, in print_figure
    fig.canvas.print_figure(bytes_io, **kw)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\matplotlib\backend_bases.py", line 2211, in print_figure
    **kwargs)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\matplotlib\backends\backend_agg.py", line 521, in print_png
    FigureCanvasAgg.draw(self)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\matplotlib\backends\backend_agg.py", line 469, in draw
    self.figure.draw(self.renderer)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\matplotlib\artist.py", line 59, in draw_wrapper
    draw(artist, renderer, *args, **kwargs)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\matplotlib\figure.py", line 1085, in draw
    func(*args)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\matplotlib\artist.py", line 59, in draw_wrapper
    draw(artist, renderer, *args, **kwargs)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\matplotlib\axes\_base.py", line 2110, in draw
    a.draw(renderer)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\matplotlib\artist.py", line 59, in draw_wrapper
    draw(artist, renderer, *args, **kwargs)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\matplotlib\legend.py", line 450, in draw
    bbox = self._legend_box.get_window_extent(renderer)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\matplotlib\offsetbox.py", line 257, in get_window_extent
    px, py = self.get_offset(w, h, xd, yd, renderer)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\matplotlib\offsetbox.py", line 210, in get_offset
    return self._offset(width, height, xdescent, ydescent, renderer)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\matplotlib\legend.py", line 413, in _findoffset_best
    ox, oy = self._find_best_position(width, height, renderer)

  File "C:\Users\Saw\Anaconda\envs\py33\lib\site-packages\matplotlib\legend.py", line 920, in _find_best_position
    badness += legendBox.count_overlaps(bboxes)

KeyboardInterrupt



In [69]:

    
data.std()









    



ERROR! Session/line number was not unique in database. History logging moved to new session 48






    Out[69]:





SubFPC1     1.312614
SubFPC2     2.125932
SubFPC3     0.652571
SubFPC4     0.264112
SubFPC5     0.310876
SubFPC6     0.076452
SubFPC7     0.000000
SubFPC8     0.099035
SubFPC9     0.593542
SubFPC10    0.014807
SubFPC11    0.011470
SubFPC12    0.479692
SubFPC13    0.223287
SubFPC14    0.312169
SubFPC15    0.177969
...
m10z2p2    0.676983
m10z2p3    0.685472
m10z2p4    1.620639
m10z2p5    2.262299
m10z2p6    1.301798
m10z2p7    1.470836
m10z2p8    0.492020
m10z3p1    0.962919
m10z3p2    1.761124
m10z3p3    0.445035
m10z3p4    1.744491
m10z3p5    1.629621
m10z3p6    0.648999
m10z3p7    0.322479
m10z3p8    1.949039
Length: 934, dtype: float64



In [71]:

    
print(data.std() < 0.1)









    



SubFPC1     False
SubFPC2     False
SubFPC3     False
SubFPC4     False
SubFPC5     False
SubFPC6      True
SubFPC7      True
SubFPC8      True
SubFPC9     False
SubFPC10     True
SubFPC11     True
SubFPC12    False
SubFPC13    False
SubFPC14    False
SubFPC15    False
...
m10z2p2    False
m10z2p3    False
m10z2p4    False
m10z2p5    False
m10z2p6    False
m10z2p7    False
m10z2p8    False
m10z3p1    False
m10z3p2    False
m10z3p3    False
m10z3p4    False
m10z3p5    False
m10z3p6    False
m10z3p7    False
m10z3p8    False
Length: 934, dtype: bool



In [72]:

    
%time
data = pd.read_csv('HD_data.csv')









    



Wall time: 0 ns



In [ ]:

	Activity	Compound	SubFPC1	SubFPC2	SubFPC3	SubFPC5	...	m10z2p7	m10z2p8	m10z3p1	m10z3p2	m10z3p3	m10z3p4	m10z3p5	m10z3p6	m10z3p7	m10z3p8
0	Active	CHEMBL2000089	0	9	0	0	...	-1.73	-0.97	0.57	0.57	2.36	-3.14	-3.44	-1.03	0.09	4.13
1	Active	CHEMBL99	2	0	1	2	...	-1.73	-0.97	0.57	0.57	2.36	-3.14	-3.44	-1.03	0.09	4.13
2	Active	CHEMBL2047701	0	1	0	0	...	-1.73	-0.97	0.57	0.57	2.36	-3.14	-3.44	-1.03	0.09	4.13
3	Active	CHEMBL343448	5	2	2	1	...	-1.73	-0.97	0.57	0.57	2.36	-3.14	-3.44	-1.03	0.09	4.13
4	Active	CHEMBL360194	2	8	0	0	...	-1.73	-0.97	0.57	0.57	2.36	-3.14	-3.44	-1.03	0.09	4.13