notebook.community

Edit and run



In [1]:

    
from __future__ import division

import os
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline



In [30]:

    
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint



In [3]:

    
txt_dir = "chembl_source"



In [4]:

    
# read all chembl bioactivity records
chembl = pd.read_csv(os.path.join(txt_dir, "inhibitor_2017_06_08.csv"), delimiter="\t")
chembl.shape









    



/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (1,3,6,7,8,9,11,14,16,19,23,27,31,34,35,38,44,48,50,52,53,54,55,56,57,58) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)






    Out[4]:





(1235867, 59)



In [5]:

    
# Remove records has no canonical smiles
m = chembl["CANONICAL_SMILES"].isnull()
chembl = chembl[~m]
chembl.shape









    Out[5]:





(1230260, 59)



In [21]:

    
# save inhibitors' smiles and apfp
smiles = chembl[["CMPD_CHEMBLID", "CANONICAL_SMILES"]].copy()
smiles.drop_duplicates(subset="CMPD_CHEMBLID", inplace=True)
smiles.set_index(keys="CMPD_CHEMBLID", drop=True, inplace=True)
smiles.to_csv(txt_dir + "/inhibitor_smiles.csv")



In [31]:

    
def dict_2_str(d):
  keylist = d.keys()
  keylist.sort()
  kv_list = ["{}: {}".format(k, d[k]) for k in keylist] 
  return ", ".join(kv_list)

apfp_file = open(txt_dir + "/inhibitor_apfp.csv", "w")
for id_, row in smiles.iterrows():
    m = Chem.MolFromSmiles(row.values[0])
    if m is None:
        print id_
        continue
    apfps = GetAtomPairFingerprint(Chem.RemoveHs(m)).GetNonzeroElements()
    apfp_file.write("%s\t{%s}\n" % (id_, dict_2_str(apfps)))
apfp_file.close()









    



CHEMBL1161633
CHEMBL2097021
CHEMBL471869
CHEMBL1161635
CHEMBL181124
CHEMBL1161637
CHEMBL181880
CHEMBL3593577
CHEMBL450200
CMPD_CHEMBLID
CHEMBL450642
CHEMBL2205792
CHEMBL2205793
CHEMBL490121
CHEMBL523281
CHEMBL463327
CHEMBL522826
CHEMBL2205790
CHEMBL495469
CHEMBL2205791
CHEMBL492602
CHEMBL2205788
CHEMBL2205787
CHEMBL452133
CHEMBL2205785
CHEMBL508580
CHEMBL508803
CHEMBL2205789
CHEMBL2205786
CHEMBL493431
CHEMBL2087763
CHEMBL2087764
CHEMBL2179461
CHEMBL2179458
CHEMBL2179464
CHEMBL2179462
CHEMBL2179459
CHEMBL2179463
CHEMBL1083554
CHEMBL2179460
CHEMBL3327018



In [6]:

    
# calculate some molecules's weight
def molwt(x):
    try:
        value = Chem.Descriptors.MolWt(Chem.MolFromSmiles(x))
    except:
        value = np.nan
    return value

m = chembl["MOLWEIGHT"].isnull()
chembl.loc[m, "MOLWEIGHT"] = chembl.loc[m, "CANONICAL_SMILES"].apply(molwt)



In [8]:

    
# remove molecules that has no "MOLWEIGHT"
m = chembl["MOLWEIGHT"].isnull()
chembl = chembl[~m]
chembl.shape









    Out[8]:





(1223639, 59)



In [32]:

    
# pick out inhibitor records
inhibitor = chembl[chembl["STANDARD_TYPE"].isin(["IC50", "Ki", "EC50"])]

# inhibitor records: all IC50, a part of Ki and EC50 with "inhibit" in "DESCRIPTION"
m0 = inhibitor["STANDARD_TYPE"].isin(["IC50"]) 
m1 = inhibitor["STANDARD_TYPE"].isin(["Ki", "EC50"]) 
m2 = inhibitor["DESCRIPTION"].apply(lambda x: "inhibit" in x.lower())
m = m0 | (m1 & m2)

inhibitor = inhibitor[m]
inhibitor.shape









    Out[32]:





(835299, 59)



In [33]:

    
# some records without "STANDARD_VALUE" should be cleared away
m = inhibitor["STANDARD_VALUE"].isnull()
inhibitor = inhibitor[~m]
inhibitor.shape









    Out[33]:





(716442, 59)



In [34]:

    
inhibitor["DATA_VALIDITY_COMMENT"].value_counts()









    Out[34]:





Outside typical range            26411
Potential transcription error      378
Non standard unit for type         370
Manually validated                 163
Name: DATA_VALIDITY_COMMENT, dtype: int64



In [35]:

    
# some records with abnormal data also should be cleared away
#error_comment = ["Outside typical range", "Non standard unit for type", "Potential transcription error"]
error_comment = ["Outside typical range"]
m = inhibitor["DATA_VALIDITY_COMMENT"].isin(error_comment)
inhibitor = inhibitor[~m]
inhibitor.shape









    Out[35]:





(690031, 59)



In [36]:

    
# correct some STANDARD_UNITS
m = inhibitor["STANDARD_UNITS"].isin(["/uM"])
inhibitor.loc[m, "STANDARD_VALUE"] = inhibitor.loc[m, "STANDARD_VALUE"].astype(float).values * 1000
inhibitor.loc[m, "STANDARD_UNITS"] = "nM"

m = inhibitor["STANDARD_UNITS"].isin(["/nM", "ug nM-1", "Ke nM-1"])
inhibitor.loc[m, "STANDARD_UNITS"] = "nM"

m = inhibitor["STANDARD_UNITS"].isin(["ug.mL-1"])
inhibitor.loc[m, "STANDARD_VALUE"] = inhibitor.loc[m, "STANDARD_VALUE"].astype(float) / inhibitor.loc[m, "MOLWEIGHT"].astype(float) * 10**6
inhibitor.loc[m, "STANDARD_UNITS"] = "nM"

m = inhibitor["STANDARD_UNITS"].isin(["nM"])
inhibitor = inhibitor[m]
inhibitor.shape









    Out[36]:





(689725, 59)



In [37]:

    
# remove duplicates
m = inhibitor["POTENTIAL_DUPLICATE"].fillna(0).astype(int) == 0
inhibitor = inhibitor[m]
inhibitor.shape









    Out[37]:





(662788, 59)



In [39]:

    
inhibitor.to_csv(txt_dir + "/inhibitor_clean_2017_06_08.csv", index=False)



In [ ]:



In [ ]:

    
# judge a record's clf label
def is_pos(row):
  r = row["RELATION"]
  v = np.float32(row["STANDARD_VALUE"])
  if r == "<" or r == "<=":
    return 1 if v <= 10000 else np.nan
  elif r == ">" or r == ">=":
    return -1 if v >= 10000 else np.nan
  elif r == "=":
    return 1 if v <= 10000 else -1
  else:
    return np.nan



In [89]:

    
inhibitor["CLF_LABEL"] = inhibitor.apply(is_pos, axis=1)
inhibitor = inhibitor[~inhibitor["CLF_LABEL"].isnull()]
inhibitor.loc[:, "YEAR"] = inhibitor.loc[:, "YEAR"].astype(float)



In [131]:

    
# group
grouped = inhibitor.groupby(by=["TARGET_CHEMBLID", "PREF_NAME", "CMPD_CHEMBLID"], as_index=False)
# judge one molecule's label by the average label
clf_label = grouped[["CLF_LABEL", "YEAR"]].mean()
clf_label









    Out[131]:






  
    
      
      TARGET_CHEMBLID
      PREF_NAME
      CMPD_CHEMBLID
      CLF_LABEL
      YEAR
    
  
  
    
      0
      CHEMBL1075092
      Glycine receptor subunit alpha-3
      CHEMBL1092618
      -1.0
      2010.0
    
    
      1
      CHEMBL1075092
      Glycine receptor subunit alpha-3
      CHEMBL1092619
      -1.0
      2010.0
    
    
      2
      CHEMBL1075092
      Glycine receptor subunit alpha-3
      CHEMBL1093582
      -1.0
      2010.0
    
    
      3
      CHEMBL1075092
      Glycine receptor subunit alpha-3
      CHEMBL1093848
      -1.0
      2010.0
    
    
      4
      CHEMBL1075092
      Glycine receptor subunit alpha-3
      CHEMBL2398350
      -1.0
      2013.0
    
    
      5
      CHEMBL1075092
      Glycine receptor subunit alpha-3
      CHEMBL2398352
      -1.0
      2013.0
    
    
      6
      CHEMBL1075092
      Glycine receptor subunit alpha-3
      CHEMBL464651
      1.0
      2010.0
    
    
      7
      CHEMBL1075097
      Arginase-1
      CHEMBL1099169
      1.0
      2010.0
    
    
      8
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3714817
      1.0
      NaN
    
    
      9
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3714879
      1.0
      NaN
    
    
      10
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3714885
      1.0
      NaN
    
    
      11
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3714909
      1.0
      NaN
    
    
      12
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3714960
      1.0
      NaN
    
    
      13
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3714970
      1.0
      NaN
    
    
      14
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715004
      1.0
      NaN
    
    
      15
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715017
      1.0
      NaN
    
    
      16
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715077
      1.0
      NaN
    
    
      17
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715155
      1.0
      NaN
    
    
      18
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715174
      1.0
      NaN
    
    
      19
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715218
      1.0
      NaN
    
    
      20
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715359
      1.0
      NaN
    
    
      21
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715361
      1.0
      NaN
    
    
      22
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715375
      1.0
      NaN
    
    
      23
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715394
      1.0
      NaN
    
    
      24
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715478
      1.0
      NaN
    
    
      25
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715536
      1.0
      NaN
    
    
      26
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715558
      1.0
      NaN
    
    
      27
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715572
      1.0
      NaN
    
    
      28
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715577
      1.0
      NaN
    
    
      29
      CHEMBL1075101
      G-protein coupled receptor 81
      CHEMBL3715599
      1.0
      NaN
    
    
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      542203
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3786862
      1.0
      2016.0
    
    
      542204
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3786952
      1.0
      2016.0
    
    
      542205
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3786963
      1.0
      2016.0
    
    
      542206
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3787020
      1.0
      2016.0
    
    
      542207
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3787044
      1.0
      2016.0
    
    
      542208
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3787133
      1.0
      2016.0
    
    
      542209
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3787193
      1.0
      2016.0
    
    
      542210
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3787438
      1.0
      2016.0
    
    
      542211
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3787516
      1.0
      2016.0
    
    
      542212
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3787534
      1.0
      2016.0
    
    
      542213
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3787548
      1.0
      2016.0
    
    
      542214
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3787556
      1.0
      2016.0
    
    
      542215
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3787664
      1.0
      2016.0
    
    
      542216
      CHEMBL6175
      Lysine-specific demethylase 4C
      CHEMBL3787669
      1.0
      2016.0
    
    
      542217
      CHEMBL6177
      NAD kinase
      CHEMBL233434
      -1.0
      2008.0
    
    
      542218
      CHEMBL6177
      NAD kinase
      CHEMBL538665
      -1.0
      2009.0
    
    
      542219
      CHEMBL6177
      NAD kinase
      CHEMBL560315
      1.0
      2009.0
    
    
      542220
      CHEMBL6177
      NAD kinase
      CHEMBL561654
      -1.0
      2009.0
    
    
      542221
      CHEMBL6177
      NAD kinase
      CHEMBL562056
      1.0
      2009.0
    
    
      542222
      CHEMBL6186
      Serine/threonine-protein kinase Sgk3
      CHEMBL2333365
      -1.0
      2013.0
    
    
      542223
      CHEMBL6186
      Serine/threonine-protein kinase Sgk3
      CHEMBL3092460
      -1.0
      2015.0
    
    
      542224
      CHEMBL6186
      Serine/threonine-protein kinase Sgk3
      CHEMBL3092468
      1.0
      2015.0
    
    
      542225
      CHEMBL6186
      Serine/threonine-protein kinase Sgk3
      CHEMBL3745885
      -1.0
      2016.0
    
    
      542226
      CHEMBL6195
      Ubiquitin carboxyl-terminal hydrolase isozyme L3
      CHEMBL1190585
      -1.0
      2007.0
    
    
      542227
      CHEMBL6195
      Ubiquitin carboxyl-terminal hydrolase isozyme L3
      CHEMBL1241028
      1.0
      2007.0
    
    
      542228
      CHEMBL6195
      Ubiquitin carboxyl-terminal hydrolase isozyme L3
      CHEMBL1241672
      -1.0
      2007.0
    
    
      542229
      CHEMBL6195
      Ubiquitin carboxyl-terminal hydrolase isozyme L3
      CHEMBL1241673
      -1.0
      2007.0
    
    
      542230
      CHEMBL6195
      Ubiquitin carboxyl-terminal hydrolase isozyme L3
      CHEMBL1241765
      -1.0
      2007.0
    
    
      542231
      CHEMBL6195
      Ubiquitin carboxyl-terminal hydrolase isozyme L3
      CHEMBL1241766
      -1.0
      2007.0
    
    
      542232
      CHEMBL6195
      Ubiquitin carboxyl-terminal hydrolase isozyme L3
      CHEMBL590
      -1.0
      2007.0
    
  

542233 rows × 5 columns



In [105]:

    
clf_label.to_csv(txt_dir + "/inhibitor_clf_label.csv")



In [ ]:



In [144]:

    
cancer_approved_target = ["CHEMBL279", "CHEMBL203", "CHEMBL333", "CHEMBL325", "CHEMBL267", "CHEMBL2842"]
cancer_clinical_target = ["CHEMBL340", "CHEMBL4005", "CHEMBL332"]



In [158]:

    
for target in cancer_approved_target + cancer_clinical_target:
    df = clf_label[clf_label["TARGET_CHEMBLID"] == target]
    df.to_csv(txt_dir + "/%s_clf_label.csv" % target, index=False)



In [ ]:

	TARGET_CHEMBLID	PREF_NAME	CMPD_CHEMBLID	CLF_LABEL	YEAR
0	CHEMBL1075092	Glycine receptor subunit alpha-3	CHEMBL1092618	-1.0	2010.0
1	CHEMBL1075092	Glycine receptor subunit alpha-3	CHEMBL1092619	-1.0	2010.0
2	CHEMBL1075092	Glycine receptor subunit alpha-3	CHEMBL1093582	-1.0	2010.0
3	CHEMBL1075092	Glycine receptor subunit alpha-3	CHEMBL1093848	-1.0	2010.0
4	CHEMBL1075092	Glycine receptor subunit alpha-3	CHEMBL2398350	-1.0	2013.0
5	CHEMBL1075092	Glycine receptor subunit alpha-3	CHEMBL2398352	-1.0	2013.0
6	CHEMBL1075092	Glycine receptor subunit alpha-3	CHEMBL464651	1.0	2010.0
7	CHEMBL1075097	Arginase-1	CHEMBL1099169	1.0	2010.0
8	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3714817	1.0	NaN
9	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3714879	1.0	NaN
10	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3714885	1.0	NaN
11	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3714909	1.0	NaN
12	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3714960	1.0	NaN
13	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3714970	1.0	NaN
14	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715004	1.0	NaN
15	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715017	1.0	NaN
16	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715077	1.0	NaN
17	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715155	1.0	NaN
18	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715174	1.0	NaN
19	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715218	1.0	NaN
20	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715359	1.0	NaN
21	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715361	1.0	NaN
22	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715375	1.0	NaN
23	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715394	1.0	NaN
24	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715478	1.0	NaN
25	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715536	1.0	NaN
26	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715558	1.0	NaN
27	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715572	1.0	NaN
28	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715577	1.0	NaN
29	CHEMBL1075101	G-protein coupled receptor 81	CHEMBL3715599	1.0	NaN
...	...	...	...	...	...
542203	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3786862	1.0	2016.0
542204	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3786952	1.0	2016.0
542205	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3786963	1.0	2016.0
542206	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3787020	1.0	2016.0
542207	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3787044	1.0	2016.0
542208	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3787133	1.0	2016.0
542209	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3787193	1.0	2016.0
542210	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3787438	1.0	2016.0
542211	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3787516	1.0	2016.0
542212	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3787534	1.0	2016.0
542213	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3787548	1.0	2016.0
542214	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3787556	1.0	2016.0
542215	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3787664	1.0	2016.0
542216	CHEMBL6175	Lysine-specific demethylase 4C	CHEMBL3787669	1.0	2016.0
542217	CHEMBL6177	NAD kinase	CHEMBL233434	-1.0	2008.0
542218	CHEMBL6177	NAD kinase	CHEMBL538665	-1.0	2009.0
542219	CHEMBL6177	NAD kinase	CHEMBL560315	1.0	2009.0
542220	CHEMBL6177	NAD kinase	CHEMBL561654	-1.0	2009.0
542221	CHEMBL6177	NAD kinase	CHEMBL562056	1.0	2009.0
542222	CHEMBL6186	Serine/threonine-protein kinase Sgk3	CHEMBL2333365	-1.0	2013.0
542223	CHEMBL6186	Serine/threonine-protein kinase Sgk3	CHEMBL3092460	-1.0	2015.0
542224	CHEMBL6186	Serine/threonine-protein kinase Sgk3	CHEMBL3092468	1.0	2015.0
542225	CHEMBL6186	Serine/threonine-protein kinase Sgk3	CHEMBL3745885	-1.0	2016.0
542226	CHEMBL6195	Ubiquitin carboxyl-terminal hydrolase isozyme L3	CHEMBL1190585	-1.0	2007.0
542227	CHEMBL6195	Ubiquitin carboxyl-terminal hydrolase isozyme L3	CHEMBL1241028	1.0	2007.0
542228	CHEMBL6195	Ubiquitin carboxyl-terminal hydrolase isozyme L3	CHEMBL1241672	-1.0	2007.0
542229	CHEMBL6195	Ubiquitin carboxyl-terminal hydrolase isozyme L3	CHEMBL1241673	-1.0	2007.0
542230	CHEMBL6195	Ubiquitin carboxyl-terminal hydrolase isozyme L3	CHEMBL1241765	-1.0	2007.0
542231	CHEMBL6195	Ubiquitin carboxyl-terminal hydrolase isozyme L3	CHEMBL1241766	-1.0	2007.0
542232	CHEMBL6195	Ubiquitin carboxyl-terminal hydrolase isozyme L3	CHEMBL590	-1.0	2007.0