In [1]:
from __future__ import division

import os
import numpy as np
import pandas as pd
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline

In [30]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.AtomPairs.Pairs import GetAtomPairFingerprint

In [3]:
txt_dir = "chembl_source"

In [4]:
# read all chembl bioactivity records
chembl = pd.read_csv(os.path.join(txt_dir, "inhibitor_2017_06_08.csv"), delimiter="\t")
chembl.shape


/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py:2717: DtypeWarning: Columns (1,3,6,7,8,9,11,14,16,19,23,27,31,34,35,38,44,48,50,52,53,54,55,56,57,58) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
Out[4]:
(1235867, 59)

In [5]:
# Remove records has no canonical smiles
m = chembl["CANONICAL_SMILES"].isnull()
chembl = chembl[~m]
chembl.shape


Out[5]:
(1230260, 59)

In [21]:
# save inhibitors' smiles and apfp
smiles = chembl[["CMPD_CHEMBLID", "CANONICAL_SMILES"]].copy()
smiles.drop_duplicates(subset="CMPD_CHEMBLID", inplace=True)
smiles.set_index(keys="CMPD_CHEMBLID", drop=True, inplace=True)
smiles.to_csv(txt_dir + "/inhibitor_smiles.csv")

In [31]:
def dict_2_str(d):
  keylist = d.keys()
  keylist.sort()
  kv_list = ["{}: {}".format(k, d[k]) for k in keylist] 
  return ", ".join(kv_list)

apfp_file = open(txt_dir + "/inhibitor_apfp.csv", "w")
for id_, row in smiles.iterrows():
    m = Chem.MolFromSmiles(row.values[0])
    if m is None:
        print id_
        continue
    apfps = GetAtomPairFingerprint(Chem.RemoveHs(m)).GetNonzeroElements()
    apfp_file.write("%s\t{%s}\n" % (id_, dict_2_str(apfps)))
apfp_file.close()


CHEMBL1161633
CHEMBL2097021
CHEMBL471869
CHEMBL1161635
CHEMBL181124
CHEMBL1161637
CHEMBL181880
CHEMBL3593577
CHEMBL450200
CMPD_CHEMBLID
CHEMBL450642
CHEMBL2205792
CHEMBL2205793
CHEMBL490121
CHEMBL523281
CHEMBL463327
CHEMBL522826
CHEMBL2205790
CHEMBL495469
CHEMBL2205791
CHEMBL492602
CHEMBL2205788
CHEMBL2205787
CHEMBL452133
CHEMBL2205785
CHEMBL508580
CHEMBL508803
CHEMBL2205789
CHEMBL2205786
CHEMBL493431
CHEMBL2087763
CHEMBL2087764
CHEMBL2179461
CHEMBL2179458
CHEMBL2179464
CHEMBL2179462
CHEMBL2179459
CHEMBL2179463
CHEMBL1083554
CHEMBL2179460
CHEMBL3327018

In [6]:
# calculate some molecules's weight
def molwt(x):
    try:
        value = Chem.Descriptors.MolWt(Chem.MolFromSmiles(x))
    except:
        value = np.nan
    return value

m = chembl["MOLWEIGHT"].isnull()
chembl.loc[m, "MOLWEIGHT"] = chembl.loc[m, "CANONICAL_SMILES"].apply(molwt)

In [8]:
# remove molecules that has no "MOLWEIGHT"
m = chembl["MOLWEIGHT"].isnull()
chembl = chembl[~m]
chembl.shape


Out[8]:
(1223639, 59)

In [32]:
# pick out inhibitor records
inhibitor = chembl[chembl["STANDARD_TYPE"].isin(["IC50", "Ki", "EC50"])]

# inhibitor records: all IC50, a part of Ki and EC50 with "inhibit" in "DESCRIPTION"
m0 = inhibitor["STANDARD_TYPE"].isin(["IC50"]) 
m1 = inhibitor["STANDARD_TYPE"].isin(["Ki", "EC50"]) 
m2 = inhibitor["DESCRIPTION"].apply(lambda x: "inhibit" in x.lower())
m = m0 | (m1 & m2)

inhibitor = inhibitor[m]
inhibitor.shape


Out[32]:
(835299, 59)

In [33]:
# some records without "STANDARD_VALUE" should be cleared away
m = inhibitor["STANDARD_VALUE"].isnull()
inhibitor = inhibitor[~m]
inhibitor.shape


Out[33]:
(716442, 59)

In [34]:
inhibitor["DATA_VALIDITY_COMMENT"].value_counts()


Out[34]:
Outside typical range            26411
Potential transcription error      378
Non standard unit for type         370
Manually validated                 163
Name: DATA_VALIDITY_COMMENT, dtype: int64

In [35]:
# some records with abnormal data also should be cleared away
#error_comment = ["Outside typical range", "Non standard unit for type", "Potential transcription error"]
error_comment = ["Outside typical range"]
m = inhibitor["DATA_VALIDITY_COMMENT"].isin(error_comment)
inhibitor = inhibitor[~m]
inhibitor.shape


Out[35]:
(690031, 59)

In [36]:
# correct some STANDARD_UNITS
m = inhibitor["STANDARD_UNITS"].isin(["/uM"])
inhibitor.loc[m, "STANDARD_VALUE"] = inhibitor.loc[m, "STANDARD_VALUE"].astype(float).values * 1000
inhibitor.loc[m, "STANDARD_UNITS"] = "nM"

m = inhibitor["STANDARD_UNITS"].isin(["/nM", "ug nM-1", "Ke nM-1"])
inhibitor.loc[m, "STANDARD_UNITS"] = "nM"

m = inhibitor["STANDARD_UNITS"].isin(["ug.mL-1"])
inhibitor.loc[m, "STANDARD_VALUE"] = inhibitor.loc[m, "STANDARD_VALUE"].astype(float) / inhibitor.loc[m, "MOLWEIGHT"].astype(float) * 10**6
inhibitor.loc[m, "STANDARD_UNITS"] = "nM"

m = inhibitor["STANDARD_UNITS"].isin(["nM"])
inhibitor = inhibitor[m]
inhibitor.shape


Out[36]:
(689725, 59)

In [37]:
# remove duplicates
m = inhibitor["POTENTIAL_DUPLICATE"].fillna(0).astype(int) == 0
inhibitor = inhibitor[m]
inhibitor.shape


Out[37]:
(662788, 59)

In [39]:
inhibitor.to_csv(txt_dir + "/inhibitor_clean_2017_06_08.csv", index=False)

In [ ]:


In [ ]:
# judge a record's clf label
def is_pos(row):
  r = row["RELATION"]
  v = np.float32(row["STANDARD_VALUE"])
  if r == "<" or r == "<=":
    return 1 if v <= 10000 else np.nan
  elif r == ">" or r == ">=":
    return -1 if v >= 10000 else np.nan
  elif r == "=":
    return 1 if v <= 10000 else -1
  else:
    return np.nan

In [89]:
inhibitor["CLF_LABEL"] = inhibitor.apply(is_pos, axis=1)
inhibitor = inhibitor[~inhibitor["CLF_LABEL"].isnull()]
inhibitor.loc[:, "YEAR"] = inhibitor.loc[:, "YEAR"].astype(float)

In [131]:
# group
grouped = inhibitor.groupby(by=["TARGET_CHEMBLID", "PREF_NAME", "CMPD_CHEMBLID"], as_index=False)
# judge one molecule's label by the average label
clf_label = grouped[["CLF_LABEL", "YEAR"]].mean()
clf_label


Out[131]:
TARGET_CHEMBLID PREF_NAME CMPD_CHEMBLID CLF_LABEL YEAR
0 CHEMBL1075092 Glycine receptor subunit alpha-3 CHEMBL1092618 -1.0 2010.0
1 CHEMBL1075092 Glycine receptor subunit alpha-3 CHEMBL1092619 -1.0 2010.0
2 CHEMBL1075092 Glycine receptor subunit alpha-3 CHEMBL1093582 -1.0 2010.0
3 CHEMBL1075092 Glycine receptor subunit alpha-3 CHEMBL1093848 -1.0 2010.0
4 CHEMBL1075092 Glycine receptor subunit alpha-3 CHEMBL2398350 -1.0 2013.0
5 CHEMBL1075092 Glycine receptor subunit alpha-3 CHEMBL2398352 -1.0 2013.0
6 CHEMBL1075092 Glycine receptor subunit alpha-3 CHEMBL464651 1.0 2010.0
7 CHEMBL1075097 Arginase-1 CHEMBL1099169 1.0 2010.0
8 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3714817 1.0 NaN
9 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3714879 1.0 NaN
10 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3714885 1.0 NaN
11 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3714909 1.0 NaN
12 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3714960 1.0 NaN
13 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3714970 1.0 NaN
14 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715004 1.0 NaN
15 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715017 1.0 NaN
16 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715077 1.0 NaN
17 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715155 1.0 NaN
18 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715174 1.0 NaN
19 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715218 1.0 NaN
20 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715359 1.0 NaN
21 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715361 1.0 NaN
22 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715375 1.0 NaN
23 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715394 1.0 NaN
24 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715478 1.0 NaN
25 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715536 1.0 NaN
26 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715558 1.0 NaN
27 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715572 1.0 NaN
28 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715577 1.0 NaN
29 CHEMBL1075101 G-protein coupled receptor 81 CHEMBL3715599 1.0 NaN
... ... ... ... ... ...
542203 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3786862 1.0 2016.0
542204 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3786952 1.0 2016.0
542205 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3786963 1.0 2016.0
542206 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3787020 1.0 2016.0
542207 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3787044 1.0 2016.0
542208 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3787133 1.0 2016.0
542209 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3787193 1.0 2016.0
542210 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3787438 1.0 2016.0
542211 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3787516 1.0 2016.0
542212 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3787534 1.0 2016.0
542213 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3787548 1.0 2016.0
542214 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3787556 1.0 2016.0
542215 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3787664 1.0 2016.0
542216 CHEMBL6175 Lysine-specific demethylase 4C CHEMBL3787669 1.0 2016.0
542217 CHEMBL6177 NAD kinase CHEMBL233434 -1.0 2008.0
542218 CHEMBL6177 NAD kinase CHEMBL538665 -1.0 2009.0
542219 CHEMBL6177 NAD kinase CHEMBL560315 1.0 2009.0
542220 CHEMBL6177 NAD kinase CHEMBL561654 -1.0 2009.0
542221 CHEMBL6177 NAD kinase CHEMBL562056 1.0 2009.0
542222 CHEMBL6186 Serine/threonine-protein kinase Sgk3 CHEMBL2333365 -1.0 2013.0
542223 CHEMBL6186 Serine/threonine-protein kinase Sgk3 CHEMBL3092460 -1.0 2015.0
542224 CHEMBL6186 Serine/threonine-protein kinase Sgk3 CHEMBL3092468 1.0 2015.0
542225 CHEMBL6186 Serine/threonine-protein kinase Sgk3 CHEMBL3745885 -1.0 2016.0
542226 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 CHEMBL1190585 -1.0 2007.0
542227 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 CHEMBL1241028 1.0 2007.0
542228 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 CHEMBL1241672 -1.0 2007.0
542229 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 CHEMBL1241673 -1.0 2007.0
542230 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 CHEMBL1241765 -1.0 2007.0
542231 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 CHEMBL1241766 -1.0 2007.0
542232 CHEMBL6195 Ubiquitin carboxyl-terminal hydrolase isozyme L3 CHEMBL590 -1.0 2007.0

542233 rows × 5 columns


In [105]:
clf_label.to_csv(txt_dir + "/inhibitor_clf_label.csv")

In [ ]:


In [144]:
cancer_approved_target = ["CHEMBL279", "CHEMBL203", "CHEMBL333", "CHEMBL325", "CHEMBL267", "CHEMBL2842"]
cancer_clinical_target = ["CHEMBL340", "CHEMBL4005", "CHEMBL332"]

In [158]:
for target in cancer_approved_target + cancer_clinical_target:
    df = clf_label[clf_label["TARGET_CHEMBLID"] == target]
    df.to_csv(txt_dir + "/%s_clf_label.csv" % target, index=False)

In [ ]: