In [1]:
%reload_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings('ignore')

import os.path as op
from collections import Counter

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# from tabulate import tabulate

from rdkit.Chem import AllChem as Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
Draw.DrawingOptions.atomLabelFontFace = "DejaVu Sans"
Draw.DrawingOptions.atomLabelFontSize = 18

from misc_tools import nb_tools as nbt  # , html_templates as html, apl_tools as apt
from rdkit_ipynb_tools import tools  # , bokeh_tools as bt, pipeline as p, clustering as cl

from cellpainting import processing as cpp, tools as cpt

import ipywidgets as ipyw
from IPython.core.display import HTML, display, clear_output  #, Javascript, display_png, clear_output, display

COMAS = "/home/pahl/comas/share/export_data_b64.tsv.gz"


> interactive IPython session.
Loading BokehJS ...
misc_tools.apl_tools                          (commit: fb5de1f ( 2017-06-23 13:16:39 ))
rdkit_ipynb_tools.tools                       (commit: 2321c7a ( 2017-07-10 22:15:00 ))
- no local installation of JSME found, using web version.
> loaded Nim extension.
cellpainting.processing                       (commit: ed8df03 ( 2017-08-07 15:34:08 ))

Prepare References

Generate the references file.


In [2]:
REF_DIR = "/home/pahl/comas/projects/painting/references"
PLATE_NAMES = ["S0195", "S0198", "S0203"]  # "S0195", "S0198", "S0203"
DATES = {"S0195": "170523", "S0198": "170516", "S0203": "170512"}
keep = ["Compound_Id", "Container_Id", "Producer", "Conc_uM", "Activity", "Rel_Cell_Count", "Pure_Flag", "Toxic", 
        'Trivial_Name', 'Known_Act', 'Act_Profile', "Metadata_Well", "Plate", 'Smiles']
data_keep = ["Compound_Id", "Container_Id", "Producer", "Conc_uM", "Pure_Flag", "Activity", "Rel_Cell_Count", "Toxic", 
             'Act_Profile', "Metadata_Well", "Plate", 'Smiles']
ds_list = []
pb = nbt.ProgressbarJS()
num_steps = 4 * len(PLATE_NAMES)
step = 0
for plate in PLATE_NAMES:
    for idx in range(1, 5):
        step += 1
        pb.update(100 * step / num_steps)        
        path = op.join(REF_DIR, "{}-{}".format(plate, idx))
        print("\nProcessing plate {}-{} ...".format(plate, idx))
        ds_plate = cpp.load(op.join(path, "Results.tsv"))
        ds_plate = ds_plate.group_on_well()
        ds_plate = ds_plate.remove_skipped_echo_direct_transfer(op.join(path, "*_print.xml"))
        ds_plate = ds_plate.well_type_from_position()
        ds_plate = ds_plate.flag_toxic()
        ds_plate = ds_plate.activity_profile()
        ds_plate = ds_plate.join_layout_1536(plate, idx)
        ds_plate.data["Plate"] = "{}-{}-{}".format(DATES[plate], plate, idx)
        ds_list.append(ds_plate.data)
pb.done()


cellpainting.processing                       (commit: 7e77cd2 ( 2017-08-05 22:51:25 ))
 
 ETA: 
Processing plate S0195-1 ...
* load dataset:          ( 3456 | 1937)
* group on well:         (  384 | 1937)
* remove skipped:        (  384 | 1937)    (  0 skipped ())
* well type from pos:    (  384 | 1938)
* flag toxic:            (  384 | 1940)    ( 18 flagged)
* activity profile:      (  384 |    9)
- loading resource:                        (LAYOUTS)
* join layout 1536:      (  352 |   13)
Processing plate S0195-2 ...
* load dataset:          ( 3456 | 1937)
* group on well:         (  384 | 1937)
* remove skipped:        (  384 | 1937)    (  0 skipped ())
* well type from pos:    (  384 | 1938)
* flag toxic:            (  384 | 1940)    (  9 flagged)
* activity profile:      (  384 |    9)
* join layout 1536:      (  352 |   13)
Processing plate S0195-3 ...
* load dataset:          ( 3456 | 1937)
* group on well:         (  384 | 1937)
* remove skipped:        (  384 | 1937)    (  0 skipped ())
* well type from pos:    (  384 | 1938)
* flag toxic:            (  384 | 1940)    ( 12 flagged)
* activity profile:      (  384 |    9)
* join layout 1536:      (  352 |   13)
Processing plate S0195-4 ...
* load dataset:          ( 3456 | 1937)
* group on well:         (  384 | 1937)
* remove skipped:        (  384 | 1937)    (  0 skipped ())
* well type from pos:    (  384 | 1938)
* flag toxic:            (  384 | 1940)    ( 14 flagged)
* activity profile:      (  384 |    9)
* join layout 1536:      (  352 |   13)
Processing plate S0198-1 ...
* load dataset:          ( 3456 | 1937)
* group on well:         (  384 | 1937)
* remove skipped:        (  383 | 1937)    (  1 skipped (A01))
* well type from pos:    (  383 | 1938)
* flag toxic:            (  383 | 1940)    (  2 flagged)
* activity profile:      (  383 |    9)
* join layout 1536:      (  351 |   13)
Processing plate S0198-2 ...
* load dataset:          ( 3456 | 1937)
* group on well:         (  384 | 1937)
* remove skipped:        (  383 | 1937)    (  1 skipped (A01))
* well type from pos:    (  383 | 1938)
* flag toxic:            (  383 | 1940)    ( 13 flagged)
* activity profile:      (  383 |    9)
* join layout 1536:      (  351 |   13)
Processing plate S0198-3 ...
* load dataset:          ( 3456 | 1937)
* group on well:         (  384 | 1937)
* remove skipped:        (  384 | 1937)    (  0 skipped ())
* well type from pos:    (  384 | 1938)
* flag toxic:            (  384 | 1940)    (  7 flagged)
* activity profile:      (  384 |    9)
* join layout 1536:      (  352 |   13)
Processing plate S0198-4 ...
* load dataset:          ( 3456 | 1937)
* group on well:         (  384 | 1937)
* remove skipped:        (  384 | 1937)    (  0 skipped ())
* well type from pos:    (  384 | 1938)
* flag toxic:            (  384 | 1940)    (  8 flagged)
* activity profile:      (  384 |    9)
* join layout 1536:      (  352 |   13)
Processing plate S0203-1 ...
* load dataset:          ( 3456 | 1937)
* group on well:         (  384 | 1937)
* remove skipped:        (  383 | 1937)    (  1 skipped (P01))
* well type from pos:    (  383 | 1938)
* flag toxic:            (  383 | 1940)    ( 10 flagged)
* activity profile:      (  383 |    9)
* join layout 1536:      (  351 |   13)
Processing plate S0203-2 ...
* load dataset:          ( 3456 | 1937)
* group on well:         (  384 | 1937)
* remove skipped:        (  384 | 1937)    (  0 skipped ())
* well type from pos:    (  384 | 1938)
* flag toxic:            (  384 | 1940)    ( 26 flagged)
* activity profile:      (  384 |    9)
* join layout 1536:      (  352 |   13)
Processing plate S0203-3 ...
* load dataset:          ( 3456 | 1937)
* group on well:         (  384 | 1937)
* remove skipped:        (  383 | 1937)    (  1 skipped (K05))
* well type from pos:    (  383 | 1938)
* flag toxic:            (  383 | 1940)    ( 13 flagged)
* activity profile:      (  383 |    9)
* join layout 1536:      (  351 |   13)
Processing plate S0203-4 ...
* load dataset:          ( 3456 | 1937)
* group on well:         (  384 | 1937)
* remove skipped:        (  384 | 1937)    (  0 skipped ())
* well type from pos:    (  384 | 1938)
* flag toxic:            (  384 | 1940)    ( 10 flagged)
* activity profile:      (  384 |    9)
* join layout 1536:      (  352 |   13)

In [3]:
# *** ds_all <- concat(ds_list) ***
ds_all = cpp.DataSet()
ds_all.data = pd.concat(ds_list)
ds_all.print_log("concat data")
del ds_list
ds_all.write_pkl("170630_references.pkl")
# ds_all = cpp.load_pkl("170630_references.pkl")


# *** ds_profile <- ds_all ***
ds_profile = ds_all.join_smiles()


# *** ds_ref <- ds_profile ***
ds_ref, _ = ds_profile.remove_impure()
ds_ref, _ = ds_ref.remove_toxic()
ds_ref = ds_ref[ds_ref["Activity"] >= 2.5]
ds_ref = ds_ref.join_annotations()
ds_ref.update_similar_refs(mode="ref")
ds_ref = ds_ref[keep]
ds_ref.write_csv("references_act_prof.tsv")


# *** update DATASTORE ***
ds_profile.update_datastore(mode="ref")


* concat data:           ( 4220 |   14)
- loading resource:                        (SMILES)
* join smiles:           ( 3921 |   16)
* remove impure:         ( 3577 |   16)    (344 removed)
* remove toxic:          ( 3460 |   16)    (117 removed)
* subset:                ( 2476 |   16)
- loading resource:                        (ANNOTATIONS)
* join annotations:      ( 2476 |   18)
- loading resource:                        (REFERENCES)
- loading resource:                        (SIM_REFS)
* write sim_refs         ( 1966 |  --  )
* update similar:        ( 2476 |   18)
* subset:                ( 2476 |   14)
- loading resource:                        (DATASTORE)
* write datastore:       ( 5328 |   13)
* update datastore:      ( 3921 |   16)

In [ ]: