In [1]:
import pymongo
In [2]:
from pymongo import MongoClient
In [3]:
client = MongoClient()
In [4]:
db = client["mightyscreen"]
db
Out[4]:
Database(MongoClient('localhost', 27017), u'mightyscreen')
In [5]:
db.collection_names()
Out[5]:
[u'system.indexes',
u'library',
u'compound',
u'proj_data_1',
u'fs.chunks',
u'fs.files',
u'file_document',
u'view',
u'proj_data_2',
u'proj_data_3',
u'proj_data_4',
u'proj_data_5']
In [6]:
d = db['proj_data_3']
In [7]:
d.find_one()
Out[7]:
{u'_id': ObjectId('52fab0ccbe22a455e18255ea'),
u'comment': [],
u'create_by': 1,
u'create_date': datetime.datetime(2014, 2, 13, 0, 10, 8, 346000),
u'hit': 0,
u'library': u'ICCB_HMS',
u'plate': u'1922',
u'platewell': u'1922A01',
u'readout': {u'FI': [65875594.0, 63045433.0],
u'FP': [146.4, 159.6],
u'P_channel': [28866991.0, 27533920.0],
u'S_channel': [8141612.0, 7977593.0]},
u'score': {},
u'well': u'A01',
u'welltype': u'E'}
In [8]:
l = db["compound"]
In [9]:
l.find_one()
Out[9]:
{u'_id': ObjectId('52e7d0dabe22a46180f3aee6'),
u'canonical_smiles': u'O=C(c1ccccc1I)NCCOCc1ccccc1\n',
u'chemical_name': u'0',
u'facility_reagent_id': u'ICCB-00111290',
u'formula': u'C16H16INO2',
u'fp2': u'00000200c024000000060001000003008000000100880000c0002000000004000040000c12000000080286200001004000000000000001000200000010000000020800080000140802010000000088424000800000201000000008400000030002001000000000000040c0000000009080010600001801000000480000000008',
u'fp3': u'000000040001b030',
u'fp4': u'0000800000000000008000000000000a000005400000200000000000000000004002000000002840000000000000000000000000000000000000000000000000',
u'inchi': u'InChI=1/C16H16INO2/c17-15-9-5-4-8-14(15)16(19)18-10-11-20-12-13-6-2-1-3-7-13/h1-9H,10-12H2,(H,18,19)',
u'inchikey': u'SXMBTNXWVVBMOR-UHFFFAOYSA-N\n',
u'library_name': u'ICCB_HMS',
u'logp': 3.63,
u'molecular_weight': 381.21,
u'plate': u'803',
u'plate_well': u'803B15',
u'pubchem_cid': u'2222845',
u'sdf': u'Structure37473\n OpenBabel12091311333D\nStructure written by MMmdl.\n 20 21 0 0 1 0 0 0 0 0999 V2000\n 10.9148 2.0997 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 9.6895 2.7776 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 12.1396 2.7776 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 9.7187 4.1998 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0\n 8.4649 2.0997 0.0000 N 0 0 0 0 0 0 0 0 0 0 0 0\n 12.1688 4.1998 0.0000 I 0 0 0 0 0 0 0 0 0 0 0 0\n 10.9148 0.6780 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 2.4133 2.8359 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 4.8194 2.7776 0.0000 O 0 0 0 0 0 0 0 0 0 0 0 0\n 13.3645 2.0997 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 7.2692 2.7776 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 3.6382 2.0997 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 6.0444 2.0997 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 2.4133 4.2581 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.2102 2.1580 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 12.1032 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 13.3645 0.6780 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.0000 2.8359 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1.2102 4.9360 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 0.0000 4.2581 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0\n 1 2 1 0 0 0 0\n 1 3 2 0 0 0 0\n 1 7 1 0 0 0 0\n 2 4 2 0 0 0 0\n 2 5 1 0 0 0 0\n 3 6 1 0 0 0 0\n 3 10 1 0 0 0 0\n 5 11 1 0 0 0 0\n 7 16 2 0 0 0 0\n 8 12 1 0 0 0 0\n 8 14 1 0 0 0 0\n 8 15 2 0 0 0 0\n 9 13 1 0 0 0 0\n 9 12 1 0 0 0 0\n 10 17 2 0 0 0 0\n 11 13 1 0 0 0 0\n 14 19 2 0 0 0 0\n 15 18 1 0 0 0 0\n 16 17 1 0 0 0 0\n 18 20 2 0 0 0 0\n 19 20 1 0 0 0 0\nM END\n> <Library>\nChemBridge Microformats\n\n> <Plate>\n803\n\n> <Well>\nB15\n\n> <Plate_Well>\n803B15\n\n> <Facility_Reagent_ID>\nICCB-00111290\n\n> <PubChem_CID>\n2222845\n\n> <Smiles>\nc1(ccccc1I)C(NCCOCc2ccccc2)=O\n\n> <InChI>\nInChI=1/C16H16INO2/c17-15-9-5-4-8-14(15)16(19)18-10-11-20-12-13-6-2-1-3-7-13/h1-9H,10-12H2,(H,18,19)\n\n> <Molecular_Weight>\n381.20821\n\n> <Formula>\nC16H16INO2\n\n> <Fp2>\n00000200c024000000060001000003008000000100880000c0002000000004000040000c12000000080286200001004000000000000001000200000010000000020800080000140802010000000088424000800000201000000008400000030002001000000000000040c0000000009080010600001801000000480000000008\n\n> <Fp3>\n000000040001b030\n\n> <Fp4>\n0000800000000000008000000000000a000005400000200000000000000000004002000000002840000000000000000000000000000000000000000000000000\n\n> <TPSA>\n38.33\n\n> <logP>\n3.6287\n\n> <Canonical_Smiles>\nO=C(c1ccccc1I)NCCOCc1ccccc1\n\n\n> <InChiKey>\nSXMBTNXWVVBMOR-UHFFFAOYSA-N\n\n\n$$$$\n',
u'sub_library_name': u'ChemBridge Microformats',
u'svg': u'<?xml version="1.0"?>\n<svg version="1.1" id="topsvg"\nxmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink"\nxmlns:cml="http://www.xml-cml.org/schema" x="0" y="0" width="200px" height="200px" viewBox="0 0 100 100">\n<title>OBDepict</title>\n<rect x="0" y="0" width="100" height="100" fill="white"/>\n<text text-anchor="middle" font-size="6" fill ="black" font-family="sans-serif"\nx="50" y="98" >Structure37473</text>\n<g transform="translate(0,0)">\n<svg width="100" height="100" x="0" y="0" viewBox="0 0 461.395 220.863"\nfont-family="sans-serif" stroke="rgb(0,0,0)" stroke-width="2" stroke-linecap="round">\n<line x1="351.5" y1="120.9" x2="316.5" y2="101.6" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="313.5" y1="101.5" x2="314.1" y2="73.9" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="319.5" y1="101.7" x2="320.1" y2="74.1" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="316.5" y1="101.6" x2="292.9" y2="114.6" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="386.4" y1="101.6" x2="387.0" y2="74.0" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="270.3" y1="114.5" x2="247.4" y2="101.6" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="108.9" y1="99.9" x2="143.8" y2="120.9" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="188.9" y1="107.9" x2="212.5" y2="120.9" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="166.3" y1="108.1" x2="143.8" y2="120.9" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="247.4" y1="101.6" x2="212.5" y2="120.9" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="351.5" y1="120.9" x2="351.5" y2="161.5" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="351.5" y1="161.5" x2="385.4" y2="180.9" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="360.3" y1="158.2" x2="383.8" y2="171.6" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="385.4" y1="180.9" x2="421.4" y2="161.5" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="421.4" y1="161.5" x2="421.4" y2="120.9" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="414.2" y1="155.5" x2="414.2" y2="126.9" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="421.4" y1="120.9" x2="386.4" y2="101.6" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="386.4" y1="101.6" x2="351.5" y2="120.9" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="384.7" y1="110.8" x2="360.2" y2="124.3" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="108.9" y1="99.9" x2="74.5" y2="119.3" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="100.1" y1="96.6" x2="76.2" y2="110.1" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="74.5" y1="119.3" x2="40.0" y2="99.9" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="40.0" y1="99.9" x2="40.0" y2="59.3" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="47.2" y1="93.9" x2="47.2" y2="65.3" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="40.0" y1="59.3" x2="74.5" y2="40.0" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="74.5" y1="40.0" x2="108.9" y2="59.3" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="76.2" y1="49.2" x2="100.1" y2="62.7" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<line x1="108.9" y1="59.3" x2="108.9" y2="99.9" stroke="rgb(0,0,0)" stroke-width="2.0"/>\n<text x="311.351784" y="69.009640" fill="rgb(255,12,12)" stroke="rgb(255,12,12)" stroke-width="1" font-size="16" >O</text>\n<text x="275.570901" y="128.942190" fill="rgb(12,12,255)" stroke="rgb(12,12,255)" stroke-width="1" font-size="16" >N</text>\n<text x="275.570901" y="144.942190" fill="rgb(12,12,255)" stroke="rgb(12,12,255)" stroke-width="1" font-size="16" >H</text>\n<text x="381.272618" y="69.009640" fill="rgb(147,0,147)" stroke="rgb(147,0,147)" stroke-width="1" font-size="16" >I</text>\n<text x="171.535801" y="109.596313" fill="rgb(255,12,12)" stroke="rgb(255,12,12)" stroke-width="1" font-size="16" >O</text>\n</svg>\n</g>\n</svg>\n\n',
u'tpsa': 38.33,
u'well': u'B15'}
In [10]:
import pandas as pd
import numpy as np
In [11]:
d1 = pd.DataFrame(list(d.find()))
In [12]:
d1.shape
Out[12]:
(768, 13)
In [13]:
d1.columns
Out[13]:
Index([u'_id', u'comment', u'compound', u'create_by', u'create_date', u'hit', u'library', u'plate', u'platewell', u'readout', u'score', u'well', u'welltype'], dtype='object')
In [14]:
l.find_one({"plate_well":"1922A03"})['fp2']
Out[14]:
u'040200008004000000060000010000000000001000000000c000020000002000004000042000000008028700400208000001000000000000030000001000000000000010201001400000000102018804400088000400000400000860140000000000100008000000000000000000000080010600000001000000081000008002'
In [15]:
d1['fp2'] = d1['platewell'].map(lambda x: "" if not l.find_one({"plate_well":x}) else l.find_one({"plate_well":x})['fp2'])
In [16]:
d1.head()['fp2']
Out[16]:
0
1
2 0402000080040000000600000100000000000010000000...
3 0000060000004000000600030c00000000000000000000...
4 0001000000000000000620000000000800000004000000...
Name: fp2, dtype: object
In [17]:
for i in "fp2 fp3 fp4 logp molecular_weight inchikey svg formula sub_library_name chemical_name".split():
d1[i] = d1['platewell'].map(lambda x: "" if not l.find_one({"plate_well":x}) else l.find_one({"plate_well":x})[i])
In [22]:
d1.head()["fp2 fp3 fp4 logp molecular_weight inchikey svg formula sub_library_name chemical_name fpA fpB fiA fiB".split()]
Out[22]:
fp2
fp3
fp4
logp
molecular_weight
inchikey
svg
formula
sub_library_name
chemical_name
fpA
fpB
fiA
fiB
0
146.4
159.6
65875594
63045433
1
169.6
184.2
55839081
54349162
2
0402000080040000000600000100000000000010000000...
000000070009f000
0000000000000000008000000000000600000540080004...
2.57
200.58
GFGSZUNNBQXGMK-UHFFFAOYSA-N\n
<?xml version="1.0"?>\n<svg version="1.1" id="...
C7H5ClN2O3
NINDS Custom Collection 2
Aklomide
114.5
132.0
73239803
68441645
3
0000060000004000000600030c00000000000000000000...
1c0000040001b000
0240180300000080008000000000001200000000000000...
3.24
467.64
FTLDJPRFCGDUFH-UHFFFAOYSA-N\n
<?xml version="1.0"?>\n<svg version="1.1" id="...
C28H41N3O3
NINDS Custom Collection 2
Oxethazaine
176.5
192.0
52843942
51216050
4
0001000000000000000620000000000800000004000000...
000000000021b000
080000000000000000000000000000000020000000d000...
1.31
100.14
RAIPHJJURHTUIC-UHFFFAOYSA-N\n
<?xml version="1.0"?>\n<svg version="1.1" id="...
C3H4N2S
NINDS Custom Collection 2
Aminothiazole
150.2
158.9
69225335
69672408
5 rows × 14 columns
In [20]:
d1['fpA'] = d1['readout'].map(lambda x: x['FP'][0])
d1['fpB'] = d1['readout'].map(lambda x: x['FP'][1])
d1['fiA'] = d1['readout'].map(lambda x: x['FI'][0])
d1['fiB'] = d1['readout'].map(lambda x: x['FI'][1])
In [23]:
d1.to_csv("demo_data.csv")
Content source: longfei-wang/cs171-pr-visualscreen
Similar notebooks: