In [1]:
import pandas as pd

In [29]:
df = pd.read_pickle('/project/projectdirs/openmsi/projects/ben_run_pactolus/unique_compounds_updated_pubchem_info.pkl')
print df.shape
df = df[df.new_common_name.str.contains('[A-Za-z0-9]+')]
print df.shape


(158493, 22)
(126437, 22)

In [30]:
df[df.metatlas_inchi_key == 'UHDGCWIWMRVCDJ-XVFCMESISA-N']


Out[30]:
Definition charge chebi_id common_name flat_inchi flat_inchikey formula hmdb_id img_abc_id kegg_id ... metatlas_inchi metatlas_inchi_key num_radicals pubchem_compound_id source_database synonyms monoisotopic_mw new_synonyms new_common_name new_pubchem_compound_id
121057 A pyrimidine nucleoside in which cytosine is a... 0.0 CHEBI:17562 Cytidine///cytidine InChI=1S/C9H13N3O5/c10-5-1-2-12(9(16)11-5)8-7(... UHDGCWIWMRVCDJ-UHFFFAOYSA-N C9H13N3O5 HMDB00089 C00475 ... InChI=1S/C9H13N3O5/c10-5-1-2-12(9(16)11-5)8-7(... UHDGCWIWMRVCDJ-XVFCMESISA-N 0.0 6175 chebi///metacyc///hmdb 1-(b-D-Ribofuranosyl)-2-oxo-4-amino-1,2-dihydr... 243.085520516 cytidine///65-46-3///Cytosine riboside///1-bet... cytidine 6175

1 rows × 22 columns


In [16]:
def shorten_synonyms(row):
    synonym_list = row.new_synonyms.split('///')
    if len(synonym_list) > 50:
        return '///'.join(synonym_list[:50])
    else:
        return row.new_synonyms
df.new_synonyms = df.apply(shorten_synonyms,axis=1)

In [17]:
import time
list_of_updates = []
for i,row in df.iterrows():
    update_dict = dict(inchi_key=row.metatlas_inchi_key, 
                       name=row.new_common_name,
                       synonyms = row.new_synonyms,
                       pubchem_compound_id = unicode('%d'%row.new_pubchem_compound_id),
                       pubchem_url = unicode('http://pubchem.ncbi.nlm.nih.gov/compound/%d'%row.new_pubchem_compound_id)
                    )
    list_of_updates.append(update_dict)
# for i in range(100):
#     compounds.update(list_of_updates[i],['inchi_key'])
#     print time.time() - t0

In [ ]:
import sys
sys.path.insert(0,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
from metatlas import metatlas_objects as metob
db = metob.database
compounds = db['compounds']
affected = []
counter = 0
t0 = time.time()
for d in list_of_updates:
    L = compounds.update(d, ['inchi_key'])
    affected.append(L)
    counter += 1
    if counter == 1e3:
        print len(affected)
        print time.time() - t0
        counter = 0


1000
724.255877972
2000
1451.11270308
3000
2174.93267918
4000
2898.36356497

In [26]:
len(affected)


Out[26]:
72979

In [28]:
len(list_of_updates)


Out[28]:
126437

In [ ]:
d

In [27]:
72979+53458


Out[27]:
126437

In [ ]: