In [1]:
import pandas as pd
In [29]:
df = pd.read_pickle('/project/projectdirs/openmsi/projects/ben_run_pactolus/unique_compounds_updated_pubchem_info.pkl')
print df.shape
df = df[df.new_common_name.str.contains('[A-Za-z0-9]+')]
print df.shape
In [30]:
df[df.metatlas_inchi_key == 'UHDGCWIWMRVCDJ-XVFCMESISA-N']
Out[30]:
In [16]:
def shorten_synonyms(row):
synonym_list = row.new_synonyms.split('///')
if len(synonym_list) > 50:
return '///'.join(synonym_list[:50])
else:
return row.new_synonyms
df.new_synonyms = df.apply(shorten_synonyms,axis=1)
In [17]:
import time
list_of_updates = []
for i,row in df.iterrows():
update_dict = dict(inchi_key=row.metatlas_inchi_key,
name=row.new_common_name,
synonyms = row.new_synonyms,
pubchem_compound_id = unicode('%d'%row.new_pubchem_compound_id),
pubchem_url = unicode('http://pubchem.ncbi.nlm.nih.gov/compound/%d'%row.new_pubchem_compound_id)
)
list_of_updates.append(update_dict)
# for i in range(100):
# compounds.update(list_of_updates[i],['inchi_key'])
# print time.time() - t0
In [ ]:
import sys
sys.path.insert(0,'/global/project/projectdirs/metatlas/anaconda/lib/python2.7/site-packages' )
from metatlas import metatlas_objects as metob
db = metob.database
compounds = db['compounds']
affected = []
counter = 0
t0 = time.time()
for d in list_of_updates:
L = compounds.update(d, ['inchi_key'])
affected.append(L)
counter += 1
if counter == 1e3:
print len(affected)
print time.time() - t0
counter = 0
In [26]:
len(affected)
Out[26]:
In [28]:
len(list_of_updates)
Out[28]:
In [ ]:
d
In [27]:
72979+53458
Out[27]:
In [ ]: