In [1]:
%matplotlib notebook
from metadatastore.mds import MDS
from databroker import Broker
from databroker.core import register_builtin_handlers
from filestore.fs import FileStore as FS
from pymongo import MongoClient
import datetime
import numpy as np
import os
import pytz
import matplotlib.pyplot as plt
utc=pytz.UTC
_mds_config = {'host': 'xf03id-ca1',
'port': 27017,
'database': 'datastore',
'timezone': 'US/Eastern'}
mds = MDS(_mds_config, auth=False)
_fs_config = {'host': 'xf03id-ca1',
'port': 27017,
'database': 'filestore'}
db = Broker(mds, FS(_fs_config))
mds = MDS(_mds_config)
fs = FS(_fs_config)
conn = MongoClient(host=_fs_config['host'],
port=_fs_config['port'])
fsdb = conn.filestore
In [49]:
# 1. Copy database and perform the following on copied database
# 2. Fetch uids of resources with datums that have no resource info
# 3. Looking at the resources, get the file timestamps
# 4. Comparing consecutive resource timestamps and verifying against file timestamps, update datums' "resource" field
# 5. Verify and perform the same opeation on production db
In [2]:
#Not possible via filestore, so resorting to pymongo for this limited purpose
incomplete_datums = list(fsdb.datum.find({'resource': None}))
In [83]:
patient_zero = next(mds.find_run_starts(scan_id=27768))
patient_last = next(mds.find_run_starts(scan_id=27917))
problem_starts = list(mds.find_run_starts(scan_id={'$gte': 27768, '$lte':27917+1}))
rstart_times = [r['time'] for r in problem_starts]
In [84]:
# find resources within defined time window
end = np.max(rstart_times)
start = np.min(rstart_times)
end_utc = utc.localize(datetime.datetime.utcfromtimestamp(end))
start_utc = utc.localize(datetime.datetime.utcfromtimestamp(start))
query = fsdb.resource.find()
rlist = []
flist = []
tlist = []
for r in query:
if start_utc<= r['_id'].generation_time <= end_utc:
rlist.append(r)
flist.append(os.path.getmtime(r['root'] + '/' + r['resource_path']))
tlist.append(r['_id'].generation_time)
In [5]:
flist[0]
Out[5]:
In [38]:
rlist[184]
Out[38]:
In [51]:
hdr = db[27768]
t1 = hdr.start.time
t2 = hdr.stop.time
t1_utc = utc.localize(datetime.datetime.utcfromtimestamp(t1))
t2_utc = utc.localize(datetime.datetime.utcfromtimestamp(t2))
In [52]:
hdr.start
Out[52]:
In [65]:
sub = hdr.start.subscan_dims
len(sub)
Out[65]:
In [70]:
fig, ax = plt.subplots()
ax.plot(tlist, '.-')
ax.axhline(t1_utc)
ax.axhline(t2_utc)
Out[70]:
In [8]:
for des in hdr.descriptors:
for k,v in des['data_keys'].items():
if 'external' in v:
print(k)
In [53]:
e = list(db.get_events(hdr, stream_name='primary', fill=False))
In [54]:
len(e)
Out[54]:
In [55]:
e[120*136].data
Out[55]:
In [66]:
dat = next(fsdb.datum.find({'datum_id': e[120*136-1].data['xspress3_ch2']}))
#dat = next(fsdb.datum.find({'datum_id': e[0].data['merlin1']}))
In [81]:
# before correction
dat
#120*136-1
Out[81]:
In [15]:
dat['resource'] = rlist[113]['uid']
In [80]:
def update_datum(scanid, resource_list):
print(scanid)
hdr = db[scanid]
subscan = hdr.start.subscan_dims
elist = list(db.get_events(hdr, fill=False, stream_name='primary'))
t1 = hdr.start.time
t2 = hdr.stop.time
timelist = [os.path.getmtime(r['root'] + '/' + r['resource_path']) for r in resource_list]
#timelist = [r['_id'].generation_time for r in source_list]
id_found = []
for i,t in enumerate(timelist):
if t1 < t< t2:
if resource_list[i]['spec'] == 'XSP3':
id_found.append(i)
print('time:', i, t)
print('number of h5 file: ', len(id_found))
if len(id_found) != len(subscan):
raise('size is inconsistent!')
#if len(id_found) != 2:
# raise('len of data should be 2')
num = 0
file_id = -1
for e in elist:
for detid in range(3):
detname = 'xspress3_ch' + str(detid+1)
dat = next(fsdb.datum.find({'datum_id': e.data[detname]}))
if detid==0 and dat['datum_kwargs']['frame']==0:
file_id += 1
dat['resource'] = resource_list[id_found[file_id]]['uid']
# turn this on later when updating is needed
#fsdb.datum.update_many(filter={'datum_id': e.data[detname]},
# update={'$set':dat},
# upsert=False)
In [86]:
runlist = [27768, 27822, 27825, 27835, 27840, 27845, 27849, 27851, 27858, 27860, 27873, 27883, 27888, 27892, 27912, 27917]
#large_list = [27768, 27845, 27849, 27860]
for num in runlist[-1:]:
update_datum(num, rlist)
In [ ]:
In [47]:
plt.close('all')
fig, ax = plt.subplots()
ax.plot(flist, '.')
ax.axhline(t1)
ax.axhline(t2)
Out[47]:
In [5]:
file_start_time = os.path.getmtime(rlist[0]['root'] + '/' + rlist[0]['resource_path'])
file_end_time = os.path.getmtime(rlist[-1]['root'] + '/' + rlist[-1]['resource_path'])
In [ ]:
for i in range(len(problem_resources)):
primary = missing[i]
try:
secondary = missing[i+1]
except IndexError:
break
diff = s_time - p_time
fsdb.datum.find({'time': {'$gte': p_time, '$lt': s_time }})
In [ ]:
a = next(fsdb.resource.find())['_id']
In [ ]:
a.generation_time
In [ ]: