In [1]:
import pymongo
import os,csv,sys
import openslide
import dsa_mongo_common_functions as dsa
import cdsa_loader_helper_functions as cdsa_helpers
import pprint
from os.path import join as oj
client = pymongo.MongoClient('localhost',27017)
dsa_load_errors_db = client['DSA_LoadErrors']
In [ ]:
In [2]:
dsa_slide_db = client['PanCanDSA_Slide_Data'] ### These need to be configured for the specific project
## This is specific to a given fle system and or structurme
slide_root = '/bigdata/PanCan_Images/' ##Base Path for Slides
feature_file_root = '/bigdata/PanCan_FeatureData/'
feature_db = client['PanCan_BoundsOnly_V2'] ### Im going to make one for each cancer type..
### To generalize this, need to describe organization, most common will be PATIENT/STAIN_TYPE as subdirectories
TLD_Dirs = [x for x in os.listdir(slide_root) if os.path.isdir(oj(slide_root,x))]
print len(TLD_Dirs),"Potential Patient directories were identified"
print subj_dir_list
In [ ]:
def find_rawslide_lists( slide_root_path ):
"""project_name is passed along with the potentially more than one root image path for ndpi files"""
slide_files = []
slide_root_path = slide_root_path.rstrip('/')
print slide_root_path
for dpath, dnames, fnames in os.walk( slide_root_path, followlinks=True):
for file in fnames:
if '.ndpi' in file or '.svs' in file:
slide_files.append(dpath +'/'+file)
print len(slide_files),"SVS or NDPI files were located"
return slide_files
In [ ]:
print subj_dir_list
print dsa_slide_db['RawSlideData'].count()
In [ ]:
### So this creates a document WITHIN the current database to store raw information about the slides
## Now that I think about this, I really should not store the filename but the File Hash as I.. want to change the filenames
for sd in subj_dir_list:
print sd,"is being processed",
curr_svs_slide_list = find_rawslide_lists( os.path.join(slide_root,sd) )
slides_processed = newly_processed = dup_slide = rescanned_slides = 0
for sld in curr_svs_slide_list:
slide_name = os.path.basename(sld)
qry = dsa_slide_db['RawSlideData'].find_one( {'slide_name':slide_name})
#print qry
if not qry:
fs = os.path.getsize(sld)
#md5Checksum = dsa.md5sum(sld)
(openslide_could_open, width, height, filesize, orig_resolution, slide_name,md5, sld_properties) = cdsa_helpers.openslide_test_file_mongo( sld, 'ndpi', client)
if openslide_could_open:
prep_type = 'Unknown'
slide_metadata = { 'slide_w_path': sld, 'slide_name': slide_name, 'file_size':fs, 'width':width, 'height':height,
'orig_resolution': orig_resolution, 'sld_properties': cdsa_helpers.clean_openslide_keys ( sld_properties), 'slide_md5': md5, 'prep_type': prep_type
}
dsa_slide_db['RawSlideData'].insert_one(slide_metadata)
newly_processed +=1
else:
print "UNABLE TO OPEN FILE??",sld
###Need to flag/load this in to an error database
else:
fs = os.path.getsize(sld)
## Double check if file size matches
# if qry['file_size'] != fs:
# #print "File size mismatch??",fs,qry['file_size'],qry['slide_w_path'],sld
# load_errors_db['rescanned_slides'].insert_one( {'loaded_slide': qry['slide_w_path'], 'rescanned_slide': sld} )
# rescanned_slides +=1
# else:
# dup_slide +=1
slides_processed +=1
output = "Total Processed: %d Newly Processed: %d Dup Slides or Already Loaded: %d RESCANNED Slides %d" % (slides_processed, newly_processed, dup_slide, rescanned_slides )
dsa.LinePrinter(output)
In [ ]:
def getFeatureInfo( slide_name, group_name):
"""Queries the feature Database and determine if there are any segmentations loaded"""
print "received %s and %s" % ( slide_name, group_name)
# print dsa_slide_db['RawSlideData'].count()
# dsa_slide_db['RawSlideData'].delete_many({})
# print dsa_slide_db['RawSlideData'].count()
In [3]:
all_feature_colls = feature_db.collection_names()
print all_feature_colls[5]
In [ ]:
### I now want to reformat all of this data to make it more useful for DSA ...
In [29]:
all_slides = dsa_slide_db['RawSlideData'].find()
## Since this is pretty easy to recreate, I'm going to empty the current database
dsa_slide_db['PanCanDSA_Slide_Data'].delete_many({})
all_feature_colls = feature_db.collection_names()
for s in all_slides:
slide_dict = {}
#pt_id = s['slide_w_path'].split('/')[-1]
slideGroup = s['slide_w_path'].split('/')[-2]
pt_id = s['slide_w_path'].split('/')[-1].split('.')[0]
# stain_type = s['slide_w_path'].split('/')[-2]
stain_type = 'UNK'
slide_dict = s.copy()
slide_dict.pop('_id',None)
slide_dict['pt_id'] = pt_id
slide_dict['stain_type'] = stain_type
### Obfuscating the global file path so everything is relative to some base path for the archive/
slide_dict['thumbnail_image'] = '/thumbnail/' + s['slide_w_path'].replace(slide_root,'')
slide_dict['slide_w_path'] = '/DZIMS/' + s['slide_w_path'].replace(slide_root,'')+'.dzi'
# slide_dict['slide_w_path'] = '/DZIMS' + s['slide_w_path']+'.dzi'
slide_dict['slideGroup'] = slideGroup
slide_dict['HasPathReport'] = True
slide_dict['PathReportURL'] = "TBD"
slide_dict['TumorType'] = slideGroup
slide_name_noext = s['slide_name'].replace('.svs','')
slide_dict['slide_name_noext'] = slide_name_noext
slide_dict['slide_nouid'] = pt_id
foundFeatureDB = False
FeatureColl = "Features.V1.%s.%s" % ( slideGroup, slide_name_noext)
if FeatureColl in all_feature_colls:
foundFeatureDB = True
else:
FeatureColl = "Features.V1.%s.%s" % ( slideGroup, pt_id)
if FeatureColl in all_feature_colls:
foundFeatureDB = True
if foundFeatureDB:
FeatObjs = feature_db[FeatureColl].count()
##Feature database should be Features.V1.[TumorType].SlideName (without the trailing crap)
slide_dict['FeatureDB_CollName'] =FeatureColl
if FeatObjs > 0:
slide_dict['HasAnnotations'] = True
else:
slide_dict['HasAnnotations'] = False
slide_dict['FeatObjs'] = FeatObjs
dsa_slide_db['PanCanDSA_Slide_Data'].insert_one(slide_dict)
In [32]:
coll_list = dsa_slide_db['PanCanDSA_Slide_Data'].distinct('slideGroup')
dsa_slide_db['PanCanDSA_Slide_Data'].create_index('slideGroup')
print coll_list
In [33]:
#all_slides = dsa_slide_db['RawSlideData'].find()
all_colls = feature_db.collection_names()
for a in all_colls:
if 'Features' in a:
feature_db[a].create_index('X')
feature_db[a].create_index('Y')
In [ ]:
print dsa_slide_db['PanCanDSA_Slide_Data'].find_one({'slide_name': {"$regex": "TCGA-3B-A9I1-01Z-00-DX1"} }
)
#{ <field>: { $regex: /pattern/, $options: '<options>' } }
In [23]:
dsa_slide_db['PanCanDSA_Slide_Data'].count()
print feature_db['Features.V1.GBM.TCGA-02-0001-01Z-00-DX1'].count()
#feature_db['Features.V1.LGG.TCGA-CS-4941-01Z-00-DX1.86D516B5-C648-4249-8C6A-7F9A6A56CB4F'].count()
In [ ]:
print dsa_slide_db['DSA_Slide_Data'].distinct('pt_id')
In [ ]:
cur= dsa_slide_db['DSA_Slide_Data'].find({'pt_id':'ADRC50-10'})
for c in cur:
print c
In [ ]:
## Going to create a cleanup and /or reformatted collection for the DSA Viewer
for s in dsa_slide_db['ADRC'].find():
keys_of_interest = ['width','height']
print s['slide_w_path']
print s.keys()
sys.exit()
###