notebook.community

Edit and run



In [1]:

    
import pymongo
import os,csv,sys
import openslide
import dsa_mongo_common_functions as dsa
import cdsa_loader_helper_functions as cdsa_helpers
import pprint
from os.path import join as oj
client = pymongo.MongoClient('localhost',27017)
dsa_load_errors_db = client['DSA_LoadErrors']



In [ ]:



In [2]:

    
dsa_slide_db = client['PanCanDSA_Slide_Data']  ### These need to be configured for the specific project
## This is specific to a given fle system and or structurme
slide_root = '/bigdata/PanCan_Images/'  ##Base Path for Slides
feature_file_root = '/bigdata/PanCan_FeatureData/'

feature_db = client['PanCan_BoundsOnly_V2'] ### Im going to make one for each cancer type..

### To generalize this, need to describe organization, most common will be  PATIENT/STAIN_TYPE as subdirectories
TLD_Dirs = [x for x in os.listdir(slide_root) if os.path.isdir(oj(slide_root,x))]
print len(TLD_Dirs),"Potential Patient directories were identified"
print subj_dir_list









    



5 Potential Patient directories were identified
['GBM', 'LGG', 'LUAD', 'SARC', 'SKCM']



In [ ]:

    
def find_rawslide_lists( slide_root_path ):
        """project_name is passed along with the potentially more than one root image path for ndpi files"""
        slide_files = []

        slide_root_path  = slide_root_path.rstrip('/')
        print slide_root_path
        for dpath, dnames, fnames in os.walk( slide_root_path, followlinks=True):
                
                for file in fnames:
                    if '.ndpi' in file or '.svs' in file:
                                slide_files.append(dpath +'/'+file)
        print len(slide_files),"SVS or NDPI files were located"
        return slide_files



In [ ]:

    
print subj_dir_list
print dsa_slide_db['RawSlideData'].count()



In [ ]:

    
### So this creates a document WITHIN the current database to store raw information about the slides
## Now that I think about this, I really should not store the filename but the File Hash as I.. want to change the filenames

for sd in subj_dir_list:
    print sd,"is being processed",
    curr_svs_slide_list = find_rawslide_lists(  os.path.join(slide_root,sd)  )
    slides_processed = newly_processed = dup_slide = rescanned_slides =  0
    for sld in curr_svs_slide_list:

        slide_name = os.path.basename(sld)
        qry = dsa_slide_db['RawSlideData'].find_one( {'slide_name':slide_name})
        #print qry
        if not qry:
            fs = os.path.getsize(sld)
            #md5Checksum = dsa.md5sum(sld)
            (openslide_could_open, width, height, filesize, orig_resolution, slide_name,md5, sld_properties) = cdsa_helpers.openslide_test_file_mongo( sld, 'ndpi', client)
            if openslide_could_open:
                prep_type = 'Unknown'
                slide_metadata = { 'slide_w_path': sld, 'slide_name': slide_name, 'file_size':fs, 'width':width, 'height':height,
                                 'orig_resolution': orig_resolution, 'sld_properties': cdsa_helpers.clean_openslide_keys ( sld_properties), 'slide_md5': md5, 'prep_type': prep_type
                                 }
                dsa_slide_db['RawSlideData'].insert_one(slide_metadata)
                newly_processed +=1 
            else:
                print "UNABLE TO OPEN FILE??",sld
                ###Need to flag/load this in to an error database

        else:
            fs = os.path.getsize(sld)
            ## Double check if file size matches
#             if qry['file_size'] != fs:
#                 #print "File size mismatch??",fs,qry['file_size'],qry['slide_w_path'],sld
#                 load_errors_db['rescanned_slides'].insert_one( {'loaded_slide': qry['slide_w_path'], 'rescanned_slide': sld}             )
#                 rescanned_slides +=1 
#             else:
#                 dup_slide +=1

    
        slides_processed +=1        
        output = "Total Processed: %d  Newly Processed: %d Dup Slides or Already Loaded: %d  RESCANNED Slides %d" % (slides_processed, newly_processed, dup_slide, rescanned_slides )
        dsa.LinePrinter(output)



In [ ]:

    
def getFeatureInfo( slide_name, group_name):
    """Queries the feature Database and determine if there are any segmentations loaded"""
    print "received %s and %s" % ( slide_name, group_name)
# print dsa_slide_db['RawSlideData'].count()
# dsa_slide_db['RawSlideData'].delete_many({})
# print dsa_slide_db['RawSlideData'].count()



In [3]:

    
all_feature_colls = feature_db.collection_names()
print all_feature_colls[5]









    



Features.V1.SARC.TCGA-DX-A6Z2-01A-01-TSA



In [ ]:

    
### I now want to reformat all of this data to make it more useful for DSA ...



In [29]:

    
all_slides = dsa_slide_db['RawSlideData'].find()

## Since this is pretty easy to recreate, I'm going to empty the current database

dsa_slide_db['PanCanDSA_Slide_Data'].delete_many({})


all_feature_colls = feature_db.collection_names()

for s in all_slides:
    slide_dict = {}
    #pt_id = s['slide_w_path'].split('/')[-1]
    slideGroup = s['slide_w_path'].split('/')[-2]
    pt_id = s['slide_w_path'].split('/')[-1].split('.')[0]
   # stain_type = s['slide_w_path'].split('/')[-2]
    stain_type = 'UNK'
    slide_dict = s.copy()
    slide_dict.pop('_id',None)
    
    slide_dict['pt_id'] = pt_id
    slide_dict['stain_type'] = stain_type
    ### Obfuscating the global file path so everything is relative to some base path for the archive/
    slide_dict['thumbnail_image'] = '/thumbnail/' + s['slide_w_path'].replace(slide_root,'')
    slide_dict['slide_w_path'] = '/DZIMS/' + s['slide_w_path'].replace(slide_root,'')+'.dzi'
#    slide_dict['slide_w_path'] = '/DZIMS' + s['slide_w_path']+'.dzi'
    slide_dict['slideGroup'] = slideGroup
    
    slide_dict['HasPathReport'] = True
    slide_dict['PathReportURL'] = "TBD"
  
    
    slide_dict['TumorType'] = slideGroup
    
    
    slide_name_noext = s['slide_name'].replace('.svs','')
    slide_dict['slide_name_noext'] = slide_name_noext
    slide_dict['slide_nouid'] = pt_id
    
    
    foundFeatureDB = False
    
    FeatureColl = "Features.V1.%s.%s" % ( slideGroup, slide_name_noext)
    
    if FeatureColl in all_feature_colls:
        foundFeatureDB = True
    else:
        FeatureColl = "Features.V1.%s.%s" % ( slideGroup, pt_id)
        if FeatureColl in all_feature_colls:
            foundFeatureDB = True

    if foundFeatureDB:
        
        FeatObjs = feature_db[FeatureColl].count()
        ##Feature database should be  Features.V1.[TumorType].SlideName (without the trailing crap)
        slide_dict['FeatureDB_CollName'] =FeatureColl
        if FeatObjs > 0:
            slide_dict['HasAnnotations'] = True
        else:
            slide_dict['HasAnnotations'] = False

        slide_dict['FeatObjs'] = FeatObjs

        dsa_slide_db['PanCanDSA_Slide_Data'].insert_one(slide_dict)



In [32]:

    
coll_list = dsa_slide_db['PanCanDSA_Slide_Data'].distinct('slideGroup')
dsa_slide_db['PanCanDSA_Slide_Data'].create_index('slideGroup')
print coll_list









    



[u'GBM', u'LGG', u'LUAD', u'SARC', u'SKCM']



In [33]:

    
#all_slides = dsa_slide_db['RawSlideData'].find()
all_colls =  feature_db.collection_names()

for a in all_colls:
    if 'Features' in a:
        feature_db[a].create_index('X')
        feature_db[a].create_index('Y')









    



---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-33-9e2bc2b5fc4d> in <module>()
      5     if 'Features' in a:
      6         feature_db[a].create_index('X')
----> 7         feature_db[a].create_index('Y')

/home/dgutman/Envs/ADRCFlask/local/lib/python2.7/site-packages/pymongo/collection.pyc in create_index(self, keys, **kwargs)
   1378         keys = helpers._index_list(keys)
   1379         name = kwargs.setdefault("name", helpers._gen_index_name(keys))
-> 1380         self.__create_index(keys, kwargs)
   1381         return name
   1382 

/home/dgutman/Envs/ADRCFlask/local/lib/python2.7/site-packages/pymongo/collection.pyc in __create_index(self, keys, index_options)
   1288             try:
   1289                 self._command(
-> 1290                     sock_info, cmd, read_preference=ReadPreference.PRIMARY)
   1291             except OperationFailure as exc:
   1292                 if exc.code in common.COMMAND_NOT_FOUND_CODES:

/home/dgutman/Envs/ADRCFlask/local/lib/python2.7/site-packages/pymongo/collection.pyc in _command(self, sock_info, command, slave_ok, read_preference, codec_options, check, allowable_errors, read_concern)
    203                                  check,
    204                                  allowable_errors,
--> 205                                  read_concern=read_concern)
    206 
    207     def __create(self, options):

/home/dgutman/Envs/ADRCFlask/local/lib/python2.7/site-packages/pymongo/pool.pyc in command(self, dbname, spec, slave_ok, read_preference, codec_options, check, allowable_errors, check_keys, read_concern)
    216         # Catch socket.error, KeyboardInterrupt, etc. and close ourselves.
    217         except BaseException as error:
--> 218             self._raise_connection_failure(error)
    219 
    220     def send_message(self, message, max_doc_size):

/home/dgutman/Envs/ADRCFlask/local/lib/python2.7/site-packages/pymongo/pool.pyc in _raise_connection_failure(self, error)
    344             _raise_connection_failure(self.address, error)
    345         else:
--> 346             raise error
    347 
    348     def __eq__(self, other):

KeyboardInterrupt:



In [ ]:

    
print dsa_slide_db['PanCanDSA_Slide_Data'].find_one({'slide_name': {"$regex": "TCGA-3B-A9I1-01Z-00-DX1"}  }
                                             
                                             )


#{ <field>: { $regex: /pattern/, $options: '<options>' } }



In [23]:

    
dsa_slide_db['PanCanDSA_Slide_Data'].count()

print feature_db['Features.V1.GBM.TCGA-02-0001-01Z-00-DX1'].count()
#feature_db['Features.V1.LGG.TCGA-CS-4941-01Z-00-DX1.86D516B5-C648-4249-8C6A-7F9A6A56CB4F'].count()



In [ ]:

    
print dsa_slide_db['DSA_Slide_Data'].distinct('pt_id')



In [ ]:

    
cur= dsa_slide_db['DSA_Slide_Data'].find({'pt_id':'ADRC50-10'})
for c in cur:
    print c



In [ ]:

    
## Going to create a cleanup and /or reformatted collection for the DSA Viewer
for s in dsa_slide_db['ADRC'].find():
    keys_of_interest = ['width','height']
    print s['slide_w_path']
    print s.keys()
    sys.exit()
###