import pymongo
import os,csv,sys
import openslide
import dsa_mongo_common_functions as dsa
import cdsa_loader_helper_functions as cdsa_helpers
import pprint
from os.path import join as oj
client = pymongo.MongoClient('localhost',27017)
dsa_load_errors_db = client['DSA_LoadErrors']

In [2]:
dsa_slide_db = client['PanCanDSA_Slide_Data']  ### These need to be configured for the specific project
## This is specific to a given fle system and or structurme
slide_root = '/bigdata/PanCan_Images/'  ##Base Path for Slides
feature_file_root = '/bigdata/PanCan_FeatureData/'

feature_db = client['PanCan_BoundsOnly_V2'] ### Im going to make one for each cancer type..

### To generalize this, need to describe organization, most common will be  PATIENT/STAIN_TYPE as subdirectories
TLD_Dirs = [x for x in os.listdir(slide_root) if os.path.isdir(oj(slide_root,x))]
print len(TLD_Dirs),"Potential Patient directories were identified"
print subj_dir_list

5 Potential Patient directories were identified
['GBM', 'LGG', 'LUAD', 'SARC', 'SKCM']

def find_rawslide_lists( slide_root_path ):
        """project_name is passed along with the potentially more than one root image path for ndpi files"""
        slide_files = []

        slide_root_path  = slide_root_path.rstrip('/')
        print slide_root_path
        for dpath, dnames, fnames in os.walk( slide_root_path, followlinks=True):
                for file in fnames:
                    if '.ndpi' in file or '.svs' in file:
                                slide_files.append(dpath +'/'+file)
        print len(slide_files),"SVS or NDPI files were located"
        return slide_files

print subj_dir_list
print dsa_slide_db['RawSlideData'].count()

### So this creates a document WITHIN the current database to store raw information about the slides
## Now that I think about this, I really should not store the filename but the File Hash as I.. want to change the filenames

for sd in subj_dir_list:
    print sd,"is being processed",
    curr_svs_slide_list = find_rawslide_lists(  os.path.join(slide_root,sd)  )
    slides_processed = newly_processed = dup_slide = rescanned_slides =  0
    for sld in curr_svs_slide_list:

        slide_name = os.path.basename(sld)
        qry = dsa_slide_db['RawSlideData'].find_one( {'slide_name':slide_name})
        #print qry
        if not qry:
            fs = os.path.getsize(sld)
            #md5Checksum = dsa.md5sum(sld)
            (openslide_could_open, width, height, filesize, orig_resolution, slide_name,md5, sld_properties) = cdsa_helpers.openslide_test_file_mongo( sld, 'ndpi', client)
            if openslide_could_open:
                prep_type = 'Unknown'
                slide_metadata = { 'slide_w_path': sld, 'slide_name': slide_name, 'file_size':fs, 'width':width, 'height':height,
                                 'orig_resolution': orig_resolution, 'sld_properties': cdsa_helpers.clean_openslide_keys ( sld_properties), 'slide_md5': md5, 'prep_type': prep_type
                newly_processed +=1 
                print "UNABLE TO OPEN FILE??",sld
                ###Need to flag/load this in to an error database

            fs = os.path.getsize(sld)
            ## Double check if file size matches
#             if qry['file_size'] != fs:
#                 #print "File size mismatch??",fs,qry['file_size'],qry['slide_w_path'],sld
#                 load_errors_db['rescanned_slides'].insert_one( {'loaded_slide': qry['slide_w_path'], 'rescanned_slide': sld}             )
#                 rescanned_slides +=1 
#             else:
#                 dup_slide +=1

        slides_processed +=1        
        output = "Total Processed: %d  Newly Processed: %d Dup Slides or Already Loaded: %d  RESCANNED Slides %d" % (slides_processed, newly_processed, dup_slide, rescanned_slides )

def getFeatureInfo( slide_name, group_name):
    """Queries the feature Database and determine if there are any segmentations loaded"""
    print "received %s and %s" % ( slide_name, group_name)
# print dsa_slide_db['RawSlideData'].count()
# dsa_slide_db['RawSlideData'].delete_many({})
# print dsa_slide_db['RawSlideData'].count()

In [3]:
all_feature_colls = feature_db.collection_names()
print all_feature_colls[5]


### I now want to reformat all of this data to make it more useful for DSA ...

In [29]:
all_slides = dsa_slide_db['RawSlideData'].find()

## Since this is pretty easy to recreate, I'm going to empty the current database


all_feature_colls = feature_db.collection_names()

for s in all_slides:
    slide_dict = {}
    #pt_id = s['slide_w_path'].split('/')[-1]
    slideGroup = s['slide_w_path'].split('/')[-2]
    pt_id = s['slide_w_path'].split('/')[-1].split('.')[0]
   # stain_type = s['slide_w_path'].split('/')[-2]
    stain_type = 'UNK'
    slide_dict = s.copy()
    slide_dict['pt_id'] = pt_id
    slide_dict['stain_type'] = stain_type
    ### Obfuscating the global file path so everything is relative to some base path for the archive/
    slide_dict['thumbnail_image'] = '/thumbnail/' + s['slide_w_path'].replace(slide_root,'')
    slide_dict['slide_w_path'] = '/DZIMS/' + s['slide_w_path'].replace(slide_root,'')+'.dzi'
#    slide_dict['slide_w_path'] = '/DZIMS' + s['slide_w_path']+'.dzi'
    slide_dict['slideGroup'] = slideGroup
    slide_dict['HasPathReport'] = True
    slide_dict['PathReportURL'] = "TBD"
    slide_dict['TumorType'] = slideGroup
    slide_name_noext = s['slide_name'].replace('.svs','')
    slide_dict['slide_name_noext'] = slide_name_noext
    slide_dict['slide_nouid'] = pt_id
    foundFeatureDB = False
    FeatureColl = "Features.V1.%s.%s" % ( slideGroup, slide_name_noext)
    if FeatureColl in all_feature_colls:
        foundFeatureDB = True
        FeatureColl = "Features.V1.%s.%s" % ( slideGroup, pt_id)
        if FeatureColl in all_feature_colls:
            foundFeatureDB = True

    if foundFeatureDB:
        FeatObjs = feature_db[FeatureColl].count()
        ##Feature database should be  Features.V1.[TumorType].SlideName (without the trailing crap)
        slide_dict['FeatureDB_CollName'] =FeatureColl
        if FeatObjs > 0:
            slide_dict['HasAnnotations'] = True
            slide_dict['HasAnnotations'] = False

        slide_dict['FeatObjs'] = FeatObjs


In [32]:
coll_list = dsa_slide_db['PanCanDSA_Slide_Data'].distinct('slideGroup')
print coll_list

[u'GBM', u'LGG', u'LUAD', u'SARC', u'SKCM']

In [33]:
#all_slides = dsa_slide_db['RawSlideData'].find()
all_colls =  feature_db.collection_names()

for a in all_colls:
    if 'Features' in a:

print dsa_slide_db['PanCanDSA_Slide_Data'].find_one({'slide_name': {"$regex": "TCGA-3B-A9I1-01Z-00-DX1"}  }

#{ <field>: { $regex: /pattern/, $options: '<options>' } }

In [23]:

print feature_db['Features.V1.GBM.TCGA-02-0001-01Z-00-DX1'].count()


print dsa_slide_db['DSA_Slide_Data'].distinct('pt_id')

cur= dsa_slide_db['DSA_Slide_Data'].find({'pt_id':'ADRC50-10'})
for c in cur:
    print c

## Going to create a cleanup and /or reformatted collection for the DSA Viewer
for s in dsa_slide_db['ADRC'].find():
    keys_of_interest = ['width','height']
    print s['slide_w_path']
    print s.keys()