In [1]:
# example python script to read in an xml file, assign fields, and upload to an elasticsearch endpoint
# Rich Stoner, 2013
In [20]:
import os, textwrap
import csv
import re
from IPython.core.display import HTML
import os
In [21]:
#<item id='47541' ><slide_id><![CDATA[47541]]></slide_id><pyramid_id><![CDATA[]]></pyramid_id><image_width><![CDATA[96152]]></image_width><image_height><![CDATA[71948]]></image_height><pyramid_filename><![CDATA[TCGA-B5-A0JX-01Z-00-DX1]]></pyramid_filename><pyramid_folder><![CDATA[]]></pyramid_folder><full_file_path><![CDATA[/bigdata/RAW_SLIDE_LINKS/CDSA-LOCAL/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs]]></full_file_path><has_annotation><![CDATA[]]></has_annotation><thumbnail><![CDATA[http://node15.cci.emory.edu/cdsa-git-dev/cgi-bin/iipsrv.fcgi?FIF=/bigdata2/PYRAMIDS/CDSA/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs.dzi.tif&WID=200&CVT=jpeg]]></thumbnail><slide_url><![CDATA[http://node15.cci.emory.edu/cdsa-git-dev/cgi-bin/iipsrv.fcgi?DeepZoom=/bigdata2/PYRAMIDS/CDSA/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs.dzi.tif.dzi]]></slide_url><annotation><![CDATA[1]]></annotation></item>
In [22]:
xml_file_name = 'xml_cdsa_output.xml'
elastic_endpoint = 'http://192.241.156.224'
elastic_port = 9200
elastic_index = 'cdsa'
In [23]:
from BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(open(xml_file_name, 'r'))
In [24]:
#example parsing
itemList = soup.findAll('item')
print len(itemList)
a = itemList[0]
d = {}
d['id'] = str(a.attrs[0][1])
d['full_file_path'] = a.full_file_path.string
d['image_width'] = int(a.image_width.string)
d['image_height'] = int(a.image_height.string)
d['pyramid_filename'] = a.pyramid_filename.string
d['thumbnail'] = a.thumbnail.string.replace('cdsa-git-dev/', '')
d['slide_url'] = a.slide_url.string.replace('cdsa-git-dev/', '')
d['patient'] = a.full_file_path.string.split('/')[4].split('_')[0]
d['zoomify_url'] = d['slide_url'].replace('cdsa-git-dev/', '').replace('DeepZoom', 'Zoomify')[:-4]+'/ImageProperties.xml'
import pprint
pprint.pprint(d)
pprint.pprint(a)
#http://node15.cci.emory.edu/cdsa-git-dev/cgi-bin/iipsrv.fcgi?Zoomify=/bigdata2/PYRAMIDS/CDSA/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs.dzi.tif/ImageProperties.xml
In [25]:
from elasticutils import get_es, S
from pyelasticsearch.exceptions import ElasticHttpNotFoundError
URL = '%s:%s' % ( elastic_endpoint, elastic_port)
print URL
INDEX = elastic_index
DOCTYPE = 'deepzoom_image'
In [26]:
# This creates a pyelasticsearch ElasticSearch object which we can use
# to do all our indexing.
es = get_es(urls=[URL])
In [31]:
# First, delete the index.
try:
es.delete_index(INDEX)
print 'Index deleted'
except ElasticHttpNotFoundError:
print 'Index doesnt exist'
# Getting this here means the index doesn't exist, so there's
# nothing to delete.
pass
In [32]:
mapping = {
DOCTYPE: {
'properties': {
'id': {'type': 'integer'},
'full_file_path': {'type': 'string', 'index': 'not_analyzed'},
'image_width': {'type': 'integer'},
'image_height': {'type': 'integer'},
'pyramid_filename': {'type': 'string'},
'patient' : { 'type' : 'string' },
'thumbnail': {'type' : 'string', 'index': 'not_analyzed'},
'slide_url': {'type' : 'string', 'index': 'not_analyzed'},
'zoomify_url': {'type' : 'string', 'index': 'not_analyzed'}
}
}
}
In [33]:
es.create_index(INDEX, settings={'mappings': mapping})
Out[33]:
In [30]:
# insert a single value
es.bulk_index(INDEX, DOCTYPE, [d], id_field='id')
Out[30]:
In [34]:
all_items = []
for n,a in enumerate(itemList):
d = {}
d['id'] = str(a.attrs[0][1])
d['full_file_path'] = a.full_file_path.string
d['image_width'] = int(a.image_width.string)
d['image_height'] = int(a.image_height.string)
d['pyramid_filename'] = a.pyramid_filename.string
d['thumbnail'] = a.thumbnail.string.replace('cdsa-git-dev/', '')
d['slide_url'] = a.slide_url.string.replace('cdsa-git-dev/', '')
d['patient'] = a.full_file_path.string.split('/')[4].split('_')[0]
d['zoomify_url'] = d['slide_url'].replace('cdsa-git-dev/', '').replace('DeepZoom', 'Zoomify')[:-4]+'/ImageProperties.xml'
all_items.append(d)
print len(all_items)
In [35]:
# this will timeout on large files ( =) )
es.bulk_index(INDEX, DOCTYPE, all_items[0:5000], id_field='id')
In [73]:
Out[73]:
In [ ]: