In [1]:
# example python script to read in an xml file, assign fields, and upload to an elasticsearch endpoint

# Rich Stoner, 2013

In [20]:
import os, textwrap
import csv
import re
from IPython.core.display import HTML
import os

In [21]:
#<item id='47541' ><slide_id><![CDATA[47541]]></slide_id><pyramid_id><![CDATA[]]></pyramid_id><image_width><![CDATA[96152]]></image_width><image_height><![CDATA[71948]]></image_height><pyramid_filename><![CDATA[TCGA-B5-A0JX-01Z-00-DX1]]></pyramid_filename><pyramid_folder><![CDATA[]]></pyramid_folder><full_file_path><![CDATA[/bigdata/RAW_SLIDE_LINKS/CDSA-LOCAL/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs]]></full_file_path><has_annotation><![CDATA[]]></has_annotation><thumbnail><![CDATA[http://node15.cci.emory.edu/cdsa-git-dev/cgi-bin/iipsrv.fcgi?FIF=/bigdata2/PYRAMIDS/CDSA/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs.dzi.tif&WID=200&CVT=jpeg]]></thumbnail><slide_url><![CDATA[http://node15.cci.emory.edu/cdsa-git-dev/cgi-bin/iipsrv.fcgi?DeepZoom=/bigdata2/PYRAMIDS/CDSA/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs.dzi.tif.dzi]]></slide_url><annotation><![CDATA[1]]></annotation></item>

In [22]:
xml_file_name = 'xml_cdsa_output.xml'
elastic_endpoint = 'http://192.241.156.224'
elastic_port = 9200
elastic_index = 'cdsa'

In [23]:
from BeautifulSoup import BeautifulSoup

soup = BeautifulSoup(open(xml_file_name, 'r'))


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-23-f1b54432574b> in <module>()
      1 from BeautifulSoup import BeautifulSoup
      2 
----> 3 soup = BeautifulSoup(open(xml_file_name, 'r'))

/Library/Python/2.7/site-packages/BeautifulSoup.pyc in __init__(self, *args, **kwargs)
   1520             kwargs['smartQuotesTo'] = self.HTML_ENTITIES
   1521         kwargs['isHTML'] = True
-> 1522         BeautifulStoneSoup.__init__(self, *args, **kwargs)
   1523 
   1524     SELF_CLOSING_TAGS = buildTagMap(None,

/Library/Python/2.7/site-packages/BeautifulSoup.pyc in __init__(self, markup, parseOnlyThese, fromEncoding, markupMassage, smartQuotesTo, convertEntities, selfClosingTags, isHTML)
   1145         self.markupMassage = markupMassage
   1146         try:
-> 1147             self._feed(isHTML=isHTML)
   1148         except StopParsing:
   1149             pass

/Library/Python/2.7/site-packages/BeautifulSoup.pyc in _feed(self, inDocumentEncoding, isHTML)
   1187         self.reset()
   1188 
-> 1189         SGMLParser.feed(self, markup)
   1190         # Close out any unfinished strings and close all the open tags.
   1191         self.endData()

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/sgmllib.pyc in feed(self, data)
    102 
    103         self.rawdata = self.rawdata + data
--> 104         self.goahead(0)
    105 
    106     def close(self):

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/sgmllib.pyc in goahead(self, end)
    136                         i = i+1
    137                         continue
--> 138                     k = self.parse_starttag(i)
    139                     if k < 0: break
    140                     i = k

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/sgmllib.pyc in parse_starttag(self, i)
    294             j = j+1
    295         self.__starttag_text = rawdata[start_pos:j]
--> 296         self.finish_starttag(tag, attrs)
    297         return j
    298 

/System/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/sgmllib.pyc in finish_starttag(self, tag, attrs)
    336                 method = getattr(self, 'do_' + tag)
    337             except AttributeError:
--> 338                 self.unknown_starttag(tag, attrs)
    339                 return -1
    340             else:

/Library/Python/2.7/site-packages/BeautifulSoup.pyc in unknown_starttag(self, name, attrs, selfClosing)
   1339 
   1340         if not self.isSelfClosingTag(name) and not selfClosing:
-> 1341             self._smartPop(name)
   1342 
   1343         if self.parseOnlyThese and len(self.tagStack) <= 1 \

/Library/Python/2.7/site-packages/BeautifulSoup.pyc in _smartPop(self, name)
   1306         inclusive = True
   1307         for i in range(len(self.tagStack)-1, 0, -1):
-> 1308             p = self.tagStack[i]
   1309             if (not p or p.name == name) and not isNestable:
   1310                 #Non-nestable tags get popped to the top or to their

KeyboardInterrupt: 

In [24]:
#example parsing

itemList = soup.findAll('item')

print len(itemList)

a = itemList[0]
d = {}

d['id'] = str(a.attrs[0][1])
d['full_file_path'] = a.full_file_path.string
d['image_width'] = int(a.image_width.string)
d['image_height'] = int(a.image_height.string)
d['pyramid_filename'] = a.pyramid_filename.string
d['thumbnail'] = a.thumbnail.string.replace('cdsa-git-dev/', '')
d['slide_url'] = a.slide_url.string.replace('cdsa-git-dev/', '')
d['patient'] = a.full_file_path.string.split('/')[4].split('_')[0]
d['zoomify_url'] = d['slide_url'].replace('cdsa-git-dev/', '').replace('DeepZoom', 'Zoomify')[:-4]+'/ImageProperties.xml'

import pprint
pprint.pprint(d)

pprint.pprint(a)
#http://node15.cci.emory.edu/cdsa-git-dev/cgi-bin/iipsrv.fcgi?Zoomify=/bigdata2/PYRAMIDS/CDSA/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs.dzi.tif/ImageProperties.xml


23883
{'full_file_path': u'/bigdata/RAW_SLIDE_LINKS/CDSA-LOCAL/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs',
 'id': '47541',
 'image_height': 71948,
 'image_width': 96152,
 'patient': u'UCEC',
 'pyramid_filename': u'TCGA-B5-A0JX-01Z-00-DX1',
 'slide_url': u'http://node15.cci.emory.edu/cgi-bin/iipsrv.fcgi?DeepZoom=/bigdata2/PYRAMIDS/CDSA/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs.dzi.tif.dzi',
 'thumbnail': u'http://node15.cci.emory.edu/cgi-bin/iipsrv.fcgi?FIF=/bigdata2/PYRAMIDS/CDSA/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs.dzi.tif&WID=200&CVT=jpeg',
 'zoomify_url': u'http://node15.cci.emory.edu/cgi-bin/iipsrv.fcgi?Zoomify=/bigdata2/PYRAMIDS/CDSA/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs.dzi.tif/ImageProperties.xml'}
<item id="47541"><slide_id><![CDATA[47541]]></slide_id><pyramid_id><![CDATA[ ]]></pyramid_id><image_width><![CDATA[96152]]></image_width><image_height><![CDATA[71948]]></image_height><pyramid_filename><![CDATA[TCGA-B5-A0JX-01Z-00-DX1]]></pyramid_filename><pyramid_folder><![CDATA[ ]]></pyramid_folder><full_file_path><![CDATA[/bigdata/RAW_SLIDE_LINKS/CDSA-LOCAL/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs]]></full_file_path><has_annotation><![CDATA[ ]]></has_annotation><thumbnail><![CDATA[http://node15.cci.emory.edu/cdsa-git-dev/cgi-bin/iipsrv.fcgi?FIF=/bigdata2/PYRAMIDS/CDSA/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs.dzi.tif&amp;WID=200&amp;CVT=jpeg]]></thumbnail><slide_url><![CDATA[http://node15.cci.emory.edu/cdsa-git-dev/cgi-bin/iipsrv.fcgi?DeepZoom=/bigdata2/PYRAMIDS/CDSA/UCEC_Diagnostic/nationwidechildrens.org_UCEC.diagnostic_images.Level_1.125.2.0/TCGA-B5-A0JX-01Z-00-DX1.D12E7005-E87A-4D42-B042-6280EA7F0D47.svs.dzi.tif.dzi]]></slide_url><annotation><![CDATA[1]]></annotation></item>

In [25]:
from elasticutils import get_es, S
from pyelasticsearch.exceptions import ElasticHttpNotFoundError

URL = '%s:%s' % ( elastic_endpoint, elastic_port)
print URL
INDEX = elastic_index
DOCTYPE = 'deepzoom_image'


http://192.241.156.224:9200

In [26]:
# This creates a pyelasticsearch ElasticSearch object which we can use
# to do all our indexing.
es = get_es(urls=[URL])

In [31]:
# First, delete the index.
try:
    es.delete_index(INDEX)
    print 'Index deleted'
except ElasticHttpNotFoundError:
    print 'Index doesnt exist'
    # Getting this here means the index doesn't exist, so there's
    # nothing to delete.
    pass


Index deleted

In [32]:
mapping = {
    DOCTYPE: {
        'properties': {
            'id': {'type': 'integer'},
            'full_file_path': {'type': 'string', 'index': 'not_analyzed'},
            'image_width': {'type': 'integer'},
            'image_height': {'type': 'integer'},            
            'pyramid_filename': {'type': 'string'},
            'patient' : { 'type' : 'string' },
            'thumbnail': {'type' : 'string', 'index': 'not_analyzed'},
            'slide_url': {'type' : 'string', 'index': 'not_analyzed'},
            'zoomify_url': {'type' : 'string', 'index': 'not_analyzed'}
            }
        }
    }

In [33]:
es.create_index(INDEX, settings={'mappings': mapping})


Out[33]:
{u'acknowledged': True}

In [30]:
# insert a single value
es.bulk_index(INDEX, DOCTYPE, [d], id_field='id')


Out[30]:
{u'errors': False,
 u'items': [{u'index': {u'_id': u'47541',
    u'_index': u'cdsa',
    u'_type': u'deepzoom_image',
    u'_version': 1,
    u'status': 201}}],
 u'took': 317}

In [34]:
all_items = []

for n,a in enumerate(itemList):
    d = {}
    d['id'] = str(a.attrs[0][1])
    d['full_file_path'] = a.full_file_path.string
    d['image_width'] = int(a.image_width.string)
    d['image_height'] = int(a.image_height.string)
    d['pyramid_filename'] = a.pyramid_filename.string
    d['thumbnail'] = a.thumbnail.string.replace('cdsa-git-dev/', '')
    d['slide_url'] = a.slide_url.string.replace('cdsa-git-dev/', '')
    d['patient'] = a.full_file_path.string.split('/')[4].split('_')[0]
    d['zoomify_url'] = d['slide_url'].replace('cdsa-git-dev/', '').replace('DeepZoom', 'Zoomify')[:-4]+'/ImageProperties.xml'
 
    all_items.append(d)
    
print len(all_items)


23883

In [35]:
# this will timeout on large files ( =) ) 
es.bulk_index(INDEX, DOCTYPE, all_items[0:5000], id_field='id')


---------------------------------------------------------------------------
Timeout                                   Traceback (most recent call last)
<ipython-input-35-1f37ff23514c> in <module>()
      1 # this will timeout on large files ( =) )
----> 2 es.bulk_index(INDEX, DOCTYPE, all_items[0:5000], id_field='id')

/Library/Python/2.7/site-packages/pyelasticsearch/client.pyc in decorate(*args, **kwargs)
     94                 elif k in convertible_args:
     95                     query_params[k] = kwargs.pop(k)
---> 96             return func(*args, query_params=query_params, **kwargs)
     97         return decorate
     98     return decorator

/Library/Python/2.7/site-packages/pyelasticsearch/client.pyc in bulk_index(self, index, doc_type, docs, id_field, parent_field, query_params)
    386                                  body,
    387                                  encode_body=False,
--> 388                                  query_params=query_params)
    389 
    390     @es_kwargs('routing', 'parent', 'replication', 'consistency', 'refresh')

/Library/Python/2.7/site-packages/pyelasticsearch/client.pyc in send_request(self, method, path_components, body, query_params, encode_body)
    236                     url,
    237                     timeout=self.timeout,
--> 238                     **({'data': request_body} if body else {}))
    239             except (ConnectionError, Timeout):
    240                 self.servers.mark_dead(server_url)

/Library/Python/2.7/site-packages/requests/sessions.pyc in post(self, url, data, **kwargs)
    423         """
    424 
--> 425         return self.request('POST', url, data=data, **kwargs)
    426 
    427     def put(self, url, data=None, **kwargs):

/Library/Python/2.7/site-packages/requests/sessions.pyc in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert)
    381             'allow_redirects': allow_redirects,
    382         }
--> 383         resp = self.send(prep, **send_kwargs)
    384 
    385         return resp

/Library/Python/2.7/site-packages/requests/sessions.pyc in send(self, request, **kwargs)
    484         start = datetime.utcnow()
    485         # Send the request
--> 486         r = adapter.send(request, **kwargs)
    487         # Total elapsed time of the request (approximately)
    488         r.elapsed = datetime.utcnow() - start

/Library/Python/2.7/site-packages/requests/adapters.pyc in send(self, request, stream, timeout, verify, cert, proxies)
    385                 raise SSLError(e)
    386             elif isinstance(e, TimeoutError):
--> 387                 raise Timeout(e)
    388             else:
    389                 raise

Timeout: HTTPConnectionPool(host='192.241.156.224', port=9200): Read timed out. (read timeout=5)

In [73]:



Out[73]:
{u'acknowledged': True, u'ok': True}

In [ ]: