In [ ]:
import happybase as hb
import elasticsearch as es
import json
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
In [ ]:
conf_fn = '../conf/global_var_remotehbase_release.json'
sha1 = "FDDADAD0CBB16245FB1BAB9CAFB8E87994CBA7A1"
In [ ]:
conf = json.load(open(conf_fn, 'rt'))
In [ ]:
def get_all_cdr_ids(sha1, conf):
conn = hb.Connection(conf['HBI_host'])
tab_sha1 = conn.table(conf['HBI_table_sha1infos'])
row = tab_sha1.row(sha1)
#print row
return row['info:all_cdr_ids']
In [ ]:
def get_s3urls_cdr_ids(cdr_ids, conf):
from elasticsearch import Elasticsearch
els_user = conf['ist_els_user']
els_pass = conf['ist_els_pass']
els_instance = conf['ist_els_instance']
els_index = conf['ist_els_index']
els_doc_type = conf['ist_els_doc_type']
query = "{\"fields\": [\"obj_stored_url\"], \"query\": { \"ids\": { \"values\": [\""+'\",\"'.join(cdr_ids.split(','))+"\"]}}}"
#print query
es = Elasticsearch('https://'+els_user+':'+els_pass+'@'+els_instance)
response = es.search(index=els_index,doc_type=els_doc_type,body=query)
#print response
# in hits-hits
image_list = []
for image in response['hits']['hits']:
#print image
# get _id and fields-obj_stored_url
image_list.append((image['_id'],image['fields']['obj_stored_url']))
return image_list
In [ ]:
def dl_image(s3_url):
# download with boto, read from bytes to PIL image
import boto3
import cStringIO
from PIL import Image
s3_res = boto3.resource('s3')
s3_clt = boto3.client('s3')
try:
s3_spl = s3_url.split("/")
try:
bucket_pos = s3_spl.index('s3.amazonaws.com')+1
except:
print "{} is not a s3 url.".format(s3_url)
return None
bucket = s3_spl[bucket_pos]
key = '/'.join(s3_spl[bucket_pos+1:])
buffer = cStringIO.StringIO()
s3_clt.download_fileobj(bucket, key, buffer)
img = Image.open(buffer)
# # for debugging
# s3_img_obj = s3_res.Object(bucket, key)
# ctype = s3_img_obj.content_type
# nb_bytes = s3_img_obj.content_length
# #print bucket, key, ctype, nb_bytes, img.size
return img
except Exception as err:
print "Could not download image from {}. Error was: {}".format(s3_url, err)
return None
In [ ]:
def disp_image_list(image_list, sha1):
#from matplotlib.pyplot import imshow
from IPython.display import display
imgs = []
for img_sha1,s3_url in image_list:
imgs.append(dl_image(str(s3_url[0])))
display(*imgs)
In [ ]:
cdr_ids = get_all_cdr_ids(sha1, conf)
#print cdr_ids
In [ ]:
image_list = get_s3urls_cdr_ids(cdr_ids, conf)
In [ ]:
disp_image_list(image_list, sha1)