The goal of this notebook is to retrieve the images associated to the set of ads provided for the CP1 during the MEMEX Winter QPR 2017.
_id
attribute.
e.g. test_adjusted_unlabeled.json, train_adjusted.jsonjq and parallel
sudo apt-get install parallel
In [18]:
# set some parameters
json_dir = "../data"
## train
#input_file = "train_adjusted.json"
#prefix = "test"
## test
input_file = "test_adjusted_unlabeled.json"
prefix = "test"
In [19]:
import os
output_file = prefix+"_adjusted_images.json"
sha1_file = prefix+"_adjusted_images_sha1.csv"
__depends__ = [os.path.join(json_dir, input_file)]
__dest__ = [prefix+'_ad_ids.txt',
os.path.join(json_dir, output_file),
os.path.join(json_dir, sha1_file),
prefix+'_image_documents_retrieval.joblog',
prefix+'_image_urls.txt',
prefix+'_image_retrieval.joblog',
prefix+'_image_url_sha1.csv']
ADS = __depends__[0]
AD_IDS = __dest__[0]
IMAGE_DOCUMENTS = __dest__[1]
IMAGE_URL_TO_VALID_SHA1 = __dest__[2]
IMAGE_DOCUMENTS_JOBLOG = __dest__[3]
IMAGE_URLS = __dest__[4]
IMAGE_JOBLOG = __dest__[5]
IMAGE_URL_SHA1 = __dest__[6]
In [20]:
import csv
In [21]:
!jq -r '._id' $ADS | sort --unique > $AD_IDS
In [22]:
# find documents from ES that are children of the ad ids (meaning, images)
!parallel --joblog $IMAGE_DOCUMENTS_JOBLOG \
--retries 3 \
--arg-file $AD_IDS \
--max-args 50 \
--jobs 10 \
python ../scripts/get_es_child_documents.py > $IMAGE_DOCUMENTS
In [23]:
#parse error: Invalid numeric literal at line 113991, column 18
#due to timeout?
!jq -r '.obj_stored_url' $IMAGE_DOCUMENTS | sort --unique > $IMAGE_URLS
num_image_documents = !wc -l $IMAGE_DOCUMENTS
print num_image_documents
num_image_urls = !wc -l $IMAGE_URLS
print num_image_urls
In [24]:
# download images
!parallel --joblog $IMAGE_JOBLOG \
--retries 3 \
--arg-file $IMAGE_URLS \
--max-args 1 \
--jobs 20 \
python ../scripts/image_download.py > $IMAGE_URL_SHA1
In [ ]: