Import Dependencies


In [5]:
# Environment at time of execution
%load_ext watermark
%pylab inline
%watermark -a "Anthony Abercrombie" -d -t -v -p numpy,pandas,matplotlib -g


The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
Anthony Abercrombie 2017-01-28 13:19:02 

CPython 3.5.2
IPython 5.1.0

numpy 1.11.2
pandas 0.19.2+0.g825876c.dirty
matplotlib 1.5.1
Git hash: a95b6218ad551104f5918a31702c6ed9b0d316e3

In [16]:
from __future__ import print_function

import os

import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import dotenv

In [97]:
import os
import sys
import dotenv
import subprocess
import glob
from tqdm import tqdm

#File path to get to the project root
PROJ_ROOT = os.path.join(os.path.pardir, os.pardir)
# add local python functions
sys.path.append(os.path.join(PROJ_ROOT, "src"))

#Load AWS keys as environment variables
dotenv_path = os.path.join(PROJ_ROOT, '.env')
dotenv.load_dotenv(dotenv_path)

AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY")

from __future__ import print_function
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Download zipfiles from Kaggle

In order to download data from Kaggle, you normally need to login to Kaggle and accept terms and conditions. I've done this and exported my cookies into a file kaggle_cookies.txt that you can use to bypass the process. This will allow you to download the data using wget.

I'm showing the code here in markdown to prevent repeat downloads.

#sample_submission.csv --- 14.89 kb
!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/sample_submission.csv.zip

#grid_sizes.csv --- 2.17 kb
!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/grid_sizes.csv.zip

#sixteen band --- 7.30 kb
!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/sixteen_band.zip

#three_band --- 12.87 kb
!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/three_band.zip

#train_geojson_v3 --- 14.22 mb
!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/train_geojson_v3.zip

#train_wkt_v4.csv --- 11.08 mb
!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/train_wkt_v4.csv.zip

Uploading files to S3


In [60]:
#Export AWS Credentials to connect with S3
export_aws_creds_cmd = 'export AWS_SECRET_ACCESS_KEY={} \nexport AWS_ACCESS_KEY_ID={}'.format(AWS_SECRET_ACCESS_KEY,AWS_ACCESS_KEY)
#Execute command in the shell
subprocess.call(export_aws_creds_cmd, shell = True)


Out[60]:
0

In [93]:
#Name of the s3 bucket to dump files in
s3_bucket_name = 'dstl-satellite-imagery'

In [94]:
#Files we want to upload
files_to_upload = glob.glob('{}/data/raw/*'.format(PROJ_ROOT))
print(files_to_upload)


['../../data/raw/grid_sizes.csv.zip', '../../data/raw/sample_submission.csv.zip', '../../data/raw/sixteen_band.zip', '../../data/raw/three_band.zip', '../../data/raw/train_geojson_v3.zip', '../../data/raw/train_wkt_v4.csv.zip']

In [108]:
#Upload files to s3 with s3_multipart_upload.py
for f in tqdm(files_to_upload):
    upload_cmd = 's3put -b {} --multipart {}'.format(s3_bucket_name, f)
    subprocess.call(upload_cmd, shell = True)


100%|██████████| 6/6 [00:34<00:00,  4.26s/it]

In [ ]: