In [5]:
# Environment at time of execution
%load_ext watermark
%pylab inline
%watermark -a "Anthony Abercrombie" -d -t -v -p numpy,pandas,matplotlib -g
In [16]:
from __future__ import print_function
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dotenv
In [97]:
import os
import sys
import dotenv
import subprocess
import glob
from tqdm import tqdm
#File path to get to the project root
PROJ_ROOT = os.path.join(os.path.pardir, os.pardir)
# add local python functions
sys.path.append(os.path.join(PROJ_ROOT, "src"))
#Load AWS keys as environment variables
dotenv_path = os.path.join(PROJ_ROOT, '.env')
dotenv.load_dotenv(dotenv_path)
AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY")
AWS_SECRET_ACCESS_KEY = os.environ.get("AWS_SECRET_ACCESS_KEY")
from __future__ import print_function
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1
In order to download data from Kaggle, you normally need to login to Kaggle and accept terms and conditions. I've done this and exported my cookies into a file kaggle_cookies.txt that you can use to bypass the process. This will allow you to download the data using wget.
I'm showing the code here in markdown to prevent repeat downloads.
#sample_submission.csv --- 14.89 kb
!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/sample_submission.csv.zip
#grid_sizes.csv --- 2.17 kb
!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/grid_sizes.csv.zip
#sixteen band --- 7.30 kb
!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/sixteen_band.zip
#three_band --- 12.87 kb
!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/three_band.zip
#train_geojson_v3 --- 14.22 mb
!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/train_geojson_v3.zip
#train_wkt_v4.csv --- 11.08 mb
!wget -x -c --load-cookies=../../src/data/kaggle_cookies.txt -P ../../data/raw -nH --cut-dirs=5 https://www.kaggle.com/c/dstl-satellite-imagery-feature-detection/download/train_wkt_v4.csv.zip
In [60]:
#Export AWS Credentials to connect with S3
export_aws_creds_cmd = 'export AWS_SECRET_ACCESS_KEY={} \nexport AWS_ACCESS_KEY_ID={}'.format(AWS_SECRET_ACCESS_KEY,AWS_ACCESS_KEY)
#Execute command in the shell
subprocess.call(export_aws_creds_cmd, shell = True)
Out[60]:
In [93]:
#Name of the s3 bucket to dump files in
s3_bucket_name = 'dstl-satellite-imagery'
In [94]:
#Files we want to upload
files_to_upload = glob.glob('{}/data/raw/*'.format(PROJ_ROOT))
print(files_to_upload)
In [108]:
#Upload files to s3 with s3_multipart_upload.py
for f in tqdm(files_to_upload):
upload_cmd = 's3put -b {} --multipart {}'.format(s3_bucket_name, f)
subprocess.call(upload_cmd, shell = True)
In [ ]: