This scripts downloads all of the stage 1 raw data for the Kaggle Data Science Bowl 2017 (https://www.kaggle.com/c/data-science-bowl-2017)
We are pulling the raw data from the data page https://www.kaggle.com/c/data-science-bowl-2017/data
In [1]:
from urllib import request
import zipfile, io
from pathlib import Path
import os
import re
from pyunpack import Archive
In [2]:
def extract_files(url, orig_dir=os.getcwd()):
os.chdir('../..')
file_name = 'data/raw/' + re.search('(\w+)(\.\w+)+(?!.*(\w+)(\.\w+)+)', url).group(0)
request.urlretrieve(url, filename=file_name)
Archive(file_name).extractall('')
os.chdir(orig_dir)
#, filename='data/raw/' + re.findall('(\w+)(\.\w+)+(?!.*(\w+)(\.\w+)+)', url)[0][0]
In [3]:
with open('.dataurl') as file:
urls = file.read().splitlines()
In [5]:
for url in urls:
extract_files(url)