Raw Data Download

This scripts downloads all of the stage 1 raw data for the Kaggle Data Science Bowl 2017 (https://www.kaggle.com/c/data-science-bowl-2017)

We are pulling the raw data from the data page https://www.kaggle.com/c/data-science-bowl-2017/data


In [1]:
from urllib import request
import zipfile, io
from pathlib import Path
import os
import re
from pyunpack import Archive

Defining function to download and extract all raw data


In [2]:
def extract_files(url, orig_dir=os.getcwd()):
    os.chdir('../..')
    file_name = 'data/raw/' + re.search('(\w+)(\.\w+)+(?!.*(\w+)(\.\w+)+)', url).group(0)
    request.urlretrieve(url, filename=file_name)
    Archive(file_name).extractall('')
    os.chdir(orig_dir)
    #, filename='data/raw/' + re.findall('(\w+)(\.\w+)+(?!.*(\w+)(\.\w+)+)', url)[0][0]

Grabbing all the stored raw data urls


In [3]:
with open('.dataurl') as file:
    urls = file.read().splitlines()

In [5]:
for url in urls:
    extract_files(url)


---------------------------------------------------------------------------
FileNotFoundError                         Traceback (most recent call last)
<ipython-input-5-3e5ea1611f80> in <module>()
      1 for url in urls:
----> 2     extract_files(url)

<ipython-input-2-1a43494c1f1f> in extract_files(url, orig_dir)
      2     os.chdir('../..')
      3     file_name = 'data/raw/' + re.search('(\w+)(\.\w+)+(?!.*(\w+)(\.\w+)+)', url).group(0)
----> 4     request.urlretrieve(url, filename=file_name)
      5     Archive(file_name).extractall('')
      6     os.chdir(orig_dir)

C:\Users\burksa\AppData\Local\Continuum\Anaconda3\lib\urllib\request.py in urlretrieve(url, filename, reporthook, data)
    196         # Handle temporary file setup.
    197         if filename:
--> 198             tfp = open(filename, 'wb')
    199         else:
    200             tfp = tempfile.NamedTemporaryFile(delete=False)

FileNotFoundError: [Errno 2] No such file or directory: 'data/raw/stage1_labels.csv.zip'