In [2]:
%pip install apache-beam[gcp]==2.13.0


Collecting apache-beam[gcp]==2.13.0
  Downloading https://files.pythonhosted.org/packages/0c/ae/909f79572265fea57a556d84e5aae04ce2fe45c7435b25bb176f52ddd174/apache_beam-2.13.0-cp37-cp37m-macosx_10_6_intel.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl (4.0MB)
    100% |████████████████████████████████| 4.0MB 1.8MB/s ta 0:00:01
Collecting crcmod<2.0,>=1.7 (from apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/6b/b0/e595ce2a2527e169c3bcd6c33d2473c1918e0b7f6826a043ca1245dd4e5b/crcmod-1.7.tar.gz
Collecting oauth2client<4,>=2.0.1 (from apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/c0/7b/bc893e35d6ca46a72faa4b9eaac25c687ce60e1fbe978993fe2de1b0ff0d/oauth2client-3.0.0.tar.gz
Collecting pytz>=2018.3 (from apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/3d/73/fe30c2daaaa0713420d0382b16fbb761409f532c56bdcc514bf7b6262bb6/pytz-2019.1-py2.py3-none-any.whl
Collecting mock<3.0.0,>=1.0.1 (from apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/e6/35/f187bdf23be87092bd0f1200d43d23076cee4d0dec109f195173fd3ebc79/mock-2.0.0-py2.py3-none-any.whl
Collecting pyarrow<0.14.0,>=0.11.1; python_version >= "3.0" or platform_system != "Windows" (from apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/6b/34/18d38a53f393d1d84c75f0cf90bcd2aaebd40c5aa9a8ca06c80c0e679f75/pyarrow-0.13.0-cp37-cp37m-macosx_10_6_intel.whl (29.1MB)
    100% |████████████████████████████████| 29.2MB 766kB/s ta 0:00:011
Requirement already satisfied: protobuf<4,>=3.5.0.post1 in /usr/local/lib/python3.7/site-packages (from apache-beam[gcp]==2.13.0) (3.8.0)
Collecting fastavro<0.22,>=0.21.4 (from apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/7b/ed/addc70938c732420dc2808e48e6965091c2944bcd999f6606f92ce084209/fastavro-0.21.24-cp37-cp37m-macosx_10_13_x86_64.whl (379kB)
    100% |████████████████████████████████| 389kB 6.1MB/s ta 0:00:01
Collecting pydot<1.3,>=1.2.0 (from apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/c3/f1/e61d6dfe6c1768ed2529761a68f70939e2569da043e9f15a8d84bf56cadf/pydot-1.2.4.tar.gz
Collecting avro-python3<2.0.0,>=1.8.1; python_version >= "3.0" (from apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/d1/55/4c2e6fecf06cbaa68e0abaf12e1e965969872ed16da3674e6245cab0d5e2/avro-python3-1.9.0.tar.gz
Requirement already satisfied: grpcio<2,>=1.8 in /usr/local/lib/python3.7/site-packages (from apache-beam[gcp]==2.13.0) (1.21.1)
Collecting pyyaml<4.0.0,>=3.12 (from apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/9e/a3/1d13970c3f36777c583f136c136f804d70f500168edc1edea6daa7200769/PyYAML-3.13.tar.gz
Requirement already satisfied: future<1.0.0,>=0.16.0 in /usr/local/lib/python3.7/site-packages (from apache-beam[gcp]==2.13.0) (0.17.1)
Collecting dill<0.2.10,>=0.2.9 (from apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/fe/42/bfe2e0857bc284cbe6a011d93f2a9ad58a22cb894461b199ae72cfef0f29/dill-0.2.9.tar.gz
Collecting hdfs<3.0.0,>=2.1.0 (from apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/52/be/6b06a82b1ae57cf4d73445fccf37a32bd4c94c8b7630a50281686c9a670a/hdfs-2.5.7.tar.gz
Collecting httplib2<=0.12.0,>=0.8 (from apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/ce/ed/803905d670b52fa0edfdd135337e545b4496c2ab3a222f1449b7256eb99f/httplib2-0.12.0.tar.gz (218kB)
    100% |████████████████████████████████| 225kB 5.1MB/s ta 0:00:01
Collecting google-cloud-pubsub<0.40.0,>=0.39.0; extra == "gcp" (from apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/c0/9a/4455b1c1450e9b912855b58ca6eee7a27ff1e9b52e4d98c243d93256f469/google_cloud_pubsub-0.39.1-py2.py3-none-any.whl (99kB)
    100% |████████████████████████████████| 102kB 8.1MB/s ta 0:00:01
Collecting cachetools<4,>=3.1.0; extra == "gcp" (from apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/2f/a6/30b0a0bef12283e83e58c1d6e7b5aabc7acfc4110df81a4471655d33e704/cachetools-3.1.1-py2.py3-none-any.whl
Collecting google-cloud-datastore<1.8.0,>=1.7.1; extra == "gcp" (from apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/d0/aa/29cbcf8cf7d08ce2d55b9dce858f7c632b434cb6451bed17cb4275804217/google_cloud_datastore-1.7.4-py2.py3-none-any.whl (82kB)
    100% |████████████████████████████████| 92kB 269kB/s ta 0:00:01
Collecting google-apitools<0.5.29,>=0.5.28; extra == "gcp" (from apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/7f/32/df3e36fd705a00092f1ffa9f41ce1df8dcb594ae313d239b87861a41fc2e/google-apitools-0.5.28.tar.gz (172kB)
    100% |████████████████████████████████| 174kB 1.5MB/s ta 0:00:01
Collecting google-cloud-bigtable<0.33.0,>=0.31.1; extra == "gcp" (from apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/08/77/b468e209dbb0a6f614e6781f06a4894299a4c6167c2c525cc086caa7c075/google_cloud_bigtable-0.32.2-py2.py3-none-any.whl (156kB)
    100% |████████████████████████████████| 163kB 6.9MB/s ta 0:00:01
Collecting google-cloud-bigquery<1.7.0,>=1.6.0; extra == "gcp" (from apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/b7/1b/2b95f2fefddbbece38110712c225bfb5649206f4056445653bd5ca4dc86d/google_cloud_bigquery-1.6.1-py2.py3-none-any.whl
Collecting google-cloud-core<0.30.0,>=0.28.1; extra == "gcp" (from apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/0c/f2/3c225e7a69cb27d283b68bff867722bd066bc1858611180197f711815ea5/google_cloud_core-0.29.1-py2.py3-none-any.whl
Collecting pyasn1>=0.1.7 (from oauth2client<4,>=2.0.1->apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/7b/7c/c9386b82a25115cccf1903441bba3cbadcfae7b678a20167347fa8ded34c/pyasn1-0.4.5-py2.py3-none-any.whl
Collecting pyasn1-modules>=0.0.5 (from oauth2client<4,>=2.0.1->apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/91/f0/b03e00ce9fddf4827c42df1c3ce10c74eadebfb706231e8d6d1c356a4062/pyasn1_modules-0.2.5-py2.py3-none-any.whl (74kB)
    100% |████████████████████████████████| 81kB 5.0MB/s ta 0:00:01
Collecting rsa>=3.1.4 (from oauth2client<4,>=2.0.1->apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/02/e5/38518af393f7c214357079ce67a317307936896e961e35450b70fad2a9cf/rsa-4.0-py2.py3-none-any.whl
Requirement already satisfied: six>=1.6.1 in /usr/local/lib/python3.7/site-packages (from oauth2client<4,>=2.0.1->apache-beam[gcp]==2.13.0) (1.12.0)
Collecting pbr>=0.11 (from mock<3.0.0,>=1.0.1->apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/7a/db/6e2fcd67cb1c7c98f54f7f19e925f8d9b23cffb84dab45528a759215fca5/pbr-5.3.1-py2.py3-none-any.whl (108kB)
    100% |████████████████████████████████| 112kB 7.2MB/s ta 0:00:01
Requirement already satisfied: numpy>=1.14 in /usr/local/lib/python3.7/site-packages (from pyarrow<0.14.0,>=0.11.1; python_version >= "3.0" or platform_system != "Windows"->apache-beam[gcp]==2.13.0) (1.16.4)
Requirement already satisfied: setuptools in /usr/local/lib/python3.7/site-packages (from protobuf<4,>=3.5.0.post1->apache-beam[gcp]==2.13.0) (40.8.0)
Requirement already satisfied: pyparsing>=2.1.4 in /usr/local/Cellar/matplotlib/2.2.3/libexec/lib/python3.7/site-packages (from pydot<1.3,>=1.2.0->apache-beam[gcp]==2.13.0) (2.1.10)
Collecting docopt (from hdfs<3.0.0,>=2.1.0->apache-beam[gcp]==2.13.0)
Requirement already satisfied: requests>=2.7.0 in /usr/local/lib/python3.7/site-packages (from hdfs<3.0.0,>=2.1.0->apache-beam[gcp]==2.13.0) (2.22.0)
Collecting google-api-core[grpc]<2.0.0dev,>=1.6.0 (from google-cloud-pubsub<0.40.0,>=0.39.0; extra == "gcp"->apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/a2/78/bbd685dda48a291b4cc81568ed3e1a89af7a61958dc88a3d52a819b1919d/google_api_core-1.13.0-py2.py3-none-any.whl (68kB)
    100% |████████████████████████████████| 71kB 5.8MB/s ta 0:00:011
Collecting grpc-google-iam-v1<0.12dev,>=0.11.4 (from google-cloud-pubsub<0.40.0,>=0.39.0; extra == "gcp"->apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/9b/28/f26f67381cb23e81271b8d66c00a846ad9d25a909ae1ae1df8222fad2744/grpc-google-iam-v1-0.11.4.tar.gz
Collecting fasteners>=0.14 (from google-apitools<0.5.29,>=0.5.28; extra == "gcp"->apache-beam[gcp]==2.13.0)
  Downloading https://files.pythonhosted.org/packages/18/bd/55eb2d6397b9c0e263af9d091ebdb756b15756029b3cededf6461481bc63/fasteners-0.15-py2.py3-none-any.whl
Collecting google-resumable-media>=0.2.1 (from google-cloud-bigquery<1.7.0,>=1.6.0; extra == "gcp"->apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/e2/5d/4bc5c28c252a62efe69ed1a1561da92bd5af8eca0cdcdf8e60354fae9b29/google_resumable_media-0.3.2-py2.py3-none-any.whl
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/site-packages (from requests>=2.7.0->hdfs<3.0.0,>=2.1.0->apache-beam[gcp]==2.13.0) (1.24.3)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.7/site-packages (from requests>=2.7.0->hdfs<3.0.0,>=2.1.0->apache-beam[gcp]==2.13.0) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/site-packages (from requests>=2.7.0->hdfs<3.0.0,>=2.1.0->apache-beam[gcp]==2.13.0) (2019.3.9)
Requirement already satisfied: idna<2.9,>=2.5 in /usr/local/lib/python3.7/site-packages (from requests>=2.7.0->hdfs<3.0.0,>=2.1.0->apache-beam[gcp]==2.13.0) (2.8)
Requirement already satisfied: googleapis-common-protos!=1.5.4,<2.0dev,>=1.5.3 in /usr/local/lib/python3.7/site-packages (from google-api-core[grpc]<2.0.0dev,>=1.6.0->google-cloud-pubsub<0.40.0,>=0.39.0; extra == "gcp"->apache-beam[gcp]==2.13.0) (1.6.0)
Collecting google-auth<2.0dev,>=0.4.0 (from google-api-core[grpc]<2.0.0dev,>=1.6.0->google-cloud-pubsub<0.40.0,>=0.39.0; extra == "gcp"->apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/c5/9b/ed0516cc1f7609fb0217e3057ff4f0f9f3e3ce79a369c6af4a6c5ca25664/google_auth-1.6.3-py2.py3-none-any.whl
Collecting monotonic>=0.1 (from fasteners>=0.14->google-apitools<0.5.29,>=0.5.28; extra == "gcp"->apache-beam[gcp]==2.13.0)
  Using cached https://files.pythonhosted.org/packages/ac/aa/063eca6a416f397bd99552c534c6d11d57f58f2e94c14780f3bbf818c4cf/monotonic-1.5-py2.py3-none-any.whl
Building wheels for collected packages: crcmod, oauth2client, pydot, avro-python3, pyyaml, dill, hdfs, httplib2, google-apitools, grpc-google-iam-v1
  Building wheel for crcmod (setup.py) ... done
  Stored in directory: /Users/byeon/Library/Caches/pip/wheels/50/24/4d/4580ca4a299f1ad6fd63443e6e584cb21e9a07988e4aa8daac
  Building wheel for oauth2client (setup.py) ... done
  Stored in directory: /Users/byeon/Library/Caches/pip/wheels/48/f7/87/b932f09c6335dbcf45d916937105a372ab14f353a9ca431d7d
  Building wheel for pydot (setup.py) ... done
  Stored in directory: /Users/byeon/Library/Caches/pip/wheels/6a/a5/14/25541ebcdeaf97a37b6d05c7ff15f5bd20f5e91b99d313e5b4
  Building wheel for avro-python3 (setup.py) ... done
  Stored in directory: /Users/byeon/Library/Caches/pip/wheels/b1/3c/45/8a3fff4129d91ff533c3e0db9cfbbb70dcb353584e22989574
  Building wheel for pyyaml (setup.py) ... done
  Stored in directory: /Users/byeon/Library/Caches/pip/wheels/ad/da/0c/74eb680767247273e2cf2723482cb9c924fe70af57c334513f
  Building wheel for dill (setup.py) ... done
  Stored in directory: /Users/byeon/Library/Caches/pip/wheels/5b/d7/0f/e58eae695403de585269f4e4a94e0cd6ca60ec0c202936fa4a
  Building wheel for hdfs (setup.py) ... done
  Stored in directory: /Users/byeon/Library/Caches/pip/wheels/84/a0/a6/87833803b10980f631ce2a1118e6f08fa78e29d70fab182098
  Building wheel for httplib2 (setup.py) ... done
  Stored in directory: /Users/byeon/Library/Caches/pip/wheels/6d/41/4b/2b369d6e2b7eaebcdd423516d3fb659c7658c16a2be8fd04ec
  Building wheel for google-apitools (setup.py) ... done
  Stored in directory: /Users/byeon/Library/Caches/pip/wheels/d6/c2/92/837e8a4d649a209dff85b38d7fbb576b4b480738be70865f29
  Building wheel for grpc-google-iam-v1 (setup.py) ... done
  Stored in directory: /Users/byeon/Library/Caches/pip/wheels/b6/c6/31/c20321a5a3fde456fc375b7c2814135e6e98bc0d74c40239d9
Successfully built crcmod oauth2client pydot avro-python3 pyyaml dill hdfs httplib2 google-apitools grpc-google-iam-v1
Installing collected packages: crcmod, httplib2, pyasn1, pyasn1-modules, rsa, oauth2client, pytz, pbr, mock, pyarrow, fastavro, pydot, avro-python3, pyyaml, dill, docopt, hdfs, cachetools, google-auth, google-api-core, grpc-google-iam-v1, google-cloud-pubsub, google-cloud-core, google-cloud-datastore, monotonic, fasteners, google-apitools, google-cloud-bigtable, google-resumable-media, google-cloud-bigquery, apache-beam
  Found existing installation: pytz 2016.10
    Uninstalling pytz-2016.10:
      Successfully uninstalled pytz-2016.10
  Found existing installation: PyYAML 5.1.1
    Uninstalling PyYAML-5.1.1:
      Successfully uninstalled PyYAML-5.1.1
Successfully installed apache-beam-2.13.0 avro-python3-1.9.0 cachetools-3.1.1 crcmod-1.7 dill-0.2.9 docopt-0.6.2 fastavro-0.21.24 fasteners-0.15 google-api-core-1.13.0 google-apitools-0.5.28 google-auth-1.6.3 google-cloud-bigquery-1.6.1 google-cloud-bigtable-0.32.2 google-cloud-core-0.29.1 google-cloud-datastore-1.7.4 google-cloud-pubsub-0.39.1 google-resumable-media-0.3.2 grpc-google-iam-v1-0.11.4 hdfs-2.5.7 httplib2-0.12.0 mock-2.0.0 monotonic-1.5 oauth2client-3.0.0 pbr-5.3.1 pyarrow-0.13.0 pyasn1-0.4.5 pyasn1-modules-0.2.5 pydot-1.2.4 pytz-2019.1 pyyaml-3.13 rsa-4.0
Note: you may need to restart the kernel to use updated packages.

In [3]:
import apache_beam as beam
print(beam.__version__)


/usr/local/lib/python3.7/site-packages/apache_beam/__init__.py:84: UserWarning: Running the Apache Beam SDK on Python 3 is not yet fully supported. You may encounter buggy behavior or missing features.
  'Running the Apache Beam SDK on Python 3 is not yet fully supported. '
2.13.0

In [4]:
BUCKET = 'cloud-training-demos-ml'
PROJECT = 'cloud-training-demos'
REGION = 'us-central1'

In [5]:
import os
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [7]:
%%bash
if ! gsutil ls | grep -q gs://${BUCKET}/; then
  gsutil mb -l ${REGION} gs://${BUCKET}
fi

In [8]:
query = """
SELECT
  weight_pounds,
  is_male,
  mother_age,
  plurality,
  gestation_weeks,
  ABS(FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING)))) AS hashmonth
FROM
  publicdata.samples.natality
WHERE year > 2000
"""

In [9]:
from google.cloud import bigquery
df = bigquery.Client().query(query + " LIMIT 100").to_dataframe()
df.head()


Out[9]:
weight_pounds is_male mother_age plurality gestation_weeks hashmonth
0 7.063611 True 32 1 37.0 7108882242435606404
1 4.687028 True 30 3 33.0 7170969733900686954
2 7.561856 True 20 1 39.0 6392072535155213407
3 7.561856 True 31 1 37.0 2126480030009879160
4 7.312733 True 32 1 40.0 3408502330831153141

gsutil -m cp -r gs://cloud-training-demos/babyweight/preproc gs://your-bucket/


In [ ]:
import datetime, os

def to_csv(rowdict):
    # Pull columns from BQ and create a line
    import hashlib
    import copy
    CSV_COLUMNS = 'weight_pounds,is_male,mother_age,plurality,gestation_weeks'.split(',')

    # Create synthetic data where we assume that no ultrasound has been performed
    # and so we don't know sex of the baby. Let's assume that we can tell the difference
    # between single and multiple, but that the errors rates in determining exact number
    # is difficult in the absence of an ultrasound.
    no_ultrasound = copy.deepcopy(rowdict)
    w_ultrasound = copy.deepcopy(rowdict)

    no_ultrasound['is_male'] = 'Unknown'
    if rowdict['plurality'] > 1:
        no_ultrasound['plurality'] = 'Multiple(2+)'
    else:
        no_ultrasound['plurality'] = 'Single(1)'

    # Change the plurality column to strings
    w_ultrasound['plurality'] = ['Single(1)', 'Twins(2)', 'Triplets(3)', 'Quadruplets(4)', 'Quintuplets(5)'][rowdict['plurality'] - 1]

    # Write out two rows for each input row, one with ultrasound and one without
    for result in [no_ultrasound, w_ultrasound]:
        data = ','.join([str(result[k]) if k in result else 'None' for k in CSV_COLUMNS])
        key = hashlib.sha224(data.encode('utf-8')).hexdigest()  # hash the columns to form a key
        yield str('{},{}'.format(data, key))

def preprocess(in_test_mode):
    import shutil, os, subprocess
    job_name = 'preprocess-babyweight-features' + '-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')

    if in_test_mode:
        print('Launching local job ... hang on')
        OUTPUT_DIR = './preproc'
        shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
        os.makedirs(OUTPUT_DIR)
    else:
        print('Launching Dataflow job {} ... hang on'.format(job_name))
        OUTPUT_DIR = 'gs://{0}/babyweight/preproc/'.format(BUCKET)
        try:
            subprocess.check_call('gsutil -m rm -r {}'.format(OUTPUT_DIR).split())
        except:
            pass

    options = {
      'staging_location': os.path.join(OUTPUT_DIR, 'tmp', 'staging'),
      'temp_location': os.path.join(OUTPUT_DIR, 'tmp'),
      'job_name': job_name,
      'project': PROJECT,
      'region': REGION,
      'teardown_policy': 'TEARDOWN_ALWAYS',
      'no_save_main_session': True,
      'max_num_workers': 6
    }
    opts = beam.pipeline.PipelineOptions(flags = [], **options)
    if in_test_mode:
        RUNNER = 'DirectRunner'
    else:
        RUNNER = 'DataflowRunner'
        p = beam.Pipeline(RUNNER, options = opts)
    query = """
    SELECT
      weight_pounds,
      is_male,
      mother_age,
      plurality,
      gestation_weeks,
      ABS(FARM_FINGERPRINT(CONCAT(CAST(YEAR AS STRING), CAST(month AS STRING)))) AS hashmonth
    FROM
      publicdata.samples.natality
    WHERE year > 2000
    AND weight_pounds > 0
    AND mother_age > 0
    AND plurality > 0
    AND gestation_weeks > 0
    AND month > 0
        """

    if in_test_mode:
        query = query + ' LIMIT 100' 

    for step in ['train', 'eval']:
        if step == 'train':
            selquery = 'SELECT * FROM ({}) WHERE MOD(ABS(hashmonth),4) < 3'.format(query)
        else:
            selquery = 'SELECT * FROM ({}) WHERE MOD(ABS(hashmonth),4) = 3'.format(query)

        (p 
         | '{}_read'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query = selquery, use_standard_sql = True))
         | '{}_csv'.format(step) >> beam.FlatMap(to_csv)
         | '{}_out'.format(step) >> beam.io.Write(beam.io.WriteToText(os.path.join(OUTPUT_DIR, '{}.csv'.format(step))))
        )

    job = p.run()
    if in_test_mode:
        job.wait_until_finish()
        print("Done!")
    
preprocess(in_test_mode = False)

In [ ]:
%%bash
gsutil ls gs://${BUCKET}/babyweight/preproc/*-00000*