In [0]:
# change these to try this notebook out
BUCKET = 'gcp-learn-209814.appspot.com'
PROJECT = 'gcp-learn-209814'
REGION = 'ASIA-NORTHEAST1'

In [0]:
import os
os.environ['BUCKET'] = BUCKET
os.environ['PROJECT'] = PROJECT
os.environ['REGION'] = REGION

In [14]:
! gcloud config set project $PROJECT
! gcloud config set compute/region $REGION


Updated property [core/project].
Updated property [compute/region].

In [15]:
!pip3 install datalab


Collecting datalab
  Downloading https://files.pythonhosted.org/packages/34/20/3b4963045b1f5ebb79253a2b22225530faecf563f293a19acbbbbc097ddf/datalab-1.1.4.tar.gz (1.3MB)
    100% |████████████████████████████████| 1.3MB 5.9MB/s 
Collecting configparser>=3.5.0 (from datalab)
  Downloading https://files.pythonhosted.org/packages/7c/69/c2ce7e91c89dc073eb1aa74c0621c3eefbffe8216b3f9af9d3885265c01c/configparser-3.5.0.tar.gz
Collecting mock>=2.0.0 (from datalab)
  Downloading https://files.pythonhosted.org/packages/e6/35/f187bdf23be87092bd0f1200d43d23076cee4d0dec109f195173fd3ebc79/mock-2.0.0-py2.py3-none-any.whl (56kB)
    100% |████████████████████████████████| 61kB 11.5MB/s 
Requirement already satisfied: future>=0.16.0 in /usr/local/lib/python3.6/dist-packages (from datalab) (0.16.0)
Collecting google-cloud>=0.30.0 (from datalab)
  Downloading https://files.pythonhosted.org/packages/ba/b1/7c54d1950e7808df06642274e677dbcedba57f75307adf2e5ad8d39e5e0e/google_cloud-0.34.0-py2.py3-none-any.whl
Requirement already satisfied: google-api-python-client>=1.6.2 in /usr/local/lib/python3.6/dist-packages (from datalab) (1.6.7)
Requirement already satisfied: seaborn>=0.7.0 in /usr/local/lib/python3.6/dist-packages (from datalab) (0.7.1)
Requirement already satisfied: plotly>=1.12.5 in /usr/local/lib/python3.6/dist-packages (from datalab) (1.12.12)
Requirement already satisfied: httplib2>=0.10.3 in /usr/local/lib/python3.6/dist-packages (from datalab) (0.11.3)
Requirement already satisfied: oauth2client>=2.2.0 in /usr/local/lib/python3.6/dist-packages (from datalab) (4.1.2)
Requirement already satisfied: pandas>=0.22.0 in /usr/local/lib/python3.6/dist-packages (from datalab) (0.22.0)
Requirement already satisfied: google_auth_httplib2>=0.0.2 in /usr/local/lib/python3.6/dist-packages (from datalab) (0.0.3)
Collecting pandas-profiling>=1.0.0a2 (from datalab)
  Downloading https://files.pythonhosted.org/packages/a7/7c/84f15ee705793a3cdd43bc65e6166d65d36f743b815ea517b02582989533/pandas_profiling-1.4.1-py2.py3-none-any.whl
Requirement already satisfied: python-dateutil>=2.5.0 in /usr/local/lib/python3.6/dist-packages (from datalab) (2.5.3)
Requirement already satisfied: pytz>=2015.4 in /usr/local/lib/python3.6/dist-packages (from datalab) (2018.5)
Requirement already satisfied: pyyaml>=3.11 in /usr/local/lib/python3.6/dist-packages (from datalab) (3.13)
Requirement already satisfied: requests>=2.9.1 in /usr/local/lib/python3.6/dist-packages (from datalab) (2.18.4)
Requirement already satisfied: scikit-image>=0.13.0 in /usr/local/lib/python3.6/dist-packages (from datalab) (0.13.1)
Requirement already satisfied: scikit-learn>=0.18.2 in /usr/local/lib/python3.6/dist-packages (from datalab) (0.19.2)
Requirement already satisfied: ipykernel>=4.5.2 in /usr/local/lib/python3.6/dist-packages (from datalab) (4.6.1)
Requirement already satisfied: psutil>=4.3.0 in /usr/local/lib/python3.6/dist-packages (from datalab) (5.4.7)
Requirement already satisfied: jsonschema>=2.6.0 in /usr/local/lib/python3.6/dist-packages (from datalab) (2.6.0)
Collecting six==1.10.0 (from datalab)
  Downloading https://files.pythonhosted.org/packages/c8/0a/b6723e1bc4c516cb687841499455a8505b44607ab535be01091c0f24f079/six-1.10.0-py2.py3-none-any.whl
Requirement already satisfied: urllib3>=1.22 in /usr/local/lib/python3.6/dist-packages (from datalab) (1.22)
Collecting pbr>=0.11 (from mock>=2.0.0->datalab)
  Downloading https://files.pythonhosted.org/packages/69/1c/98cba002ed975a91a0294863d9c774cc0ebe38e05bbb65e83314550b1677/pbr-4.2.0-py2.py3-none-any.whl (100kB)
    100% |████████████████████████████████| 102kB 8.4MB/s 
Requirement already satisfied: uritemplate<4dev,>=3.0.0 in /usr/local/lib/python3.6/dist-packages (from google-api-python-client>=1.6.2->datalab) (3.0.0)
Requirement already satisfied: rsa>=3.1.4 in /usr/local/lib/python3.6/dist-packages (from oauth2client>=2.2.0->datalab) (3.4.2)
Requirement already satisfied: pyasn1-modules>=0.0.5 in /usr/local/lib/python3.6/dist-packages (from oauth2client>=2.2.0->datalab) (0.2.2)
Requirement already satisfied: pyasn1>=0.1.7 in /usr/local/lib/python3.6/dist-packages (from oauth2client>=2.2.0->datalab) (0.4.4)
Requirement already satisfied: numpy>=1.9.0 in /usr/local/lib/python3.6/dist-packages (from pandas>=0.22.0->datalab) (1.14.5)
Requirement already satisfied: google-auth in /usr/local/lib/python3.6/dist-packages (from google_auth_httplib2>=0.0.2->datalab) (1.4.2)
Requirement already satisfied: jinja2>=2.8 in /usr/local/lib/python3.6/dist-packages (from pandas-profiling>=1.0.0a2->datalab) (2.10)
Requirement already satisfied: matplotlib>=1.4 in /usr/local/lib/python3.6/dist-packages (from pandas-profiling>=1.0.0a2->datalab) (2.1.2)
Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests>=2.9.1->datalab) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests>=2.9.1->datalab) (2018.8.13)
Requirement already satisfied: idna<2.7,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests>=2.9.1->datalab) (2.6)
Requirement already satisfied: pillow>=2.1.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image>=0.13.0->datalab) (4.0.0)
Requirement already satisfied: networkx>=1.8 in /usr/local/lib/python3.6/dist-packages (from scikit-image>=0.13.0->datalab) (2.1)
Requirement already satisfied: scipy>=0.17.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image>=0.13.0->datalab) (0.19.1)
Requirement already satisfied: PyWavelets>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from scikit-image>=0.13.0->datalab) (0.5.2)
Requirement already satisfied: tornado>=4.0 in /usr/local/lib/python3.6/dist-packages (from ipykernel>=4.5.2->datalab) (4.5.3)
Requirement already satisfied: jupyter-client in /usr/local/lib/python3.6/dist-packages (from ipykernel>=4.5.2->datalab) (5.2.3)
Requirement already satisfied: ipython>=4.0.0 in /usr/local/lib/python3.6/dist-packages (from ipykernel>=4.5.2->datalab) (5.5.0)
Requirement already satisfied: traitlets>=4.1.0 in /usr/local/lib/python3.6/dist-packages (from ipykernel>=4.5.2->datalab) (4.3.2)
Requirement already satisfied: cachetools>=2.0.0 in /usr/local/lib/python3.6/dist-packages (from google-auth->google_auth_httplib2>=0.0.2->datalab) (2.1.0)
Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.6/dist-packages (from jinja2>=2.8->pandas-profiling>=1.0.0a2->datalab) (1.0)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=1.4->pandas-profiling>=1.0.0a2->datalab) (0.10.0)
Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.6/dist-packages (from matplotlib>=1.4->pandas-profiling>=1.0.0a2->datalab) (2.2.0)
Requirement already satisfied: olefile in /usr/local/lib/python3.6/dist-packages (from pillow>=2.1.0->scikit-image>=0.13.0->datalab) (0.45.1)
Requirement already satisfied: decorator>=4.1.0 in /usr/local/lib/python3.6/dist-packages (from networkx>=1.8->scikit-image>=0.13.0->datalab) (4.3.0)
Requirement already satisfied: pyzmq>=13 in /usr/local/lib/python3.6/dist-packages (from jupyter-client->ipykernel>=4.5.2->datalab) (16.0.4)
Requirement already satisfied: jupyter-core in /usr/local/lib/python3.6/dist-packages (from jupyter-client->ipykernel>=4.5.2->datalab) (4.4.0)
Requirement already satisfied: simplegeneric>0.8 in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel>=4.5.2->datalab) (0.8.1)
Requirement already satisfied: pexpect; sys_platform != "win32" in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel>=4.5.2->datalab) (4.6.0)
Requirement already satisfied: pickleshare in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel>=4.5.2->datalab) (0.7.4)
Requirement already satisfied: pygments in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel>=4.5.2->datalab) (2.1.3)
Requirement already satisfied: prompt-toolkit<2.0.0,>=1.0.4 in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel>=4.5.2->datalab) (1.0.15)
Requirement already satisfied: setuptools>=18.5 in /usr/local/lib/python3.6/dist-packages (from ipython>=4.0.0->ipykernel>=4.5.2->datalab) (39.1.0)
Requirement already satisfied: ipython-genutils in /usr/local/lib/python3.6/dist-packages (from traitlets>=4.1.0->ipykernel>=4.5.2->datalab) (0.2.0)
Requirement already satisfied: ptyprocess>=0.5 in /usr/local/lib/python3.6/dist-packages (from pexpect; sys_platform != "win32"->ipython>=4.0.0->ipykernel>=4.5.2->datalab) (0.6.0)
Requirement already satisfied: wcwidth in /usr/local/lib/python3.6/dist-packages (from prompt-toolkit<2.0.0,>=1.0.4->ipython>=4.0.0->ipykernel>=4.5.2->datalab) (0.1.7)
Building wheels for collected packages: datalab, configparser
  Running setup.py bdist_wheel for datalab ... - \ | / - done
  Stored in directory: /content/.cache/pip/wheels/a0/49/91/b042b0fc2f6f1b418c0d4adc9aee43af3e1b6784367c6700df
  Running setup.py bdist_wheel for configparser ... - done
  Stored in directory: /content/.cache/pip/wheels/a3/61/79/424ef897a2f3b14684a7de5d89e8600b460b89663e6ce9d17c
Successfully built datalab configparser
Installing collected packages: configparser, six, pbr, mock, google-cloud, pandas-profiling, datalab
  Found existing installation: six 1.11.0
    Uninstalling six-1.11.0:
      Successfully uninstalled six-1.11.0
Successfully installed configparser-3.5.0 datalab-1.1.4 google-cloud-0.34.0 mock-2.0.0 pandas-profiling-1.4.1 pbr-4.2.0 six-1.10.0

In [0]:
from google.colab import auth
auth.authenticate_user()

In [0]:
query = """
#standardsql
WITH bicycle_rentals AS (
  SELECT
    COUNT(starttime) as num_trips,
    EXTRACT(DATE from starttime) as trip_date,
    MAX(EXTRACT(DAYOFWEEK from starttime)) as day_of_week,
    start_station_id
  FROM `bigquery-public-data.new_york.citibike_trips`
  GROUP BY trip_date, start_station_id
),

rainy_days AS
(
SELECT
  date,
  (MAX(prcp) > 5) AS rainy
FROM (
  SELECT
    wx.date AS date,
    IF (wx.element = 'PRCP', wx.value/10, NULL) AS prcp
  FROM
    `bigquery-public-data.ghcn_d.ghcnd_2016` AS wx
  WHERE
    wx.id = 'USW00094728'
)
GROUP BY
  date
)

SELECT
  num_trips,
  day_of_week,
  start_station_id,
  rainy
FROM bicycle_rentals AS bk
JOIN rainy_days AS wx
ON wx.date = bk.trip_date
"""
import google.datalab.bigquery as bq
df = bq.Query(query).execute().result().to_dataframe()

In [19]:
# shuffle the dataframe to make it easier to split into train/eval later
df = df.sample(frac=1.0)
df.head()


Out[19]:
num_trips day_of_week start_station_id rainy
74907 13 5 3082 False
35134 3 2 3058 True
8916 33 1 367 False
89727 13 5 2022 False
28807 13 2 406 False

In [21]:
df.dtypes


Out[21]:
num_trips           int64
day_of_week         int64
start_station_id    int64
rainy                bool
dtype: object

In [22]:
import numpy as np
df = df.astype({'num_trips': np.float32, 'day_of_week': np.int32, 'start_station_id': np.int32, 'rainy': str})
df.dtypes


Out[22]:
num_trips           float32
day_of_week           int32
start_station_id      int32
rainy                object
dtype: object

In [0]:
df['num_trips'] = df['num_trips'] / 1000.0

In [24]:
num_train = (int) (len(df) * 0.8)
train_df = df.iloc[:num_train]
eval_df  = df.iloc[num_train:]
print("Split into {} training examples and {} evaluation examples".format(len(train_df), len(eval_df)))


Split into 104148 training examples and 26037 evaluation examples

In [26]:
train_df.head()


Out[26]:
num_trips day_of_week start_station_id rainy
74907 0.013 5 3082 False
35134 0.003 2 3058 True
8916 0.033 1 367 False
89727 0.013 5 2022 False
28807 0.013 2 406 False

In [27]:
import tensorflow as tf
import pandas as pd

def make_input_fn(indf, num_epochs):
  return tf.estimator.inputs.pandas_input_fn(
    indf,
    indf['num_trips'],
    num_epochs=num_epochs,
    shuffle=True)

def serving_input_fn():
    feature_placeholders = {
      'day_of_week': tf.placeholder(tf.int32, [None]),
      'start_station_id': tf.placeholder(tf.int32, [None]),
      'rainy': tf.placeholder(tf.string, [None])
    }
    features = {
        key: tf.expand_dims(tensor, -1)
        for key, tensor in feature_placeholders.items()
    }
    return tf.estimator.export.ServingInputReceiver(features, feature_placeholders)
  
def train_and_evaluate(output_dir, nsteps):
  station_embed = tf.feature_column.embedding_column(
      tf.feature_column.categorical_column_with_hash_bucket('start_station_id', 5000, tf.int32), 2)
  feature_cols = [
    tf.feature_column.categorical_column_with_identity('day_of_week', num_buckets = 8),
    station_embed,
    tf.feature_column.categorical_column_with_vocabulary_list('rainy', ['false', 'true'])
  ]
  estimator = tf.estimator.LinearRegressor(
                       model_dir = output_dir,
                       feature_columns = feature_cols)
  train_spec=tf.estimator.TrainSpec(
                       input_fn = make_input_fn(train_df, None),
                       max_steps = nsteps)
  exporter = tf.estimator.LatestExporter('exporter', serving_input_fn)
  eval_spec=tf.estimator.EvalSpec(
                       input_fn = make_input_fn(eval_df, 1),
                       steps = None,
                       start_delay_secs = 1, # start evaluating after N seconds
                       throttle_secs = 10,  # evaluate every N seconds
                       exporters = exporter)
  tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
  
import shutil
OUTDIR='./model_trained'
shutil.rmtree(OUTDIR, ignore_errors=True)
train_and_evaluate(OUTDIR, 10)


INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': './model_trained', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f2c7091f390>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after every checkpoint. Checkpoint frequency is determined based on RunConfig arguments: save_checkpoints_steps None or save_checkpoints_secs 600.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into ./model_trained/model.ckpt.
INFO:tensorflow:loss = 1.4860361, step = 1
INFO:tensorflow:Saving checkpoints for 10 into ./model_trained/model.ckpt.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-08-16-03:25:24
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./model_trained/model.ckpt-10
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-08-16-03:25:24
INFO:tensorflow:Saving dict for global step 10: average_loss = 0.0057319324, global_step = 10, label/mean = 0.078333765, loss = 0.73158, prediction/mean = 0.08281881
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 10: ./model_trained/model.ckpt-10
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Signatures INCLUDED in export for Classify: None
INFO:tensorflow:Signatures INCLUDED in export for Regress: None
INFO:tensorflow:Signatures INCLUDED in export for Predict: ['predict']
INFO:tensorflow:Signatures INCLUDED in export for Train: None
INFO:tensorflow:Signatures INCLUDED in export for Eval: None
INFO:tensorflow:Signatures EXCLUDED from export because they cannot be be served via TensorFlow Serving APIs:
INFO:tensorflow:'serving_default' : Regression input must be a single string Tensor; got {'day_of_week': <tf.Tensor 'Placeholder:0' shape=(?,) dtype=int32>, 'start_station_id': <tf.Tensor 'Placeholder_1:0' shape=(?,) dtype=int32>, 'rainy': <tf.Tensor 'Placeholder_2:0' shape=(?,) dtype=string>}
INFO:tensorflow:'regression' : Regression input must be a single string Tensor; got {'day_of_week': <tf.Tensor 'Placeholder:0' shape=(?,) dtype=int32>, 'start_station_id': <tf.Tensor 'Placeholder_1:0' shape=(?,) dtype=int32>, 'rainy': <tf.Tensor 'Placeholder_2:0' shape=(?,) dtype=string>}
WARNING:tensorflow:Export includes no default signature!
INFO:tensorflow:Restoring parameters from ./model_trained/model.ckpt-10
INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: ./model_trained/export/exporter/temp-b'1534389925'/saved_model.pb
INFO:tensorflow:Loss for final step: 0.8201783.

In [30]:
%%writefile test.json
{"day_of_week": 3, "start_station_id": 384, "rainy": "false"}
{"day_of_week": 4, "start_station_id": 384, "rainy": "true"}


Writing test.json

In [32]:
!ls


adc.json  datalab  model_trained  sample_data  test.json

In [33]:
%%bash
EXPORTDIR=./model_trained/export/exporter/
MODELDIR=$(ls $EXPORTDIR | tail -1)
gcloud ml-engine local predict --model-dir=${EXPORTDIR}/${MODELDIR} --json-instances=./test.json


ERROR: (gcloud.ml-engine.local.predict) RuntimeError: Bad magic number in .pyc file


In [0]: