In [ ]:
#!/usr/bin/python
"""Create a Google BigQuery linear regression input table.
In the code below, the following actions are taken:
* A new dataset is created "natality_regression."
* A new table "regression_input" is created to hold the inputs for our linear
regression.
* A query is run against the public dataset,
bigquery-public-data.samples.natality, selecting only the data of interest
to the regression, the output of which is stored in the "regression_input"
table.
* The output table is moved over the wire to the user's default project via
the built-in BigQuery Connector for Spark that bridges BigQuery and Cloud
Dataproc.
"""
from google.cloud import bigquery
from google.cloud.bigquery import SchemaField
from google.cloud.bigquery import table
# Create a new Google BigQuery client using Google Cloud Platform project
# defaults.
bigquery_client = bigquery.Client()
# The name for the new dataset.
dataset_id = 'natality_regression'
# Prepare a reference to the new dataset.
dataset_ref = bigquery_client.dataset(dataset_id)
dataset = bigquery.Dataset(dataset_ref)
# Create the new BigQuery dataset.
dataset = bigquery_client.create_dataset(dataset)
# In the new BigQuery dataset, create a new table.
table_ref = dataset.table('regression_input')
# The table needs a schema before it can be created and accept data.
# Create an ordered list of the columns using SchemaField objects.
SCHEMA = []
SCHEMA.append(SchemaField('weight_pounds', 'float'))
SCHEMA.append(SchemaField('mother_age', 'integer'))
SCHEMA.append(SchemaField('father_age', 'integer'))
SCHEMA.append(SchemaField('gestation_weeks', 'integer'))
SCHEMA.append(SchemaField('weight_gain_pounds', 'integer'))
SCHEMA.append(SchemaField('apgar_5min', 'integer'))
# Assign the schema to the table and create the table in BigQuery.
table = bigquery.Table(table_ref, schema=SCHEMA)
table = bigquery_client.create_table(table)
# Set up a query in Standard SQL.
# The query selects the fields of interest.
QUERY = """
SELECT weight_pounds, mother_age, father_age, gestation_weeks,
weight_gain_pounds, apgar_5min
FROM `bigquery-public-data.samples.natality`
WHERE weight_pounds is not null
and mother_age is not null and father_age is not null
and gestation_weeks is not null
and weight_gain_pounds is not null
and apgar_5min is not null
limit
"""
# Configure the query job.
job_config = bigquery.QueryJobConfig()
# Set the output table to the table created above.
dest_dataset_ref = bigquery_client.dataset('natality_regression')
dest_table_ref = dest_dataset_ref.table('regression_input')
job_config.destination = dest_table_ref
# Allow the results table to be overwritten.
job_config.write_disposition = 'WRITE_TRUNCATE'
# Use Standard SQL.
job_config.use_legacy_sql = False
# Run the query.
query_job = bigquery_client.query(QUERY, job_config=job_config)