In [ ]:
#!pip install google-cloud-bigquery
%pip install -q xmltodict apache-beam[gcp]
In [ ]:
%%writefile orders.xml
<?xml version="1.0" encoding="utf-8"?>
<Root xmlns="http://www.adventure-works.com">
<Orders>
<Order>
<CustomerID>GREAL</CustomerID>
<EmployeeID>6</EmployeeID>
<OrderDate>1997-05-06T00:00:00</OrderDate>
<RequiredDate>1997-05-20T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-05-09T00:00:00">
<ShipVia>2</ShipVia>
<Freight>3.35</Freight>
<ShipName>Great Lakes Food Market</ShipName>
<ShipAddress>2732 Baker Blvd.</ShipAddress>
<ShipCity>Eugene</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97403</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>GREAL</CustomerID>
<EmployeeID>8</EmployeeID>
<OrderDate>1997-07-04T00:00:00</OrderDate>
<RequiredDate>1997-08-01T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-07-14T00:00:00">
<ShipVia>2</ShipVia>
<Freight>4.42</Freight>
<ShipName>Great Lakes Food Market</ShipName>
<ShipAddress>2732 Baker Blvd.</ShipAddress>
<ShipCity>Eugene</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97403</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>GREAL</CustomerID>
<EmployeeID>1</EmployeeID>
<OrderDate>1997-07-31T00:00:00</OrderDate>
<RequiredDate>1997-08-28T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-08-05T00:00:00">
<ShipVia>2</ShipVia>
<Freight>116.53</Freight>
<ShipName>Great Lakes Food Market</ShipName>
<ShipAddress>2732 Baker Blvd.</ShipAddress>
<ShipCity>Eugene</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97403</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>GREAL</CustomerID>
<EmployeeID>4</EmployeeID>
<OrderDate>1997-07-31T00:00:00</OrderDate>
<RequiredDate>1997-08-28T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-08-04T00:00:00">
<ShipVia>2</ShipVia>
<Freight>18.53</Freight>
<ShipName>Great Lakes Food Market</ShipName>
<ShipAddress>2732 Baker Blvd.</ShipAddress>
<ShipCity>Eugene</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97403</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>GREAL</CustomerID>
<EmployeeID>6</EmployeeID>
<OrderDate>1997-09-04T00:00:00</OrderDate>
<RequiredDate>1997-10-02T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-09-10T00:00:00">
<ShipVia>1</ShipVia>
<Freight>57.15</Freight>
<ShipName>Great Lakes Food Market</ShipName>
<ShipAddress>2732 Baker Blvd.</ShipAddress>
<ShipCity>Eugene</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97403</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>GREAL</CustomerID>
<EmployeeID>3</EmployeeID>
<OrderDate>1997-09-25T00:00:00</OrderDate>
<RequiredDate>1997-10-23T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-09-30T00:00:00">
<ShipVia>3</ShipVia>
<Freight>76.13</Freight>
<ShipName>Great Lakes Food Market</ShipName>
<ShipAddress>2732 Baker Blvd.</ShipAddress>
<ShipCity>Eugene</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97403</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>GREAL</CustomerID>
<EmployeeID>4</EmployeeID>
<OrderDate>1998-01-06T00:00:00</OrderDate>
<RequiredDate>1998-02-03T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1998-02-04T00:00:00">
<ShipVia>2</ShipVia>
<Freight>719.78</Freight>
<ShipName>Great Lakes Food Market</ShipName>
<ShipAddress>2732 Baker Blvd.</ShipAddress>
<ShipCity>Eugene</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97403</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>GREAL</CustomerID>
<EmployeeID>3</EmployeeID>
<OrderDate>1998-03-09T00:00:00</OrderDate>
<RequiredDate>1998-04-06T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1998-03-18T00:00:00">
<ShipVia>2</ShipVia>
<Freight>33.68</Freight>
<ShipName>Great Lakes Food Market</ShipName>
<ShipAddress>2732 Baker Blvd.</ShipAddress>
<ShipCity>Eugene</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97403</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>GREAL</CustomerID>
<EmployeeID>3</EmployeeID>
<OrderDate>1998-04-07T00:00:00</OrderDate>
<RequiredDate>1998-05-05T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1998-04-15T00:00:00">
<ShipVia>2</ShipVia>
<Freight>25.19</Freight>
<ShipName>Great Lakes Food Market</ShipName>
<ShipAddress>2732 Baker Blvd.</ShipAddress>
<ShipCity>Eugene</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97403</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>GREAL</CustomerID>
<EmployeeID>4</EmployeeID>
<OrderDate>1998-04-22T00:00:00</OrderDate>
<RequiredDate>1998-05-20T00:00:00</RequiredDate>
<ShipInfo>
<ShipVia>3</ShipVia>
<Freight>18.84</Freight>
<ShipName>Great Lakes Food Market</ShipName>
<ShipAddress>2732 Baker Blvd.</ShipAddress>
<ShipCity>Eugene</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97403</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>GREAL</CustomerID>
<EmployeeID>4</EmployeeID>
<OrderDate>1998-04-30T00:00:00</OrderDate>
<RequiredDate>1998-06-11T00:00:00</RequiredDate>
<ShipInfo>
<ShipVia>3</ShipVia>
<Freight>14.01</Freight>
<ShipName>Great Lakes Food Market</ShipName>
<ShipAddress>2732 Baker Blvd.</ShipAddress>
<ShipCity>Eugene</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97403</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>HUNGC</CustomerID>
<EmployeeID>3</EmployeeID>
<OrderDate>1996-12-06T00:00:00</OrderDate>
<RequiredDate>1997-01-03T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1996-12-09T00:00:00">
<ShipVia>2</ShipVia>
<Freight>20.12</Freight>
<ShipName>Hungry Coyote Import Store</ShipName>
<ShipAddress>City Center Plaza 516 Main St.</ShipAddress>
<ShipCity>Elgin</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97827</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>HUNGC</CustomerID>
<EmployeeID>1</EmployeeID>
<OrderDate>1996-12-25T00:00:00</OrderDate>
<RequiredDate>1997-01-22T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-01-03T00:00:00">
<ShipVia>3</ShipVia>
<Freight>30.34</Freight>
<ShipName>Hungry Coyote Import Store</ShipName>
<ShipAddress>City Center Plaza 516 Main St.</ShipAddress>
<ShipCity>Elgin</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97827</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>HUNGC</CustomerID>
<EmployeeID>3</EmployeeID>
<OrderDate>1997-01-15T00:00:00</OrderDate>
<RequiredDate>1997-02-12T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-01-24T00:00:00">
<ShipVia>1</ShipVia>
<Freight>0.2</Freight>
<ShipName>Hungry Coyote Import Store</ShipName>
<ShipAddress>City Center Plaza 516 Main St.</ShipAddress>
<ShipCity>Elgin</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97827</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>HUNGC</CustomerID>
<EmployeeID>4</EmployeeID>
<OrderDate>1997-07-16T00:00:00</OrderDate>
<RequiredDate>1997-08-13T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-07-21T00:00:00">
<ShipVia>1</ShipVia>
<Freight>45.13</Freight>
<ShipName>Hungry Coyote Import Store</ShipName>
<ShipAddress>City Center Plaza 516 Main St.</ShipAddress>
<ShipCity>Elgin</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97827</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>HUNGC</CustomerID>
<EmployeeID>8</EmployeeID>
<OrderDate>1997-09-08T00:00:00</OrderDate>
<RequiredDate>1997-10-06T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-10-15T00:00:00">
<ShipVia>1</ShipVia>
<Freight>111.29</Freight>
<ShipName>Hungry Coyote Import Store</ShipName>
<ShipAddress>City Center Plaza 516 Main St.</ShipAddress>
<ShipCity>Elgin</ShipCity>
<ShipRegion>OR</ShipRegion>
<ShipPostalCode>97827</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>LAZYK</CustomerID>
<EmployeeID>1</EmployeeID>
<OrderDate>1997-03-21T00:00:00</OrderDate>
<RequiredDate>1997-04-18T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-04-10T00:00:00">
<ShipVia>3</ShipVia>
<Freight>7.48</Freight>
<ShipName>Lazy K Kountry Store</ShipName>
<ShipAddress>12 Orchestra Terrace</ShipAddress>
<ShipCity>Walla Walla</ShipCity>
<ShipRegion>WA</ShipRegion>
<ShipPostalCode>99362</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>LAZYK</CustomerID>
<EmployeeID>8</EmployeeID>
<OrderDate>1997-05-22T00:00:00</OrderDate>
<RequiredDate>1997-06-19T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-06-26T00:00:00">
<ShipVia>2</ShipVia>
<Freight>11.92</Freight>
<ShipName>Lazy K Kountry Store</ShipName>
<ShipAddress>12 Orchestra Terrace</ShipAddress>
<ShipCity>Walla Walla</ShipCity>
<ShipRegion>WA</ShipRegion>
<ShipPostalCode>99362</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>LETSS</CustomerID>
<EmployeeID>1</EmployeeID>
<OrderDate>1997-06-25T00:00:00</OrderDate>
<RequiredDate>1997-07-23T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-07-04T00:00:00">
<ShipVia>2</ShipVia>
<Freight>13.73</Freight>
<ShipName>Let's Stop N Shop</ShipName>
<ShipAddress>87 Polk St. Suite 5</ShipAddress>
<ShipCity>San Francisco</ShipCity>
<ShipRegion>CA</ShipRegion>
<ShipPostalCode>94117</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>LETSS</CustomerID>
<EmployeeID>8</EmployeeID>
<OrderDate>1997-10-27T00:00:00</OrderDate>
<RequiredDate>1997-11-24T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-11-05T00:00:00">
<ShipVia>2</ShipVia>
<Freight>51.44</Freight>
<ShipName>Let's Stop N Shop</ShipName>
<ShipAddress>87 Polk St. Suite 5</ShipAddress>
<ShipCity>San Francisco</ShipCity>
<ShipRegion>CA</ShipRegion>
<ShipPostalCode>94117</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>LETSS</CustomerID>
<EmployeeID>6</EmployeeID>
<OrderDate>1997-11-10T00:00:00</OrderDate>
<RequiredDate>1997-12-08T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1997-11-21T00:00:00">
<ShipVia>2</ShipVia>
<Freight>45.97</Freight>
<ShipName>Let's Stop N Shop</ShipName>
<ShipAddress>87 Polk St. Suite 5</ShipAddress>
<ShipCity>San Francisco</ShipCity>
<ShipRegion>CA</ShipRegion>
<ShipPostalCode>94117</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
<Order>
<CustomerID>LETSS</CustomerID>
<EmployeeID>4</EmployeeID>
<OrderDate>1998-02-12T00:00:00</OrderDate>
<RequiredDate>1998-03-12T00:00:00</RequiredDate>
<ShipInfo ShippedDate="1998-02-13T00:00:00">
<ShipVia>2</ShipVia>
<Freight>90.97</Freight>
<ShipName>Let's Stop N Shop</ShipName>
<ShipAddress>87 Polk St. Suite 5</ShipAddress>
<ShipCity>San Francisco</ShipCity>
<ShipRegion>CA</ShipRegion>
<ShipPostalCode>94117</ShipPostalCode>
<ShipCountry>USA</ShipCountry>
</ShipInfo>
</Order>
</Orders>
</Root>
In [ ]:
%%writefile xmlload.py
import argparse
import logging
import apache_beam as beam
def parse_into_dict(xmlfile):
import xmltodict
with open(xmlfile) as ifp:
doc = xmltodict.parse(ifp.read())
return doc
In [ ]:
doc = parse_into_dict('orders.xml')
doc['Root']['Orders']['Order'][0] # first order
In [ ]:
!bq rm -f advdata.fromxml
In [ ]:
%%writefile -a xmlload.py
table_schema = {
'fields': [
{'name' : 'CustomerID', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'EmployeeID', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'OrderDate', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'RequiredDate', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipInfo', 'type': 'RECORD', 'mode': 'NULLABLE', 'fields': [
{'name' : 'ShipVia', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'Freight', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipName', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipAddress', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipCity', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipRegion', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipPostalCode', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShipCountry', 'type': 'STRING', 'mode': 'NULLABLE'},
{'name' : 'ShippedDate', 'type': 'STRING', 'mode': 'NULLABLE'},
]},
]
}
In [ ]:
%%writefile -a xmlload.py
# The @ symbol is not allowed as a column name in BigQuery
def cleanup(x):
import copy
y = copy.deepcopy(x)
if '@ShippedDate' in x['ShipInfo']: # optional attribute
y['ShipInfo']['ShippedDate'] = x['ShipInfo']['@ShippedDate']
del y['ShipInfo']['@ShippedDate']
#print(y)
return y
def get_orders(doc):
for order in doc['Root']['Orders']['Order']:
yield cleanup(order)
def run(argv=None):
parser = argparse.ArgumentParser()
parser.add_argument(
'--output',
required=True,
help=(
'Specify text file orders.txt or BigQuery table project:dataset.table '))
known_args, pipeline_args = parser.parse_known_args(argv)
with beam.Pipeline(argv=pipeline_args) as p:
orders = (p
| 'files' >> beam.Create(['orders.xml'])
| 'parse' >> beam.Map(lambda filename: parse_into_dict(filename))
| 'orders' >> beam.FlatMap(lambda doc: get_orders(doc)))
if '.txt' in known_args.output:
orders | 'totxt' >> beam.io.WriteToText(known_args.output)
else:
orders | 'tobq' >> beam.io.WriteToBigQuery(known_args.output,
schema=table_schema,
write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, #WRITE_TRUNCATE
create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
run()
In [ ]:
#!cat xmlload.py
!python3 xmlload.py --project ai-analytics-solutions --output ai-analytics-solutions:advdata.fromxml
In [1]:
%%bigquery
SELECT * EXCEPT(ShipInfo), ShipInfo.*
FROM advdata.fromxml
WHERE CustomerID = 'LETSS'
Out[1]:
Copyright 2019 Google Inc. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License