In [1]:
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.appName("Calculate Distances") \
.config("spark.some.config.option", "some-value") \
.getOrCreate()
In [27]:
# Inspired by Boto3 documentation
import boto3
s3 = boto3.resource('s3')
for bucket in s3.buckets.all():
print(bucket.name)
for obj in bucket.objects.all():
print(obj.key)
In [3]:
dfDist = spark.read.parquet("../processeddata/distances.parquet")
dfDist.printSchema()
In [30]:
dfNearby = dfDist.where(dfDist.dist<60).orderBy('z_id','m_id')
print(dfNearby.count())
In [35]:
# https://realpython.com/blog/python/api-integration-in-python/
# http://docs.python-requests.org/en/master/user/quickstart/
import os
import boto3
def upload_file(row):
s3 = boto3.client('s3')
local_path = '../rawdata/bing_results/'
remote_path = 'bing_results/'
fn_root = str(row['z_id'])+'_'+str(row['m_id'])
fn_local = local_path+fn_root+'.json'
fn_local_meta = local_path+fn_root+'_meta.json'
fn_remote = remote_path+fn_root+'.json'
fn_remote_meta = remote_path+fn_root+'_meta.json'
if os.path.isfile(fn_local):
with open(fn_local, "rb") as f:
s3.upload_fileobj(f, "big-data-carbon-tax", fn_remote)
with open(fn_local_meta, "rb") as f:
s3.upload_fileobj(f, "big-data-carbon-tax", fn_remote_meta)
print('Sent',fn_local)
return
r = dfNearby.foreach(upload_file)
In [34]:
import boto3
s3 = boto3.client('s3')
with open('../processeddata/driv_dist.parquet.tar.gz', "rb") as f:
s3.upload_fileobj(f, "big-data-carbon-tax", 'bing_results/driv_dist.parquet.tar.gz')
In [1]:
import boto3
s3 = boto3.client('s3')
with open('../rawdata/ACS_15_5YR_DP03b.csv', "rb") as f:
s3.upload_fileobj(f, "big-data-carbon-tax", 'bing_results/ACS_15_5YR_DP03b.csv')
In [2]:
import boto3
s3 = boto3.client('s3')
with open('../rawdata/ACS_15_5YR_S1903b.csv', "rb") as f:
s3.upload_fileobj(f, "big-data-carbon-tax", 'bing_results/ACS_15_5YR_S1903b.csv')