The goal here is to parse the locally synced DarkSky data to be able to use it in 15-minute based big data analytics using Tableau. In this case the data will be stored in Google BigQuery so extra processing is needed on the returned JSON data that is nog locally stored (see Basic_setup.ipynb)
Since DarkSky only provisions hourly data, this needs to be spread to allow for 15-minute based analytics.
In [ ]:
import datetime as dt
import time
from tqdm import tqdm
import json
import os
In [ ]:
file_list = os.listdir('local_data')
In [ ]:
def f_t_c(fahrenheit):
return round((((fahrenheit - 32) * 5.0) / 9.0), 3)
In [ ]:
output_long = open('combined/output.json', 'w')
count = 0
for current_file in tqdm(file_list):
content = json.loads(open('local_data/' + current_file).read())
output = open('parsed_data/parsed_' + current_file, 'w')
formatted = {}
#DAILY
formatted['DailyPrecipitationType'] = content.get('daily').get('data')[0].get('precipType')
formatted['DailyCloudCover'] = content.get('daily').get('data')[0].get('cloudCover')
formatted['DailyTemperatureMin'] = f_t_c(content.get('daily').get('data')[0].get('temperatureMin'))
formatted['DailyTemperatureMax'] = f_t_c(content.get('daily').get('data')[0].get('temperatureMax'))
formatted['DailyTemperatureMinTime'] = str(dt.datetime.fromtimestamp(content.get('daily').get('data')[0].get('temperatureMinTime')))
formatted['DailyTemperatureMaxTime'] = str(dt.datetime.fromtimestamp(content.get('daily').get('data')[0].get('temperatureMaxTime')))
formatted['DailyApparentTemperatureMin'] = f_t_c(content.get('daily').get('data')[0].get('apparentTemperatureMin'))
formatted['DailyApparentTemperatureMax'] = f_t_c(content.get('daily').get('data')[0].get('apparentTemperatureMax'))
formatted['DailyApparentTemperatureMinTime'] = str(dt.datetime.fromtimestamp(content.get('daily').get('data')[0].get('apparentTemperatureMinTime')))
formatted['DailyApparentTemperatureMaxTime'] = str(dt.datetime.fromtimestamp(content.get('daily').get('data')[0].get('apparentTemperatureMaxTime')))
formatted['DailyDewpoint'] = content.get('daily').get('data')[0].get('dewPoint')
formatted['DailySummary'] = content.get('daily').get('data')[0].get('summary')
formatted['DailyWindbearing'] = content.get('daily').get('data')[0].get('windBearing')
formatted['DailyMoonphase'] = content.get('daily').get('data')[0].get('moonPhase')
formatted['DailyVisibility'] = content.get('daily').get('data')[0].get('visibility')
formatted['DailySunriseTime'] = str(dt.datetime.fromtimestamp(content.get('daily').get('data')[0].get('sunriseTime')))
formatted['DailySunsetTime'] = str(dt.datetime.fromtimestamp(content.get('daily').get('data')[0].get('sunsetTime')))
formatted['DailyWindspeed'] = content.get('daily').get('data')[0].get('windSpeed')
formatted['DailyPressure'] = content.get('daily').get('data')[0].get('pressure')
formatted['DailyHumidity'] = content.get('daily').get('data')[0].get('humidity')
#HOURLY
for hour in range(len(content.get('hourly').get('data'))):
hour_data = {}
hour_data['Temperature'] = f_t_c(content.get('hourly').get('data')[hour].get('temperature'))
hour_data['Dewpoint'] = content.get('hourly').get('data')[hour].get('dewPoint')
hour_data['PrecipitationType'] = content.get('hourly').get('data')[hour].get('precipType')
hour_data['Visibility'] = content.get('hourly').get('data')[hour].get('visibility')
hour_data['Summary'] = content.get('hourly').get('data')[hour].get('summary')
hour_data['ApparentTemperature'] = content.get('hourly').get('data')[hour].get('apparentTemperature')
hour_data['Pressure'] = content.get('hourly').get('data')[hour].get('pressure')
hour_data['WindSpeed'] = content.get('hourly').get('data')[hour].get('windSpeed')
hour_data['CloudCover'] = content.get('hourly').get('data')[hour].get('cloudCover')
hour_data['WindBearing'] = content.get('hourly').get('data')[hour].get('windBearing')
hour_data['Humidity'] = content.get('hourly').get('data')[hour].get('humidity')
base_time = dt.datetime.fromtimestamp(content.get('hourly').get('data')[hour].get('time'))
for i in range(4):
quarter = {}
quarter['Datetime'] = str(base_time + dt.timedelta(minutes=i*15))
quarter.update(hour_data)
quarter.update(formatted)
output.write(json.dumps(quarter) + "\n")
output_long.write(json.dumps(quarter) + "\n")
count += 1
if count % 10000 == 0:
output_long.close()
output_long = open('combined/output' + str(count/10000 + 1) +'.json', 'w')
output.close()
output_long.close()
print count
In [ ]: