Local data parsing

The goal here is to parse the locally synced DarkSky data to be able to use it in 15-minute based big data analytics using Tableau. In this case the data will be stored in Google BigQuery so extra processing is needed on the returned JSON data that is nog locally stored (see Basic_setup.ipynb)
Since DarkSky only provisions hourly data, this needs to be spread to allow for 15-minute based analytics.


In [ ]:
import datetime as dt
import time
from tqdm import tqdm
import json 
import os

In [ ]:
file_list = os.listdir('local_data')

In [ ]:
def f_t_c(fahrenheit):
    return round((((fahrenheit - 32) * 5.0) / 9.0), 3)

Parse daily & hourly data

Will parse the daily data in the desired format


In [ ]:
output_long = open('combined/output.json', 'w')
count = 0
for current_file in tqdm(file_list):
    content = json.loads(open('local_data/' + current_file).read())
    output = open('parsed_data/parsed_' + current_file, 'w')
    formatted = {}
    
    #DAILY
    formatted['DailyPrecipitationType'] = content.get('daily').get('data')[0].get('precipType')
    formatted['DailyCloudCover'] = content.get('daily').get('data')[0].get('cloudCover')
    formatted['DailyTemperatureMin'] = f_t_c(content.get('daily').get('data')[0].get('temperatureMin'))
    formatted['DailyTemperatureMax'] = f_t_c(content.get('daily').get('data')[0].get('temperatureMax'))
    formatted['DailyTemperatureMinTime'] = str(dt.datetime.fromtimestamp(content.get('daily').get('data')[0].get('temperatureMinTime')))
    formatted['DailyTemperatureMaxTime'] = str(dt.datetime.fromtimestamp(content.get('daily').get('data')[0].get('temperatureMaxTime')))
    formatted['DailyApparentTemperatureMin'] = f_t_c(content.get('daily').get('data')[0].get('apparentTemperatureMin'))
    formatted['DailyApparentTemperatureMax'] = f_t_c(content.get('daily').get('data')[0].get('apparentTemperatureMax'))
    formatted['DailyApparentTemperatureMinTime'] = str(dt.datetime.fromtimestamp(content.get('daily').get('data')[0].get('apparentTemperatureMinTime')))
    formatted['DailyApparentTemperatureMaxTime'] = str(dt.datetime.fromtimestamp(content.get('daily').get('data')[0].get('apparentTemperatureMaxTime')))
    formatted['DailyDewpoint'] = content.get('daily').get('data')[0].get('dewPoint')
    formatted['DailySummary'] = content.get('daily').get('data')[0].get('summary')
    formatted['DailyWindbearing'] = content.get('daily').get('data')[0].get('windBearing')
    formatted['DailyMoonphase'] = content.get('daily').get('data')[0].get('moonPhase')
    formatted['DailyVisibility'] = content.get('daily').get('data')[0].get('visibility')
    formatted['DailySunriseTime'] = str(dt.datetime.fromtimestamp(content.get('daily').get('data')[0].get('sunriseTime')))
    formatted['DailySunsetTime'] = str(dt.datetime.fromtimestamp(content.get('daily').get('data')[0].get('sunsetTime')))
    formatted['DailyWindspeed'] = content.get('daily').get('data')[0].get('windSpeed')
    formatted['DailyPressure'] = content.get('daily').get('data')[0].get('pressure')
    formatted['DailyHumidity'] = content.get('daily').get('data')[0].get('humidity')
    
    #HOURLY
    for hour in range(len(content.get('hourly').get('data'))):
        hour_data = {}
        hour_data['Temperature'] = f_t_c(content.get('hourly').get('data')[hour].get('temperature'))
        hour_data['Dewpoint'] = content.get('hourly').get('data')[hour].get('dewPoint')
        hour_data['PrecipitationType'] = content.get('hourly').get('data')[hour].get('precipType')
        hour_data['Visibility'] = content.get('hourly').get('data')[hour].get('visibility')
        hour_data['Summary'] = content.get('hourly').get('data')[hour].get('summary')
        hour_data['ApparentTemperature'] = content.get('hourly').get('data')[hour].get('apparentTemperature')
        hour_data['Pressure'] = content.get('hourly').get('data')[hour].get('pressure')
        hour_data['WindSpeed'] = content.get('hourly').get('data')[hour].get('windSpeed')
        hour_data['CloudCover'] = content.get('hourly').get('data')[hour].get('cloudCover')
        hour_data['WindBearing'] = content.get('hourly').get('data')[hour].get('windBearing')
        hour_data['Humidity'] = content.get('hourly').get('data')[hour].get('humidity')
        
        base_time = dt.datetime.fromtimestamp(content.get('hourly').get('data')[hour].get('time'))
        for i in range(4):
            quarter = {}
            quarter['Datetime'] = str(base_time + dt.timedelta(minutes=i*15))
            quarter.update(hour_data)
            quarter.update(formatted)
            output.write(json.dumps(quarter) + "\n")
            output_long.write(json.dumps(quarter) + "\n")
            count += 1
            if count % 10000 == 0:
                output_long.close()
                output_long = open('combined/output' + str(count/10000 + 1) +'.json', 'w')
        
    output.close() 
output_long.close()
print count

In [ ]: