In [1]:
import numpy as np
import pandas as pd

In [133]:
# Generating a dictionary with all the forecast data
wf_base = '../data/windforecasts_wf'
features = ['u', 'v', 'ws', 'wd']
wf_forecast_dict = {}
for wf_id in range(1, 8):
    wf_file = wf_base + str(wf_id) + '.csv'
    wf_data = pd.read_csv(wf_file)
    
    for idx, row in wf_data.iterrows():
        key = str(int(row['date']) + (int(row['hors']))) + '_' + str(wf_id)
        if key not in wf_forecast_dict:
            wf_forecast_dict[key] = [row[features]]
        else:
            wf_forecast_dict[key].append(row[features])

In [148]:
train = pd.read_csv('../data/train.csv')
train_virtual = np.concatenate((train['wp1'].values, train['wp2'].values, train['wp3'].values, train['wp4'].values, train['wp5'].values, train['wp6'].values, train['wp7'].values))
wf_ids = np.concatenate(([1] * len(train['wp1']), [2] * len(train['wp2']), [3] * len(train['wp3']), [4] * len(train['wp4']), [5] * len(train['wp5']), [6] * len(train['wp6']), [7] * len(train['wp7'])))
wf_dates = np.ravel([train['date'].values] * 7)

virtual_data = pd.DataFrame(np.vstack((wf_dates, wf_ids, train_virtual)).T, columns=['date', 'wf_id', 'energy'])

In [136]:
avg_forecast = []
for idx, row in virtual_data.iterrows():
    key = str(int(row['date'])) + '_' + str(int(row['wf_id']))
    if key not in wf_forecast_dict:
        avg_forecast.append(np.array([np.nan, np.nan, np.nan, np.nan]))
    else:
        avg_forecast.append(np.average(wf_forecast_dict[key], axis=0))

In [159]:
avg_forecast = pd.DataFrame(avg_forecast, columns=features)
virtual_data_aggregate = virtual_data.join(avg_forecast)
virtual_data_aggregate[['date', 'wf_id']] = virtual_data_aggregate[['date', 'wf_id']].astype(int)
virtual_data_aggregate.to_csv('../data/virtual_aggregate_data.csv')

In [ ]: