In [5]:
from HourlyPowerConsumptions import HourlyPowerConsumptions

import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from visualizations import plot_regression

In [1]:
def evaluate(regressor, X_train, y_train, X_test, y_test):
    """
    Given a regressor, it fits the model with X_train and y_train
    and then predicts for X_test. Prints the Variance score. Best possible score is 1.0, lower values are worse.
    :param regressor: the regressor
    :param X_train:
    :param y_train:
    :param X_test:
    :param y_test:
    :return: None
    """

    regressor.fit(X_train, y_train)

    y_pred = regressor.predict(X_test)

    print('Coefficients: \n', regressor.coef_)
    # The mean square error
    print("Residual sum of squares: %.2f"
          % np.mean((y_pred - y_test) ** 2))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.2f' % regressor.score(X_test, y_test))

    return y_pred

In [2]:
def plot_curves(x, y_pred, y_test, x_label, y_label, legend):
    """
    It plots to curves, the truth and the predicted
    :param x: 1 dim array for x
    :param y_pred: 1 dim array for predicted curve
    :param y_test: 1 dim array for truth curve
    :param x_label: x label
    :param y_label: y label
    :param legend: legend (expected of size 2)
    :return:
    """
    if not isinstance(y_pred, np.ndarray):
        y_pred = np.asarray(y_pred)

    if not isinstance(y_test, np.ndarray):
        y_test = np.asarray(y_test)

    if not isinstance(x, np.ndarray):
        x = np.asarray(x)

    # Plot outputs
    plt.plot(x, y_pred)
    plt.plot(x, y_test)

    plt.legend(legend, loc='upper left')
    plt.axis([x.min() - 0.2, x.max() + 0.2, y_test.min() - 0.2, y_test.max() + 0.2])
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    plt.show()

In [6]:
dir_path = "/Users/zoraida/Desktop/TEFCON/all-country-data/hourly"
pattern = "/Hourly_201*month*.xls"
year = 2013 # year to predict
country = 'ES' # country to predict

In [7]:
pc = HourlyPowerConsumptions(dir_path, pattern, skiprows=9, maxcolumns=26, hourchange='3B:00:00')

In [9]:
df = pc.historical_daily_aggregates(country, year, num_years=3)
df = df[df.date != '2012-02-29']

In [10]:
df.head()


Out[10]:
date weekday month year Consumption
0 2010-01-01 4 1 2010 539737
1 2010-01-02 5 1 2010 620238
2 2010-01-03 6 1 2010 617602
3 2010-01-04 0 1 2010 736761
4 2010-01-05 1 1 2010 719938

In [11]:
y_train = df[df.year.isin(range(year-3,year))].Consumption.values
y_test = df[df.year == year].Consumption.values

X_train = var = df[df.year.isin(range(year - 3, year))][['month', 'year', 'weekday']].values
X_test = df[df.year == year][['month','year','weekday']].values

In [12]:
y_train.shape


Out[12]:
(1095,)

In [13]:
y_test.shape


Out[13]:
(365,)

In [14]:
X_train.shape


Out[14]:
(1095, 3)

In [15]:
X_test.shape


Out[15]:
(365, 3)

In [16]:
type(X_train)


Out[16]:
numpy.ndarray

In [17]:
type(y_train)


Out[17]:
numpy.ndarray

In [18]:
vec = OneHotEncoder(sparse=False, categorical_features=[0, 2])
X_train_T = vec.fit_transform(X_train).astype(int)
X_test_T = vec.transform(X_test).astype(int)

In [20]:
X_train_T.shape


Out[20]:
(1095, 20)

In [21]:
X_train_T[0]


Out[21]:
array([   1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    1,    0,    0, 2010])

In [22]:
regressor = LinearRegression()

In [23]:
# Returns prediction of a day consumption for all the countries
y_pred = evaluate(regressor, X_train_T, y_train, X_test_T, y_test)


('Coefficients: \n', array([ -2.86899146e+17,  -2.86899146e+17,  -2.86899146e+17,
        -2.86899146e+17,  -2.86899146e+17,  -2.86899146e+17,
        -2.86899146e+17,  -2.86899146e+17,  -2.86899146e+17,
        -2.86899146e+17,  -2.86899146e+17,  -2.86899146e+17,
        -2.12943336e+18,  -2.12943336e+18,  -2.12943336e+18,
        -2.12943336e+18,  -2.12943336e+18,  -2.12943336e+18,
        -2.12943336e+18,  -1.21534017e+04]))
Residual sum of squares: 1431924351.71
Variance score: 0.67

In [ ]: