notebook.community

Edit and run



In [5]:

    
from HourlyPowerConsumptions import HourlyPowerConsumptions

import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from visualizations import plot_regression



In [1]:

    
def evaluate(regressor, X_train, y_train, X_test, y_test):
    """
    Given a regressor, it fits the model with X_train and y_train
    and then predicts for X_test. Prints the Variance score. Best possible score is 1.0, lower values are worse.
    :param regressor: the regressor
    :param X_train:
    :param y_train:
    :param X_test:
    :param y_test:
    :return: None
    """

    regressor.fit(X_train, y_train)

    y_pred = regressor.predict(X_test)

    print('Coefficients: \n', regressor.coef_)
    # The mean square error
    print("Residual sum of squares: %.2f"
          % np.mean((y_pred - y_test) ** 2))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.2f' % regressor.score(X_test, y_test))

    return y_pred



In [2]:

    
def plot_curves(x, y_pred, y_test, x_label, y_label, legend):
    """
    It plots to curves, the truth and the predicted
    :param x: 1 dim array for x
    :param y_pred: 1 dim array for predicted curve
    :param y_test: 1 dim array for truth curve
    :param x_label: x label
    :param y_label: y label
    :param legend: legend (expected of size 2)
    :return:
    """
    if not isinstance(y_pred, np.ndarray):
        y_pred = np.asarray(y_pred)

    if not isinstance(y_test, np.ndarray):
        y_test = np.asarray(y_test)

    if not isinstance(x, np.ndarray):
        x = np.asarray(x)

    # Plot outputs
    plt.plot(x, y_pred)
    plt.plot(x, y_test)

    plt.legend(legend, loc='upper left')
    plt.axis([x.min() - 0.2, x.max() + 0.2, y_test.min() - 0.2, y_test.max() + 0.2])
    plt.xlabel(x_label)
    plt.ylabel(y_label)

    plt.show()



In [6]:

    
dir_path = "/Users/zoraida/Desktop/TEFCON/all-country-data/hourly"
pattern = "/Hourly_201*month*.xls"
year = 2013 # year to predict
country = 'ES' # country to predict



In [7]:

    
pc = HourlyPowerConsumptions(dir_path, pattern, skiprows=9, maxcolumns=26, hourchange='3B:00:00')



In [9]:

    
df = pc.historical_daily_aggregates(country, year, num_years=3)
df = df[df.date != '2012-02-29']



In [10]:

    
df.head()









    Out[10]:






  
    
      
      date
      weekday
      month
      year
      Consumption
    
  
  
    
      0
      2010-01-01
       4
       1
       2010
       539737
    
    
      1
      2010-01-02
       5
       1
       2010
       620238
    
    
      2
      2010-01-03
       6
       1
       2010
       617602
    
    
      3
      2010-01-04
       0
       1
       2010
       736761
    
    
      4
      2010-01-05
       1
       1
       2010
       719938



In [11]:

    
y_train = df[df.year.isin(range(year-3,year))].Consumption.values
y_test = df[df.year == year].Consumption.values

X_train = var = df[df.year.isin(range(year - 3, year))][['month', 'year', 'weekday']].values
X_test = df[df.year == year][['month','year','weekday']].values



In [12]:

    
y_train.shape









    Out[12]:





(1095,)



In [13]:

    
y_test.shape









    Out[13]:





(365,)



In [14]:

    
X_train.shape









    Out[14]:





(1095, 3)



In [15]:

    
X_test.shape









    Out[15]:





(365, 3)



In [16]:

    
type(X_train)









    Out[16]:





numpy.ndarray



In [17]:

    
type(y_train)









    Out[17]:





numpy.ndarray



In [18]:

    
vec = OneHotEncoder(sparse=False, categorical_features=[0, 2])
X_train_T = vec.fit_transform(X_train).astype(int)
X_test_T = vec.transform(X_test).astype(int)



In [20]:

    
X_train_T.shape









    Out[20]:





(1095, 20)



In [21]:

    
X_train_T[0]









    Out[21]:





array([   1,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    1,    0,    0, 2010])



In [22]:

    
regressor = LinearRegression()



In [23]:

    
# Returns prediction of a day consumption for all the countries
y_pred = evaluate(regressor, X_train_T, y_train, X_test_T, y_test)









    



('Coefficients: \n', array([ -2.86899146e+17,  -2.86899146e+17,  -2.86899146e+17,
        -2.86899146e+17,  -2.86899146e+17,  -2.86899146e+17,
        -2.86899146e+17,  -2.86899146e+17,  -2.86899146e+17,
        -2.86899146e+17,  -2.86899146e+17,  -2.86899146e+17,
        -2.12943336e+18,  -2.12943336e+18,  -2.12943336e+18,
        -2.12943336e+18,  -2.12943336e+18,  -2.12943336e+18,
        -2.12943336e+18,  -1.21534017e+04]))
Residual sum of squares: 1431924351.71
Variance score: 0.67



In [ ]:

	date	weekday	month	year	Consumption
0	2010-01-01	4	1	2010	539737
1	2010-01-02	5	1	2010	620238
2	2010-01-03	6	1	2010	617602
3	2010-01-04	0	1	2010	736761
4	2010-01-05	1	1	2010	719938