In [19]:
import pandas as pd
df = pd.read_csv('data/ncent_ml_data.csv')
df1 = pd.read_csv('flatData.csv')
df1 = df1.drop(['tempc', 'load_prev', 'day', 'hour'], axis=1)
df.head()


Out[19]:
m0 m1 m2 m3 m4 m5 m6 m7 m8 m9 ... h18 h19 h20 h21 h22 h23 years_n temp_n load_prev_n load
0 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 -0.5 -0.336085 -0.092815 10336.304899
1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 -0.5 -0.336085 -0.100468 10178.052738
2 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 -0.5 -0.347319 -0.104199 10100.887710
3 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 -0.5 -0.356680 -0.105134 10081.565109
4 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 -0.5 -0.356680 -0.099783 10192.218670

5 rows × 47 columns


In [18]:
df1.head()


Out[18]:
load dates month year
0 10336.304899 2002-01-01 00:00:00 1 2002
1 10178.052738 2002-01-01 01:00:00 1 2002
2 10100.887710 2002-01-01 02:00:00 1 2002
3 10081.565109 2002-01-01 03:00:00 1 2002
4 10192.218670 2002-01-01 04:00:00 1 2002

In [33]:
df1['monthyear'] = df1['month'].astype(str) + '_' + df1['year'].astype(str)
idx = df1.groupby('monthyear')['load'].transform(max) == df1['load']
df['isPeak'] = idx.astype(int)
df.head()


Out[33]:
m0 m1 m2 m3 m4 m5 m6 m7 m8 m9 ... h19 h20 h21 h22 h23 years_n temp_n load_prev_n load isPeak
0 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 -0.5 -0.336085 -0.092815 10336.304899 0
1 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 -0.5 -0.336085 -0.100468 10178.052738 0
2 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 -0.5 -0.347319 -0.104199 10100.887710 0
3 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 -0.5 -0.356680 -0.105134 10081.565109 0
4 0 1 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 -0.5 -0.356680 -0.099783 10192.218670 0

5 rows × 48 columns


In [35]:
# X = df.drop('load', axis=1)
# y = df['load']

# X_train, y_train = X[:-8760], y[:-8760]
# X_test, y_test = X[-8760:], y[-8760:]

In [39]:
df1.head()


Out[39]:
load dates month year monthyear isPeak
0 10336.304899 2002-01-01 00:00:00 1 2002 1_2002 0
1 10178.052738 2002-01-01 01:00:00 1 2002 1_2002 0
2 10100.887710 2002-01-01 02:00:00 1 2002 1_2002 0
3 10081.565109 2002-01-01 03:00:00 1 2002 1_2002 0
4 10192.218670 2002-01-01 04:00:00 1 2002 1_2002 0

In [73]:
m = df1.groupby('month')['load'].quantile(.99)
m[1]


Out[73]:
19572.431368378457

In [75]:



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-75-d2ff68f52f11> in <module>()
----> 1 df1['threshold']= df1['load'] > m[df1['month']]

/anaconda3/lib/python3.7/site-packages/pandas/core/ops.py in wrapper(self, other, axis)
   1188 
   1189         elif isinstance(other, ABCSeries) and not self._indexed_same(other):
-> 1190             raise ValueError("Can only compare identically-labeled "
   1191                              "Series objects")
   1192 

ValueError: Can only compare identically-labeled Series objects

In [ ]: