In [1]:
%matplotlib inline

In [2]:
import requests
from pandas.io.json import json_normalize
import pandas as pd
import numpy as np

In [3]:
sensorId = 4727
r = requests.get('http://api.gios.gov.pl/pjp-api/rest/data/getData/' + str(sensorId))

In [4]:
concentration = json_normalize(r.json())
concentrationFrame = pd.DataFrame()
concentrationFrame["dates"] = [d[u'date'] for d in concentration["values"].values.item()]
concentrationFrame["values"] = [d[u'value'] for d in concentration["values"].values.item()]
concentrationFrame = concentrationFrame.iloc[::-1]
rng = pd.date_range(start = concentrationFrame["dates"][:1].item(), periods=len(concentrationFrame), freq='H')
concentrationFrame["dates"] = rng
concentrationFrame = concentrationFrame[:-1]  #removing last line
firstDatetime = concentrationFrame["dates"].iloc[0]
concentrationFrame["relativeTime"] = (concentrationFrame["dates"] - firstDatetime) / np.timedelta64(1, 'h')

In [5]:
concentrationFrame


Out[5]:
dates values relativeTime
60 2017-04-21 01:00:00 19.14730 0.0
59 2017-04-21 02:00:00 1.65206 1.0
58 2017-04-21 03:00:00 12.53800 2.0
57 2017-04-21 04:00:00 1.00000 3.0
56 2017-04-21 05:00:00 20.45770 4.0
55 2017-04-21 06:00:00 22.90880 5.0
54 2017-04-21 07:00:00 13.48710 6.0
53 2017-04-21 08:00:00 32.13200 7.0
52 2017-04-21 09:00:00 17.73870 8.0
51 2017-04-21 10:00:00 12.90010 9.0
50 2017-04-21 11:00:00 4.74283 10.0
49 2017-04-21 12:00:00 10.70450 11.0
48 2017-04-21 13:00:00 1.00000 12.0
47 2017-04-21 14:00:00 2.95744 13.0
46 2017-04-21 15:00:00 27.01130 14.0
45 2017-04-21 16:00:00 22.96910 15.0
44 2017-04-21 17:00:00 21.15800 16.0
43 2017-04-21 18:00:00 1.00000 17.0
42 2017-04-21 19:00:00 5.51700 18.0
41 2017-04-21 20:00:00 4.58017 19.0
40 2017-04-21 21:00:00 6.83833 20.0
39 2017-04-21 22:00:00 1.00000 21.0
38 2017-04-21 23:00:00 1.00000 22.0
37 2017-04-22 00:00:00 1.00000 23.0
36 2017-04-22 01:00:00 6.34769 24.0
35 2017-04-22 02:00:00 1.90883 25.0
34 2017-04-22 03:00:00 12.55590 26.0
33 2017-04-22 04:00:00 16.61140 27.0
32 2017-04-22 05:00:00 18.76860 28.0
31 2017-04-22 06:00:00 18.08410 29.0
30 2017-04-22 07:00:00 13.14770 30.0
29 2017-04-22 08:00:00 16.37040 31.0
28 2017-04-22 09:00:00 7.80183 32.0
27 2017-04-22 10:00:00 4.57194 33.0
26 2017-04-22 11:00:00 8.06875 34.0
25 2017-04-22 12:00:00 5.24153 35.0
24 2017-04-22 13:00:00 6.09681 36.0
23 2017-04-22 14:00:00 20.31990 37.0
22 2017-04-22 15:00:00 14.82320 38.0
21 2017-04-22 16:00:00 9.87542 39.0
20 2017-04-22 17:00:00 2.72983 40.0
19 2017-04-22 18:00:00 3.98144 41.0
18 2017-04-22 19:00:00 6.63619 42.0
17 2017-04-22 20:00:00 1.95631 43.0
16 2017-04-22 21:00:00 16.38690 44.0
15 2017-04-22 22:00:00 18.20030 45.0
14 2017-04-22 23:00:00 15.53640 46.0
13 2017-04-23 00:00:00 17.38690 47.0
12 2017-04-23 01:00:00 22.25380 48.0
11 2017-04-23 02:00:00 9.44294 49.0
10 2017-04-23 03:00:00 10.27420 50.0
9 2017-04-23 04:00:00 14.60180 51.0
8 2017-04-23 05:00:00 1.00000 52.0
7 2017-04-23 06:00:00 6.04558 53.0
6 2017-04-23 07:00:00 14.71620 54.0
5 2017-04-23 08:00:00 34.49780 55.0
4 2017-04-23 09:00:00 5.20961 56.0
3 2017-04-23 10:00:00 1.00000 57.0
2 2017-04-23 11:00:00 5.86842 58.0
1 2017-04-23 12:00:00 9.66539 59.0

In [6]:
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(concentrationFrame["relativeTime"].values.reshape((concentrationFrame.shape[0],1)), concentrationFrame["values"].values,
                                                    train_size=0.75, test_size=0.25)

tpot = TPOTRegressor(generations=5, population_size=50, verbosity=2)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('tpot_concentration_pipeline.py')


Version 0.7.1 of tpot is outdated. Version 0.7.2 was released 2 days ago.
Optimization Progress:  33%|███▎      | 100/300 [02:35<02:49,  1.18pipeline/s]
Generation 1 - Current best internal CV score: 59.2572857379
Optimization Progress:  50%|█████     | 150/300 [03:49<03:28,  1.39s/pipeline]
Generation 2 - Current best internal CV score: 59.2178981236
Optimization Progress:  67%|██████▋   | 200/300 [05:49<01:54,  1.14s/pipeline]
Generation 3 - Current best internal CV score: 59.2178981236
Optimization Progress:  83%|████████▎ | 250/300 [08:42<01:29,  1.78s/pipeline]
Generation 4 - Current best internal CV score: 56.3100117396

Generation 5 - Current best internal CV score: 56.3100117396

Best pipeline: RandomForestRegressor(RBFSampler(input_matrix, RBFSampler__gamma=0.75), RandomForestRegressor__bootstrap=DEFAULT, RandomForestRegressor__max_features=0.05, RandomForestRegressor__min_samples_leaf=DEFAULT, RandomForestRegressor__min_samples_split=3, RandomForestRegressor__n_estimators=100)
82.4052705079

In [7]:
ftr = pd.date_range(start = concentrationFrame["dates"][-1:].item(), periods=25, freq='H', closed='right')
predctionFrame = pd.DataFrame( {"dates" : ftr})
predctionFrame["relativeTime"] = (predctionFrame["dates"] - firstDatetime) / np.timedelta64(1, 'h')
predctionFrame["predictedValues"] = predctionFrame["relativeTime"].apply(lambda x: tpot.predict(x).item())
wholeFrame = pd.concat([concentrationFrame, predctionFrame])
wholeFrame.set_index(["dates"], inplace=True)

In [8]:
wholeFrame[["predictedValues", "values"]].plot(figsize=(15,5), grid=True)


Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x53c1050>

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: