In [285]:
import numpy as np
import pandas as pd
# import tensorflow as tf
from matplotlib import pyplot as plt
import  datetime
import os
import gzip
%matplotlib inline

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler

In [58]:
waze = pd.read_csv("../waze (4).csv", sep=";")
print(waze['subtype'].unique())


['ROAD_CLOSED_EVENT' 'HAZARD_ON_ROAD_CONSTRUCTION'
 'HAZARD_ON_SHOULDER_CAR_STOPPED' nan 'POLICE_VISIBLE' 'ACCIDENT_MINOR'
 'HAZARD_ON_ROAD_CAR_STOPPED' 'HAZARD_ON_ROAD_LANE_CLOSED'
 'ROAD_CLOSED_HAZARD' 'POLICE_HIDING' 'HAZARD_ON_ROAD_OBJECT'
 'HAZARD_ON_SHOULDER_ANIMALS' 'ACCIDENT_MAJOR' 'HAZARD_ON_SHOULDER'
 'HAZARD_ON_ROAD_POT_HOLE' 'HAZARD_WEATHER'
 'HAZARD_ON_SHOULDER_MISSING_SIGN' 'HAZARD_ON_ROAD_ROAD_KILL'
 'HAZARD_ON_ROAD' 'ROAD_CLOSED_CONSTRUCTION' 'HAZARD_ON_ROAD_ICE']

In [67]:
waze.columns


Out[67]:
Index(['WID', 'area', 'publicatedOn', 'updatedOn', 'closedOn', 'country',
       'nThumbsUp', 'city', 'reportRating', 'reliability', 'type', 'uuid',
       'speed', 'reportMood', 'subtype', 'street', 'id', 'additionalInfo',
       'nearBy', 'roadType', 'wazeid', 'nComments', 'reportBy', 'inscale',
       'comments', 'isThumbsUp', 'isJamUnifiedAlert', 'confidence', 'nImages',
       'magvar', 'showFacebookPic', 'wazeData', 'reportDescription', 'x', 'y',
       'pubMillis', 'provider', 'providerId', 'imageUrl', 'imageId',
       'reportByActiveGroupName', 'reportByActiveGroupIcon', 'missing',
       'hour'],
      dtype='object')

In [69]:
w_g = waze.groupby(['x', 'y'])['area'].count()

In [70]:
plt.plot(waze['x'], waze['y'], ".")
plt.show()



In [62]:
waze['reportDescription'].unique()


Out[62]:
array(['Storing',
       'Calandbrug over Calandkanaal. Open sinds: 00:09 uur. Gemiddeld open: 15 minuten.',
       'Calandbrug over Calandkanaal. Open sinds: 00:25 uur. Gemiddeld open: 15 minuten.',
       ..., 'Voorwerp op de weg: A4 bij hmp 71,4',
       'Calandbrug over Calandkanaal. Open sinds: 23:41 uur. Gemiddeld open: 16 minuten.',
       'Roparun'], dtype=object)

In [63]:
waze['publicatedOn'] = pd.to_datetime(waze['publicatedOn'], format="%Y%m%d%H%M%S")

waze["hour"] = waze.publicatedOn.dt.hour

plt.hist(waze.loc[waze['subtype'] == 'ACCIDENT_MAJOR']["hour"], bins=23)
waze.columns = ['WID', 'area', 'publicatedOn', 'updatedOn', 'closedOn', 'country',
       'nThumbsUp', 'city', 'reportRating', 'reliability', 'type', 'uuid',
       'speed', 'reportMood', 'subtype', 'street', 'id', 'additionalInfo',
       'nearBy', 'roadType', 'wazeid', 'nComments', 'reportBy', 'inscale',
       'comments', 'isThumbsUp', 'isJamUnifiedAlert', 'confidence', 'nImages',
       'magvar', 'showFacebookPic', 'wazeData', 'reportDescription',
       'x', 'y', 'pubMillis', 'provider', 'providerId',
       'imageUrl', 'imageId', 'reportByActiveGroupName',
       'reportByActiveGroupIcon', 'missing', 'hour']



In [50]:
incidents = pd.read_excel("../Incidents_WPH_Car.xlsx")

In [51]:
incidents.head()


Out[51]:
datum gemelde locatie werkelijke locatie tijdstip melding tijdstip aankomst
0 2017-05-22 16 L 20.5 16 L 20.68 06:37:16 06:57:27
1 2017-05-22 15 R 60.8 29 L 9.56 s 06:49:51 06:49:26
2 2017-05-22 15 R 53.3 a 15 R 53.27 a 07:18:40 07:30:07
3 2017-05-22 20 L 29.6 20 L 29.69 s 07:23:40 07:36:28
4 2017-05-22 20 L 34.3 20 L 34.25 v 07:24:07 07:29:15

In [23]:
flitsmeister = pd.read_csv("../flitsmeister week 21 22/2017-05-22_17.csv")

In [24]:
flitsmeister.head()


Out[24]:
RowID SessionID Latitude Longitude Bearing Speed created_at geohash SessionCount
0 456393953 FC847645-41FA-4C24-AB47-134042D9C1FB 51.864525 4.526764 284 46 Mon May 22 2017 14:58:03 GMT+0200 (CEST) u15pp18 NaN
1 456393954 FC847645-41FA-4C24-AB47-134042D9C1FB 51.864556 4.526617 286 39 Mon May 22 2017 14:58:03 GMT+0200 (CEST) u15pp18 NaN
2 456393955 FC847645-41FA-4C24-AB47-134042D9C1FB 51.864584 4.526479 287 37 Mon May 22 2017 14:58:03 GMT+0200 (CEST) u15pp18 NaN
3 456393956 FC847645-41FA-4C24-AB47-134042D9C1FB 51.864581 4.526385 285 28 Mon May 22 2017 14:58:03 GMT+0200 (CEST) u15pp18 NaN
4 456393957 FC847645-41FA-4C24-AB47-134042D9C1FB 51.864601 4.526271 286 28 Mon May 22 2017 14:58:03 GMT+0200 (CEST) u15pncx NaN

In [26]:
flitsmeister.shape


Out[26]:
(2094754, 9)

In [28]:
flitsmeister["SessionID"].nunique()


Out[28]:
4617

In [31]:
plt.hist(flitsmeister["Speed"], bins=35)


Out[31]:
(array([  1.83640000e+05,   1.19281000e+05,   1.14670000e+05,
          1.01321000e+05,   9.55420000e+04,   1.05778000e+05,
          8.79620000e+04,   8.17410000e+04,   7.74440000e+04,
          8.09580000e+04,   1.20480000e+05,   1.31225000e+05,
          1.50358000e+05,   1.89395000e+05,   1.88347000e+05,
          1.43809000e+05,   7.21450000e+04,   3.29810000e+04,
          1.24630000e+04,   3.30700000e+03,   8.80000000e+02,
          3.12000000e+02,   1.44000000e+02,   6.20000000e+01,
          3.70000000e+01,   2.10000000e+01,   6.40000000e+01,
          9.00000000e+01,   1.06000000e+02,   4.80000000e+01,
          6.90000000e+01,   5.30000000e+01,   2.00000000e+01,
          0.00000000e+00,   1.00000000e+00]),
 array([   0. ,    7.2,   14.4,   21.6,   28.8,   36. ,   43.2,   50.4,
          57.6,   64.8,   72. ,   79.2,   86.4,   93.6,  100.8,  108. ,
         115.2,  122.4,  129.6,  136.8,  144. ,  151.2,  158.4,  165.6,
         172.8,  180. ,  187.2,  194.4,  201.6,  208.8,  216. ,  223.2,
         230.4,  237.6,  244.8,  252. ]),
 <a list of 35 Patch objects>)

In [41]:
flitsmeister['time'] = pd.to_datetime(flitsmeister['created_at'])

In [64]:
flitsmeister[np.array(flitsmeister['time'] <= datetime.datetime(2017, 5, 22, 17, 8, 3))].head()


Out[64]:
RowID SessionID Latitude Longitude Bearing Speed created_at geohash SessionCount time
0 456393953 FC847645-41FA-4C24-AB47-134042D9C1FB 51.864525 4.526764 284 46 Mon May 22 2017 14:58:03 GMT+0200 (CEST) u15pp18 NaN 2017-05-22 16:58:03
1 456393954 FC847645-41FA-4C24-AB47-134042D9C1FB 51.864556 4.526617 286 39 Mon May 22 2017 14:58:03 GMT+0200 (CEST) u15pp18 NaN 2017-05-22 16:58:03
2 456393955 FC847645-41FA-4C24-AB47-134042D9C1FB 51.864584 4.526479 287 37 Mon May 22 2017 14:58:03 GMT+0200 (CEST) u15pp18 NaN 2017-05-22 16:58:03
3 456393956 FC847645-41FA-4C24-AB47-134042D9C1FB 51.864581 4.526385 285 28 Mon May 22 2017 14:58:03 GMT+0200 (CEST) u15pp18 NaN 2017-05-22 16:58:03
4 456393957 FC847645-41FA-4C24-AB47-134042D9C1FB 51.864601 4.526271 286 28 Mon May 22 2017 14:58:03 GMT+0200 (CEST) u15pncx NaN 2017-05-22 16:58:03

In [29]:
incident = pd.read_csv("../incident_with_xy.csv")
incident.columns = ['Unnamed: 0', 'V1', 'date', 'report_time', 'gemelde.locatie', 'y', 'x']
incident['publicatedOn'] = pd.to_datetime(incident['date'] + " " +incident['report_time'])

In [33]:
plt.plot(incident["y"], incident["x"], ".")
plt.show()



In [71]:
plt.plot(waze['x'], waze['y'], ".")
plt.show()



In [35]:
waze['subtype'].unique()


Out[35]:
array(['ROAD_CLOSED_EVENT', 'HAZARD_ON_ROAD_CONSTRUCTION',
       'HAZARD_ON_SHOULDER_CAR_STOPPED', nan, 'POLICE_VISIBLE',
       'ACCIDENT_MINOR', 'HAZARD_ON_ROAD_CAR_STOPPED',
       'HAZARD_ON_ROAD_LANE_CLOSED', 'ROAD_CLOSED_HAZARD', 'POLICE_HIDING',
       'HAZARD_ON_ROAD_OBJECT', 'HAZARD_ON_SHOULDER_ANIMALS',
       'ACCIDENT_MAJOR', 'HAZARD_ON_SHOULDER', 'HAZARD_ON_ROAD_POT_HOLE',
       'HAZARD_WEATHER', 'HAZARD_ON_SHOULDER_MISSING_SIGN',
       'HAZARD_ON_ROAD_ROAD_KILL', 'HAZARD_ON_ROAD',
       'ROAD_CLOSED_CONSTRUCTION', 'HAZARD_ON_ROAD_ICE'], dtype=object)

In [44]:
waze['publicatedOn'].min()


Out[44]:
Timestamp('2017-05-21 00:08:55')

In [320]:
incident.head()


Out[320]:
Unnamed: 0 V1 date report_time gemelde.locatie y x publicated_on publicatedOn
0 1 0 2017-05-23 09:30:12.999000 27/37,9 51.855369 4.954711 2017-05-23 09:30:12.999 2017-05-23 09:30:12.999
1 2 1 2017-05-24 07:49:54.999000 15/95,8 51.846133 4.950630 2017-05-24 07:49:54.999 2017-05-24 07:49:54.999
2 3 2 2017-05-24 10:08:08.999000 15/102,5 51.839749 5.047208 2017-05-24 10:08:08.999 2017-05-24 10:08:08.999
3 4 3 2017-05-26 17:44:37 15/65,3 51.866261 4.584767 2017-05-26 17:44:37.000 2017-05-26 17:44:37.000
4 5 4 2017-05-26 20:26:48.999000 15/79,1 51.837240 4.721092 2017-05-26 20:26:48.999 2017-05-26 20:26:48.999

In [51]:
pd.to_datetime(incident['date']).max()


Out[51]:
Timestamp('2017-06-04 00:00:00')

In [72]:
print(waze["x"].min(), waze["y"].max())


4.22436306271 51.955889

In [74]:
print(waze["x"].max(), waze["y"].min())


4.576501 51.862621

In [75]:
diff_x = waze["x"].max() - waze["x"].min()
diff_y = waze["y"].max() - waze["y"].min()

In [76]:
diff_x /= 10
diff_y /= 10

In [77]:
x_co = [waze["x"].min()+diff_x*i for i in range(11)]

In [78]:
y_co = [waze["y"].max()-diff_y*i for i in range(11)]

In [79]:
counter = 1
dict_coord = {}
for y in range(1,10):
    for x in range(1, 10):
        dict_coord[x, y] = counter
        counter +=1

In [80]:
def grider(row):
    for i, x in enumerate(x_co[:-1]):
        if x <= row['x'] <= x_co[i+1]:
            x_op = i

    for i, y in enumerate(y_co[:-1]):
        if y >= row['y'] >= y_co[i+1]:
            y_op = i

    try:
        num = dict_coord[(x_op, y_op)]
    except:
        num = None
    return num

In [331]:
data_train = pd.DataFrame()
for index, row in waze.iterrows():
    if grider(row) != None:
            data_train = data_train.append(pd.DataFrame([1, grider(row), row['publicatedOn']]).T)

In [334]:
data_train.to_csv("../feature_for_grid.csv")

In [333]:
data_train.head()


Out[333]:
0 1 2
0 1 52 2017-05-21 00:08:55
0 1 22 2017-05-21 00:42:35
0 1 69 2017-05-21 01:25:09
0 1 78 2017-05-21 01:25:47
0 1 23 2017-05-21 04:16:59

In [208]:
temp = pd.read_csv("../flitsmeister week 21 22/2017-05-22_00.csv")

In [ ]:
#func = {'Speed':['mean', 'max', "std", "min"], 'SessionID':['nunique']}
features = pd.DataFrame()
for file in os.listdir("../flitsmeister week 21 22/"):
    if file.endswith(".gz"):
        with gzip.open("../flitsmeister week 21 22/" + file, "r") as f:
            try:
                temp = pd.read_csv(f)
                temp.columns = ['RowID', 'SessionID', 'y', 'x', 'Bearing', 'Speed',
           'created_at', 'geohash', 'SessionCount']
                #temp["grid"] = temp.apply(lambda x: grider(x), axis=1)
                #to_concat = temp.groupby(["grid"])["Speed"].agg(func)
                #to_concat = pd.DataFrame(to_concat.reset_index().as_matrix(), columns=['grid', "speed_mean", "speed_max", "speed_std", "speed_min", "count" ])
                func = {'Speed':['mean', 'max', "std", "min"], 'y':['mean'], "x":['mean']}
                temp_agg = temp.groupby(["SessionID"]).agg(func)
                temp_agg = pd.DataFrame(temp_agg.as_matrix(), columns=['speed_mean', 'speed_max', "speed_std", "speed_min",
                                                           "y", "x"])
                temp_agg["grid"] = -1
                for ix, row in temp_agg.iterrows():
                    temp_agg.loc[ix, "grid"] = grider(row)
                temp_agg["date"] = file.split(".")[0]
                print(file.split(".")[0])
                features = features.append(temp_agg)
                print("Finished with " + file)
            except:
                continue

In [367]:
#features.to_csv("../features.csv")

In [5]:
features = pd.read_csv("../features.csv")
features.head(10)


Out[5]:
Unnamed: 0 speed_mean speed_max speed_std speed_min y x grid date
0 0 107.674931 146.0 27.909145 66.0 51.923162 4.504515 25.0 2017-05-22_00
1 1 139.439560 178.0 17.145918 115.0 51.876314 4.572905 72.0 2017-05-22_00
2 2 119.498054 141.0 20.024423 19.0 51.885456 4.564509 63.0 2017-05-22_00
3 3 101.987520 118.0 9.905010 0.0 51.883272 4.425730 59.0 2017-05-22_00
4 4 63.180233 82.0 18.516851 17.0 51.937475 4.431121 5.0 2017-05-22_00
5 5 121.197107 134.0 12.097694 82.0 51.878232 4.466523 69.0 2017-05-22_00
6 6 75.811111 83.0 3.934561 66.0 51.934747 4.449704 15.0 2017-05-22_00
7 7 137.302885 151.0 9.648889 108.0 51.913590 4.543788 36.0 2017-05-22_00
8 8 75.790123 87.0 3.700650 67.0 51.939301 4.429959 5.0 2017-05-22_00
9 9 114.485591 203.0 26.753035 59.0 51.921418 4.508817 26.0 2017-05-22_00

In [7]:
times = pd.to_datetime(features["date"], format="%Y-%m-%d_%H")

In [18]:
times_hour = times.dt.strftime("%Y-%m-%d %H")

In [19]:
features["time"] = times_hour

In [126]:
features.head()


Out[126]:
Unnamed: 0 speed_mean speed_max speed_std speed_min y x grid date time
0 0 107.674931 146.0 27.909145 66.0 51.923162 4.504515 25.0 2017-05-22_00 2017-05-22 00
1 1 139.439560 178.0 17.145918 115.0 51.876314 4.572905 72.0 2017-05-22_00 2017-05-22 00
2 2 119.498054 141.0 20.024423 19.0 51.885456 4.564509 63.0 2017-05-22_00 2017-05-22 00
3 3 101.987520 118.0 9.905010 0.0 51.883272 4.425730 59.0 2017-05-22_00 2017-05-22 00
4 4 63.180233 82.0 18.516851 17.0 51.937475 4.431121 5.0 2017-05-22_00 2017-05-22 00

In [134]:
features.groupby(["time", "grid"]).agg(func).reset_index().head()


Out[134]:
time grid speed_mean speed_max speed_std speed_min date
mean mean mean mean count
0 2017-05-22 00 3.0 98.000000 145.000000 62.707940 0.000000 1
1 2017-05-22 00 5.0 74.076514 93.416667 9.276480 52.833333 12
2 2017-05-22 00 6.0 76.518359 97.571429 15.152162 36.428571 7
3 2017-05-22 00 7.0 80.315252 117.916667 21.320476 31.166667 12
4 2017-05-22 00 8.0 93.493739 110.285714 14.092003 46.857143 7

In [137]:
func = {'speed_mean':['mean'], 'speed_max':['mean'], 'speed_std':['mean'], 'speed_min':['mean'], 'date':['count']}
features = pd.DataFrame(features.groupby(["time", "grid"]).agg(func).reset_index().as_matrix(), columns = ["time", 
                                                                                               "grid", 
                                                                                               "speed_mean",
                                                                                               "speed_max",
                                                                                               "speed_std", 
                                                                                               "speed_min",
                                                                                               "traf_flow"])

In [32]:
weather = pd.read_csv("../weather.csv", encoding="latin-1")
weather["time"] = pd.to_datetime(weather["Time"])

In [33]:
weather["time"] = weather["time"].dt.strftime("%Y-%m-%d %H")

In [37]:
del weather["Time"]

In [42]:
weather = weather.groupby(["time"]).first().reset_index()

In [44]:
merged_feat_weat = pd.merge(weather, features)

In [45]:
merged_feat_weat.head()


Out[45]:
time Temp Weather Wind Humidity Barometer Visibility Unnamed: 0 speed_mean speed_max speed_std speed_min y x grid date
0 2017-05-22 00 12 Cool. 9 km/h 88% 1022 mbar NaN 0 107.674931 146.0 27.909145 66.0 51.923162 4.504515 25.0 2017-05-22_00
1 2017-05-22 00 12 Cool. 9 km/h 88% 1022 mbar NaN 1 139.439560 178.0 17.145918 115.0 51.876314 4.572905 72.0 2017-05-22_00
2 2017-05-22 00 12 Cool. 9 km/h 88% 1022 mbar NaN 2 119.498054 141.0 20.024423 19.0 51.885456 4.564509 63.0 2017-05-22_00
3 2017-05-22 00 12 Cool. 9 km/h 88% 1022 mbar NaN 3 101.987520 118.0 9.905010 0.0 51.883272 4.425730 59.0 2017-05-22_00
4 2017-05-22 00 12 Cool. 9 km/h 88% 1022 mbar NaN 4 63.180233 82.0 18.516851 17.0 51.937475 4.431121 5.0 2017-05-22_00

In [46]:
incidents_xy = pd.read_csv("../incident_with_xy.csv")

In [49]:
incidents_xy['time'] = incidents_xy["date"] + " " + incidents_xy["report_time"]

In [54]:
incidents_xy['time'] = pd.to_datetime(incidents_xy["time"]).dt.strftime("%Y-%m-%d %H")

In [88]:
incidents_xy.columns = ['Unnamed: 0', 'V1', 'date', 'report_time', 'gemelde.locatie', 'y', 'x',
       'time', 'grid']

In [89]:
incidents_xy["grid"] = -1
for ix, row in incidents_xy.iterrows():
    incidents_xy.loc[ix, "grid"] = grider(row)

In [90]:
incidents_xy.head()


Out[90]:
Unnamed: 0 V1 date report_time gemelde.locatie y x time grid
0 1 0 2017-05-23 09:30:12.999000 27/37,9 51.855369 4.954711 2017-05-23 09 NaN
1 2 1 2017-05-24 07:49:54.999000 15/95,8 51.846133 4.950630 2017-05-24 07 NaN
2 3 2 2017-05-24 10:08:08.999000 15/102,5 51.839749 5.047208 2017-05-24 10 NaN
3 4 3 2017-05-26 17:44:37 15/65,3 51.866261 4.584767 2017-05-26 17 NaN
4 5 4 2017-05-26 20:26:48.999000 15/79,1 51.837240 4.721092 2017-05-26 20 NaN

In [93]:
incidents_xy = incidents_xy.dropna()

In [97]:
incidents_xy.head()


Out[97]:
Unnamed: 0 V1 date report_time gemelde.locatie y x time grid
9 10 9 2017-06-01 15:02:48 15/50,2 51.873703 4.381252 2017-06-01 15 67.0
15 16 15 2017-06-02 22:28:28 13/18,0 51.940589 4.427824 2017-06-02 22 5.0
17 18 0 2017-05-22 06:37:16 16/L/20,5 51.907173 4.540709 2017-05-22 06 44.0
18 19 1 2017-05-22 06:49:51 15/R/60,8 51.863643 4.531110 2017-05-22 06 80.0
20 21 3 2017-05-22 07:23:40 20/L/29,6 51.934234 4.446092 2017-05-22 07 15.0

In [99]:
incidents_small = incidents_xy[["time", "grid"]]
incidents_small["output"] = 1


C:\Users\Zoltan\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  

In [114]:
incidents_small.head()


Out[114]:
time grid output
9 2017-06-01 15 67.0 1
15 2017-06-02 22 5.0 1
17 2017-05-22 06 44.0 1
18 2017-05-22 06 80.0 1
20 2017-05-22 07 15.0 1

In [181]:
data_all_merged = pd.merge(features, incidents_small, on=['time', "grid"], how="outer")

In [182]:
data_all_merged["output"] = data_all_merged["output"].fillna(value=0)

In [183]:
data_all_merged = pd.merge(data_all_merged, weather)

In [184]:
data_all_merged.head()


Out[184]:
time grid speed_mean speed_max speed_std speed_min traf_flow output Temp Weather Wind Humidity Barometer Visibility
0 2017-05-22 00 3.0 98 145 62.7079 0 1 0.0 12 Cool. 9 km/h 88% 1022 mbar NaN
1 2017-05-22 00 5.0 74.0765 93.4167 9.27648 52.8333 12 0.0 12 Cool. 9 km/h 88% 1022 mbar NaN
2 2017-05-22 00 6.0 76.5184 97.5714 15.1522 36.4286 7 0.0 12 Cool. 9 km/h 88% 1022 mbar NaN
3 2017-05-22 00 7.0 80.3153 117.917 21.3205 31.1667 12 0.0 12 Cool. 9 km/h 88% 1022 mbar NaN
4 2017-05-22 00 8.0 93.4937 110.286 14.092 46.8571 7 0.0 12 Cool. 9 km/h 88% 1022 mbar NaN

In [185]:
data_all_merged["Visibility"] = data_all_merged["Visibility"].fillna("2 km")
data_all_merged["Visibility"].value_counts(dropna=False)


Out[185]:
2 km    16139
9 km      457
7 km      218
6 km      217
0 km      131
1 km      127
8 km       62
5 km       60
3 km       47
Name: Visibility, dtype: int64

In [186]:
data_all_merged["Wind"] = data_all_merged["Wind"].str.replace("km/h", "")
data_all_merged["Humidity"] = data_all_merged["Humidity"].str.replace("%", "")
data_all_merged["Visibility"] = data_all_merged["Visibility"].str.replace("km", "")

In [187]:
data_all_merged["hour"] = pd.to_datetime(data_all_merged["time"]).dt.hour
data_all_merged["day"] = pd.to_datetime(data_all_merged["time"]).dt.weekday

In [189]:
times = pd.to_datetime(data_all_merged["time"])

In [190]:
data_all_merged["times"] = times

In [203]:
data_all_merged.sort_values(["grid", "times"], inplace=True)

In [204]:
data_all_merged["output_shifted"] = data_all_merged.groupby(["grid", "times" ])["output"].sum().shift(1).reset_index()["output"]

In [207]:
data_all_merged["output_shifted"].describe()


Out[207]:
count    17443.000000
mean         0.015135
std          0.128945
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          3.000000
Name: output_shifted, dtype: float64

In [210]:
data_all_merged["output_shifted"].loc[np.array(data_all_merged["output_shifted"] >= 1)] = 1


C:\Users\Zoltan\Anaconda3\lib\site-packages\pandas\core\indexing.py:179: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)

In [315]:
data_all_merged.head()


Out[315]:
time grid speed_mean speed_max speed_std speed_min traf_flow output Temp Weather Wind Humidity Barometer Visibility hour day times output_shifted
44 2017-05-22 01 3.0 108.021 130 19.1822 64 1 0.0 11 Cool. 7 94 1022 mbar 2 1 0 2017-05-22 01:00:00 0.0
179 2017-05-22 05 3.0 105.15 124 14.6055 64 1 0.0 9 Cool. 6 94 1020 mbar 2 5 0 2017-05-22 05:00:00 0.0
228 2017-05-22 06 3.0 101.368 118.125 14.2641 71.125 8 0.0 9 Passing clouds. 6 100 1020 mbar 2 6 0 2017-05-22 06:00:00 0.0
287 2017-05-22 07 3.0 93.9023 115.4 15.4173 55.7 10 0.0 13 Cool. 2 82 1020 mbar 2 7 0 2017-05-22 07:00:00 0.0
348 2017-05-22 08 3.0 88.8137 117.2 19.8105 45.9 10 0.0 15 Cool. 9 68 1020 mbar 2 8 0 2017-05-22 08:00:00 0.0

In [364]:
#dummies_train = pd.concat([pd.get_dummies(train_feat_cat[col]) for col in train_feat_cat.columns], axis=1, keys=train_feat_cat.columns).as_matrix()
#dummies_test  = pd.concat([pd.get_dummies(test_feat_cat[col]) for col in test_feat_cat.columns], axis=1, keys=train_feat_cat.columns).as_matrix()
dummies = pd.concat([pd.get_dummies(feat_cat[col]) for col in feat_cat.columns], axis=1, keys=feat_cat.columns).as_matrix()

In [367]:
data_all_merged = data_all_merged[data_all_merged.Wind.str.contains("No") == False]
data_all_merged = data_all_merged.dropna()

In [370]:
features_to_model_numeric = ["Temp", "speed_mean", "speed_max", "speed_std",
                             "speed_min", "traf_flow", "Wind", "Humidity", "Visibility" ]
features_to_model_categoric = ["grid", "hour", "day", "Weather"]
features_to_model_label = ["Weather"]

In [371]:
feat_cat = data_all_merged[features_to_model_categoric]
feat_num = data_all_merged[features_to_model_numeric]
train_feat_num = train_data[features_to_model_numeric]
train_feat_cat = train_data[features_to_model_categoric]
train_feat_lab = train_data[features_to_model_label]
y_train = train_data.output_shifted

test_feat_num = test_data[features_to_model_numeric]
test_feat_cat = test_data[features_to_model_categoric]
test_feat_lab = test_data[features_to_model_label]
y_test = test_data.output_shifted

In [400]:
rob_scale = RobustScaler()
feat_num = rob_scale.fit_transform(feat_num)

In [401]:
all_data = pd.concat([times, pd.DataFrame(feat_num), pd.DataFrame(dummies), pd.DataFrame(data_all_merged['output_shifted'])], ignore_index=True, axis=1)

In [402]:
all_data.dropna(inplace=True)

In [403]:
all_data.head()


Out[403]:
0 1 2 3 4 5 6 7 8 9 ... 126 127 128 129 130 131 132 133 134 135
1 2017-05-22 -1.666667 0.482282 0.168314 -0.481384 0.759373 -0.333333 -0.5 1.04 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 2017-05-22 -1.666667 0.313752 -0.248041 -0.518721 0.974311 -0.121212 -0.5 1.28 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
3 2017-05-22 -1.000000 -0.018871 -0.441159 -0.392590 0.508988 -0.060606 -0.9 0.56 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 2017-05-22 -0.666667 -0.245602 -0.313595 0.087910 0.213352 -0.060606 -0.2 0.00 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 2017-05-22 -0.166667 0.041523 0.117693 0.610843 0.061225 -0.151515 -0.4 -0.32 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 136 columns


In [414]:
train_data = all_data.loc[~(pd.to_datetime(all_data[0]) >= "2017-06-02")]
test_data = all_data.loc[pd.to_datetime(all_data[0]) <= "2017-06-02"]

In [415]:
x_train = train_data.iloc[:, 1:-1]
x_test = test_data.iloc[:, 1:-1]
y_train = train_data.iloc[:, -1]
y_test = test_data.iloc[:, -1]

In [416]:
print(x_train.shape)
print(y_train.shape)


(14618, 134)
(14618,)

In [417]:
rf = RandomForestClassifier()

In [418]:
rf.fit(X=x_train, y = y_train)


Out[418]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [419]:
rf.score(X=x_test, y = y_test)


Out[419]:
0.99679552737437782

In [423]:
y_test.sum() / y_test.shape[0]


Out[423]:
0.011727006204404445

In [425]:
rf.predict_proba(X=x_test)


Out[425]:
array([[ 1. ,  0. ],
       [ 0.9,  0.1],
       [ 1. ,  0. ],
       ..., 
       [ 1. ,  0. ],
       [ 1. ,  0. ],
       [ 1. ,  0. ]])

In [422]:
np.where(y == 1)


Out[422]:
(array([  111,   138,   235,   251,   264,   311,   324,   336,   359,
          380,   510,   540,   570,   571,   598,   639,   645,   885,
          923,   954,  1153,  1304,  1309,  1317,  1666,  1699,  1745,
         1807,  1868,  1973,  2072,  2186,  2218,  2239,  2252,  2276,
         2319,  2324,  2327,  2345,  2573,  2589,  2620,  2653,  2754,
         2927,  3042,  3061,  3284,  3301,  3329,  3416,  3477,  3649,
         3665,  3810,  3839,  3853,  4055,  4070,  4155,  4183,  4236,
         4275,  4282,  4305,  4316,  4541,  4701,  4869,  4907,  4992,
         5100,  5215,  5336,  5435,  5463,  5576,  5586,  5588,  5635,
         5710,  5766,  5886,  5891,  5944,  6043,  6045,  6076,  6096,
         6124,  6157,  6248,  6259,  6372,  6405,  6445,  6473,  6492,
         6542,  6648,  6719,  6785,  6883,  6919,  6931,  6964,  7045,
         7144,  7304,  7341,  7382,  7400,  7610,  7635,  7659,  7673,
         7677,  7704,  7736,  7741,  7777,  7899,  7901,  7905,  7918,
         8188,  8196,  8205,  8334,  8414,  8546,  8613,  8634,  8636,
         8702,  8753,  8878,  8880,  8901,  8907,  8910,  8982,  9030,
         9115,  9162,  9192,  9201,  9248,  9452,  9460,  9512,  9610,
         9621,  9670,  9732,  9769,  9895,  9899,  9918,  9944,  9959,
         9987, 10002, 10232, 10281, 10320, 10399, 10467, 10971, 10985,
        11027, 11183, 11211, 11277, 11281, 11591, 11996, 12097, 12130,
        12156, 12166, 12415, 12485, 12576, 12604, 12614, 12726, 12759,
        12810, 12861, 12907, 12966, 13049, 13068, 13086, 13103, 13222,
        13243, 13294, 13512, 13561, 13569, 13604, 13812, 13833, 13894,
        13923, 14023, 14133, 14160, 14171, 14341, 14378, 14389, 14424,
        14624, 14658, 14670, 14697, 14925, 15070, 15174, 15235, 15345,
        15513, 15577, 15737, 15875, 15890, 16026, 16072, 16153, 16155,
        16258, 16383, 16472, 16480, 16529, 16564, 16673, 16729, 16901,
        17004, 17010, 17302, 17309], dtype=int64),)

In [429]:
x_train


Out[429]:
1 2 3 4 5 6 7 8 9 10 ... 125 126 127 128 129 130 131 132 133 134
1 -1.666667 0.482282 0.168314 -0.481384 0.759373 -0.333333 -0.5 1.04 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 -1.666667 0.313752 -0.248041 -0.518721 0.974311 -0.121212 -0.5 1.28 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
3 -1.000000 -0.018871 -0.441159 -0.392590 0.508988 -0.060606 -0.9 0.56 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 -0.666667 -0.245602 -0.313595 0.087910 0.213352 -0.060606 -0.2 0.00 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
5 -0.166667 0.041523 0.117693 0.610843 0.061225 -0.151515 -0.4 -0.32 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
6 0.166667 1.339987 0.664396 -1.348249 1.966047 -0.333333 -0.2 -0.60 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
7 0.333333 0.873521 0.646679 -1.039961 1.641753 -0.242424 0.2 -0.88 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
8 0.500000 0.191321 -1.071892 -1.657638 1.558794 -0.303030 0.6 -0.84 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
9 0.500000 0.344550 0.640773 -0.878810 1.151542 -0.272727 0.4 -1.08 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
10 0.833333 0.225793 0.008859 -0.712070 1.023333 -0.242424 0.2 -1.16 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
11 0.833333 0.139226 -0.693924 -0.869712 0.869984 -0.181818 0.2 -1.16 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
12 0.833333 0.338001 -0.162408 -0.601957 0.856577 -0.090909 0.2 -1.16 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
13 0.833333 0.226974 -0.049745 -0.370417 0.722244 0.030303 0.2 -1.28 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14 0.666667 -0.187041 -0.434072 -0.712001 1.005017 0.060606 -0.2 -0.84 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
15 0.833333 0.685817 0.699830 -0.511093 1.040930 -0.181818 -0.5 -1.16 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
16 0.333333 0.250703 -0.115162 -0.469498 0.953302 -0.151515 -0.4 -0.72 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
17 0.166667 1.161532 1.160478 -0.515405 1.513544 -0.303030 -0.5 -0.48 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
18 -0.333333 2.436924 3.782626 -0.524577 2.569384 -0.333333 -0.7 0.20 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
19 -0.333333 0.266766 -0.256900 0.114260 -0.296467 -0.333333 -0.5 -0.16 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
20 -0.833333 3.680763 4.633053 -1.663490 3.957059 -0.333333 -0.7 0.56 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
21 -1.000000 0.651751 -0.186031 -0.476741 0.668872 -0.333333 -0.2 0.56 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
22 -1.000000 0.027192 -0.670301 -0.548392 0.246536 -0.181818 -0.2 0.56 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
23 -0.666667 0.271654 -0.140932 -0.519964 0.707266 -0.030303 0.0 0.56 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
24 -0.333333 -0.278698 -0.239182 0.004930 0.476558 0.121212 0.0 0.20 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
25 -0.333333 -1.033741 -0.752982 0.783758 -0.458904 0.030303 0.2 0.36 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
26 -0.166667 0.321321 0.522658 0.081901 0.397370 -0.272727 0.2 0.00 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
27 -0.166667 0.075694 -0.280523 -0.296385 0.397370 -0.272727 0.9 0.20 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
28 -0.333333 -0.016141 -0.356116 -0.248716 0.566305 -0.060606 0.6 0.20 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
29 -0.166667 0.230678 -0.631492 -0.994696 0.948993 -0.151515 0.6 0.00 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
30 0.000000 0.247990 -0.115162 -0.532081 0.319798 -0.151515 0.8 -0.32 0.0 1.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
14713 -1.000000 1.235104 1.433829 -0.559588 1.263590 -0.151515 -0.2 0.56 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
14714 -1.000000 0.348043 0.071550 -0.741557 0.995486 1.212121 -0.2 0.56 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14715 -0.666667 0.060236 -0.141272 -0.382105 0.641563 2.515152 0.0 0.56 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14716 -0.333333 -0.639377 -0.347546 0.542771 -0.230755 3.545455 0.0 0.20 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14717 -0.333333 0.124107 0.006061 -0.500616 0.834790 1.939394 0.2 0.36 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14718 -0.166667 0.431231 -0.142259 -1.048929 1.155091 0.666667 0.2 0.00 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14719 -0.166667 0.452053 -0.149639 -1.095060 1.267317 0.757576 0.9 0.20 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14720 -0.333333 0.345246 -0.033390 -0.767642 1.023333 1.212121 0.6 0.20 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14721 -0.166667 0.446474 -0.075790 -0.892068 1.130760 1.000000 0.6 0.00 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14722 0.000000 0.380425 -0.024668 -0.912402 1.087495 1.606061 0.8 -0.32 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
14723 -0.166667 0.351441 -0.045455 -0.934052 1.129782 1.484848 0.8 -0.16 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14724 -0.166667 -0.243337 -0.003662 0.047228 0.197465 1.909091 0.9 -0.32 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14725 -0.166667 -1.247287 -0.480317 1.413601 -0.725450 3.212121 0.9 -0.32 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14726 -0.166667 -0.050909 -0.141919 -0.076959 0.323800 2.606061 0.8 -0.32 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14727 -0.666667 0.636859 0.215989 -0.991920 1.317734 1.303030 0.0 0.36 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
14728 -0.666667 0.852902 0.706273 -0.914066 1.364081 0.303030 -0.2 0.36 0.0 0.0 ... 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14729 -0.833333 0.604413 0.052824 -1.027006 1.138134 0.454545 -0.7 0.56 0.0 0.0 ... 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0
14730 -0.833333 0.386250 0.001266 -0.782044 0.886504 0.060606 -0.9 0.56 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
14731 -0.833333 0.921996 1.295773 -0.002221 0.816964 -0.030303 -0.9 0.56 0.0 0.0 ... 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14732 -1.000000 0.522820 0.451789 -0.270501 0.744289 -0.242424 -0.9 0.80 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
14733 -1.166667 1.271312 0.770699 -1.390146 2.056547 -0.303030 -0.4 0.80 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
14734 -1.166667 -0.195572 -0.398637 0.155663 0.518038 -0.333333 0.0 1.04 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
14735 -1.000000 0.407472 -0.682113 -1.391854 1.573878 -0.303030 -0.2 1.04 5.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14736 -1.000000 0.981943 0.855742 -0.629457 0.877023 -0.060606 -0.4 1.04 4.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14737 -1.166667 0.474675 0.294728 -0.772700 1.027613 0.757576 -0.4 1.28 4.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14738 -0.666667 0.429494 -0.020183 -1.036811 1.234889 2.575758 -0.2 1.04 7.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14739 -0.500000 0.482351 0.124810 -0.933299 1.223225 2.696970 -0.4 0.60 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14740 -0.333333 0.382463 0.054742 -0.837344 1.002254 2.000000 -0.4 0.36 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14741 -0.333333 0.401085 -0.111787 -1.030423 1.263590 0.909091 0.0 0.36 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
14742 -0.166667 0.589347 0.387363 -0.715347 1.129602 0.969697 0.2 0.20 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

14618 rows × 134 columns


In [430]:
all_data.columns


Out[430]:
RangeIndex(start=0, stop=136, step=1)

In [ ]: