In [285]:
import numpy as np
import pandas as pd
# import tensorflow as tf
from matplotlib import pyplot as plt
import datetime
import os
import gzip
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
In [58]:
waze = pd.read_csv("../waze (4).csv", sep=";")
print(waze['subtype'].unique())
In [67]:
waze.columns
Out[67]:
In [69]:
w_g = waze.groupby(['x', 'y'])['area'].count()
In [70]:
plt.plot(waze['x'], waze['y'], ".")
plt.show()
In [62]:
waze['reportDescription'].unique()
Out[62]:
In [63]:
waze['publicatedOn'] = pd.to_datetime(waze['publicatedOn'], format="%Y%m%d%H%M%S")
waze["hour"] = waze.publicatedOn.dt.hour
plt.hist(waze.loc[waze['subtype'] == 'ACCIDENT_MAJOR']["hour"], bins=23)
waze.columns = ['WID', 'area', 'publicatedOn', 'updatedOn', 'closedOn', 'country',
'nThumbsUp', 'city', 'reportRating', 'reliability', 'type', 'uuid',
'speed', 'reportMood', 'subtype', 'street', 'id', 'additionalInfo',
'nearBy', 'roadType', 'wazeid', 'nComments', 'reportBy', 'inscale',
'comments', 'isThumbsUp', 'isJamUnifiedAlert', 'confidence', 'nImages',
'magvar', 'showFacebookPic', 'wazeData', 'reportDescription',
'x', 'y', 'pubMillis', 'provider', 'providerId',
'imageUrl', 'imageId', 'reportByActiveGroupName',
'reportByActiveGroupIcon', 'missing', 'hour']
In [50]:
incidents = pd.read_excel("../Incidents_WPH_Car.xlsx")
In [51]:
incidents.head()
Out[51]:
In [23]:
flitsmeister = pd.read_csv("../flitsmeister week 21 22/2017-05-22_17.csv")
In [24]:
flitsmeister.head()
Out[24]:
In [26]:
flitsmeister.shape
Out[26]:
In [28]:
flitsmeister["SessionID"].nunique()
Out[28]:
In [31]:
plt.hist(flitsmeister["Speed"], bins=35)
Out[31]:
In [41]:
flitsmeister['time'] = pd.to_datetime(flitsmeister['created_at'])
In [64]:
flitsmeister[np.array(flitsmeister['time'] <= datetime.datetime(2017, 5, 22, 17, 8, 3))].head()
Out[64]:
In [29]:
incident = pd.read_csv("../incident_with_xy.csv")
incident.columns = ['Unnamed: 0', 'V1', 'date', 'report_time', 'gemelde.locatie', 'y', 'x']
incident['publicatedOn'] = pd.to_datetime(incident['date'] + " " +incident['report_time'])
In [33]:
plt.plot(incident["y"], incident["x"], ".")
plt.show()
In [71]:
plt.plot(waze['x'], waze['y'], ".")
plt.show()
In [35]:
waze['subtype'].unique()
Out[35]:
In [44]:
waze['publicatedOn'].min()
Out[44]:
In [320]:
incident.head()
Out[320]:
In [51]:
pd.to_datetime(incident['date']).max()
Out[51]:
In [72]:
print(waze["x"].min(), waze["y"].max())
In [74]:
print(waze["x"].max(), waze["y"].min())
In [75]:
diff_x = waze["x"].max() - waze["x"].min()
diff_y = waze["y"].max() - waze["y"].min()
In [76]:
diff_x /= 10
diff_y /= 10
In [77]:
x_co = [waze["x"].min()+diff_x*i for i in range(11)]
In [78]:
y_co = [waze["y"].max()-diff_y*i for i in range(11)]
In [79]:
counter = 1
dict_coord = {}
for y in range(1,10):
for x in range(1, 10):
dict_coord[x, y] = counter
counter +=1
In [80]:
def grider(row):
for i, x in enumerate(x_co[:-1]):
if x <= row['x'] <= x_co[i+1]:
x_op = i
for i, y in enumerate(y_co[:-1]):
if y >= row['y'] >= y_co[i+1]:
y_op = i
try:
num = dict_coord[(x_op, y_op)]
except:
num = None
return num
In [331]:
data_train = pd.DataFrame()
for index, row in waze.iterrows():
if grider(row) != None:
data_train = data_train.append(pd.DataFrame([1, grider(row), row['publicatedOn']]).T)
In [334]:
data_train.to_csv("../feature_for_grid.csv")
In [333]:
data_train.head()
Out[333]:
In [208]:
temp = pd.read_csv("../flitsmeister week 21 22/2017-05-22_00.csv")
In [ ]:
#func = {'Speed':['mean', 'max', "std", "min"], 'SessionID':['nunique']}
features = pd.DataFrame()
for file in os.listdir("../flitsmeister week 21 22/"):
if file.endswith(".gz"):
with gzip.open("../flitsmeister week 21 22/" + file, "r") as f:
try:
temp = pd.read_csv(f)
temp.columns = ['RowID', 'SessionID', 'y', 'x', 'Bearing', 'Speed',
'created_at', 'geohash', 'SessionCount']
#temp["grid"] = temp.apply(lambda x: grider(x), axis=1)
#to_concat = temp.groupby(["grid"])["Speed"].agg(func)
#to_concat = pd.DataFrame(to_concat.reset_index().as_matrix(), columns=['grid', "speed_mean", "speed_max", "speed_std", "speed_min", "count" ])
func = {'Speed':['mean', 'max', "std", "min"], 'y':['mean'], "x":['mean']}
temp_agg = temp.groupby(["SessionID"]).agg(func)
temp_agg = pd.DataFrame(temp_agg.as_matrix(), columns=['speed_mean', 'speed_max', "speed_std", "speed_min",
"y", "x"])
temp_agg["grid"] = -1
for ix, row in temp_agg.iterrows():
temp_agg.loc[ix, "grid"] = grider(row)
temp_agg["date"] = file.split(".")[0]
print(file.split(".")[0])
features = features.append(temp_agg)
print("Finished with " + file)
except:
continue
In [367]:
#features.to_csv("../features.csv")
In [5]:
features = pd.read_csv("../features.csv")
features.head(10)
Out[5]:
In [7]:
times = pd.to_datetime(features["date"], format="%Y-%m-%d_%H")
In [18]:
times_hour = times.dt.strftime("%Y-%m-%d %H")
In [19]:
features["time"] = times_hour
In [126]:
features.head()
Out[126]:
In [134]:
features.groupby(["time", "grid"]).agg(func).reset_index().head()
Out[134]:
In [137]:
func = {'speed_mean':['mean'], 'speed_max':['mean'], 'speed_std':['mean'], 'speed_min':['mean'], 'date':['count']}
features = pd.DataFrame(features.groupby(["time", "grid"]).agg(func).reset_index().as_matrix(), columns = ["time",
"grid",
"speed_mean",
"speed_max",
"speed_std",
"speed_min",
"traf_flow"])
In [32]:
weather = pd.read_csv("../weather.csv", encoding="latin-1")
weather["time"] = pd.to_datetime(weather["Time"])
In [33]:
weather["time"] = weather["time"].dt.strftime("%Y-%m-%d %H")
In [37]:
del weather["Time"]
In [42]:
weather = weather.groupby(["time"]).first().reset_index()
In [44]:
merged_feat_weat = pd.merge(weather, features)
In [45]:
merged_feat_weat.head()
Out[45]:
In [46]:
incidents_xy = pd.read_csv("../incident_with_xy.csv")
In [49]:
incidents_xy['time'] = incidents_xy["date"] + " " + incidents_xy["report_time"]
In [54]:
incidents_xy['time'] = pd.to_datetime(incidents_xy["time"]).dt.strftime("%Y-%m-%d %H")
In [88]:
incidents_xy.columns = ['Unnamed: 0', 'V1', 'date', 'report_time', 'gemelde.locatie', 'y', 'x',
'time', 'grid']
In [89]:
incidents_xy["grid"] = -1
for ix, row in incidents_xy.iterrows():
incidents_xy.loc[ix, "grid"] = grider(row)
In [90]:
incidents_xy.head()
Out[90]:
In [93]:
incidents_xy = incidents_xy.dropna()
In [97]:
incidents_xy.head()
Out[97]:
In [99]:
incidents_small = incidents_xy[["time", "grid"]]
incidents_small["output"] = 1
In [114]:
incidents_small.head()
Out[114]:
In [181]:
data_all_merged = pd.merge(features, incidents_small, on=['time', "grid"], how="outer")
In [182]:
data_all_merged["output"] = data_all_merged["output"].fillna(value=0)
In [183]:
data_all_merged = pd.merge(data_all_merged, weather)
In [184]:
data_all_merged.head()
Out[184]:
In [185]:
data_all_merged["Visibility"] = data_all_merged["Visibility"].fillna("2 km")
data_all_merged["Visibility"].value_counts(dropna=False)
Out[185]:
In [186]:
data_all_merged["Wind"] = data_all_merged["Wind"].str.replace("km/h", "")
data_all_merged["Humidity"] = data_all_merged["Humidity"].str.replace("%", "")
data_all_merged["Visibility"] = data_all_merged["Visibility"].str.replace("km", "")
In [187]:
data_all_merged["hour"] = pd.to_datetime(data_all_merged["time"]).dt.hour
data_all_merged["day"] = pd.to_datetime(data_all_merged["time"]).dt.weekday
In [189]:
times = pd.to_datetime(data_all_merged["time"])
In [190]:
data_all_merged["times"] = times
In [203]:
data_all_merged.sort_values(["grid", "times"], inplace=True)
In [204]:
data_all_merged["output_shifted"] = data_all_merged.groupby(["grid", "times" ])["output"].sum().shift(1).reset_index()["output"]
In [207]:
data_all_merged["output_shifted"].describe()
Out[207]:
In [210]:
data_all_merged["output_shifted"].loc[np.array(data_all_merged["output_shifted"] >= 1)] = 1
In [315]:
data_all_merged.head()
Out[315]:
In [364]:
#dummies_train = pd.concat([pd.get_dummies(train_feat_cat[col]) for col in train_feat_cat.columns], axis=1, keys=train_feat_cat.columns).as_matrix()
#dummies_test = pd.concat([pd.get_dummies(test_feat_cat[col]) for col in test_feat_cat.columns], axis=1, keys=train_feat_cat.columns).as_matrix()
dummies = pd.concat([pd.get_dummies(feat_cat[col]) for col in feat_cat.columns], axis=1, keys=feat_cat.columns).as_matrix()
In [367]:
data_all_merged = data_all_merged[data_all_merged.Wind.str.contains("No") == False]
data_all_merged = data_all_merged.dropna()
In [370]:
features_to_model_numeric = ["Temp", "speed_mean", "speed_max", "speed_std",
"speed_min", "traf_flow", "Wind", "Humidity", "Visibility" ]
features_to_model_categoric = ["grid", "hour", "day", "Weather"]
features_to_model_label = ["Weather"]
In [371]:
feat_cat = data_all_merged[features_to_model_categoric]
feat_num = data_all_merged[features_to_model_numeric]
train_feat_num = train_data[features_to_model_numeric]
train_feat_cat = train_data[features_to_model_categoric]
train_feat_lab = train_data[features_to_model_label]
y_train = train_data.output_shifted
test_feat_num = test_data[features_to_model_numeric]
test_feat_cat = test_data[features_to_model_categoric]
test_feat_lab = test_data[features_to_model_label]
y_test = test_data.output_shifted
In [400]:
rob_scale = RobustScaler()
feat_num = rob_scale.fit_transform(feat_num)
In [401]:
all_data = pd.concat([times, pd.DataFrame(feat_num), pd.DataFrame(dummies), pd.DataFrame(data_all_merged['output_shifted'])], ignore_index=True, axis=1)
In [402]:
all_data.dropna(inplace=True)
In [403]:
all_data.head()
Out[403]:
In [414]:
train_data = all_data.loc[~(pd.to_datetime(all_data[0]) >= "2017-06-02")]
test_data = all_data.loc[pd.to_datetime(all_data[0]) <= "2017-06-02"]
In [415]:
x_train = train_data.iloc[:, 1:-1]
x_test = test_data.iloc[:, 1:-1]
y_train = train_data.iloc[:, -1]
y_test = test_data.iloc[:, -1]
In [416]:
print(x_train.shape)
print(y_train.shape)
In [417]:
rf = RandomForestClassifier()
In [418]:
rf.fit(X=x_train, y = y_train)
Out[418]:
In [419]:
rf.score(X=x_test, y = y_test)
Out[419]:
In [423]:
y_test.sum() / y_test.shape[0]
Out[423]:
In [425]:
rf.predict_proba(X=x_test)
Out[425]:
In [422]:
np.where(y == 1)
Out[422]:
In [429]:
x_train
Out[429]:
In [430]:
all_data.columns
Out[430]:
In [ ]: