In [1]:
import emission.storage.decorations.location_queries as lq
In [2]:
reload(lq)
Out[2]:
In [3]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logging.debug("test")
In [4]:
%matplotlib inline
Note that we need to have separate unit tests for the analysis functions that will be run as part of the ongoing pipeline. This just tests the part that puts them all together and displays the map
In [5]:
ts = lq.get_section("20150120T083030-0800_0"); ts
Out[5]:
In [6]:
import emission.analysis.plotting.leaflet_osm.our_plotter as lo
In [7]:
reload(lo)
Out[7]:
In [8]:
import emission.analysis.classification.cleaning.speed_outlier_detection as cso
import emission.analysis.classification.cleaning.jump_smoothing as cjs
In [389]:
reload(cso)
reload(cjs)
Out[389]:
In [10]:
import emission.analysis.plotting.leaflet_osm.ipython_helper as ipy
Order of results
In [11]:
import itertools
for (o, a) in itertools.product(["BoxplotOutlier", "SimpleQuartileOutlier"],
["SmoothPiecewiseRansac", "SmoothBoundary", "SmoothPosdap"]):
print o, a
In [12]:
import json
In [288]:
section_ground_truth_list = json.load(open("/Users/shankari/cluster_ground_truth/smoothing/smoothing_removed_points_combined"))
In [289]:
extract_raw_section_id = lambda(uuid_key): '_'.join(uuid_key.split('_')[1:])
In [290]:
section_id_list = [extract_raw_section_id(key) for key in section_ground_truth_list.keys()]
In [291]:
section_list = [lq.get_section(sid) for sid in section_id_list if lq.get_section(sid) is not None]
In [292]:
import emission.analysis.classification.cleaning.location_smoothing as ls
In [470]:
reload(ls)
Out[470]:
In [20]:
def two_rows(map_list):
ret_list = [[map_list[0], map_list[2], map_list[3], map_list[4]], [map_list[1], map_list[5], map_list[6], map_list[7]]]
return ret_list
In [21]:
import emission.analysis.plotting.leaflet_osm.ipython_helper as ipy
In [22]:
import numpy as np
In [294]:
import copy as copy
In [331]:
import pandas as pd
In [295]:
section_train_only = copy.copy(section_list[18])
section_train_only.end_ts = 1427903481.0
In [246]:
section_train_only_df = ls.get_section_points(section_train_only)
In [248]:
with_features_train_only_df = ls.add_heading(ls.add_heading_change(ls.add_speed(ls.filter_accuracy(section_train_only_df))))
In [249]:
section_train_only_iqr_threshold = cso.BoxplotOutlier(ignore_zeros=True).get_threshold(with_features_train_only_df)
In [251]:
section_train_only_iqr_threshold
Out[251]:
In [252]:
with_features_train_only_df[with_features_train_only_df.speed > section_train_only_iqr_threshold]
Out[252]:
In [254]:
np.nonzero(abs(with_features_train_only_df.mLongitude + 122.46) < 0.01)
Out[254]:
In [255]:
np.nonzero(abs(with_features_train_only_df.mLongitude + 121.95) < 0.01)
Out[255]:
In [256]:
cleaned_train_only_df = with_features_train_only_df[np.logical_not(abs(with_features_train_only_df.mLongitude + 122.46) < 0.01)]
In [258]:
recomputed_cleaned_train_only_df = ls.add_heading_change(ls.add_heading(ls.add_speed(
cleaned_train_only_df.drop('speed', axis=1).drop('distance', axis=1).drop('heading', axis=1).drop('heading_change', axis=1))))
In [268]:
with_features_train_only_df.heading_change.plot(kind="bar", figsize = (20,6))
Out[268]:
In [269]:
recomputed_cleaned_train_only_df.heading_change.plot(kind="bar", figsize=(20,6))
Out[269]:
In [281]:
ipy.inline_map(lo.get_map(cleaned_train_only_df))
Out[281]:
In [322]:
section_train_only_segmentation_points = with_features_train_only_df[with_features_train_only_df.speed > section_train_only_iqr_threshold].index
In [323]:
section_train_only_segmentation_points = section_train_only_segmentation_points.insert(0, 0)
section_train_only_segmentation_points = section_train_only_segmentation_points.insert(len(segmentation_points), with_features_train_only_df.shape[0] - 1)
In [324]:
print section_train_only_segmentation_points
In [325]:
segments = zip(section_train_only_segmentation_points, section_train_only_segmentation_points[1:])
In [332]:
segments_df = pd.DataFrame(segments, columns=["start", "end"])
In [359]:
# Find longest subsequence (part with no zigzags). If any segment is good, this must be it.
# What is the definition of "longest"?
# Can't be lame and rely on number of points- note that 23 - 29 has a bunch of points and very little distance
# Yeah but using distance is also problematic - 1-4 has a huge distance because it has an undetected zig (part of zig zag) in it
# How about largest number of points that are more than 100 mts apart. That should work.
In [351]:
for (start, end) in segments:
currSegment = with_features_train_only_df[start:end]
currDistance = pf.calDistance(currSegment.iloc[0], currSegment.iloc[-1])
print ("From %s to %s, number of points is %s, distance is %s" % (start, end, (end-start), pf.calDistance(currSegment.iloc[0], currSegment.iloc[-1])))
In [353]:
good_segments = [segments[4]]; good_segments
Out[353]:
In [354]:
with_features_train_only_df[1:4]
Out[354]:
In [357]:
ls.add_speed(pd.DataFrame(with_features_train_only_df[1:4].to_dict('records')).drop('speed', axis=1).drop('distance', axis=1))
Out[357]:
In [296]:
section_bike_only = copy.copy(section_list[18])
section_bike_only.start_ts = 1427903482
In [297]:
section_bike_only_df = ls.get_section_points(section_bike_only)
In [298]:
with_features_bike_only_df = ls.add_heading(ls.add_heading_change(ls.add_speed(ls.filter_accuracy(section_bike_only_df))))
In [300]:
tml = lo.evaluate_filtering([section_bike_only], [cso.BoxplotOutlier(ignore_zeros=True)],
[cjs.SmoothPiecewiseRansac(), cjs.SmoothBoundary()])
In [302]:
ipy.inline_maps(tml, 1, 4)
Out[302]:
In [23]:
section_18_df = ls.get_section_points(section_list[18])
In [24]:
with_speeds_18_df = ls.add_speed(ls.filter_accuracy(section_18_df))
In [25]:
ipy.inline_map(lo.get_map(with_speeds_18_df))
Out[25]:
In [26]:
with_speeds_18_df[abs(with_speeds_18_df.mLongitude + 122.46) < 0.01]
Out[26]:
In [27]:
section_18_iqr_threshold = cso.BoxplotOutlier(ignore_zeros=True).get_threshold(with_speeds_18_df)
In [28]:
section_18_iqr_threshold_low = cso.BoxplotOutlier(ignore_zeros=True).get_lower_threshold(with_speeds_18_df)
In [29]:
(section_18_iqr_threshold, section_18_iqr_threshold_low)
Out[29]:
In [270]:
section_18_speedThresholdMap = with_speeds_18_df.speed > section_18_iqr_threshold
In [271]:
with_speeds_18_df[section_18_speedThresholdMap]
Out[271]:
In [272]:
section_18_segmentation_points = with_speeds_18_df[section_18_speedThresholdMap].index
In [273]:
section_18_segmentation_points
Out[273]:
In [274]:
section_18_segmentation_points = section_18_segmentation_points.insert(0,0)
section_18_segmentation_points = section_18_segmentation_points.insert(len(segmentation_points),len(speedThresholdMap) -1 )
In [275]:
section_18_segmentation_points
Out[275]:
In [276]:
zip(section_18_segmentation_points, section_18_segmentation_points[1:])
Out[276]:
In [277]:
import emission.analysis.point_features as pf
In [278]:
segment_distance_list = []
for (start, end) in zip(segmentation_points, segmentation_points[1:]):
currSegment = with_speeds_18_df[start:end]
recalcSegment = ls.add_speed(currSegment.drop('speed', axis=1).drop('distance', axis=1))
if len(recalcSegment[recalcSegment.distance > 100].index) > 0:
print ("From %s to %s, re-break at %s" % (start, end, start + recalcSegment[recalcSegment.distance > 100].index[0]))
print ("From %s to %s, distance is %s" % (start, end, pf.calDistance(currSegment.iloc[0], currSegment.iloc[-1])))
segment_distance_list.append(pf.calDistance(currSegment.iloc[0], currSegment.iloc[-1]))
In [280]:
with_speeds_18_df[0:10]
Out[280]:
In [40]:
segment_distance_list
Out[40]:
In [41]:
with_speeds_18_df[speedThresholdMap]
Out[41]:
In [238]:
ground_truth_mask = np.logical_or(abs(with_speeds_18_df.mLongitude + 122.46) < 0.01, abs(with_speeds_18_df.mLongitude + 121.95) < 0.01)
In [239]:
np.nonzero(ground_truth_mask)
Out[239]:
In [145]:
import numpy as np
In [146]:
ipy.inline_map(lo.get_map(with_speeds_18_df[np.logical_not(ground_truth_mask)]))
Out[146]:
In [148]:
np.nonzero(ground_truth_mask)
Out[148]:
In [240]:
after_all_filtering_section_18 = ls.add_heading_change(ls.add_heading(ls.add_speed(ls.filter_accuracy(section_18_df)[np.logical_not(ground_truth_mask)])))
In [241]:
cso.BoxplotOutlier(ignore_zeros=True).get_threshold(after_all_filtering_section_18)
Out[241]:
In [243]:
after_all_filtering_section_18.distance.plot(kind="bar", figsize=(20,6))
Out[243]:
In [46]:
speedThresholdMap.shape[0]
Out[46]:
In [47]:
speedThresholdMap.iloc[0] = True
speedThresholdMap.iloc[-1] = True
In [48]:
speedThresholdMap.head(), speedThresholdMap.tail()
Out[48]:
In [49]:
fa = cjs.SmoothBoundary(maxSpeed = section_18_iqr_threshold)
fa.filter(with_speeds_18_df)
In [50]:
ipy.inline_map(lo.get_map(with_speeds_18_df[fa.inlier_mask_]))
Out[50]:
In [51]:
fa = cjs.SmoothPiecewiseRansac(maxSpeed = section_18_iqr_threshold)
fa.filter(with_speeds_18_df)
In [52]:
np.nonzero(np.logical_not(fa.inlier_mask_))
Out[52]:
In [53]:
ipy.inline_map(lo.get_map(with_speeds_18_df[1:11]))
Out[53]:
In [54]:
with_hcs_18_df = ls.add_heading_change(ls.add_heading(with_speeds_18_df))
In [152]:
np.count_nonzero(after_all_filtering_section_18.speed > 117228.81314806046)
Out[152]:
In [55]:
with_hcs_18_df.heading_change.plot(kind="bar", figsize=(20,6))
Out[55]:
In [242]:
after_all_filtering_section_18.heading_change.plot(kind="bar", figsize=(20,6))
Out[242]:
In [56]:
with_hcs_18_df.heading.plot(kind="bar", figsize=(20,6))
Out[56]:
In [57]:
with_hcs_18_df[np.logical_not(ground_truth_mask)].heading.plot(kind="bar", figsize=(15, 6))
Out[57]:
In [58]:
with_hcs_18_df[np.logical_not(ground_truth_mask)].heading_change.plot(kind="bar", figsize=(15, 6))
Out[58]:
In [59]:
with_hcs_18_df[abs(with_hcs_18_df.heading_change) > 135]
Out[59]:
In [60]:
from uuid import UUID
import attrdict as ad
In [61]:
section_new_data = ad.AttrDict({'user_id': UUID('b0d937d0-70ef-305e-9563-440369012b39'), 'loc_filter': 'distance',
'start_ts': 1437667649000.0, 'end_ts': 1437671636000.0, 'source': 'new'})
In [62]:
section_new_data_df = ls.get_section_points(section_new_data)
In [63]:
import emission.core.get_database as edb
In [64]:
edb.get_usercache_db().find({'data.mTime': 1437667649213}).count()
Out[64]:
In [65]:
with_speeds_new_data_df = ls.add_speed(ls.filter_accuracy(section_new_data_df))
In [66]:
new_data_iqr_threshold = cso.BoxplotOutlier(ignore_zeros=True).get_threshold(with_speeds_new_data_df)
In [67]:
new_data_iqr_threshold
Out[67]:
In [367]:
with_speeds_new_data_df[with_speeds_new_data_df.speed > new_data_iqr_threshold][["mLatitude", "mLongitude", "mAccuracy", "speed", "mTime", "formatted_time"]]
Out[367]:
In [368]:
section_new_data_segmentation_points = with_speeds_new_data_df[with_speeds_new_data_df.speed > new_data_iqr_threshold].index
In [369]:
section_new_data_segmentation_points = section_new_data_segmentation_points.insert(0,0)
section_new_data_segmentation_points = section_new_data_segmentation_points.insert(len(section_new_data_segmentation_points),
with_speeds_new_data_df.shape[0])
In [370]:
zip(section_new_data_segmentation_points, section_new_data_segmentation_points[1:])
Out[370]:
In [371]:
for (start, end) in zip(section_new_data_segmentation_points, section_new_data_segmentation_points[1:]):
currSegment = with_speeds_new_data_df[start:end]
currDistance = pf.calDistance(currSegment.iloc[0], currSegment.iloc[-1])
print ("From %s to %s, number of points is %s, distance is %s" % (start, end, (end-start), pf.calDistance(currSegment.iloc[0], currSegment.iloc[-1])))
In [379]:
pd.Series([14025.4359673, 0,0, 31915.3776788]).hist()
Out[379]:
In [378]:
with_speeds_new_data_df[0:10][["mLatitude", "mLongitude", "mAccuracy", "speed", "distance", "mTime", "formatted_time"]]
Out[378]:
In [382]:
with_speeds_new_data_df[10:15][["mLatitude", "mLongitude", "mAccuracy", "speed", "distance", "mTime", "formatted_time"]]
Out[382]:
In [69]:
ground_truth_mask = abs(with_speeds_new_data_df.mLongitude + 122.38) < 0.01
In [70]:
ground_truth_mask.iloc[0] = False
In [71]:
with_speeds_new_data_df[ground_truth_mask].index
Out[71]:
In [72]:
ipy.inline_map(lo.get_map(with_speeds_new_data_df))
Out[72]:
In [73]:
ipy.inline_map(lo.get_map(with_speeds_new_data_df[np.logical_not(ground_truth_mask)]))
Out[73]:
In [74]:
with_speeds_new_data_df[ground_truth_mask].index
Out[74]:
In [75]:
with_speeds_new_data_df[5:15][["mLatitude", "mLongitude", "mAccuracy", "speed", "mTime", "formatted_time"]]
Out[75]:
In [153]:
orig_ground_truth_mask_new_data = ground_truth_mask = abs(section_new_data_df.mLongitude + 122.38) < 0.01
In [156]:
after_all_filtering_section_new_data = ls.add_speed(ls.filter_accuracy(section_new_data_df[np.logical_not(orig_ground_truth_mask_new_data)]))
In [157]:
cso.BoxplotOutlier(ignore_zeros=True).get_threshold(after_all_filtering_section_new_data)
Out[157]:
In [158]:
np.count_nonzero(after_all_filtering_section_new_data.speed > 49.138563586884288)
Out[158]:
In [94]:
locArr = with_hcs_18_df[["mLongitude", "mLatitude"]].iloc[0:11].as_matrix()
In [97]:
timeArr = with_hcs_18_df.mTime.iloc[0:11].as_matrix()
In [208]:
with_hcs_18_df.index[0:11] * 1000
Out[208]:
In [218]:
from sklearn import linear_model
model_ransac = linear_model.RANSACRegressor(linear_model.LinearRegression())
model_ransac.fit(locArr, timeArr)
model_ransac.inlier_mask_
Out[218]:
In [212]:
from sklearn import linear_model
model_linear = linear_model.LinearRegression()
model_ransac = linear_model.RANSACRegressor(model_linear)
model_ransac.fit(locArr, with_hcs_18_df.index[0:11])
model_ransac.inlier_mask_
Out[212]:
In [216]:
model_linear = linear_model.LinearRegression().fit(locArr, timeArr)
model_linear.coef_
Out[216]:
In [219]:
model_ransac.predict(locArr)
Out[219]:
In [220]:
import datetime as pydt
In [223]:
[pydt.datetime.fromtimestamp(time) for time in model_ransac.predict(locArr)]
Out[223]:
In [226]:
from sklearn import linear_model
model_ransac = linear_model.RANSACRegressor(linear_model.LinearRegression())
model_ransac.fit(locArr, with_hcs_18_df.index[0:11] * 1000)
model_ransac.inlier_mask_
Out[226]:
In [228]:
[index for index in model_ransac.predict(locArr)]
Out[228]:
In [230]:
timeLatArr = with_hcs_18_df[["mTime", "mLatitude"]].iloc[0:11].as_matrix()
lonArr = with_hcs_18_df.mLongitude.iloc[0:11].as_matrix()
In [231]:
model_ransac = linear_model.RANSACRegressor(linear_model.LinearRegression())
model_ransac.fit(timeLatArr, lonArr)
model_ransac.inlier_mask_
Out[231]:
In [225]:
from sklearn import linear_model
model_ransac = linear_model.RANSACRegressor(linear_model.LinearRegression())
model_ransac.fit(locAndTimeArr, with_hcs_18_df.index[0:11])
model_ransac.inlier_mask_
Out[225]:
In [111]:
from sklearn import linear_model
model_ransac = linear_model.RANSACRegressor(linear_model.Lasso())
model_ransac.fit(locArr, timeArr)
model_ransac.inlier_mask_
Out[111]:
In [113]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
model_pipeline = Pipeline([('poly', PolynomialFeatures(degree=3)),
('linear', linear_model.LinearRegression(fit_intercept=False))])
model_ransac = linear_model.RANSACRegressor(model_pipeline)
model_ransac.fit(locArr, timeArr)
model_ransac.inlier_mask_
Out[113]:
In [115]:
np.nonzero(np.logical_not(model_ransac.inlier_mask_))
Out[115]:
In [108]:
locAndTimeArr = with_hcs_18_df[["mLongitude", "mLatitude", "mTime"]].iloc[0:11].as_matrix()
In [102]:
from sklearn import svm
model_one_class_svm = svm.OneClassSVM()
model_one_class_svm.fit(locAndTimeArr)
Out[102]:
In [103]:
model_one_class_svm.decision_function(locAndTimeArr)
Out[103]:
In [99]:
import numpy as np
In [100]:
np.nonzero(np.logical_not(model_ransac.inlier_mask_))
Out[100]:
In [117]:
section_12_df = ls.get_section_points(section_list[12])
In [118]:
with_speeds_section_12_df = ls.add_speed(ls.filter_accuracy(section_12_df))
In [120]:
section_12_iqr_threshold = cso.BoxplotOutlier(ignore_zeros=True).get_threshold(with_speeds_section_12_df); section_12_iqr_threshold
Out[120]:
In [121]:
ipy.inline_map(lo.get_map(with_speeds_section_12_df))
Out[121]:
In [123]:
section_list[12].prev_section
Out[123]:
In [126]:
with_speeds_section_12_df[0:10]
Out[126]:
In [124]:
ipy.inline_map(lo.get_map(ls.get_section_points(lq.get_section(section_list[12].prev_section))))
Out[124]:
In [159]:
np.count_nonzero(with_speeds_section_12_df.speed > section_12_iqr_threshold)
Out[159]:
In [162]:
np.count_nonzero(with_speeds_section_12_df.speed >
cso.BoxplotOutlier(multiplier=cso.BoxplotOutlier.MINOR, ignore_zeros=True).get_threshold(with_speeds_section_12_df))
Out[162]:
In [163]:
cso.BoxplotOutlier(multiplier=cso.BoxplotOutlier.MINOR, ignore_zeros=True).get_threshold(with_speeds_section_12_df)
Out[163]:
In [165]:
with_speeds_section_12_df.speed.hist(bins=50, figsize=(20,6))
Out[165]:
In [167]:
np.count_nonzero(with_speeds_section_12_df.speed == 0)
Out[167]:
In [172]:
np.count_nonzero(with_speeds_section_12_df.speed < 1000)
Out[172]:
In [182]:
quartile_vals = with_speeds_section_12_df[with_speeds_section_12_df.speed >= 0].quantile([0.25, 0.75]).speed
In [183]:
iqr = quartile_vals.iloc[1] - quartile_vals.iloc[0]
quartile_vals.iloc[1] + 1.5 * iqr, quartile_vals.iloc[1] + 3 * iqr
Out[183]:
Three main steps:
In [579]:
reload(cjs)
Out[579]:
In [552]:
train_only_zigzag_result = ls.filter_points(section_train_only_df, cso.BoxplotOutlier(ignore_zeros=True), cjs.SmoothZigzag())
In [553]:
ipy.inline_map(lo.get_map(train_only_zigzag_result))
Out[553]:
In [554]:
bike_only_zigzag_result = ls.filter_points(section_bike_only_df, cso.BoxplotOutlier(ignore_zeros=True), cjs.SmoothZigzag())
In [555]:
ipy.inline_map(lo.get_map(bike_only_zigzag_result))
Out[555]:
In [556]:
section_18_zigzag_result = ls.filter_points(section_18_df, cso.BoxplotOutlier(ignore_zeros=True), cjs.SmoothZigzag())
In [557]:
ipy.inline_map(lo.get_map(section_18_zigzag_result))
Out[557]:
In [558]:
section_12_zigzag_result = ls.filter_points(section_12_df, cso.BoxplotOutlier(ignore_zeros=True), cjs.SmoothZigzag())
In [559]:
ipy.inline_map(lo.get_map(section_12_zigzag_result))
Out[559]:
In [560]:
section_new_data_zigzag_result = ls.filter_points(section_new_data_df, cso.BoxplotOutlier(ignore_zeros=True), cjs.SmoothZigzag())
In [561]:
ipy.inline_map(lo.get_map(section_new_data_zigzag_result))
Out[561]:
In [565]:
section_17_df = ls.filter_accuracy(ls.get_section_points(section_list[17]))
In [567]:
with_speeds_17_df = ls.add_speed(section_17_df)
with_speeds_17_df[27:37]
Out[567]:
In [568]:
ipy.inline_map(lo.get_map(with_speeds_17_df))
Out[568]:
In [581]:
section_17_zigzag_result = ls.filter_points(section_17_df, cso.BoxplotOutlier(ignore_zeros=True), cjs.SmoothZigzag())
In [582]:
ipy.inline_map(lo.get_map(section_17_zigzag_result))
Out[582]:
In [ ]: