In [1]:
import functools
import geopy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyproj
import requests
import scipy as sp
import rtree
import seaborn as sb
from scipy import signal
# import shapely
import shapely.geometry
%pylab inline
import data_munging
In [2]:
reload(data_munging)
Out[2]:
In [4]:
rides, readings = data_munging.read_raw_data()
readings = data_munging.clean_readings(readings)
readings = data_munging.add_proj_to_readings(readings, data_munging.NAD83)
In [ ]:
n, p = readings.shape
print rides.columns
print readings.columns
In [ ]:
readings_idx = data_munging.insert_readings_rtree(readings)
In [ ]:
some_indices = np.random.choice(readings.index, 400)
In [ ]:
for i in readings_idx.intersection((358016, 580867, 358016, 580871)):
print i
In [ ]:
readings['gps_dist'].plot(kind='hist', bins=30)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
print sp.stats.describe(readings['gps_dist'])
print np.percentile(readings['gps_dist'], 5)
In [ ]:
moving_readings = readings.loc[readings.gps_dist > 1, :].index
In [ ]:
intersecting_segments = dict()
for i, bb in readings['bb'].iteritems():
reading_of_interest = data_munging.expand_bb(bb, 0.2)
ins_readings = list(readings_idx.intersection(reading_of_interest))
intersecting_segments[i] = ins_readings
In [ ]:
total_intsx = [len(intersecting_segments[i]) for i in intersecting_segments]
In [ ]:
sum(total_intsx)
In [ ]:
plt.hist(total_intsx, bins=70)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
print sp.stats.describe(readings['gps_dist'])
print np.percentile(readings['gps_dist'], 5)
A really good idea is to randomly sample points and look at all paths that cross within a small bounding box and compare their "bumpiness" values!
In [ ]:
readings.ix[0, :]['start_x']
In [ ]:
line_diffs = dict()
total_its = 0
for i in intersecting_segments:
for j in intersecting_segments[i]:
total_its += 1
if total_its % 10000 == 0:
print total_its
line_diffs[(i,j)] = data_munging.calc_reading_diffs(readings.ix[i, :], readings.ix[j, :])
In [ ]:
In [ ]:
readings['std_total'] = readings['num_accel_total'].apply(np.std)
In [ ]:
readings['std_z_adj_speed'] = readings['std_z'] * readings['gps_dist']
In [ ]:
readings['sum_abs_z'] = readings['num_accel_z'].apply(lambda x: np.sum(np.abs(x)))
In [ ]:
energy0 = list()
energy1 = list()
index = list()
for i, j in line_diffs:
if line_diffs[(i,j)] < 0.2 and i != j:
index.append((i,j))
energy0.append(readings.ix[i, 'std_z_adj_speed'])
energy1.append(readings.ix[j, 'std_z_adj_speed'])
compare_energy = pd.DataFrame({'energy0': energy0, 'energy1': energy1})
compare_energy.index = index
In [ ]:
compare_energy.plot(x='energy0', y='energy1', kind='scatter')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
print 'std_total'
In [ ]:
compare_energy.plot(x='energy0', y='energy1', kind='scatter')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
In [ ]:
np.corrcoef(compare_energy['energy0'], compare_energy['energy1'])
In [ ]:
np.corrcoef(compare_energy['energy0'], compare_energy['energy1'])
In [ ]:
np.corrcoef(compare_energy['energy0'], compare_energy['energy1'])
In [ ]:
compare_energy.plot(x='energy0', y='energy1', kind='scatter')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
In [ ]:
plt.hist(line_diffs, bins=70)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
print sp.stats.describe(readings['gps_dist'])
print np.percentile(readings['gps_dist'], 5)
In [ ]:
readings.ix[0, :][['start_x', 'start_y']]
In [ ]:
np.linalg.norm(readings.ix[0, ['start_x', 'start_y']] - readings.ix[1, ['start_x', 'start_y']])
In [ ]:
print readings_idx.get_bounds(0)
print data_munging.expand_bb(readings_idx.get_bounds(0), 0.2)
In [ ]:
shapely.geometry.LineString([(0, 0), (1, 1)])
In [ ]:
#we will take the intersection of these two mitred segments and then set a threshold for overlap
# similar to Jaccard similarity
#we can also take the angle between these two segments and use that to determine if
#we should consider them the same (different angles)
In [ ]:
help(shapely.geometry.LineString)
shapely.geometry.LineString()
In [ ]:
for i in readings_idx.intersection(readings_idx.get_bounds(0)):
print
# print len(list(readings_idx.intersection(readings_idx.get_bounds(0))))
In [ ]:
help(readings_idx.intersection)
In [ ]:
# Locations plotted on map with coloring based on std
chi_readings.plot(x='start_lon', y='start_lat', kind='scatter')
fig = plt.gcf()
fig.set_size_inches(30, 30)
plt.show()
In [ ]:
# Locations plotted on map with coloring based on std
chi_readings.plot(x='start_x', y='start_y', kind='scatter')
fig = plt.gcf()
fig.set_size_inches(30, 30)
plt.show()
In [ ]:
kimball_readings = data_munging.filter_readings_by_bb(readings, data_munging.kimball_bounding_box)
In [ ]:
kimball_readings.plot(x='start_x', y='start_y', kind='scatter')
fig = plt.gcf()
fig.set_size_inches(30, 30)
plt.show()
In [ ]:
chi_readings = data_munging.filter_readings_to_chicago(readings)
In [ ]:
nyc_readings = data_munging.filter_readings_to_nyc(readings)
In [ ]:
print chi_readings.shape
print nyc_readings.shape
print readings.shape
In [ ]:
ax = sb.regplot(x="gps_speed", y="std_z", data=readings, scatter_kws={'alpha':0.3})
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
# ax = sb.regplot(x="total_bill", y="tip", data=tips, scatter_kws={'alpha':0.3})
In [ ]:
ax = sb.regplot(x="gps_speed", y="std_z", data=readings, scatter_kws={'alpha':0.05})
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
pylab.xlim([0, 40])
plt.show()
# ax = sb.regplot(x="total_bill", y="tip", data=tips, scatter_kws={'alpha':0.3})
In [ ]:
readings['gps_speed'].plot(kind='hist')
plt.show()
In [ ]:
for axis in ['x', 'y', 'z']:
readings['std_' + axis].plot(kind='hist')
fig = plt.gcf()
fig.set_size_inches(10, 4)
plt.show()
In [ ]:
sample_size = 100
indices = np.random.choice(n, sample_size)
for axis in ['x', 'y', 'z']:
for i in indices:
sb.tsplot(readings['num_accel_' + axis][i][0:100], alpha=0.05)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.xlabel('Accelerometer Values')
plt.ylabel('Force (Gravities)')
plt.title('Random sample of ' + str(sample_size) + ' ' + axis + ' Accelerometer Time Series')
plt.show()
In [ ]:
sample_size = 1000
indices = np.random.choice(n, sample_size)
for axis in ['x', 'y', 'z']:
for i in indices:
f, Pxx_den = signal.periodogram(readings['num_accel_' + axis][i][0:100])
plt.plot(f, Pxx_den)
plt.title('Power Spectrum for ' + axis + ' axis')
plt.xlabel('frequency [Hz]')
plt.ylabel('Power Spectrum Density')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()