In [1]:
import functools
import geopy
from matplotlib import collections as mc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyproj
import requests
import scipy as sp
import rtree
import seaborn as sb
from scipy import signal
# import shapely
import shapely.geometry
%pylab inline
import data_munging
Data are rides which are composed of readings. Readings are usually taken every second. A reading has its start and end time, its start and end lat/long via GPS, and 100 readings (at 100 Hz) from the x, y, and z accelerometers. These are not calibrated so that z is in the direction of gravity! Also, the units are in gravity and it measures gravity!
In [2]:
rides, readings = data_munging.read_raw_data()
readings = data_munging.clean_readings(readings)
readings = data_munging.add_proj_to_readings(readings, data_munging.NAD83)
In [3]:
print 'This is our latest reading:'
print max(readings['start_datetime'])
In [48]:
print rides.shape
print readings.shape
n, p = readings.shape
In [5]:
readings.ix[:, 0:14].describe()
Out[5]:
In [6]:
readings.ix[:, 14:].describe()
Out[6]:
In [7]:
rides.describe()
Out[7]:
In [8]:
readings.plot(x='duration', y='total_readings', kind='scatter')
plt.title('Verifying that we are sampling at 100 Hz With No Gaps in Data')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
Checks out some random rides to make sure that our line segments line up and form a proper route, since there are some concerns that GPS data is a bit noisy for this!
In [17]:
for random_ride_id in np.random.choice(rides.id, 100):
for i, reading in readings.loc[readings['ride_id'] == random_ride_id, :].iterrows():
plt.plot([reading['start_x'], reading['end_x']], [reading['start_y'], reading['end_y']])
plt.title('Plotting Ride ' + str(random_ride_id))
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
In [27]:
readings['gps_speed'].plot(kind='hist', bins = 100, range=(0, 29))
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
print sp.stats.describe(readings['gps_dist'])
print np.percentile(readings['gps_dist'], 5)
In [42]:
readings.plot(x='gps_speed', y='std_z', alpha=0.08, kind='scatter')
fig = plt.gcf()
plt.title('Relationship between Speed and Vibration')
fig.set_size_inches(18.5, 10.5)
plt.xlim(0, 30)
plt.ylim(0, 5)
plt.show()
# ax = sb.regplot(x="total_bill", y="tip", data=tips, scatter_kws={'alpha':0.3})
In [46]:
readings.plot(x='gps_speed', y='abs_sum_z', alpha=0.08, kind='scatter')
fig = plt.gcf()
plt.title('Relationship between Speed and Vibration (Different Measure)')
fig.set_size_inches(18.5, 10.5)
plt.xlim(0, 30)
plt.ylim(0, 500)
plt.show()
# ax = sb.regplot(x="total_bill", y="tip", data=tips, scatter_kws={'alpha':0.3})
In [52]:
for axis in ['x', 'y', 'z']:
readings['std_' + axis].plot(kind='hist', bins=40)
fig = plt.gcf()
fig.set_size_inches(10, 4)
plt.title('Std of ' + axis + ' axis')
plt.show()
In [65]:
sample_size = 15
indices = np.random.choice(n, sample_size)
for axis in ['x', 'y', 'z']:
for i in indices:
sb.tsplot(readings['num_accel_' + axis][i][0:100], alpha=0.50, color=np.random.random(3))
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.xlabel('Accelerometer Values')
plt.ylabel('Force (Gravities)')
plt.title('Random sample of ' + str(sample_size) + ' ' + axis + ' Accelerometer Time Series')
plt.show()
In [64]:
sample_size = 1000
indices = np.random.choice(n, sample_size)
for axis in ['x', 'y', 'z']:
for i in indices:
f, Pxx_den = signal.periodogram(readings['num_accel_' + axis][i][0:100])
plt.plot(f, Pxx_den)
plt.title('Power Spectrum for ' + axis + ' axis')
plt.xlabel('frequency [Hz]')
plt.ylabel('Power Spectrum Density')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()