In [1]:

    
import functools
import geopy
from matplotlib import collections as mc
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyproj
import requests
import scipy as sp
import rtree
import seaborn as sb
from scipy import signal
# import shapely
import shapely.geometry
%pylab inline

import data_munging









    



Populating the interactive namespace from numpy and matplotlib






    



/home/zblan/anaconda/lib/python2.7/site-packages/matplotlib/__init__.py:872: UserWarning: axes.color_cycle is deprecated and replaced with axes.prop_cycle; please use the latter.
  warnings.warn(self.msg_depr % (key, alt_key))

Data are rides which are composed of readings. Readings are usually taken every second. A reading has its start and end time, its start and end lat/long via GPS, and 100 readings (at 100 Hz) from the x, y, and z accelerometers. These are not calibrated so that z is in the direction of gravity! Also, the units are in gravity and it measures gravity!



In [2]:

    
rides, readings = data_munging.read_raw_data()
readings = data_munging.clean_readings(readings)
readings = data_munging.add_proj_to_readings(readings, data_munging.NAD83)



In [3]:

    
print 'This is our latest reading:'
print max(readings['start_datetime'])









    



This is our latest reading:
2016-01-08 19:32:22.057940



In [48]:

    
print rides.shape
print readings.shape
n, p = readings.shape









    



(94, 7)
(35884, 39)



In [5]:

    
readings.ix[:, 0:14].describe()









    Out[5]:






  
    
      
      id
      start_lat
      start_lon
      end_lat
      end_lon
      angle_x
      angle_y
      angle_z
      ride_id
    
  
  
    
      count
      35884.00000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35882.000000
    
    
      mean
      19227.71302
      41.074522
      -78.145393
      41.074524
      -78.145396
      1.412244
      2.203244
      2.349292
      81.662616
    
    
      std
      11020.35792
      0.552719
      6.315436
      0.552718
      6.315440
      0.181960
      0.399212
      0.440968
      37.261709
    
    
      min
      588.00000
      40.681685
      -87.714990
      40.681685
      -87.714990
      0.177325
      0.000000
      0.000000
      2.000000
    
    
      25%
      9577.75000
      40.699079
      -87.629375
      40.699088
      -87.629378
      1.305654
      2.038208
      2.199902
      52.000000
    
    
      50%
      19090.50000
      40.712922
      -73.967589
      40.712927
      -73.967589
      1.393604
      2.244046
      2.397128
      86.000000
    
    
      75%
      28682.25000
      41.886814
      -73.932991
      41.886814
      -73.932989
      1.506927
      2.428405
      2.576590
      111.000000
    
    
      max
      38523.00000
      41.973392
      -73.905131
      41.973393
      -73.905104
      3.014949
      3.141593
      3.141593
      146.000000



In [6]:

    
readings.ix[:, 14:].describe()









    Out[6]:






  
    
      
      start_time
      end_time
      mean_g
      abs_sum_x
      std_x
      abs_sum_y
      std_y
      abs_sum_z
      std_z
      std_total
      duration
      gps_dist
      abs_sum_total
      gps_speed
      total_readings
      start_x
      start_y
      end_x
      end_y
    
  
  
    
      count
      3.588400e+04
      3.588400e+04
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
      35884.000000
    
    
      mean
      1.448133e+09
      1.448133e+09
      0.335402
      24.529352
      0.160663
      67.191197
      0.295970
      92.051591
      0.502745
      0.615226
      1.079435
      5.573131
      125.032443
      5.634492
      108.715082
      1161867.858993
      559322.565057
      1161867.560246
      559322.817045
    
    
      std
      1.773318e+06
      1.773318e+06
      0.300768
      118.938536
      0.117447
      56.436329
      0.271950
      123.058644
      0.396454
      0.482272
      3.331302
      3.650427
      189.679344
      16.929376
      187.773519
      535000.218079
      15146.511831
      535000.549026
      15146.498089
    
    
      min
      1.445264e+09
      1.445264e+09
      0.003077
      0.034119
      0.000648
      0.128006
      0.000941
      0.018158
      0.000304
      0.001610
      0.000800
      0.080076
      1.934746
      0.007705
      2.000000
      351278.381668
      545746.517020
      351278.381668
      545747.044438
    
    
      25%
      1.446951e+09
      1.446951e+09
      0.168067
      19.824879
      0.094393
      61.231327
      0.150120
      77.563679
      0.257029
      0.323717
      0.993880
      3.748162
      106.542545
      3.727073
      101.000000
      358428.643891
      548734.167840
      358428.554771
      548734.790681
    
    
      50%
      1.448231e+09
      1.448231e+09
      0.252558
      23.317154
      0.135663
      65.020645
      0.224335
      81.940704
      0.393362
      0.480757
      1.000080
      5.537240
      110.488620
      5.529468
      102.000000
      1516014.447513
      549939.969546
      1516012.234198
      549940.180898
    
    
      75%
      1.449271e+09
      1.449271e+09
      0.392600
      26.188408
      0.190509
      71.166946
      0.353836
      94.700474
      0.629087
      0.755590
      1.006410
      6.777850
      120.124840
      6.762154
      103.000000
      1518812.913623
      579750.792407
      1518813.231135
      579750.892795
    
    
      max
      1.452303e+09
      1.452303e+09
      4.988518
      22168.323334
      1.892977
      3306.349960
      5.573329
      15608.470230
      4.214469
      6.569573
      588.258450
      29.923974
      27282.312452
      2882.408992
      27379.000000
      1521388.013816
      589355.055276
      1521389.726798
      589355.097554



In [7]:

    
rides.describe()









    Out[7]:






  
    
      
      id
      start_time
      end_time
      calibration_id
      scoreboard_id
    
  
  
    
      count
      94.000000
      9.400000e+01
      9.000000e+01
      93.000000
      91.000000
    
    
      mean
      80.457447
      1.448136e+09
      1.448087e+09
      56.408602
      53.307692
    
    
      std
      43.987697
      2.050464e+06
      2.051928e+06
      51.858762
      53.983885
    
    
      min
      2.000000
      1.445264e+09
      1.445264e+09
      1.000000
      1.000000
    
    
      25%
      44.250000
      1.446105e+09
      1.445688e+09
      1.000000
      1.000000
    
    
      50%
      86.500000
      1.448267e+09
      1.448231e+09
      48.000000
      1.000000
    
    
      75%
      118.750000
      1.449552e+09
      1.449553e+09
      101.000000
      101.000000
    
    
      max
      146.000000
      1.452302e+09
      1.452303e+09
      136.000000
      136.000000



In [8]:

    
readings.plot(x='duration', y='total_readings', kind='scatter')
plt.title('Verifying that we are sampling at 100 Hz With No Gaps in Data')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()

Checks out some random rides to make sure that our line segments line up and form a proper route, since there are some concerns that GPS data is a bit noisy for this!



In [17]:

    
for random_ride_id in np.random.choice(rides.id, 100):
    for i, reading in readings.loc[readings['ride_id'] == random_ride_id, :].iterrows():
        plt.plot([reading['start_x'], reading['end_x']], [reading['start_y'], reading['end_y']])
    plt.title('Plotting Ride ' + str(random_ride_id))
    fig = plt.gcf()
    fig.set_size_inches(18.5, 10.5)
    plt.show()



In [27]:

    
readings['gps_speed'].plot(kind='hist', bins = 100, range=(0, 29))
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
print sp.stats.describe(readings['gps_dist'])
print np.percentile(readings['gps_dist'], 5)









    












    



DescribeResult(nobs=35884, minmax=(0.080076318170318475, 29.923974297519049), mean=5.5731306942150241, variance=13.325615947217118, skewness=2.066897320530394, kurtosis=7.844937680198619)
0.418735713698



In [42]:

    
readings.plot(x='gps_speed', y='std_z', alpha=0.08, kind='scatter')
fig = plt.gcf()
plt.title('Relationship between Speed and Vibration')
fig.set_size_inches(18.5, 10.5)
plt.xlim(0, 30)
plt.ylim(0, 5)
plt.show()
# ax = sb.regplot(x="total_bill", y="tip", data=tips, scatter_kws={'alpha':0.3})



In [46]:

    
readings.plot(x='gps_speed', y='abs_sum_z', alpha=0.08, kind='scatter')
fig = plt.gcf()
plt.title('Relationship between Speed and Vibration (Different Measure)')
fig.set_size_inches(18.5, 10.5)
plt.xlim(0, 30)
plt.ylim(0, 500)
plt.show()
# ax = sb.regplot(x="total_bill", y="tip", data=tips, scatter_kws={'alpha':0.3})



In [52]:

    
for axis in ['x', 'y', 'z']:
    readings['std_' + axis].plot(kind='hist', bins=40)
    fig = plt.gcf()
    fig.set_size_inches(10, 4)
    plt.title('Std of ' + axis + ' axis')
    plt.show()



In [65]:

    
sample_size = 15
indices = np.random.choice(n, sample_size)
for axis in ['x', 'y', 'z']:
    for i in indices:
        sb.tsplot(readings['num_accel_' + axis][i][0:100], alpha=0.50, color=np.random.random(3))
    fig = plt.gcf()
    fig.set_size_inches(18.5, 10.5)
    plt.xlabel('Accelerometer Values')
    plt.ylabel('Force (Gravities)')
    plt.title('Random sample of ' + str(sample_size) + ' ' + axis + ' Accelerometer Time Series')
    plt.show()



In [64]:

    
sample_size = 1000
indices = np.random.choice(n, sample_size)
for axis in ['x', 'y', 'z']:
    for i in indices:
        f, Pxx_den = signal.periodogram(readings['num_accel_' + axis][i][0:100])
        plt.plot(f, Pxx_den)
        plt.title('Power Spectrum for ' + axis + ' axis')
        plt.xlabel('frequency [Hz]')
        plt.ylabel('Power Spectrum Density')
    fig = plt.gcf()
    fig.set_size_inches(18.5, 10.5)
    plt.show()

	id	start_lat	start_lon	end_lat	end_lon	angle_x	angle_y	angle_z	ride_id
count	35884.00000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35882.000000
mean	19227.71302	41.074522	-78.145393	41.074524	-78.145396	1.412244	2.203244	2.349292	81.662616
std	11020.35792	0.552719	6.315436	0.552718	6.315440	0.181960	0.399212	0.440968	37.261709
min	588.00000	40.681685	-87.714990	40.681685	-87.714990	0.177325	0.000000	0.000000	2.000000
25%	9577.75000	40.699079	-87.629375	40.699088	-87.629378	1.305654	2.038208	2.199902	52.000000
50%	19090.50000	40.712922	-73.967589	40.712927	-73.967589	1.393604	2.244046	2.397128	86.000000
75%	28682.25000	41.886814	-73.932991	41.886814	-73.932989	1.506927	2.428405	2.576590	111.000000
max	38523.00000	41.973392	-73.905131	41.973393	-73.905104	3.014949	3.141593	3.141593	146.000000

	start_time	end_time	mean_g	abs_sum_x	std_x	abs_sum_y	std_y	abs_sum_z	std_z	std_total	duration	gps_dist	abs_sum_total	gps_speed	total_readings	start_x	start_y	end_x	end_y
count	3.588400e+04	3.588400e+04	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000	35884.000000
mean	1.448133e+09	1.448133e+09	0.335402	24.529352	0.160663	67.191197	0.295970	92.051591	0.502745	0.615226	1.079435	5.573131	125.032443	5.634492	108.715082	1161867.858993	559322.565057	1161867.560246	559322.817045
std	1.773318e+06	1.773318e+06	0.300768	118.938536	0.117447	56.436329	0.271950	123.058644	0.396454	0.482272	3.331302	3.650427	189.679344	16.929376	187.773519	535000.218079	15146.511831	535000.549026	15146.498089
min	1.445264e+09	1.445264e+09	0.003077	0.034119	0.000648	0.128006	0.000941	0.018158	0.000304	0.001610	0.000800	0.080076	1.934746	0.007705	2.000000	351278.381668	545746.517020	351278.381668	545747.044438
25%	1.446951e+09	1.446951e+09	0.168067	19.824879	0.094393	61.231327	0.150120	77.563679	0.257029	0.323717	0.993880	3.748162	106.542545	3.727073	101.000000	358428.643891	548734.167840	358428.554771	548734.790681
50%	1.448231e+09	1.448231e+09	0.252558	23.317154	0.135663	65.020645	0.224335	81.940704	0.393362	0.480757	1.000080	5.537240	110.488620	5.529468	102.000000	1516014.447513	549939.969546	1516012.234198	549940.180898
75%	1.449271e+09	1.449271e+09	0.392600	26.188408	0.190509	71.166946	0.353836	94.700474	0.629087	0.755590	1.006410	6.777850	120.124840	6.762154	103.000000	1518812.913623	579750.792407	1518813.231135	579750.892795
max	1.452303e+09	1.452303e+09	4.988518	22168.323334	1.892977	3306.349960	5.573329	15608.470230	4.214469	6.569573	588.258450	29.923974	27282.312452	2882.408992	27379.000000	1521388.013816	589355.055276	1521389.726798	589355.097554

	id	start_time	end_time	calibration_id	scoreboard_id
count	94.000000	9.400000e+01	9.000000e+01	93.000000	91.000000
mean	80.457447	1.448136e+09	1.448087e+09	56.408602	53.307692
std	43.987697	2.050464e+06	2.051928e+06	51.858762	53.983885
min	2.000000	1.445264e+09	1.445264e+09	1.000000	1.000000
25%	44.250000	1.446105e+09	1.445688e+09	1.000000	1.000000
50%	86.500000	1.448267e+09	1.448231e+09	48.000000	1.000000
75%	118.750000	1.449552e+09	1.449553e+09	101.000000	101.000000
max	146.000000	1.452302e+09	1.452303e+09	136.000000	136.000000