notebook.community

Edit and run



In [1]:

    
import functools
import geopy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyproj
import requests
import scipy as sp
import rtree
import seaborn as sb
from scipy import signal
# import shapely
import shapely.geometry
%pylab inline

import data_munging









    



Populating the interactive namespace from numpy and matplotlib



In [2]:

    
reload(data_munging)









    Out[2]:





<module 'data_munging' from 'data_munging.pyc'>



In [4]:

    
rides, readings = data_munging.read_raw_data()
readings = data_munging.clean_readings(readings)
readings = data_munging.add_proj_to_readings(readings, data_munging.NAD83)









    



---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-4-d6f22def602f> in <module>()
----> 1 rides, readings = data_munging.read_raw_data()
      2 readings = data_munging.clean_readings(readings)
      3 readings = data_munging.add_proj_to_readings(readings, data_munging.NAD83)

/home/zblan/pavement_analysis/src/data_munging.pyc in read_raw_data()
     79 def read_raw_data():
     80     """Returns rides and readings Pandas dfs"""
---> 81     rides = pd.read_csv('../dat/rides.csv')
     82     readings = pd.read_csv('../dat/readings.csv')
     83     readings.sort_values(by=['ride_id', 'id'], inplace=True)

/home/zblan/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
    489                     skip_blank_lines=skip_blank_lines)
    490 
--> 491         return _read(filepath_or_buffer, kwds)
    492 
    493     parser_f.__name__ = name

/home/zblan/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    266 
    267     # Create the parser.
--> 268     parser = TextFileReader(filepath_or_buffer, **kwds)
    269 
    270     if (nrows is not None) and (chunksize is not None):

/home/zblan/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds)
    581             self.options['has_index_names'] = kwds['has_index_names']
    582 
--> 583         self._make_engine(self.engine)
    584 
    585     def _get_options_with_defaults(self, engine):

/home/zblan/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _make_engine(self, engine)
    722     def _make_engine(self, engine='c'):
    723         if engine == 'c':
--> 724             self._engine = CParserWrapper(self.f, **self.options)
    725         else:
    726             if engine == 'python':

/home/zblan/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, src, **kwds)
   1091         kwds['allow_leading_cols'] = self.index_col is not False
   1092 
-> 1093         self._reader = _parser.TextReader(src, **kwds)
   1094 
   1095         # XXX

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:3229)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:6064)()

IOError: Initializing from file failed



In [ ]:

    
n, p = readings.shape
print rides.columns
print readings.columns



In [ ]:

    
readings_idx = data_munging.insert_readings_rtree(readings)



In [ ]:

    
some_indices = np.random.choice(readings.index, 400)



In [ ]:

    
for i in readings_idx.intersection((358016, 580867, 358016, 580871)):
    print i



In [ ]:

    
readings['gps_dist'].plot(kind='hist', bins=30)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
print sp.stats.describe(readings['gps_dist'])
print np.percentile(readings['gps_dist'], 5)



In [ ]:

    
moving_readings = readings.loc[readings.gps_dist > 1, :].index



In [ ]:

    
intersecting_segments = dict()
for i, bb in readings['bb'].iteritems():
    reading_of_interest = data_munging.expand_bb(bb, 0.2)
    ins_readings = list(readings_idx.intersection(reading_of_interest))
    intersecting_segments[i] = ins_readings



In [ ]:

    
total_intsx = [len(intersecting_segments[i]) for i in intersecting_segments]



In [ ]:

    
sum(total_intsx)



In [ ]:

    
plt.hist(total_intsx, bins=70)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
print sp.stats.describe(readings['gps_dist'])
print np.percentile(readings['gps_dist'], 5)

A really good idea is to randomly sample points and look at all paths that cross within a small bounding box and compare their "bumpiness" values!



In [ ]:

    
readings.ix[0, :]['start_x']



In [ ]:

    
line_diffs = dict()
total_its = 0
for i in intersecting_segments:
    for j in intersecting_segments[i]:
        total_its += 1
        if total_its % 10000 == 0:
            print total_its
        line_diffs[(i,j)] = data_munging.calc_reading_diffs(readings.ix[i, :], readings.ix[j, :])



In [ ]:



In [ ]:

    
readings['std_total'] = readings['num_accel_total'].apply(np.std)



In [ ]:

    
readings['std_z_adj_speed'] = readings['std_z'] * readings['gps_dist']



In [ ]:

    
readings['sum_abs_z'] = readings['num_accel_z'].apply(lambda x: np.sum(np.abs(x)))



In [ ]:

    
energy0 = list()
energy1 = list()
index = list()
for i, j in line_diffs:
    if line_diffs[(i,j)] < 0.2 and i != j:
        index.append((i,j))
        energy0.append(readings.ix[i, 'std_z_adj_speed'])
        energy1.append(readings.ix[j, 'std_z_adj_speed'])
compare_energy = pd.DataFrame({'energy0': energy0, 'energy1': energy1})
compare_energy.index = index



In [ ]:

    
compare_energy.plot(x='energy0', y='energy1', kind='scatter')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
print 'std_total'



In [ ]:

    
compare_energy.plot(x='energy0', y='energy1', kind='scatter')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()



In [ ]:

    
np.corrcoef(compare_energy['energy0'], compare_energy['energy1'])



In [ ]:

    
np.corrcoef(compare_energy['energy0'], compare_energy['energy1'])



In [ ]:

    
np.corrcoef(compare_energy['energy0'], compare_energy['energy1'])



In [ ]:

    
compare_energy.plot(x='energy0', y='energy1', kind='scatter')
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()



In [ ]:

    
plt.hist(line_diffs, bins=70)
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
print sp.stats.describe(readings['gps_dist'])
print np.percentile(readings['gps_dist'], 5)



In [ ]:

    
readings.ix[0, :][['start_x', 'start_y']]



In [ ]:

    
np.linalg.norm(readings.ix[0, ['start_x', 'start_y']] - readings.ix[1, ['start_x', 'start_y']])



In [ ]:

    
print readings_idx.get_bounds(0)
print data_munging.expand_bb(readings_idx.get_bounds(0), 0.2)



In [ ]:

    
shapely.geometry.LineString([(0, 0), (1, 1)])



In [ ]:

    
#we will take the intersection of these two mitred segments and then set a threshold for overlap
# similar to Jaccard similarity
#we can also take the angle between these two segments and use that to determine if 
#we should consider them the same (different angles)



In [ ]:

    
help(shapely.geometry.LineString)
shapely.geometry.LineString()



In [ ]:

    
for i in readings_idx.intersection(readings_idx.get_bounds(0)):
    print 
# print len(list(readings_idx.intersection(readings_idx.get_bounds(0))))



In [ ]:

    
help(readings_idx.intersection)



In [ ]:

    
# Locations plotted on map with coloring based on std
chi_readings.plot(x='start_lon', y='start_lat', kind='scatter')
fig = plt.gcf()
fig.set_size_inches(30, 30)
plt.show()



In [ ]:

    
# Locations plotted on map with coloring based on std
chi_readings.plot(x='start_x', y='start_y', kind='scatter')
fig = plt.gcf()
fig.set_size_inches(30, 30)
plt.show()



In [ ]:

    
kimball_readings = data_munging.filter_readings_by_bb(readings, data_munging.kimball_bounding_box)



In [ ]:

    
kimball_readings.plot(x='start_x', y='start_y', kind='scatter')
fig = plt.gcf()
fig.set_size_inches(30, 30)
plt.show()



In [ ]:

    
chi_readings = data_munging.filter_readings_to_chicago(readings)



In [ ]:

    
nyc_readings = data_munging.filter_readings_to_nyc(readings)



In [ ]:

    
print chi_readings.shape
print nyc_readings.shape
print readings.shape



In [ ]:

    
ax = sb.regplot(x="gps_speed", y="std_z", data=readings, scatter_kws={'alpha':0.3})
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
plt.show()
# ax = sb.regplot(x="total_bill", y="tip", data=tips, scatter_kws={'alpha':0.3})



In [ ]:

    
ax = sb.regplot(x="gps_speed", y="std_z", data=readings, scatter_kws={'alpha':0.05})
fig = plt.gcf()
fig.set_size_inches(18.5, 10.5)
pylab.xlim([0, 40])
plt.show()
# ax = sb.regplot(x="total_bill", y="tip", data=tips, scatter_kws={'alpha':0.3})



In [ ]:

    
readings['gps_speed'].plot(kind='hist')
plt.show()



In [ ]:

    
for axis in ['x', 'y', 'z']:
    readings['std_' + axis].plot(kind='hist')
    fig = plt.gcf()
    fig.set_size_inches(10, 4)
    plt.show()



In [ ]:

    
sample_size = 100
indices = np.random.choice(n, sample_size)
for axis in ['x', 'y', 'z']:
    for i in indices:
        sb.tsplot(readings['num_accel_' + axis][i][0:100], alpha=0.05)
    fig = plt.gcf()
    fig.set_size_inches(18.5, 10.5)
    plt.xlabel('Accelerometer Values')
    plt.ylabel('Force (Gravities)')
    plt.title('Random sample of ' + str(sample_size) + ' ' + axis + ' Accelerometer Time Series')
    plt.show()



In [ ]:

    
sample_size = 1000
indices = np.random.choice(n, sample_size)
for axis in ['x', 'y', 'z']:
    for i in indices:
        f, Pxx_den = signal.periodogram(readings['num_accel_' + axis][i][0:100])
        plt.plot(f, Pxx_den)
        plt.title('Power Spectrum for ' + axis + ' axis')
        plt.xlabel('frequency [Hz]')
        plt.ylabel('Power Spectrum Density')
    fig = plt.gcf()
    fig.set_size_inches(18.5, 10.5)
    plt.show()