notebook.community

Edit and run



In [20]:

    
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sqlite3
import time
import sys
import os
sys.path.append('..')
from datetime import datetime
from util.Stopwatch import Stopwatch
import socket
%matplotlib inline

DB_PATH = '../local/db/telemetry.db'
assert(os.path.exists(DB_PATH))
IMG_OUT = 'plot_repair.png'
HOST = 'chuck'
LOCAL_HOSTNAME = socket.gethostname()

plt.style.use('ggplot')



In [21]:

    
def get_sensors(db_path, host):
    with sqlite3.connect(db_path) as con:
        cur = con.cursor()
        cur.execute("SELECT * FROM sensors WHERE host='{}';".format(host))
        sensors = {s[2]: s[0] for s in cur.fetchall()}
    return sensors

sensors = get_sensors(DB_PATH, HOST)
sensors_rev = {v:k for k, v in sensors.iteritems()}



In [22]:

    
def read_df(db_path, host=None, sw=None, from_id=0, deduplicate=False, table='telemetry', immediate=False, **kwargs):
    # TODO: incremental reads
    # TODO: Select only current host
    if sw is None:
        sw = Stopwatch('Dataframe loading')

    with sqlite3.connect(db_path) as con:
        if not immediate:
            df = pd.read_sql_query('SELECT * FROM {table} where id>{id};'.format(table=table, id=from_id), con)
        else:
            df = pd.read_sql_query('SELECT * FROM {table} where id>{id};'.format(table=table, id=from_id), con,
                                   index_col='timestamp', parse_dates={'timestamp': {'unit':'s'}})
    sw.event('SQL raw data into df' + '' if not immediate else 'with timestamp to dates')

    # timestamps as datetime array
    if not immediate:
        df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
        sw.event('convert timestamps to datetime')
    
    # type as category (sensor type)
    df['type'] = df['type'].astype('category')
    newcats = [sensors_rev[c] for c in df['type'].cat.categories]
    df['type'].cat.categories = newcats
    sw.event('type as category')
  
    #return df
    if deduplicate:
        df.drop_duplicates(subset=['timestamp', 'type'], inplace=True)
        sw.event('De-duplication')

    pivoted, _ = prepare_df(df, sw, **kwargs)
    
    return pivoted, sw



In [23]:

    
def prepare_df(df, sw=None, resample=False):
    if sw is None:
        sw = Stopwatch('Pivot and adjust data frame')
    
    pivoted = df.pivot(index = 'timestamp', columns='type', values='value') \
                .tz_localize('UTC').tz_convert('Europe/Amsterdam')
    sw.event('pivot table')
    
    if resample:
        pivoted = pivoted.resample(resample, how='mean')
        sw.event('resampling')

    # adjust time zone
    pivoted = pivoted
    sw.event('adjust timezone')

    # Get rid of false temp values
    pivoted.temp[(pivoted['light'] > 250) & (pivoted.index < '2016-03-10 07:03:57.603722+01:00')] = np.NaN
    sw.event('Removing false temp values')
    
    return pivoted, sw



In [24]:

    
def make_plot(db_path, host, sw=None, **kwargs):
    if sw is None:
        sw = Stopwatch('Make Plot')
    
    df, sw = read_df(db_path, host, sw=sw, **kwargs)
    plot(df, sw=sw)
    
    return sw



In [30]:

    
def plot(df, sw=None, width=None):
    if sw is None:
        sw = Stopwatch('Plotting')

    # Throwing the data into a plot
    axes = df.plot(secondary_y=['temp', 'ds_temp'], mark_right=False, style=['r', 'g', 'b', 'c'])
    sw.event('plot pivoted dataframe')

    fig = plt.gcf()
    
    # limits
    axes.set_ylim((0, 1024))
    axes.right_ax.set_ylim((10, 40))
    
    # Grid lines and ticks
    rticks = axes.right_ax.get_yticks()
    rmin, rmax = min(rticks), max(rticks)
    lmin, lmax = axes.get_ylim()
    lrange = lmax - lmin
    rrange = rmax - rmin
    factor = lrange/rrange
    axes.set_yticks([(t-rmin)*factor for t in rticks])
    axes.set_yticklabels([])
    
    axes.right_ax.grid([])
    axes.grid()
    
    # Fiddling with axis labeling
    axes.set_ylabel(u'Raw ADC values')
    axes.right_ax.set_ylabel(u'Temperature °C')

    axes.set_xlabel('')    
    fig.autofmt_xdate(bottom=0.2, rotation=0, ha='center')

    # Messing with the legend
    axes.legend(loc='upper left', shadow=True, fontsize='x-large')
    axes.right_ax.legend(loc='upper right', shadow=True, fontsize='x-large')

    # Annotation at bottom
    elapsed = sw.elapsed()
    axes.annotate('in {elapsed:.1f} s on {hostname}, {timestamp}'.format(elapsed=elapsed,
                                                                            hostname=LOCAL_HOSTNAME,
                                                                            timestamp=datetime.now().strftime('%Y-%m-%d %H:%M'),),
                  xy=(1, 0), xycoords='axes fraction', fontsize=10, xytext=(0, -55),
                  textcoords='offset points', ha='right', va='top')
    sw.event('annotate axes')

    # resampled to 6 minutes, 240 points per day
    # 1 inch = 2.54 cm, * 0.944881889764 = 2.4cm = .1mm/data point * number of days plotted
    if width is None:
        delta = (df.index[-1] - df.index[0]) / pd.Timedelta('1 Day')
        width = delta*0.944881889764

    fig.set_size_inches(width, 5.5)
    fig.savefig('plot.png', dpi=150, transparent=False, bbox_inches='tight', pad_inches=0)
    sw.event('save plot file')
    plt.close();
    return sw

timings = make_plot(DB_PATH, HOST, resample='6min')
timings.report()









    



Timings for Make Plot
2.64s (2.64s): SQL raw data into df
4.54s (1.90s): convert timestamps to datetime
4.56s (0.02s): type as category
5.28s (0.72s): pivot table
5.38s (0.10s): resampling
5.38s (0.00s): adjust timezone
5.38s (0.00s): Removing false temp values
5.63s (0.25s): plot pivoted dataframe
5.64s (0.01s): annotate axes
6.87s (1.23s): save plot file



In [29]:

    
!rsync -avh --progress VersedSquid:~/code/telemetry/db/telemetry.db ../local/db/
#timings = make_plot(DB_PATH, HOST, resample='5min', table='telemetry', from_id=int(1.2e6))
#timings.report()
#!scp plot.png lychnobite.me:~/projects/static/









    



receiving incremental file list
telemetry.db
         69.50M 100%   13.53MB/s    0:00:04 (xfr#1, to-chk=0/1)

sent 56.17K bytes  received 5.38M bytes  473.09K bytes/sec
total size is 69.50M  speedup is 12.78



In [26]:

    
p, s, d = read_df(DB_PATH, HOST)









    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-26-f92d302058a4> in <module>()
----> 1 p, s, d = read_df(DB_PATH, HOST)

ValueError: need more than 2 values to unpack



In [27]:

    
with sqlite3.connect(DB_PATH) as con:
    df = pd.read_sql_query('SELECT * FROM {table} where id>{id};'.format(table='telemetry', id=0), con,
                           index_col='timestamp', parse_dates={'timestamp': {'unit':'s'}})
print df.head()
#df.pivot(index='timestamp', columns='type', values='value')
%timeit df.value[df.type==1]









    



                            Id  type   value
timestamp                                   
2016-02-12 17:50:43.231437   1     3     NaN
2016-02-12 17:50:43.231437   2     2  116.00
2016-02-12 17:50:43.231437   3     1   25.38
2016-02-12 17:50:48.292462   4     3     NaN
2016-02-12 17:50:48.292462   5     2  114.62
1 loops, best of 3: 986 ms per loop



In [135]:

    
#%timeit make_plot(DB_PATH, HOST, resample=False)
#%timeit make_plot(DB_PATH, HOST, resample='30Min')
%timeit make_plot(DB_PATH, HOST, resample='6min')
%timeit make_plot(DB_PATH, HOST, resample='6min', immediate=True)









    



1 loops, best of 3: 3.42 s per loop






    



---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-135-eb4e309aaacf> in <module>()
      2 #%timeit make_plot(DB_PATH, HOST, resample='30Min')
      3 get_ipython().magic(u"timeit make_plot(DB_PATH, HOST, resample='6min')")
----> 4 get_ipython().magic(u"timeit make_plot(DB_PATH, HOST, resample='6min', immediate=True)")

/home/reichler/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in magic(self, arg_s)
   2305         magic_name, _, magic_arg_s = arg_s.partition(' ')
   2306         magic_name = magic_name.lstrip(prefilter.ESC_MAGIC)
-> 2307         return self.run_line_magic(magic_name, magic_arg_s)
   2308 
   2309     #-------------------------------------------------------------------------

/home/reichler/anaconda/lib/python2.7/site-packages/IPython/core/interactiveshell.pyc in run_line_magic(self, magic_name, line)
   2226                 kwargs['local_ns'] = sys._getframe(stack_depth).f_locals
   2227             with self.builtin_trap:
-> 2228                 result = fn(*args,**kwargs)
   2229             return result
   2230 

/home/reichler/anaconda/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in timeit(self, line, cell)

/home/reichler/anaconda/lib/python2.7/site-packages/IPython/core/magic.pyc in <lambda>(f, *a, **k)
    191     # but it's overkill for just that one bit of state.
    192     def magic_deco(arg):
--> 193         call = lambda f, *a, **k: f(*a, **k)
    194 
    195         if callable(arg):

/home/reichler/anaconda/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in timeit(self, line, cell)
   1034             number = 1
   1035             for _ in range(1, 10):
-> 1036                 time_number = timer.timeit(number)
   1037                 worst_tuning = max(worst_tuning, time_number / number)
   1038                 if time_number >= 0.2:

/home/reichler/anaconda/lib/python2.7/site-packages/IPython/core/magics/execution.pyc in timeit(self, number)
    130         gc.disable()
    131         try:
--> 132             timing = self.inner(it, self.timer)
    133         finally:
    134             if gcold:

<magic-timeit> in inner(_it, _timer)

<ipython-input-133-88c51396d626> in make_plot(db_path, host, sw, **kwargs)
      3         sw = Stopwatch('Make Plot')
      4 
----> 5     df, sw = read_df(db_path, host, sw=sw, **kwargs)
      6     plot(df, sw=sw)
      7 

<ipython-input-131-430965ae9f3a> in read_df(db_path, host, sw, from_id, deduplicate, table, immediate, **kwargs)
     29         sw.event('De-duplication')
     30 
---> 31     pivoted, _ = prepare_df(df, sw, **kwargs)
     32 
     33     return pivoted, sw

<ipython-input-132-87d0c1220ee2> in prepare_df(df, sw, resample)
      3         sw = Stopwatch('Pivot and adjust data frame')
      4 
----> 5     pivoted = df.pivot(index = 'timestamp', columns='type', values='value')                 .tz_localize('UTC').tz_convert('Europe/Amsterdam')
      6     sw.event('pivot table')
      7 

/home/reichler/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in pivot(self, index, columns, values)
   3507         """
   3508         from pandas.core.reshape import pivot
-> 3509         return pivot(self, index=index, columns=columns, values=values)
   3510 
   3511     def stack(self, level=-1, dropna=True):

/home/reichler/anaconda/lib/python2.7/site-packages/pandas/core/reshape.pyc in pivot(self, index, columns, values)
    324     else:
    325         indexed = Series(self[values].values,
--> 326                          index=MultiIndex.from_arrays([self[index],
    327                                                        self[columns]]))
    328         return indexed.unstack(columns)

/home/reichler/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key)
   1795             return self._getitem_multilevel(key)
   1796         else:
-> 1797             return self._getitem_column(key)
   1798 
   1799     def _getitem_column(self, key):

/home/reichler/anaconda/lib/python2.7/site-packages/pandas/core/frame.pyc in _getitem_column(self, key)
   1802         # get column
   1803         if self.columns.is_unique:
-> 1804             return self._get_item_cache(key)
   1805 
   1806         # duplicate columns & possible reduce dimensionaility

/home/reichler/anaconda/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item)
   1082         res = cache.get(item)
   1083         if res is None:
-> 1084             values = self._data.get(item)
   1085             res = self._box_item_values(item, values)
   1086             cache[item] = res

/home/reichler/anaconda/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item, fastpath)
   2849 
   2850             if not isnull(item):
-> 2851                 loc = self.items.get_loc(item)
   2852             else:
   2853                 indexer = np.arange(len(self.items))[isnull(self.items)]

/home/reichler/anaconda/lib/python2.7/site-packages/pandas/core/index.pyc in get_loc(self, key, method)
   1570         """
   1571         if method is None:
-> 1572             return self._engine.get_loc(_values_from_object(key))
   1573 
   1574         indexer = self.get_indexer([key], method=method)

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3824)()

pandas/index.pyx in pandas.index.IndexEngine.get_loc (pandas/index.c:3704)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12280)()

pandas/hashtable.pyx in pandas.hashtable.PyObjectHashTable.get_item (pandas/hashtable.c:12231)()

KeyError: 'timestamp'



In [31]:

    
!rsync -avh VersedSquid:~/code/telemetry/local/db/ ../local/db/
df = pd.read_csv('../local/db/chuck_2016-03-28.csv', names=['timestamp', 'type', 'value'])
df.timestamp = df.timestamp.astype('datetime64[s]')
df.set_index('timestamp', inplace=True)
df = df.drop(['timestamp'])
df = df.tz_localize('UTC').tz_convert('Europe/Amsterdam')
df.type.loc[df.type=='ds_temp'] = 1
df.type.loc[df.type=='light'] = 2
df.type.loc[df.type=='soil'] = 3;
df.type.astype('uint8');

fig = plt.figure()
axes = fig.add_subplot(111)

# night background
for span in spans(df):
    plt.axvspan(span[0], span[1], facecolor='0.2', alpha=0.3)

groups = df.groupby('type')
groups.get_group(2).value.resample('6min').plot(ax=axes, style='k', label=r'light')
groups.get_group(3).value.plot(ax=axes, style='b', label=r'$\theta_{soil}$')

axes.set_ylim((0, 1023))
# handles, labels = axes.get_legend_handles_labels()
# handles.append(ep_artist)
# axes.legend(handles=handles, loc=2)
axes.legend(loc=2)
axes.set_xlabel('')
axes.set_yticks([])
axes.set_ylabel(u'raw ADC values')

# temp
axes_r = axes.twinx()
groups.get_group(1).value.plot(ax=axes_r, style='r', label=r'$T_{ambient}$')

axes_r.set_ylim((15, 35))
axes_r.set_ylabel(u'Temperature (°C)')
axes_r.legend()


delta = (df.index[-1] - df.index[0]) / pd.Timedelta('1 Day')
width = delta*0.944881889764*8

fig.set_size_inches(width, 5)









    



receiving incremental file list
./
2016_w13-chuck.csv
chuck_2016-03-28.csv

sent 503 bytes  received 17.91K bytes  12.28K bytes/sec
total size is 67.80K  speedup is 3.68






    



/home/reichler/anaconda/lib/python2.7/site-packages/pandas/core/indexing.py:115: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)






    



---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-31-bead4cd609e7> in <module>()
     14 
     15 # night background
---> 16 for span in spans(df):
     17     plt.axvspan(span[0], span[1], facecolor='0.2', alpha=0.3)
     18 

NameError: name 'spans' is not defined



In [ ]: