Database


In [1]:
import time
import os

import sqlite3

import pandas as pd
import numpy as np
import scipy as sp

Unprocessed accelerometer data

Extract data from accelerometer app CSV files.


In [2]:
def disco_parser(inputdir,outputdir,startstring,endstring):
    ''' Extracts data from accelerometer app CSV files    
    inputdir = Input data directory
    outputdir = Output data directory (timeseries)
    startstring = Event start time (HH:MM:SS:MS)
    endstring = Event end time (HH:MM:SS:MS) 
    Returns: timeseries = Subjects' data (id,t,x,y,z) ''' 

    function = 'disco_parser'
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Running',function)
    
    # Event start and end times in milliseconds
    startnumber = [int(num) for num in startstring.split(':')]
    endnumber = [int(num) for num in endstring.split(':')]
    starttime = startnumber[0]*60*60*1000 + startnumber[1]*60*1000 + startnumber[2]*1000 + startnumber[3]
    endtime = endnumber[0]*60*60*1000 + endnumber[1]*60*1000 + endnumber[2]*1000 + endnumber[3]
    
    timeseries = pd.DataFrame({})
    # samplerateMS,samplerateHZ = {},{}
    for file in os.listdir(os.path.join('.',inputdir)):
        
        print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),file)
            
        temp = pd.read_csv(os.path.join('.',inputdir,file),
                           skiprows=[0],names=['h','m','s','ms','x','y','z'],
                           sep=':|;',engine='python')

        millisecond = np.add(np.add(np.add(np.array(temp.h)*60*60*1000,
                                           np.array(temp.m)*60*1000),
                                           np.array(temp.s)*1000),
                                           np.array(temp.ms))            

        try: 
            startindex = np.where(millisecond <= starttime)[0][-1] # Last timepoint before event start time
            endindex = np.where(millisecond >= endtime)[0][0] # First timepoint after event end time
        except IndexError: # Time series not within music start and end times
            continue
        
        trim = pd.DataFrame({'t':millisecond[startindex:endindex+1],
                             'x':temp.x[startindex:endindex+1],
                             'y':temp.y[startindex:endindex+1],
                             'z':temp.z[startindex:endindex+1],
                             'id':[file[:-4]]*(endindex-startindex+1)})
        
        timeseries = pd.concat([timeseries,trim])
        
        # Time between each sampling time point (ms)
        # difference = np.diff(millisecond[1:-1]) # Exclude first(startindex) and last(endindex) timepoints        
        # samplerateMS[file[:-4]] = np.mean(difference) # Average sampling rate for each subject (ms)
        # samplerateHZ[file[:-4]] = np.divide(1000,samplerateMS[file[:-4]]) # Average sampling rate for each subject (Hz)
        
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Saving',function+'.pkl')
    timeseries.to_pickle(os.path.join('.',outputdir,function+'.pkl')) # To re-load: timeseries = pd.read_pickle(file_name)
        
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Done') 
    return timeseries

In [3]:
timeseries = disco_parser('testdata','','15:25:44:580','16:02:05:830')


11/07/2016 13:19:41 Running disco_parser
11/07/2016 13:19:41 1a.csv
11/07/2016 13:19:44 1b.csv
11/07/2016 13:19:46 2a.csv
11/07/2016 13:19:48 2b.csv
11/07/2016 13:19:50 3a.csv
11/07/2016 13:19:52 3b.csv
11/07/2016 13:19:54 4a.csv
11/07/2016 13:19:56 4b.csv
11/07/2016 13:19:58 5b.csv
11/07/2016 13:20:00 6a.csv
11/07/2016 13:20:03 6b.csv
11/07/2016 13:20:05 Saving disco_parser.pkl
11/07/2016 13:20:07 Done

In [4]:
timeseries.head()


Out[4]:
id t x y z
15837 1a 55544529 -0.833 0.151 0.054
15838 1a 55544606 -0.848 0.218 0.078
15839 1a 55544610 -0.844 0.300 0.074
15840 1a 55544612 -0.833 0.313 0.093
15841 1a 55544617 -0.835 0.261 0.132

Create SQL database for unprocessed accelerometer data.


In [5]:
def get_db(dbfile):
    '''Connect to database.'''
    sqlite_db = sqlite3.connect(os.path.join('.',dbfile))
    return sqlite_db

def init_db(dbfile,schema):
    '''Create the database.'''
    db = get_db(dbfile)
    db.cursor().executescript(schema)
    db.commit()
    return db

In [6]:
disco_schema = """
DROP TABLE IF EXISTS "unprocessed";
CREATE TABLE "unprocessed" (
    "id" VARCHAR,
    "t" FLOAT,
    "x" FLOAT,
    "y" FLOAT,
    "z" FLOAT
);
"""

db = init_db("disco_parser.db",disco_schema) # Initialize the database
timeseries.to_sql("unprocessed",db,if_exists="replace",index=False) # Populate the database

db.commit() # Commit changes to the database
db.close() # Close the connection to the database

Synchrony condition times

Convert condition times to milliseconds from 00:00:00:000.


In [7]:
def disco_millisecond(outputdir,fileID,events):
    ''' Converts time to milliseconds from 00:00:00:000
    outputdir = Output data directory (timeMSEC,timeNAME)
    fileID = File identifier ('conditions','wholesongs','songparts')
    events = Event time and name strings ('HH:MM:SS:MS','NAME')
    Returns: eventtimes = Events' data (name,msec) '''
    
    function = 'disco_millisecond'
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Running',function,fileID)
    
    timeMSEC,timeNAME = [],[]
    for i in range(len(events)):
        
        print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),events[i][1])
        
        timeNUM = [int(num) for num in events[i][0].split(':')]
        timeMSEC.append(timeNUM[0]*60*60*1000 + timeNUM[1]*60*1000 + timeNUM[2]*1000 + timeNUM[3])
        timeNAME.append(events[i][1])
    
    eventtimes = pd.DataFrame({'name':timeNAME,'msec':timeMSEC})
    
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Saving',function+'_'+fileID+'.pkl')
    eventtimes.to_pickle(os.path.join('.',outputdir,function+'_'+fileID+'.pkl'))
    
    print(time.strftime("%m/%d/%Y"),time.strftime("%H:%M:%S"),'Done') 
    return eventtimes

In [8]:
eventtimes = disco_millisecond('','conditions',
   [('15:25:44:580','Non-Synchrony 1'),
    ('15:29:53:240','Synchrony 1 (Choreographed)'),
    ('15:33:37:380','Non-Synchrony 2'),
    ('15:36:51:310','Synchrony 2 (Anti-Phase)'),
    ('15:40:56:120','Non-Synchrony 3'),
    ('15:44:34:950','Synchrony 3 (Magnitude)'),
    ('15:47:48:610','Non-Synchrony 4'),
    ('15:50:59:640','Synchrony 4 (Delay)'),
    ('15:54:25:740','Non-Synchrony 5'),
    ('15:58:17:370','Synchrony 5 (Non-Choreographed)'),
    ('16:02:05:830','End Time')])


11/07/2016 13:20:21 Running disco_millisecond conditions
11/07/2016 13:20:21 Non-Synchrony 1
11/07/2016 13:20:21 Synchrony 1 (Choreographed)
11/07/2016 13:20:21 Non-Synchrony 2
11/07/2016 13:20:21 Synchrony 2 (Anti-Phase)
11/07/2016 13:20:21 Non-Synchrony 3
11/07/2016 13:20:21 Synchrony 3 (Magnitude)
11/07/2016 13:20:21 Non-Synchrony 4
11/07/2016 13:20:21 Synchrony 4 (Delay)
11/07/2016 13:20:21 Non-Synchrony 5
11/07/2016 13:20:21 Synchrony 5 (Non-Choreographed)
11/07/2016 13:20:21 End Time
11/07/2016 13:20:21 Saving disco_millisecond_conditions.pkl
11/07/2016 13:20:21 Done

In [9]:
eventtimes


Out[9]:
msec name
0 55544580 Non-Synchrony 1
1 55793240 Synchrony 1 (Choreographed)
2 56017380 Non-Synchrony 2
3 56211310 Synchrony 2 (Anti-Phase)
4 56456120 Non-Synchrony 3
5 56674950 Synchrony 3 (Magnitude)
6 56868610 Non-Synchrony 4
7 57059640 Synchrony 4 (Delay)
8 57265740 Non-Synchrony 5
9 57497370 Synchrony 5 (Non-Choreographed)
10 57725830 End Time

Create SQL database for condition times.


In [10]:
msec_schema = """
DROP TABLE IF EXISTS "conditions";
CREATE TABLE "conditions" (
    "name" VARCHAR,
    "msec" FLOAT
);
"""

db = init_db("disco_millisecond_conditions.db",msec_schema)
eventtimes.to_sql("conditions",db,if_exists="replace",index=False)

db.commit()
db.close()