test_data_clean

  • author: Ieuan Clay
  • started: March 2015
  • last update: April 2015

test_data_cean is intended to provide some cleaned and organised data for development of QC methods

Functionality provided:

  • Data import
  • Parsing and reorganisation
  • saving in convenient format for re-use

References

Set up session


In [2]:
# analytics
import numpy as np
import pandas as pd
import scipy.io as scio
import scipy.signal as sp
import os

# utils
from os.path import isfile, join

In [4]:
# flags
TEST = True
VERBOSE = True

In [5]:
# test data path
# todo: flexible input
if TEST :
    datapath = os.path.abspath("test_data/")
else :
    datapath = os.getcwd()
print(datapath)


C:\Users\Ieuan and Katharina\ieuan_work\delft\dataQC\test_data

Import data

Data Overview

Test data

  • Test 1 and Test 2 are events with good data.
  • Error 1 has the unexpected peaks every 1Hz in the spectrum.
  • Error 2 has a bug caused by 32 to 16 bit conversion.

In [6]:
def process_mat_file(f):
    
    mat = scio.loadmat(f)
    # convert to dataframe (see: http://poquitopicante.blogspot.nl/2014/05/loading-matlab-mat-file-into-pandas.html)
    # data key name is not consisent, separate from globals, etc
    mat = {k:v for k, v in mat.items() if k[0] != '_'}
    #tmp = pd.concat([ pd.DataFrame(src: k, pd.DataFrame(v)) for k, v in mat.items() ])
    tmp = list()
    for k,v in mat.items() : 
        # depending on shape of data, read it in
        if np.shape(v)[0] == 1 :
            for i in v[0] :
                for j in i :
                    df = pd.DataFrame(j)
                    df.columns = [c[0].lower() for c in df.columns]
                    df = df[ sorted(df.columns.values.tolist()) ] # make sure columns are sorted
                    # extract 'file' (in variable depth list of lists)
                    tmp_file = df.iloc[0,1]
                    while len(np.shape(tmp_file)) > 1 : # i.e. not 1D
                        tmp_file = tmp_file[0] # iteratively flatten
                    tmp_file = tmp_file.flat[0]
                    if type(tmp_file) is np.ndarray :
                        tmp_file = str(tmp_file.flat[0])
                    tmp.extend(
                                [
                                pd.concat(
                                    [
                                        pd.DataFrame(df.iloc[:,0].tolist()[0], columns=['x', 'y', 'z']), # accel data
                                        pd.DataFrame({'time' : [e[0] for e in df.iloc[:,2][0]], 'file' : tmp_file})
                                     ], axis=1, ignore_index=True
                                    )
                                ]
                            )

            #tmp = pd.concat([pd.DataFrame(i) for i in v[1]])
        else :
            tmp.extend([pd.DataFrame(v)])

    # concat dataframe and return
    tmp = pd.concat(tmp)
    return(tmp)

In [20]:
def process_file(f):
    
    print("Reading in file {}".format(f))
    tmp = None
    
    # skip output file if it already exists
    if os.path.basename(f) == 'test_data_full.tsv' or os.path.basename(f) == 'test_data_small.tsv':
        print("\tNot processing")
        return(None)

    # parse file depending on the file extension
    if os.path.splitext(f)[1] == ".mat":
        tmp = process_mat_file(f)
    elif os.path.splitext(f)[1] == ".csv":
        tmp = pd.read_csv(f)
    elif os.path.splitext(f)[1] == ".tsv":
        tmp = pd.read_csv(f, sep="\t")
    elif os.path.splitext(f)[1] == ".txt":
        ## assume semi-colons
        tmp = pd.read_csv(f, sep=";")
    else :
        print("\tFile cannot be read.")
        
    # collect data if we got some
    if tmp is not None :
        # add new column for file and clean up
        tmp['src'] = os.path.basename(f)
        if VERBOSE: print(np.shape(tmp))
        return(tmp)
    else :
        print("\tNothing to return")
        return(None)

In [21]:
# collect all files from data directory
files = [ join(datapath,f) for f in os.listdir(datapath) if isfile(join(datapath,f)) ]
# report files found and read them in
accel = [ process_file(f) for f in files ]
accel = [ a for a in accel if a is not None ] # drop files that did not process


Reading in file C:\Users\Ieuan and Katharina\ieuan_work\delft\dataQC\test_data\desktop.ini
	File cannot be read.
	Nothing to return
Reading in file C:\Users\Ieuan and Katharina\ieuan_work\delft\dataQC\test_data\error_1.mat
(4088167, 6)
Reading in file C:\Users\Ieuan and Katharina\ieuan_work\delft\dataQC\test_data\error_2.mat
(11540660, 6)
Reading in file C:\Users\Ieuan and Katharina\ieuan_work\delft\dataQC\test_data\test_1.mat
(14015, 5)
Reading in file C:\Users\Ieuan and Katharina\ieuan_work\delft\dataQC\test_data\test_2.txt
(199520, 5)
Reading in file C:\Users\Ieuan and Katharina\ieuan_work\delft\dataQC\test_data\test_data_full.tsv
	Not processing

In [22]:
## re-parse data into pandas structure and clean up
if TEST :
    if VERBOSE :
        print(">>> BEFORE")
        for a in accel:
            print("TYPE")
            print(type(a))
            print("HEAD")
            print(a.head(2))
            print("INFO")
            print(a.info())

    ### dirty hack to get formats the same, 
    ### would be better if formats actually matched or where distinguishable 
    ### based on file extension or header, etc

    for (i, k) in enumerate(accel) :
        #error_1.mat
        #error_2.mat
        #test_1.mat
        #test_2.txt
        if k.iloc[0]["src"] == "error_1.mat" :
            k.columns = ["x", "y", "z", "file", "t", "src"]
            k.set_index(["src", "file"], inplace=True)
        elif k.iloc[0]["src"] == "error_2.mat" :
            k.columns = ["x", "y", "z", "file", "t", "src"] 
            k.set_index(["src", "file"], inplace=True)
        elif k.iloc[0]["src"] == "test_1.mat" :
            k.columns = ["x", "y", "z", "t", "src"]
            k["file"] = None # add new empty placeholder
            # reset time to 20Hz
            k.drop(['t'], axis=1,inplace=True)
            k['t'] = pd.Series(np.arange(np.shape(k)[0])/20)
            k.set_index(["src", "file"], inplace=True)
        elif k.iloc[0]["src"] == "test_2.txt" :
            k.columns = ["t", "x", "y", "z", "src"]
            k["file"] = None # add new empty placeholder
            k.set_index(["src", "file"], inplace=True)
    
    if VERBOSE: 
        print(">>> AFTER")
        for a in accel:
            print("TYPE")
            print(type(a))
            print("HEAD")
            print(a.head())
            print("INFO")
            print(a.info())


>>> BEFORE
TYPE
<class 'pandas.core.frame.DataFrame'>
HEAD
    0    1    2         3    4          src
0  35  461  745  4097.log  0.0  error_1.mat
1  35  461  745  4097.log  0.1  error_1.mat
INFO
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4088167 entries, 0 to 131880
Data columns (total 6 columns):
0      int16
1      int16
2      int16
3      object
4      float64
src    object
dtypes: float64(1), int16(3), object(2)None
TYPE
<class 'pandas.core.frame.DataFrame'>
HEAD
      0      1      2                     3     4          src
0  8979 -27136 -25601  accel_node000001.csv  1.00  error_2.mat
1  8960 -26881 -25853  accel_node000001.csv  1.05  error_2.mat
INFO
<class 'pandas.core.frame.DataFrame'>
Int64Index: 11540660 entries, 0 to 263399
Data columns (total 6 columns):
0      int16
1      int16
2      int16
3      object
4      float64
src    object
dtypes: float64(1), int16(3), object(2)None
TYPE
<class 'pandas.core.frame.DataFrame'>
HEAD
          0         1         2   3         src
0 -0.060333  9.759333  4.559333  20  test_1.mat
1 -0.245000  9.544000  4.141000  20  test_1.mat
INFO
<class 'pandas.core.frame.DataFrame'>
Int64Index: 14015 entries, 0 to 14014
Data columns (total 5 columns):
0      14015 non-null float64
1      14015 non-null float64
2      14015 non-null float64
3      14015 non-null float64
src    14015 non-null object
dtypes: float64(4), object(1)None
TYPE
<class 'pandas.core.frame.DataFrame'>
HEAD
   time     x    y      z         src
0  1.00  1044  256 -12288  test_2.txt
1  1.05  1024  256 -12289  test_2.txt
INFO
<class 'pandas.core.frame.DataFrame'>
Int64Index: 199520 entries, 0 to 199519
Data columns (total 5 columns):
time    199520 non-null float64
x       199520 non-null int64
y       199520 non-null int64
z       199520 non-null int64
src     199520 non-null object
dtypes: float64(1), int64(3), object(1)None
>>> AFTER
TYPE
<class 'pandas.core.frame.DataFrame'>
HEAD
                       x    y    z    t
src         file                       
error_1.mat 4097.log  35  461  745  0.0
            4097.log  35  461  745  0.1
            4097.log  35  497  781  0.2
            4097.log   0  497  745  0.3
            4097.log  71  497  710  0.4
INFO
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 4088167 entries, (error_1.mat, 4097.log) to (error_1.mat, 4158.log)
Data columns (total 4 columns):
x    int16
y    int16
z    int16
t    float64
dtypes: float64(1), int16(3)None
TYPE
<class 'pandas.core.frame.DataFrame'>
HEAD
                                      x      y      z     t
src         file                                           
error_2.mat accel_node000001.csv   8979 -27136 -25601  1.00
            accel_node000001.csv   8960 -26881 -25853  1.05
            accel_node000001.csv  18176  29439 -25853  1.10
            accel_node000001.csv  18176 -17921  30467  1.15
            accel_node000001.csv   8960 -17921  30467  1.20
INFO
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 11540660 entries, (error_2.mat, accel_node000001.csv) to (error_2.mat, accel_node000106.csv)
Data columns (total 4 columns):
x    int16
y    int16
z    int16
t    float64
dtypes: float64(1), int16(3)None
TYPE
<class 'pandas.core.frame.DataFrame'>
HEAD
                        x         y         z     t
src        file                                    
test_1.mat NaN  -0.060333  9.759333  4.559333  0.00
           NaN  -0.245000  9.544000  4.141000  0.05
           NaN  -0.186667  9.492333  3.850000  0.10
           NaN  -0.147000  9.586000  3.972000  0.15
           NaN  -0.233000  9.483000  4.140000  0.20
INFO
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 14015 entries, (test_1.mat, nan) to (test_1.mat, nan)
Data columns (total 4 columns):
x    14015 non-null float64
y    14015 non-null float64
z    14015 non-null float64
t    14015 non-null float64
dtypes: float64(4)None
TYPE
<class 'pandas.core.frame.DataFrame'>
HEAD
                    t     x    y      z
src        file                        
test_2.txt NaN   1.00  1044  256 -12288
           NaN   1.05  1024  256 -12289
           NaN   1.10  1024  256 -12289
           NaN   1.15   768  256 -12289
           NaN   1.20  1024  256 -12289
INFO
<class 'pandas.core.frame.DataFrame'>
MultiIndex: 199520 entries, (test_2.txt, nan) to (test_2.txt, nan)
Data columns (total 4 columns):
t    199520 non-null float64
x    199520 non-null int64
y    199520 non-null int64
z    199520 non-null int64
dtypes: float64(1), int64(3)None

combine and save data


In [23]:
# combine
accel = pd.concat(accel)
accel.info()


<class 'pandas.core.frame.DataFrame'>
MultiIndex: 15842362 entries, (error_1.mat, 4097.log) to (test_2.txt, nan)
Data columns (total 4 columns):
t    float64
x    float64
y    float64
z    float64
dtypes: float64(4)

In [79]:
# subset and print
#accel.to_csv('test_data/test_data_full.tsv', sep="\t", index=True, header=True)
pd.concat([     
    accel.loc[('error_1.mat','4097.log'):('error_1.mat','4099.log'),:], 
    accel.loc[('error_2.mat','accel_node000001.csv'):('error_2.mat','accel_node000003.csv'),:],
    accel.loc[('test_2.txt', 'nan'):('test_2.txt', 'nan'),:]
              ]).to_csv('test_data/test_data_small.tsv', sep="\t", index=True, header=True)

In [ ]: