test_data_cean is intended to provide some cleaned and organised data for development of QC methods
In [2]:
# analytics
import numpy as np
import pandas as pd
import scipy.io as scio
import scipy.signal as sp
import os
# utils
from os.path import isfile, join
In [4]:
# flags
TEST = True
VERBOSE = True
In [5]:
# test data path
# todo: flexible input
if TEST :
datapath = os.path.abspath("test_data/")
else :
datapath = os.getcwd()
print(datapath)
In [6]:
def process_mat_file(f):
mat = scio.loadmat(f)
# convert to dataframe (see: http://poquitopicante.blogspot.nl/2014/05/loading-matlab-mat-file-into-pandas.html)
# data key name is not consisent, separate from globals, etc
mat = {k:v for k, v in mat.items() if k[0] != '_'}
#tmp = pd.concat([ pd.DataFrame(src: k, pd.DataFrame(v)) for k, v in mat.items() ])
tmp = list()
for k,v in mat.items() :
# depending on shape of data, read it in
if np.shape(v)[0] == 1 :
for i in v[0] :
for j in i :
df = pd.DataFrame(j)
df.columns = [c[0].lower() for c in df.columns]
df = df[ sorted(df.columns.values.tolist()) ] # make sure columns are sorted
# extract 'file' (in variable depth list of lists)
tmp_file = df.iloc[0,1]
while len(np.shape(tmp_file)) > 1 : # i.e. not 1D
tmp_file = tmp_file[0] # iteratively flatten
tmp_file = tmp_file.flat[0]
if type(tmp_file) is np.ndarray :
tmp_file = str(tmp_file.flat[0])
tmp.extend(
[
pd.concat(
[
pd.DataFrame(df.iloc[:,0].tolist()[0], columns=['x', 'y', 'z']), # accel data
pd.DataFrame({'time' : [e[0] for e in df.iloc[:,2][0]], 'file' : tmp_file})
], axis=1, ignore_index=True
)
]
)
#tmp = pd.concat([pd.DataFrame(i) for i in v[1]])
else :
tmp.extend([pd.DataFrame(v)])
# concat dataframe and return
tmp = pd.concat(tmp)
return(tmp)
In [20]:
def process_file(f):
print("Reading in file {}".format(f))
tmp = None
# skip output file if it already exists
if os.path.basename(f) == 'test_data_full.tsv' or os.path.basename(f) == 'test_data_small.tsv':
print("\tNot processing")
return(None)
# parse file depending on the file extension
if os.path.splitext(f)[1] == ".mat":
tmp = process_mat_file(f)
elif os.path.splitext(f)[1] == ".csv":
tmp = pd.read_csv(f)
elif os.path.splitext(f)[1] == ".tsv":
tmp = pd.read_csv(f, sep="\t")
elif os.path.splitext(f)[1] == ".txt":
## assume semi-colons
tmp = pd.read_csv(f, sep=";")
else :
print("\tFile cannot be read.")
# collect data if we got some
if tmp is not None :
# add new column for file and clean up
tmp['src'] = os.path.basename(f)
if VERBOSE: print(np.shape(tmp))
return(tmp)
else :
print("\tNothing to return")
return(None)
In [21]:
# collect all files from data directory
files = [ join(datapath,f) for f in os.listdir(datapath) if isfile(join(datapath,f)) ]
# report files found and read them in
accel = [ process_file(f) for f in files ]
accel = [ a for a in accel if a is not None ] # drop files that did not process
In [22]:
## re-parse data into pandas structure and clean up
if TEST :
if VERBOSE :
print(">>> BEFORE")
for a in accel:
print("TYPE")
print(type(a))
print("HEAD")
print(a.head(2))
print("INFO")
print(a.info())
### dirty hack to get formats the same,
### would be better if formats actually matched or where distinguishable
### based on file extension or header, etc
for (i, k) in enumerate(accel) :
#error_1.mat
#error_2.mat
#test_1.mat
#test_2.txt
if k.iloc[0]["src"] == "error_1.mat" :
k.columns = ["x", "y", "z", "file", "t", "src"]
k.set_index(["src", "file"], inplace=True)
elif k.iloc[0]["src"] == "error_2.mat" :
k.columns = ["x", "y", "z", "file", "t", "src"]
k.set_index(["src", "file"], inplace=True)
elif k.iloc[0]["src"] == "test_1.mat" :
k.columns = ["x", "y", "z", "t", "src"]
k["file"] = None # add new empty placeholder
# reset time to 20Hz
k.drop(['t'], axis=1,inplace=True)
k['t'] = pd.Series(np.arange(np.shape(k)[0])/20)
k.set_index(["src", "file"], inplace=True)
elif k.iloc[0]["src"] == "test_2.txt" :
k.columns = ["t", "x", "y", "z", "src"]
k["file"] = None # add new empty placeholder
k.set_index(["src", "file"], inplace=True)
if VERBOSE:
print(">>> AFTER")
for a in accel:
print("TYPE")
print(type(a))
print("HEAD")
print(a.head())
print("INFO")
print(a.info())
In [23]:
# combine
accel = pd.concat(accel)
accel.info()
In [79]:
# subset and print
#accel.to_csv('test_data/test_data_full.tsv', sep="\t", index=True, header=True)
pd.concat([
accel.loc[('error_1.mat','4097.log'):('error_1.mat','4099.log'),:],
accel.loc[('error_2.mat','accel_node000001.csv'):('error_2.mat','accel_node000003.csv'),:],
accel.loc[('test_2.txt', 'nan'):('test_2.txt', 'nan'),:]
]).to_csv('test_data/test_data_small.tsv', sep="\t", index=True, header=True)
In [ ]: