In [1]:
%matplotlib inline

import os
import warnings
import os.path as op
import numpy   as np
import pandas  as pd

warnings.filterwarnings("ignore")

In [27]:
import socket


hn = socket.gethostname()

if hn == 'AyerdiBorja':
    datadir = '/Users/ayerdi/Desktop/data'
    resultDir = '/Users/ayerdi/Desktop/Dropbox (Neurita)/IJCRS15_code'
elif hn == 'calm.local':
    root_dir  = op.expanduser('~/data/coal_mining')
    datadir   = op.join(root_dir, 'data')
    resultDir = op.join(root_dir, 'results')
    hdf_file  = op.join(root_dir, 'coal_mining_data.hdf')
    test_csv_file = op.join(datadir, 'testData.csv')
    
    
print('Root working dir: {}'.format(root_dir))


Root working dir: /Users/alexandre/data/coal_mining

In [28]:
# Total Warnings: 1429
# Total Warnings 1: 80
# Total Warnings 2: 1208
# Total Warnings 3: 141
# Total Normal: 118571
# Acc. all normal: 0.988091666667

In [29]:
def read_csv_test_data(filepath=test_csv_file):
    return pd.read_csv(filepath, sep=',', header=None)

In [30]:
def read_csv_training_data():

    X = pd.DataFrame()
    Y = pd.DataFrame()

    for i in range(1,5):
        trainFile = 'trainingData' + str(i) + '.csv'
        testFile = 'trainingLabels' + str(i) + '.csv'

        print('Reading data from {}'.format(trainFile))
    
        # Load training data
        train = os.path.join(datadir, trainFile)
        train = pd.read_csv(train, sep=',', header=None)

        # Load labels
        labels = os.path.join(datadir, testFile)
        labels = pd.read_csv(labels, sep=',', header=None)
        label_0 = (labels[0] == 'warning') *1
        label_1 = (labels[1] == 'warning') *1
        label_2 = (labels[2] == 'warning') *1
        y = pd.concat([label_0, label_1, label_2], axis=1)

        X = pd.concat([X, train])
        Y = pd.concat([Y, y])


    X = X.reset_index()
    X = X.drop('index',1)

    Y = Y.reset_index()
    Y = Y.drop('index',1)
    
    return X, Y

In [33]:
#save to hdf file
# you need to install PyTables for this: !pip install tables
def convert_csv_to_hdf(hdf_filepath=hdf_file):
    print('Reading test data.')
    test_data = read_csv_test_data()

    print('Reading training data.')
    train_samples, train_labels = read_csv_training_data()

    print('Saving data into {}'.format(hdf_filepath))
    test_data.to_hdf    (hdf_filepath, key='test_data')
    train_samples.to_hdf(hdf_filepath, key='samples')
    train_labels.to_hdf (hdf_filepath, key='labels')


def read_data_from_hdf(hdf_filepath=hdf_file):
    test_data     = pd.read_hdf(hdf_filepath, key='test_data')
    train_samples = pd.read_hdf(hdf_filepath, key='samples')
    train_labels  = pd.read_hdf(hdf_filepath, key='labels')
    return test_data, train_samples, train_labels

In [24]:
#remove hdf file
#!rm $hdf_file
#convert_csv_to_hdf()

In [35]:
# read the data from HDF file
test_data, train_samples, train_labels = read_data_from_hdf(hdf_filepath=hdf_file)

In [34]:
# Ya tenemos todos los datos en X e Y, ahora toca usar la imaginación

In [36]:
train_samples.shape


Out[36]:
(40000, 16800)

In [ ]: