In [ ]:
# Imports
import os
from datetime import datetime
import pandas as pd
import numpy as np

In [ ]:
# Update the filename
test_FILENAME = 'test.csv'
train_FILENAME = 'train.csv'

In [ ]:
# Constants Declaration
DATASET_DIR = './data/'
RESULT_DIR = './result/'
EXTENSION_MAPPING = {
    'read': {
        'csv': 'read_csv',
        'json': 'read_json',
        'xlsx': 'read_excel'   
    },
    'save': {
        'csv': 'to_csv',
        'json': 'to_json',
        'xlsx': 'to_excel'      
    }
}
np.random.seed(seed=42)

In [ ]:
# Test Dataset Loader
test_DATASET_FILE = os.path.join(DATASET_DIR, test_FILENAME)
test_file_path, test_file_extension = os.path.splitext(test_DATASET_FILE)
test_file_name = test_file_path.split(os.path.sep)[-1]
test_file_extension = test_file_extension.strip('.')
test_dataset_extracter = EXTENSION_MAPPING['read'].get(test_file_extension)
if test_dataset_extracter is None:
    raise ValueError('Dataset type not supported')
test_df = getattr(pd, test_dataset_extracter)(test_DATASET_FILE)

In [ ]:
# Train Dataset Loader
train_DATASET_FILE = os.path.join(DATASET_DIR, train_FILENAME)
train_file_path, train_file_extension = os.path.splitext(train_DATASET_FILE)
train_file_extension = train_file_extension.strip('.')
train_dataset_extracter = EXTENSION_MAPPING['read'].get(train_file_extension)
if train_dataset_extracter is None:
    raise ValueError('Dataset type not supported')
train_df = getattr(pd, train_dataset_extracter)(train_DATASET_FILE)

Exploratory Analysis


In [ ]:
# Display train data frame shape
print('TRAIN dataset shape (rows X columns) :', train_df.shape[0], ' X ', train_df.shape[1])

# Display test data frame shape
print('TEST dataset shape (rows X columns) :', test_df.shape[0], ' X ', test_df.shape[1])

In [ ]:
# Display first 5 rows in train dataset
print('First 5 records in TRAIN dataset')
(train_df.head(5))

In [ ]:
print('First 5 records in TEST dataset')
(test_df.head(5))

In [ ]:
train_df['Id'].describe()

In [ ]:
train_df.describe()

In [ ]:
test_df.describe()

In [ ]:
# Storage of results.
result_time = datetime.utcnow().strftime('%s')
save_dataset_fn = EXTENSION_MAPPING['save'].get(file_extension.strip('.'))
getattr(df, save_dataset_fn)(os.path.join(RESULT_DIR, '{}.result.{}.{}'.format(file_name, result_time, file_extension)))