In [1]:
# Imports
import os
import cPickle
from datetime import datetime

import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [2]:
# Update the filename
FILENAME = 'dummy.csv'

In [3]:
# Constants Declaration
DATASET_DIR = './data/'
RESULT_DIR = './result/'
RANDOM_SEED = 42
EXTENSION_MAPPING = {
    'read': {
        'csv': 'read_csv',
        'json': 'read_json',
        'xlsx': 'read_excel'   
    },
    'save': {
        'csv': 'to_csv',
        'json': 'to_json',
        'xlsx': 'to_excel'      
    }
}
np.random.seed(seed=RANDOM_SEED)

In [4]:
# Dataset Loader
DATASET_FILE = os.path.join(DATASET_DIR, FILENAME)
file_path, file_extension = os.path.splitext(DATASET_FILE)
file_name = file_path.split(os.path.sep)[-1]
file_extension = file_extension.strip('.')
dataset_extracter = EXTENSION_MAPPING['read'].get(file_extension)
if dataset_extracter is None:
    raise ValueError('Dataset type not supported')
df = getattr(pd, dataset_extracter)(DATASET_FILE)

In [5]:
df.head()


Out[5]:
id name age year message
0 1 john 23.0 2004.0 Hey how are you doing?
1 2 tom 45.0 2006.0 Was chasing Jerry
2 3 sam 64.0 NaN I have something urgent to do
3 4 harry NaN 2012.0 The protagonist in Rowling's book
4 5 jim 23.0 2014.0 There you found me protagonist

In [6]:
target_columns = list(set(['age']))
dependent_columns = list(set(df.columns) - set(target_columns))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    df[dependent_columns], df[target_columns],
    test_size=0.2, random_state=RANDOM_SEED)

Dealing with missing values

  • Replace with Mean Values
  • Replace with Median Values
  • Replace with Most Common Values
  • Replace with Specific Value
  • Drop records with Missing Values

In [8]:
# Preprocessing with Sklearn, Fill with mean values for the required columns.

required_columns = []
imputer = Imputer(missing_values=np.nan, strategy="mean", axis=0)
if len(required_columns) > 0:
    X_train[required_columns] = pd.DataFrame(imputer.fit_transform(X_train[required_columns]), index=X_train.index)
    X_test[required_columns] = pd.DataFrame(imputer.transform(X_test[required_columns]), index=X_test.index)

In [9]:
# Preprocessing with Sklearn, Fill with median values for the required columns.

required_columns = []
imputer = Imputer(missing_values=np.nan, strategy="median", axis=0)
if len(required_columns) > 0:
    X_train[required_columns] = pd.DataFrame(imputer.fit_transform(X_train[required_columns]), index=X_train.index)
    X_test[required_columns] = pd.DataFrame(imputer.transform(X_test[required_columns]), index=X_test.index)

In [10]:
# Preprocessing with Sklearn, Fill with most frequent values for the required columns.

required_columns = []
imputer = Imputer(missing_values=np.nan, strategy="most_frequent", axis=0)
if len(required_columns) > 0:
    X_train[required_columns] = pd.DataFrame(imputer.fit_transform(X_train[required_columns]), index=X_train.index)
    X_test[required_columns] = pd.DataFrame(imputer.transform(X_test[required_columns]), index=X_test.index)

In [11]:
# Preprocessing with Pandas, Fill with a specific value.

value = 0
required_columns = []
if len(required_columns) > 0:
    X_train[required_columns] = X_train[required_columns].fillna(value)
    X_test[required_columns] = X_test[required_columns].fillna(value)

In [12]:
# Preprocessing with Pandas, Drop missing values

required_columns = []
if len(required_columns) > 0:
    X_train.dropna(subset=required_columns, inplace=True, how='any')
    X_test.dropna(subset=required_columns, inplace=True, how='any')

Encoding Features

  • Target Features
    • Multiclass Classification
      • Binary
      • Non Binary
    • Multilabel Classification
  • Dependent Features
    • Encode Classes to Labels
    • One Hot Encoding of categorical data

In [13]:
# Non Binary Multiclass Classification / Encode Classes to Labels

required_columns = []
label_encoders = {}
for column in required_columns:
    label_encoders[column] = LabelEncoder()
    if column in X_train.columns:
        X_train[column] = label_encoders[column].fit_transform(X_train[column])
        X_test[column] = label_encoders[column].transform(X_test[column])
    elif column in y_train.columns:
        y_train[column] = label_encoders[column].fit_transform(y_train[column])
        y_test[column] = label_encoders[column].transform(y_test[column])

In [14]:
# Multiclass Binary Classification

# Only a single column is expected
required_columns = []
label_binarizer = None
if len(required_columns) > 0:
    column = required_columns[0]
    if column in X_train.columns:
        label_binarizer = LabelBinarizer()
        X_train[column] = label_binarizer.fit_transform(X_train[column])
        X_test[column] = label_binarizer.transform(X_test[column])
    elif column in y_train.columns:
        label_binarizer = LabelBinarizer()
        y_train[column] = label_binarizer.fit_transform(y_train[column])
        y_test[column] = label_binarizer.transform(y_test[column])

In [15]:
# Multilabel Binary Classification

# Only a single column is expected
required_columns = []

multi_label_binarizer = None
if len(required_columns) > 0:
    column = required_columns[0]
    if column in y_train.columns:
        multi_label_binarizer = MultiLabelBinarizer()
        y_train[column] = multi_label_binarizer.fit_transform(y_train[column])
        y_test[column] = multi_label_binarizer.transform(y_test[column])

In [16]:
# One Hot Encoding of dependent features

required_columns = []
if len(required_columns) > 0:
    # Avoid dummy variable trap with n-1 columns
    total = pd.get_dummies(pd.concat([X_train, X_test]), columns=required_columns, drop_first=True)
    X_train = total.loc[X_train.index]
    X_test = total.loc[X_test.index]

In [17]:
#Text Preprocessing with CBOW & TFIDF Transformer

#One column expected
required_columns = []
tfidf_vect = None
if len(required_columns) > 0:
    # Remove words which occur in more than 95% of the documents and should atleast have 2 occurences
    tfidf_vect = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
    column = required_columns[0]
    tfidf_vect.fit(pd.concat([X_train, X_test])[column])
    train_numerical_features = tfidf_vect.transform(X_train[column]).todense()
    X_train = pd.concat([X_train, pd.DataFrame(train_numerical_features, index=X_train.index).add_prefix('message_')], axis=1)
    test_numerical_features = tfidf_vect.transform(X_test[column]).todense()
    X_test = pd.concat([X_test, pd.DataFrame(test_numerical_features, index=X_test.index).add_prefix('message_')], axis=1)

In [18]:
# Feature Selection with Chi2 Test
max_num_features = None
required_columns = []

selector_chi2 = None
if max_num_features is not None and len(required_columns) > 0:
    selector_chi2 = SelectKBest(chi2, k=max_num_features)
    print selector_chi2.fit_transform(X_train[required_columns], y_train)
    print selector_chi2.transform(X_test[required_columns])

Scaling Features

  • Scaling
  • Normalisation

In [19]:
# Utilise Standard Scaler
required_columns = []

scaler = None
if len(required_columns) > 0:
    scaler = StandardScaler()
    X_train[required_columns] = pd.DataFrame(scaler.fit_transform(X_train[required_columns]), index=X_train.index)
    X_test[required_columns] = pd.DataFrame(scaler.transform(X_test[required_columns]), index=X_test.index)

In [20]:
# Utilise Normalisation
required_columns = []

normalizer = None
if len(required_columns) > 0:
    normalizer = Normalizer()
    X_train[required_columns] = pd.DataFrame(normalizer.fit_transform(X_train[required_columns]), index=X_train.index)
    X_test[required_columns] = pd.DataFrame(normalizer.transform(X_test[required_columns]), X_test.index)

In [21]:
# Storage of results.
result_time = datetime.utcnow().strftime('%s')
save_dataset_fn = EXTENSION_MAPPING['save'].get(file_extension.strip('.'))
getattr(pd.concat([X_train, y_train], axis=1), save_dataset_fn)(os.path.join(RESULT_DIR, '{}.result.train.{}.{}'.format(file_name, result_time, file_extension)))
getattr(pd.concat([X_test, y_test], axis=1), save_dataset_fn)(os.path.join(RESULT_DIR, '{}.result.test.{}.{}'.format(file_name, result_time, file_extension)))

In [22]:
if len(label_encoders) > 0:
    with open(os.path.join(RESULT_DIR, '{}.result.label_encoders.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
        cPickle.dump(label_encoders, encoder_fp)

In [23]:
if label_binarizer is not None:
    with open(os.path.join(RESULT_DIR, '{}.result.label_binarizer.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
        cPickle.dump(label_binarizer, encoder_fp)

In [24]:
if multi_label_binarizer is not None:
    with open(os.path.join(RESULT_DIR, '{}.result.multi_label_binarizer.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
        cPickle.dump(multi_label_binarizer, encoder_fp)

In [25]:
if scaler is not None:
    with open(os.path.join(RESULT_DIR, '{}.result.scaler.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
        cPickle.dump(scaler, encoder_fp)

In [26]:
if normalizer is not None:
    with open(os.path.join(RESULT_DIR, '{}.result.normalizer.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
        cPickle.dump(normalizer, encoder_fp)

In [27]:
if tfidf_vect is not None:
    with open(os.path.join(RESULT_DIR, '{}.result.tfidf_vect.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
        cPickle.dump(tfidf_vect, encoder_fp)

In [28]:
if selector_chi2 is not None:
    with open(os.path.join(RESULT_DIR, '{}.result.selector_chi2.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
        cPickle.dump(selector_chi2, encoder_fp)