In [1]:
# Imports
import os
import cPickle
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
In [2]:
# Update the filename
FILENAME = 'dummy.csv'
In [3]:
# Constants Declaration
DATASET_DIR = './data/'
RESULT_DIR = './result/'
RANDOM_SEED = 42
EXTENSION_MAPPING = {
'read': {
'csv': 'read_csv',
'json': 'read_json',
'xlsx': 'read_excel'
},
'save': {
'csv': 'to_csv',
'json': 'to_json',
'xlsx': 'to_excel'
}
}
np.random.seed(seed=RANDOM_SEED)
In [4]:
# Dataset Loader
DATASET_FILE = os.path.join(DATASET_DIR, FILENAME)
file_path, file_extension = os.path.splitext(DATASET_FILE)
file_name = file_path.split(os.path.sep)[-1]
file_extension = file_extension.strip('.')
dataset_extracter = EXTENSION_MAPPING['read'].get(file_extension)
if dataset_extracter is None:
raise ValueError('Dataset type not supported')
df = getattr(pd, dataset_extracter)(DATASET_FILE)
In [5]:
df.head()
Out[5]:
In [6]:
target_columns = list(set(['age']))
dependent_columns = list(set(df.columns) - set(target_columns))
In [7]:
X_train, X_test, y_train, y_test = train_test_split(
df[dependent_columns], df[target_columns],
test_size=0.2, random_state=RANDOM_SEED)
In [8]:
# Preprocessing with Sklearn, Fill with mean values for the required columns.
required_columns = []
imputer = Imputer(missing_values=np.nan, strategy="mean", axis=0)
if len(required_columns) > 0:
X_train[required_columns] = pd.DataFrame(imputer.fit_transform(X_train[required_columns]), index=X_train.index)
X_test[required_columns] = pd.DataFrame(imputer.transform(X_test[required_columns]), index=X_test.index)
In [9]:
# Preprocessing with Sklearn, Fill with median values for the required columns.
required_columns = []
imputer = Imputer(missing_values=np.nan, strategy="median", axis=0)
if len(required_columns) > 0:
X_train[required_columns] = pd.DataFrame(imputer.fit_transform(X_train[required_columns]), index=X_train.index)
X_test[required_columns] = pd.DataFrame(imputer.transform(X_test[required_columns]), index=X_test.index)
In [10]:
# Preprocessing with Sklearn, Fill with most frequent values for the required columns.
required_columns = []
imputer = Imputer(missing_values=np.nan, strategy="most_frequent", axis=0)
if len(required_columns) > 0:
X_train[required_columns] = pd.DataFrame(imputer.fit_transform(X_train[required_columns]), index=X_train.index)
X_test[required_columns] = pd.DataFrame(imputer.transform(X_test[required_columns]), index=X_test.index)
In [11]:
# Preprocessing with Pandas, Fill with a specific value.
value = 0
required_columns = []
if len(required_columns) > 0:
X_train[required_columns] = X_train[required_columns].fillna(value)
X_test[required_columns] = X_test[required_columns].fillna(value)
In [12]:
# Preprocessing with Pandas, Drop missing values
required_columns = []
if len(required_columns) > 0:
X_train.dropna(subset=required_columns, inplace=True, how='any')
X_test.dropna(subset=required_columns, inplace=True, how='any')
In [13]:
# Non Binary Multiclass Classification / Encode Classes to Labels
required_columns = []
label_encoders = {}
for column in required_columns:
label_encoders[column] = LabelEncoder()
if column in X_train.columns:
X_train[column] = label_encoders[column].fit_transform(X_train[column])
X_test[column] = label_encoders[column].transform(X_test[column])
elif column in y_train.columns:
y_train[column] = label_encoders[column].fit_transform(y_train[column])
y_test[column] = label_encoders[column].transform(y_test[column])
In [14]:
# Multiclass Binary Classification
# Only a single column is expected
required_columns = []
label_binarizer = None
if len(required_columns) > 0:
column = required_columns[0]
if column in X_train.columns:
label_binarizer = LabelBinarizer()
X_train[column] = label_binarizer.fit_transform(X_train[column])
X_test[column] = label_binarizer.transform(X_test[column])
elif column in y_train.columns:
label_binarizer = LabelBinarizer()
y_train[column] = label_binarizer.fit_transform(y_train[column])
y_test[column] = label_binarizer.transform(y_test[column])
In [15]:
# Multilabel Binary Classification
# Only a single column is expected
required_columns = []
multi_label_binarizer = None
if len(required_columns) > 0:
column = required_columns[0]
if column in y_train.columns:
multi_label_binarizer = MultiLabelBinarizer()
y_train[column] = multi_label_binarizer.fit_transform(y_train[column])
y_test[column] = multi_label_binarizer.transform(y_test[column])
In [16]:
# One Hot Encoding of dependent features
required_columns = []
if len(required_columns) > 0:
# Avoid dummy variable trap with n-1 columns
total = pd.get_dummies(pd.concat([X_train, X_test]), columns=required_columns, drop_first=True)
X_train = total.loc[X_train.index]
X_test = total.loc[X_test.index]
In [17]:
#Text Preprocessing with CBOW & TFIDF Transformer
#One column expected
required_columns = []
tfidf_vect = None
if len(required_columns) > 0:
# Remove words which occur in more than 95% of the documents and should atleast have 2 occurences
tfidf_vect = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
column = required_columns[0]
tfidf_vect.fit(pd.concat([X_train, X_test])[column])
train_numerical_features = tfidf_vect.transform(X_train[column]).todense()
X_train = pd.concat([X_train, pd.DataFrame(train_numerical_features, index=X_train.index).add_prefix('message_')], axis=1)
test_numerical_features = tfidf_vect.transform(X_test[column]).todense()
X_test = pd.concat([X_test, pd.DataFrame(test_numerical_features, index=X_test.index).add_prefix('message_')], axis=1)
In [18]:
# Feature Selection with Chi2 Test
max_num_features = None
required_columns = []
selector_chi2 = None
if max_num_features is not None and len(required_columns) > 0:
selector_chi2 = SelectKBest(chi2, k=max_num_features)
print selector_chi2.fit_transform(X_train[required_columns], y_train)
print selector_chi2.transform(X_test[required_columns])
In [19]:
# Utilise Standard Scaler
required_columns = []
scaler = None
if len(required_columns) > 0:
scaler = StandardScaler()
X_train[required_columns] = pd.DataFrame(scaler.fit_transform(X_train[required_columns]), index=X_train.index)
X_test[required_columns] = pd.DataFrame(scaler.transform(X_test[required_columns]), index=X_test.index)
In [20]:
# Utilise Normalisation
required_columns = []
normalizer = None
if len(required_columns) > 0:
normalizer = Normalizer()
X_train[required_columns] = pd.DataFrame(normalizer.fit_transform(X_train[required_columns]), index=X_train.index)
X_test[required_columns] = pd.DataFrame(normalizer.transform(X_test[required_columns]), X_test.index)
In [21]:
# Storage of results.
result_time = datetime.utcnow().strftime('%s')
save_dataset_fn = EXTENSION_MAPPING['save'].get(file_extension.strip('.'))
getattr(pd.concat([X_train, y_train], axis=1), save_dataset_fn)(os.path.join(RESULT_DIR, '{}.result.train.{}.{}'.format(file_name, result_time, file_extension)))
getattr(pd.concat([X_test, y_test], axis=1), save_dataset_fn)(os.path.join(RESULT_DIR, '{}.result.test.{}.{}'.format(file_name, result_time, file_extension)))
In [22]:
if len(label_encoders) > 0:
with open(os.path.join(RESULT_DIR, '{}.result.label_encoders.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
cPickle.dump(label_encoders, encoder_fp)
In [23]:
if label_binarizer is not None:
with open(os.path.join(RESULT_DIR, '{}.result.label_binarizer.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
cPickle.dump(label_binarizer, encoder_fp)
In [24]:
if multi_label_binarizer is not None:
with open(os.path.join(RESULT_DIR, '{}.result.multi_label_binarizer.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
cPickle.dump(multi_label_binarizer, encoder_fp)
In [25]:
if scaler is not None:
with open(os.path.join(RESULT_DIR, '{}.result.scaler.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
cPickle.dump(scaler, encoder_fp)
In [26]:
if normalizer is not None:
with open(os.path.join(RESULT_DIR, '{}.result.normalizer.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
cPickle.dump(normalizer, encoder_fp)
In [27]:
if tfidf_vect is not None:
with open(os.path.join(RESULT_DIR, '{}.result.tfidf_vect.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
cPickle.dump(tfidf_vect, encoder_fp)
In [28]:
if selector_chi2 is not None:
with open(os.path.join(RESULT_DIR, '{}.result.selector_chi2.{}.{}.pkl'.format(file_name, result_time, file_extension)), 'wb') as encoder_fp:
cPickle.dump(selector_chi2, encoder_fp)