Master CSV: Store all Features as a Single File

Imports

This utility package imports numpy, pandas, matplotlib and a helper kg module into the root namespace.


In [ ]:
from pygoose import *

Config


In [ ]:
project = kg.Project.discover()

Read Data

Load all feature lists.


In [ ]:
feature_lists = [
    'simple_summaries',
    'jaccard_ngrams',
    'fuzzy',
    'tfidf',
    'lda',
    'nlp_tags',
    'wordnet_similarity',
    'phrase_embedding',
    'wmd',
    'wm_intersect',
    
    '3rdparty_abhishek',
    '3rdparty_dasolmar_whq',
    '3rdparty_mephistopheies',
    '3rdparty_image_similarity',
    
    'magic_pagerank',
    'magic_frequencies',
    'magic_cooccurrence_matrix',
    
    'oofp_nn_mlp_with_magic',
    'oofp_nn_cnn_with_magic',
    'oofp_nn_bi_lstm_with_magic',
    'oofp_nn_siamese_lstm_attention',
]

In [ ]:
df_train, df_test, feature_list_ix = project.load_feature_lists(feature_lists)

In [ ]:
y_train = kg.io.load(project.features_dir + 'y_train.pickle')

View feature summary.


In [ ]:
print('X train:', df_train.shape)
print('y train:', y_train.shape)
print('X test: ', df_test.shape)

In [ ]:
pd.DataFrame(feature_list_ix, columns=['feature_list', 'start_index', 'end_index'])

Train

Append the target to the training set.


In [ ]:
df_train['is_duplicate'] = y_train

In [ ]:
df_train.head(5)

In [ ]:
df_train.to_csv(
    project.features_dir + 'X_train_all_features.csv',
    header=True,
    index=True,
    index_label='id',
    float_format='%.6f',
)

Test


In [ ]:
df_test.head(5)

In [ ]:
df_test.to_csv(
    project.features_dir + 'X_test_all_features.csv',
    header=True,
    index=True,
    index_label='id',
    float_format='%.6f',
)