This utility package imports numpy, pandas, matplotlib and a helper kg module into the root namespace.
In [ ]:
from pygoose import *
In [ ]:
project = kg.Project.discover()
Load all feature lists.
In [ ]:
feature_lists = [
'simple_summaries',
'jaccard_ngrams',
'fuzzy',
'tfidf',
'lda',
'nlp_tags',
'wordnet_similarity',
'phrase_embedding',
'wmd',
'wm_intersect',
'3rdparty_abhishek',
'3rdparty_dasolmar_whq',
'3rdparty_mephistopheies',
'3rdparty_image_similarity',
'magic_pagerank',
'magic_frequencies',
'magic_cooccurrence_matrix',
'oofp_nn_mlp_with_magic',
'oofp_nn_cnn_with_magic',
'oofp_nn_bi_lstm_with_magic',
'oofp_nn_siamese_lstm_attention',
]
In [ ]:
df_train, df_test, feature_list_ix = project.load_feature_lists(feature_lists)
In [ ]:
y_train = kg.io.load(project.features_dir + 'y_train.pickle')
View feature summary.
In [ ]:
print('X train:', df_train.shape)
print('y train:', y_train.shape)
print('X test: ', df_test.shape)
In [ ]:
pd.DataFrame(feature_list_ix, columns=['feature_list', 'start_index', 'end_index'])
Append the target to the training set.
In [ ]:
df_train['is_duplicate'] = y_train
In [ ]:
df_train.head(5)
In [ ]:
df_train.to_csv(
project.features_dir + 'X_train_all_features.csv',
header=True,
index=True,
index_label='id',
float_format='%.6f',
)
In [ ]:
df_test.head(5)
In [ ]:
df_test.to_csv(
project.features_dir + 'X_test_all_features.csv',
header=True,
index=True,
index_label='id',
float_format='%.6f',
)