Explore Pairwise Feature Correlations & Distributions

Imports


In [1]:
from pygoose import *

Config


In [2]:
project = kg.Project.discover()

Read Data


In [3]:
feature_lists = [
    'simple_summaries',
    'jaccard_ngrams',
    'fuzzy',
    'tfidf',
    'lda',
    'nlp_tags',
    'wordnet_similarity',
    'phrase_embedding',
    'wmd',
    'wm_intersect',
    'magic_pagerank',
    'magic_frequencies',
    'magic_cooccurrence_matrix',
    'oofp_nn_mlp_with_magic',
    'oofp_nn_cnn_with_magic',
    'oofp_nn_bi_lstm_with_magic',
    'oofp_nn_siamese_lstm_attention',
]

In [4]:
df_train, df_test, feature_ranges = project.load_feature_lists(feature_lists)

In [5]:
df_train['target'] = kg.io.load(project.features_dir + 'y_train.pickle')

Explore


In [6]:
df_train.describe().T


Out[6]:
count mean std min 25% 50% 75% max
shorter_char_len_log 404290.000000 3.839168 0.412446 0.000000 3.583519 3.806662 4.077537 5.872118
longer_char_len_log 404290.000000 4.156210 0.447655 1.386294 3.850148 4.110874 4.454347 7.064759
char_len_diff_log 404290.000000 2.482964 1.115496 0.000000 1.609438 2.564949 3.295837 6.985642
char_len_ratio 404290.000000 0.753405 0.187965 0.000000 0.631579 0.790123 0.910891 1.000000
shorter_token_len_log 404290.000000 2.028468 0.318315 0.693147 1.791759 1.945910 2.197225 3.828641
longer_token_len_log 404290.000000 2.265450 0.375445 1.098612 1.945910 2.197225 2.484907 4.955827
token_len_diff_log 404290.000000 0.927092 0.724175 0.000000 0.693147 0.693147 1.386294 4.882802
token_len_ratio 404290.000000 0.790111 0.180231 0.047619 0.666667 0.833333 0.937500 1.000000
word_diff_ratio 404290.000000 0.419394 0.241534 0.000000 0.230769 0.400000 0.600000 0.964286
jaccard_ix_2gram 404290.000000 0.490047 0.206499 0.000000 0.337838 0.465753 0.634615 1.000000
jaccard_ix_norm_q1_2gram 404290.000000 0.645478 0.205335 0.000000 0.500000 0.656250 0.806452 1.000000
jaccard_ix_norm_q2_2gram 404290.000000 0.647694 0.204177 0.000000 0.500000 0.655172 0.807692 1.000000
jaccard_ix_3gram 404290.000000 0.363329 0.224465 0.000000 0.190083 0.325000 0.510417 1.000000
jaccard_ix_norm_q1_3gram 404290.000000 0.507695 0.248321 0.000000 0.322917 0.500000 0.700000 1.000000
jaccard_ix_norm_q2_3gram 404290.000000 0.509297 0.249370 0.000000 0.321429 0.500000 0.703125 1.000000
jaccard_ix_4gram 404290.000000 0.311363 0.222294 0.000000 0.137931 0.265060 0.446429 1.000000
jaccard_ix_norm_q1_4gram 404290.000000 0.445527 0.256115 0.000000 0.246377 0.430769 0.638889 1.000000
jaccard_ix_norm_q2_4gram 404290.000000 0.447284 0.257838 0.000000 0.245902 0.431034 0.641509 1.000000
jaccard_ix_5gram 404290.000000 0.272967 0.218302 0.000000 0.102941 0.219512 0.394737 1.000000
jaccard_ix_norm_q1_5gram 404290.000000 0.397291 0.258410 0.000000 0.190141 0.369565 0.585366 1.000000
jaccard_ix_norm_q2_5gram 404290.000000 0.399191 0.260414 0.000000 0.189873 0.370370 0.588235 1.000000
jaccard_ix_diff_2_3 404290.000000 0.126758 0.050682 0.000000 0.092414 0.127150 0.161093 0.386243
jaccard_ix_diff_3_4 404290.000000 0.052029 0.027240 0.000000 0.032967 0.050296 0.068966 0.308081
jaccard_ix_diff_4_5 404290.000000 0.038427 0.022740 0.000000 0.021739 0.036630 0.052313 0.217035
fuzz_ratio 404290.000000 0.703729 0.149434 0.040000 0.600000 0.700000 0.820000 1.000000
fuzz_partial_ratio 404290.000000 0.745187 0.113950 0.290000 0.650000 0.730000 0.830000 1.000000
fuzz_token_sort_ratio 404290.000000 0.641560 0.167938 0.000000 0.520000 0.640000 0.770000 1.000000
fuzz_token_set_ratio 404290.000000 0.734267 0.180913 0.000000 0.600000 0.750000 0.890000 1.000000
fuzz_partial_token_sort_ratio 404290.000000 0.674988 0.148020 0.000000 0.560000 0.670000 0.790000 1.000000
jaro 404290.000000 0.731897 0.100242 0.000000 0.663919 0.722668 0.793774 1.000000
jaro_winkler 404290.000000 0.767030 0.123626 0.000000 0.663919 0.765536 0.873223 1.000000
tfidf_cosine 404290.000000 0.495047 0.286285 0.000000 0.259862 0.486038 0.703605 1.000000
tfidf_euclidean 404290.000000 0.935267 0.339532 0.000000 0.720919 0.985939 1.186191 1.414214
lda_cosine 404290.000000 0.382863 0.313404 0.000000 0.106059 0.315955 0.608643 0.999209
lda_euclidean 404290.000000 0.380960 0.201139 0.000000 0.254028 0.376629 0.503320 1.271443
pos_q1_adj 404290.000000 1.053689 1.053692 0.000000 0.000000 1.000000 2.000000 16.000000
pos_q1_adv 404290.000000 0.721272 0.846085 0.000000 0.000000 1.000000 1.000000 12.000000
pos_q1_noun 404290.000000 2.813901 1.763558 0.000000 2.000000 3.000000 4.000000 32.000000
pos_q1_propn 404290.000000 0.886512 1.329234 0.000000 0.000000 0.000000 1.000000 28.000000
pos_q1_num 404290.000000 0.433874 1.455259 0.000000 0.000000 0.000000 0.000000 44.000000
pos_q1_verb 404290.000000 2.380331 1.443083 0.000000 1.000000 2.000000 3.000000 24.000000
ner_q1_gpe 404290.000000 0.176025 0.458078 0.000000 0.000000 0.000000 0.000000 10.000000
ner_q1_loc 404290.000000 0.015162 0.128628 0.000000 0.000000 0.000000 0.000000 4.000000
ner_q1_org 404290.000000 0.206109 0.477373 0.000000 0.000000 0.000000 0.000000 6.000000
ner_q1_norp 404290.000000 0.051663 0.260826 0.000000 0.000000 0.000000 0.000000 8.000000
ner_q1_person 404290.000000 0.119763 0.369682 0.000000 0.000000 0.000000 0.000000 6.000000
ner_q1_product 404290.000000 0.003008 0.056275 0.000000 0.000000 0.000000 0.000000 3.000000
ner_q1_date 404290.000000 0.047859 0.233279 0.000000 0.000000 0.000000 0.000000 6.000000
ner_q1_time 404290.000000 0.008118 0.096404 0.000000 0.000000 0.000000 0.000000 3.000000
ner_q1_quantity 404290.000000 0.008595 0.098231 0.000000 0.000000 0.000000 0.000000 5.000000
ner_q1_cardinal 404290.000000 0.214131 0.756232 0.000000 0.000000 0.000000 0.000000 29.000000
pos_q2_adj 404290.000000 1.076663 1.103128 0.000000 0.000000 1.000000 2.000000 26.000000
pos_q2_adv 404290.000000 0.742385 0.882115 0.000000 0.000000 1.000000 1.000000 18.000000
pos_q2_noun 404290.000000 2.810767 1.840310 0.000000 2.000000 3.000000 4.000000 42.000000
pos_q2_propn 404290.000000 0.886416 1.332241 0.000000 0.000000 0.000000 1.000000 39.000000
pos_q2_num 404290.000000 0.447805 1.475642 0.000000 0.000000 0.000000 0.000000 39.000000
pos_q2_verb 404290.000000 2.461686 1.638655 0.000000 1.000000 2.000000 3.000000 59.000000
ner_q2_gpe 404290.000000 0.178575 0.465500 0.000000 0.000000 0.000000 0.000000 9.000000
ner_q2_loc 404290.000000 0.015363 0.130694 0.000000 0.000000 0.000000 0.000000 4.000000
ner_q2_org 404290.000000 0.206676 0.481635 0.000000 0.000000 0.000000 0.000000 8.000000
ner_q2_norp 404290.000000 0.051560 0.261434 0.000000 0.000000 0.000000 0.000000 8.000000
ner_q2_person 404290.000000 0.118064 0.366384 0.000000 0.000000 0.000000 0.000000 7.000000
ner_q2_product 404290.000000 0.003094 0.056643 0.000000 0.000000 0.000000 0.000000 3.000000
ner_q2_date 404290.000000 0.051624 0.245515 0.000000 0.000000 0.000000 0.000000 7.000000
ner_q2_time 404290.000000 0.007957 0.096019 0.000000 0.000000 0.000000 0.000000 6.000000
ner_q2_quantity 404290.000000 0.008793 0.098641 0.000000 0.000000 0.000000 0.000000 4.000000
ner_q2_cardinal 404290.000000 0.219877 0.767847 0.000000 0.000000 0.000000 0.000000 21.000000
pos_tag_cosine 404259.000000 0.128317 0.139989 -0.000000 0.031335 0.083333 0.177794 1.000000
pos_tag_euclidean 404290.000000 2.625738 2.159862 0.000000 1.414214 2.236068 3.464102 69.728043
ner_tag_euclidean 404290.000000 0.653005 0.912025 0.000000 0.000000 0.000000 1.000000 28.017851
ner_tag_count_diff 404290.000000 0.541168 0.933578 0.000000 0.000000 0.000000 1.000000 27.000000
wordnet_similarity_raw 404271.000000 0.567604 0.202009 0.000000 0.419883 0.559209 0.718843 1.000000
wordnet_similarity_brown 404271.000000 0.579054 0.223361 0.000000 0.414539 0.594537 0.759180 1.000000
phrase_emb_mean_cosine 404290.000000 0.135807 0.108565 -0.000000 0.059154 0.109366 0.178982 0.813473
phrase_emb_mean_cityblock_log 404290.000000 2.609448 0.600744 0.000000 2.427022 2.705685 2.940278 4.639107
phrase_emb_mean_euclidean 404290.000000 1.040869 0.460618 0.000000 0.746320 1.009677 1.295476 7.370496
phrase_emb_normsum_cosine 404290.000000 0.135807 0.108565 -0.000000 0.059154 0.109366 0.178982 0.813473
phrase_emb_normsum_cityblock_log 404290.000000 1.938521 0.486354 0.000000 1.750216 2.010682 2.227286 2.923522
phrase_emb_normsum_euclidean 404290.000000 0.478727 0.205997 0.000000 0.343958 0.467688 0.598301 1.275518
wmd 404290.000000 1.964857 1.096723 0.000000 1.106430 1.851734 2.716353 10.444951
q1_q2_intersect 404290.000000 1.892211 5.689603 0.000000 0.000000 0.000000 1.000000 75.000000
q1_q2_wm_ratio 404290.000000 0.151356 0.271282 0.000000 0.000000 0.000000 0.234043 1.000000
pagerank_q1 404290.000000 0.000295 0.000289 0.000039 0.000209 0.000209 0.000305 0.012546
pagerank_q2 404290.000000 0.000310 0.000470 0.000039 0.000209 0.000209 0.000305 0.012546
magic_freq_q1 404290.000000 5.122924 15.508751 1.000000 1.000000 2.000000 4.000000 2744.000000
magic_freq_q2 404290.000000 5.585013 17.648034 1.000000 1.000000 2.000000 4.000000 2744.000000
magic_freq_q1_q2_ratio 404290.000000 1.620231 9.046853 0.000364 0.750000 1.000000 1.500000 2744.000000
magic_freq_q2_q1_ratio 404290.000000 1.813279 9.171552 0.000364 0.666667 1.000000 1.333333 2744.000000
magic_comatrix_cosine 404290.000000 0.851011 0.265152 0.013889 0.750000 1.000000 1.000000 1.000000
magic_comatrix_euclidean 404290.000000 2.231701 1.214244 1.414214 1.414214 1.732051 2.449490 17.776389
magic_comatrix_svd_cosine 404290.000000 0.549098 0.630828 0.000000 0.000000 0.256272 1.069122 1.992126
magic_comatrix_svd_euclidean 404290.000000 0.112156 0.627414 0.000000 0.000000 0.000000 0.000000 15.120028
magic_comatrix_svd_manhattan 404290.000000 0.144463 0.941726 0.000000 0.000000 0.000000 0.000000 31.659183
oofp_nn_mlp_with_magic 404290.000000 0.364452 0.375347 0.000000 0.036410 0.199208 0.691840 1.000000
oofp_nn_cnn_with_magic 404290.000000 0.378076 0.389407 0.000000 0.027116 0.195153 0.780654 1.000000
oofp_nn_bi_lstm_with_magic 404290.000000 0.391990 0.391737 0.000000 0.020232 0.229321 0.808289 1.000000
oofp_nn_siamese_lstm_attention 404290.000000 0.408822 0.388498 0.000000 0.017568 0.289125 0.822654 1.000000
target 404290.000000 0.369198 0.482588 0.000000 0.000000 0.000000 1.000000 1.000000

In [7]:
kg.eda.plot_feature_correlation_heatmap(
    df_train,
    df_train.columns[:-1].tolist(),
    font_size=3,
    save_filename=project.features_dir + 'eda_heatmap.png'
)