In [1]:
from pygoose import *
In [2]:
project = kg.Project.discover()
In [3]:
feature_lists = [
'simple_summaries',
'jaccard_ngrams',
'fuzzy',
'tfidf',
'lda',
'nlp_tags',
'wordnet_similarity',
'phrase_embedding',
'wmd',
'wm_intersect',
'magic_pagerank',
'magic_frequencies',
'magic_cooccurrence_matrix',
'oofp_nn_mlp_with_magic',
'oofp_nn_cnn_with_magic',
'oofp_nn_bi_lstm_with_magic',
'oofp_nn_siamese_lstm_attention',
]
In [4]:
df_train, df_test, feature_ranges = project.load_feature_lists(feature_lists)
In [5]:
df_train['target'] = kg.io.load(project.features_dir + 'y_train.pickle')
In [6]:
df_train.describe().T
Out[6]:
count
mean
std
min
25%
50%
75%
max
shorter_char_len_log
404290.000000
3.839168
0.412446
0.000000
3.583519
3.806662
4.077537
5.872118
longer_char_len_log
404290.000000
4.156210
0.447655
1.386294
3.850148
4.110874
4.454347
7.064759
char_len_diff_log
404290.000000
2.482964
1.115496
0.000000
1.609438
2.564949
3.295837
6.985642
char_len_ratio
404290.000000
0.753405
0.187965
0.000000
0.631579
0.790123
0.910891
1.000000
shorter_token_len_log
404290.000000
2.028468
0.318315
0.693147
1.791759
1.945910
2.197225
3.828641
longer_token_len_log
404290.000000
2.265450
0.375445
1.098612
1.945910
2.197225
2.484907
4.955827
token_len_diff_log
404290.000000
0.927092
0.724175
0.000000
0.693147
0.693147
1.386294
4.882802
token_len_ratio
404290.000000
0.790111
0.180231
0.047619
0.666667
0.833333
0.937500
1.000000
word_diff_ratio
404290.000000
0.419394
0.241534
0.000000
0.230769
0.400000
0.600000
0.964286
jaccard_ix_2gram
404290.000000
0.490047
0.206499
0.000000
0.337838
0.465753
0.634615
1.000000
jaccard_ix_norm_q1_2gram
404290.000000
0.645478
0.205335
0.000000
0.500000
0.656250
0.806452
1.000000
jaccard_ix_norm_q2_2gram
404290.000000
0.647694
0.204177
0.000000
0.500000
0.655172
0.807692
1.000000
jaccard_ix_3gram
404290.000000
0.363329
0.224465
0.000000
0.190083
0.325000
0.510417
1.000000
jaccard_ix_norm_q1_3gram
404290.000000
0.507695
0.248321
0.000000
0.322917
0.500000
0.700000
1.000000
jaccard_ix_norm_q2_3gram
404290.000000
0.509297
0.249370
0.000000
0.321429
0.500000
0.703125
1.000000
jaccard_ix_4gram
404290.000000
0.311363
0.222294
0.000000
0.137931
0.265060
0.446429
1.000000
jaccard_ix_norm_q1_4gram
404290.000000
0.445527
0.256115
0.000000
0.246377
0.430769
0.638889
1.000000
jaccard_ix_norm_q2_4gram
404290.000000
0.447284
0.257838
0.000000
0.245902
0.431034
0.641509
1.000000
jaccard_ix_5gram
404290.000000
0.272967
0.218302
0.000000
0.102941
0.219512
0.394737
1.000000
jaccard_ix_norm_q1_5gram
404290.000000
0.397291
0.258410
0.000000
0.190141
0.369565
0.585366
1.000000
jaccard_ix_norm_q2_5gram
404290.000000
0.399191
0.260414
0.000000
0.189873
0.370370
0.588235
1.000000
jaccard_ix_diff_2_3
404290.000000
0.126758
0.050682
0.000000
0.092414
0.127150
0.161093
0.386243
jaccard_ix_diff_3_4
404290.000000
0.052029
0.027240
0.000000
0.032967
0.050296
0.068966
0.308081
jaccard_ix_diff_4_5
404290.000000
0.038427
0.022740
0.000000
0.021739
0.036630
0.052313
0.217035
fuzz_ratio
404290.000000
0.703729
0.149434
0.040000
0.600000
0.700000
0.820000
1.000000
fuzz_partial_ratio
404290.000000
0.745187
0.113950
0.290000
0.650000
0.730000
0.830000
1.000000
fuzz_token_sort_ratio
404290.000000
0.641560
0.167938
0.000000
0.520000
0.640000
0.770000
1.000000
fuzz_token_set_ratio
404290.000000
0.734267
0.180913
0.000000
0.600000
0.750000
0.890000
1.000000
fuzz_partial_token_sort_ratio
404290.000000
0.674988
0.148020
0.000000
0.560000
0.670000
0.790000
1.000000
jaro
404290.000000
0.731897
0.100242
0.000000
0.663919
0.722668
0.793774
1.000000
jaro_winkler
404290.000000
0.767030
0.123626
0.000000
0.663919
0.765536
0.873223
1.000000
tfidf_cosine
404290.000000
0.495047
0.286285
0.000000
0.259862
0.486038
0.703605
1.000000
tfidf_euclidean
404290.000000
0.935267
0.339532
0.000000
0.720919
0.985939
1.186191
1.414214
lda_cosine
404290.000000
0.382863
0.313404
0.000000
0.106059
0.315955
0.608643
0.999209
lda_euclidean
404290.000000
0.380960
0.201139
0.000000
0.254028
0.376629
0.503320
1.271443
pos_q1_adj
404290.000000
1.053689
1.053692
0.000000
0.000000
1.000000
2.000000
16.000000
pos_q1_adv
404290.000000
0.721272
0.846085
0.000000
0.000000
1.000000
1.000000
12.000000
pos_q1_noun
404290.000000
2.813901
1.763558
0.000000
2.000000
3.000000
4.000000
32.000000
pos_q1_propn
404290.000000
0.886512
1.329234
0.000000
0.000000
0.000000
1.000000
28.000000
pos_q1_num
404290.000000
0.433874
1.455259
0.000000
0.000000
0.000000
0.000000
44.000000
pos_q1_verb
404290.000000
2.380331
1.443083
0.000000
1.000000
2.000000
3.000000
24.000000
ner_q1_gpe
404290.000000
0.176025
0.458078
0.000000
0.000000
0.000000
0.000000
10.000000
ner_q1_loc
404290.000000
0.015162
0.128628
0.000000
0.000000
0.000000
0.000000
4.000000
ner_q1_org
404290.000000
0.206109
0.477373
0.000000
0.000000
0.000000
0.000000
6.000000
ner_q1_norp
404290.000000
0.051663
0.260826
0.000000
0.000000
0.000000
0.000000
8.000000
ner_q1_person
404290.000000
0.119763
0.369682
0.000000
0.000000
0.000000
0.000000
6.000000
ner_q1_product
404290.000000
0.003008
0.056275
0.000000
0.000000
0.000000
0.000000
3.000000
ner_q1_date
404290.000000
0.047859
0.233279
0.000000
0.000000
0.000000
0.000000
6.000000
ner_q1_time
404290.000000
0.008118
0.096404
0.000000
0.000000
0.000000
0.000000
3.000000
ner_q1_quantity
404290.000000
0.008595
0.098231
0.000000
0.000000
0.000000
0.000000
5.000000
ner_q1_cardinal
404290.000000
0.214131
0.756232
0.000000
0.000000
0.000000
0.000000
29.000000
pos_q2_adj
404290.000000
1.076663
1.103128
0.000000
0.000000
1.000000
2.000000
26.000000
pos_q2_adv
404290.000000
0.742385
0.882115
0.000000
0.000000
1.000000
1.000000
18.000000
pos_q2_noun
404290.000000
2.810767
1.840310
0.000000
2.000000
3.000000
4.000000
42.000000
pos_q2_propn
404290.000000
0.886416
1.332241
0.000000
0.000000
0.000000
1.000000
39.000000
pos_q2_num
404290.000000
0.447805
1.475642
0.000000
0.000000
0.000000
0.000000
39.000000
pos_q2_verb
404290.000000
2.461686
1.638655
0.000000
1.000000
2.000000
3.000000
59.000000
ner_q2_gpe
404290.000000
0.178575
0.465500
0.000000
0.000000
0.000000
0.000000
9.000000
ner_q2_loc
404290.000000
0.015363
0.130694
0.000000
0.000000
0.000000
0.000000
4.000000
ner_q2_org
404290.000000
0.206676
0.481635
0.000000
0.000000
0.000000
0.000000
8.000000
ner_q2_norp
404290.000000
0.051560
0.261434
0.000000
0.000000
0.000000
0.000000
8.000000
ner_q2_person
404290.000000
0.118064
0.366384
0.000000
0.000000
0.000000
0.000000
7.000000
ner_q2_product
404290.000000
0.003094
0.056643
0.000000
0.000000
0.000000
0.000000
3.000000
ner_q2_date
404290.000000
0.051624
0.245515
0.000000
0.000000
0.000000
0.000000
7.000000
ner_q2_time
404290.000000
0.007957
0.096019
0.000000
0.000000
0.000000
0.000000
6.000000
ner_q2_quantity
404290.000000
0.008793
0.098641
0.000000
0.000000
0.000000
0.000000
4.000000
ner_q2_cardinal
404290.000000
0.219877
0.767847
0.000000
0.000000
0.000000
0.000000
21.000000
pos_tag_cosine
404259.000000
0.128317
0.139989
-0.000000
0.031335
0.083333
0.177794
1.000000
pos_tag_euclidean
404290.000000
2.625738
2.159862
0.000000
1.414214
2.236068
3.464102
69.728043
ner_tag_euclidean
404290.000000
0.653005
0.912025
0.000000
0.000000
0.000000
1.000000
28.017851
ner_tag_count_diff
404290.000000
0.541168
0.933578
0.000000
0.000000
0.000000
1.000000
27.000000
wordnet_similarity_raw
404271.000000
0.567604
0.202009
0.000000
0.419883
0.559209
0.718843
1.000000
wordnet_similarity_brown
404271.000000
0.579054
0.223361
0.000000
0.414539
0.594537
0.759180
1.000000
phrase_emb_mean_cosine
404290.000000
0.135807
0.108565
-0.000000
0.059154
0.109366
0.178982
0.813473
phrase_emb_mean_cityblock_log
404290.000000
2.609448
0.600744
0.000000
2.427022
2.705685
2.940278
4.639107
phrase_emb_mean_euclidean
404290.000000
1.040869
0.460618
0.000000
0.746320
1.009677
1.295476
7.370496
phrase_emb_normsum_cosine
404290.000000
0.135807
0.108565
-0.000000
0.059154
0.109366
0.178982
0.813473
phrase_emb_normsum_cityblock_log
404290.000000
1.938521
0.486354
0.000000
1.750216
2.010682
2.227286
2.923522
phrase_emb_normsum_euclidean
404290.000000
0.478727
0.205997
0.000000
0.343958
0.467688
0.598301
1.275518
wmd
404290.000000
1.964857
1.096723
0.000000
1.106430
1.851734
2.716353
10.444951
q1_q2_intersect
404290.000000
1.892211
5.689603
0.000000
0.000000
0.000000
1.000000
75.000000
q1_q2_wm_ratio
404290.000000
0.151356
0.271282
0.000000
0.000000
0.000000
0.234043
1.000000
pagerank_q1
404290.000000
0.000295
0.000289
0.000039
0.000209
0.000209
0.000305
0.012546
pagerank_q2
404290.000000
0.000310
0.000470
0.000039
0.000209
0.000209
0.000305
0.012546
magic_freq_q1
404290.000000
5.122924
15.508751
1.000000
1.000000
2.000000
4.000000
2744.000000
magic_freq_q2
404290.000000
5.585013
17.648034
1.000000
1.000000
2.000000
4.000000
2744.000000
magic_freq_q1_q2_ratio
404290.000000
1.620231
9.046853
0.000364
0.750000
1.000000
1.500000
2744.000000
magic_freq_q2_q1_ratio
404290.000000
1.813279
9.171552
0.000364
0.666667
1.000000
1.333333
2744.000000
magic_comatrix_cosine
404290.000000
0.851011
0.265152
0.013889
0.750000
1.000000
1.000000
1.000000
magic_comatrix_euclidean
404290.000000
2.231701
1.214244
1.414214
1.414214
1.732051
2.449490
17.776389
magic_comatrix_svd_cosine
404290.000000
0.549098
0.630828
0.000000
0.000000
0.256272
1.069122
1.992126
magic_comatrix_svd_euclidean
404290.000000
0.112156
0.627414
0.000000
0.000000
0.000000
0.000000
15.120028
magic_comatrix_svd_manhattan
404290.000000
0.144463
0.941726
0.000000
0.000000
0.000000
0.000000
31.659183
oofp_nn_mlp_with_magic
404290.000000
0.364452
0.375347
0.000000
0.036410
0.199208
0.691840
1.000000
oofp_nn_cnn_with_magic
404290.000000
0.378076
0.389407
0.000000
0.027116
0.195153
0.780654
1.000000
oofp_nn_bi_lstm_with_magic
404290.000000
0.391990
0.391737
0.000000
0.020232
0.229321
0.808289
1.000000
oofp_nn_siamese_lstm_attention
404290.000000
0.408822
0.388498
0.000000
0.017568
0.289125
0.822654
1.000000
target
404290.000000
0.369198
0.482588
0.000000
0.000000
0.000000
1.000000
1.000000
In [7]:
kg.eda.plot_feature_correlation_heatmap(
df_train,
df_train.columns[:-1].tolist(),
font_size=3,
save_filename=project.features_dir + 'eda_heatmap.png'
)
Content source: YuriyGuts/kaggle-quora-question-pairs
Similar notebooks: