In [1]:
# TODO: move all the merge code from model-selection-pe-coff.ipynb
import warnings
import numpy as np
import scipy as sp
import pandas as pd

1. VirusShare 251 Feature Merge.


In [ ]:
# NOTE: move this merging stuff to merge-feature-sets-pe-coff.ipynb
# it is making the notebook too long.

# Merge all the vs251 Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. File ID features.
# 4. Packer ID features.
# 5. PE Call Graph features.
# 6. Trid ID features.
# 7. PE Header features.
#
# NOTE: keep function count features and ASM/Binary images separate, train on different models.
# 8. TODO: function count features.
# 9. TODO: Binary or ASM images (RNN/CNN models).
#
# Final Combination: combine all PE/COFF features then use chi2 tests to 
# reduce to 10% best feature set ->
#     all-combined-pe-features-10perc-vs251.csv

sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs251.csv')
#sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-vs251.csv')
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs251-fixed.csv')

sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs251.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs251.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs251.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs251.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs251.csv') # Select only scalar columns.
sorted_header_features = pd.read_csv('data/sorted-pe-header-features-10percent-vs251.csv')
sorted_packer_id_features.head()

In [ ]:
sorted_asm_features_fixed.head()

In [ ]:
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
fileidfeatures.head()

In [ ]:
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
packeridfeatures.head()

In [ ]:
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
trididfeatures.head()

In [ ]:
# NOTE: the file name problem has been fixed now, do not use get_training_data()

def get_training_data(sorted_train_features_df, sorted_train_labels_df, train_labels_file_out):
    
    X = sorted_train_features_df.iloc[:,1:]
    ylabels = sorted_train_labels_df.iloc[:,4]
    # not necessary, labels start at zero. ylabels = ylabels - 1 
    sorted_sample_names = sorted_train_features_df.loc[:,'file_name']
    
    # Now get the labels of the PE malware samples from the label set.
    counter = 0
    y = []
    #train_names = sorted_train_labels['family_label']
    for fname in sorted_sample_names:
        counter += 1
        
        # Have a slight problem with the file_names in asm feature sets,
        # they still have .pe on the end.
        idx = fname.find('.pe')
        if idx > 0:
            fname = fname[:idx]
            
        if counter % 100 == 1:
            print("Appending {:d} -> {:s}".format(counter, fname))
        for idx, fname2 in enumerate(sorted_train_labels['file_name']):
            if (fname2 == fname):
                y.append(sorted_train_labels.iloc[idx, 4]) # Append the family class label.
                break
    
    ###############################
    # Write out the PE/COFF sample train labels for later use and validation.
    fop = open(train_labels_file_out, 'w')
    fop.writelines("\n".join(str(x) for x in y))
    fop.close()
    ###############################
    
    return X,y

In [ ]:
# Combine all the feature sets, ensure we drop all the rows that are not in the PE/COFF sample set.
# NOTE: use stream editor to fix ASM file_name field: echo 'as;dflkj;a.pe,123' | sed s/\.pe//

combined_train_features = sorted_asm_features_fixed.merge(sorted_header_features, on='file_name', how='inner', suffixes=('_asm','_hd'))
combined_train_features = combined_train_features.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
combined_train_features = combined_train_features.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
combined_train_features = combined_train_features.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))

combined_train_features.head()


combined_train_features.to_csv('data/combined-pe-features-vs251.csv', index=False)

In [ ]:


In [ ]:


In [ ]:

2. VirusShare 252 PE/COFF Feature Merge.


In [2]:
# Merge all the vs252 Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. File ID features.
# 4. Packer ID features.
# 5. PE Call Graph features.
# 6. Trid ID features.
# 7. PE Header features.
# 8. TODO: function count features.
# 9. TODO: Binary or ASM images (RNN/CNN models).
#
# Final Combination: combine all PE/COFF features then use chi2 tests to 
# reduce to 10% best feature set ->
#     all-combined-pe-features-10perc-vs251.csv

sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs252.csv')
#sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-vs252.csv')
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs252-fixed.csv')

sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs252.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs252.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs252.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs252.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs252.csv') # Select only scalar columns.
sorted_header_features = pd.read_csv('data/sorted-pe-header-features-10percent-vs252.csv')

# Use separate model for function counts, otherwise the csv files are too big for github.
#sorted_function_count_features = pd.read_csv('data/sorted-pe-function-counts-10percent-vs252.csv')

sorted_asm_features_fixed.head()


Out[2]:
file_name edx esi es fs ds ss gs cs ah ... sidt stc std sti stos sub test wait xchg xor
0 000118d12cbf9ad6103e8b914a6e1ac3 2151 2782 2 2 2 2 2 2 37 ... 0 0 9 0 0 395 836 0 1 688
1 00093d5fa5cb7ce77f6eaf39962daa12 4207 11425 4 2 6 6 2 3 53 ... 0 0 9 0 0 893 2560 8 2 2530
2 00099926d51b44c6f8c93a48c2567891 270 1033 0 0 0 0 0 0 0 ... 0 0 0 0 0 111 225 0 0 129
3 000a2db4762dc06628a086c9e117f884 43 1281 0 0 0 0 0 0 0 ... 0 0 0 0 0 100 184 0 0 180
4 000ae2c63ba69fc93dfc395b40bfe03a 4334 12557 11 3 8 7 3 11 41 ... 0 3 16 0 0 964 2884 1 9 2104

5 rows × 120 columns


In [ ]:
sorted_asm_features_fixed.head()

In [3]:
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
fileidfeatures.head()


Out[3]:
file_name file_id
0 00002e640cafb741bea9a48eaee27d6f 133
1 000118d12cbf9ad6103e8b914a6e1ac3 1
2 0001776237ac37a69fcef93c1bac0988 1
3 000403e4e488356b7535cc613fbeb80b 1
4 0004c8b2a0f4680a5694d74199b40ea2 1

In [4]:
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
packeridfeatures.head()


Out[4]:
file_name packer_id
0 00002e640cafb741bea9a48eaee27d6f 0
1 000118d12cbf9ad6103e8b914a6e1ac3 1101
2 0001776237ac37a69fcef93c1bac0988 0
3 000403e4e488356b7535cc613fbeb80b 1101
4 0004c8b2a0f4680a5694d74199b40ea2 1111

In [5]:
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
trididfeatures.head()


Out[5]:
file_name percentage trid_id
0 00002e640cafb741bea9a48eaee27d6f 2.3 2
1 000118d12cbf9ad6103e8b914a6e1ac3 2.2 1
2 0001776237ac37a69fcef93c1bac0988 2.7 14
3 000403e4e488356b7535cc613fbeb80b 4.6 21
4 0004c8b2a0f4680a5694d74199b40ea2 2.2 1

In [6]:
# Combine all the feature sets, ensure we drop all the rows that are not in the PE/COFF sample set.
# NOTE: use stream editor to fix ASM file_name field: echo 'as;dflkj;a.pe,123' | sed s/\.pe//

combined_train_features = sorted_asm_features_fixed.merge(sorted_header_features, on='file_name', how='inner', suffixes=('_asm','_hd'))
combined_train_features = combined_train_features.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
combined_train_features = combined_train_features.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
combined_train_features = combined_train_features.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))

combined_train_features.head()


Out[6]:
file_name edx esi es fs ds ss gs cs ah ... vertex_count edge_count delta_max density entropy file_size file_id percentage trid_id packer_id
0 000118d12cbf9ad6103e8b914a6e1ac3 2151 2782 2 2 2 2 2 2 37 ... 2213 3300 101 0.002277 0.834382 201600 1 2.2 1 1101
1 00093d5fa5cb7ce77f6eaf39962daa12 4207 11425 4 2 6 6 2 3 53 ... 4640 12158 204 0.005596 0.803481 742064 1 8.1 17 1101
2 00099926d51b44c6f8c93a48c2567891 270 1033 0 0 0 0 0 0 0 ... 522 868 284 0.267901 0.997032 725288 4 4.8 3 0
3 000a2db4762dc06628a086c9e117f884 43 1281 0 0 0 0 0 0 0 ... 871 959 70 0.035789 0.535436 61551 5 3.5 13 153
4 000ae2c63ba69fc93dfc395b40bfe03a 4334 12557 11 3 8 7 3 11 41 ... 3939 7218 82 0.004668 0.899481 487386 1 4.0 17 0

5 rows × 272 columns


In [7]:
combined_train_features.to_csv('data/combined-pe-features-vs252.csv', index=False)

In [ ]:
# DEPRECATED: do no use get_training_data() now, problem has been fixed.
X,y = get_training_data(sorted_asm_features_fixed, sorted_train_labels, 'data/sorted-pe-coff-train-labels-vs252.csv')
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))

In [ ]:
X,y = get_training_data(combined_train_features, sorted_train_labels, 'data/sorted-pe-coff-train-labels-vs252.csv')
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))

In [ ]:

3. VirusShare 263 PE/COFF Feature Merge.


In [3]:
# vs263 Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. File ID features.
# 4. Packer ID features.
# 5. PE Call Graph features.
# 6. Trid ID features.
# 7. PE Header features.
# 8. function count features.
# 9. TODO: Binary or ASM images (RNN/CNN models).
#
# Final Combination: combine all PE/COFF features then use chi2 tests to 
# reduce to 10% best feature set ->
#     all-combined-pe-features-10perc-vs251.csv

sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs263.csv')
#sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-vs263.csv')
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs263-fixed.csv')

sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs263.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs263.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs263.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs263.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs263.csv') # Select only scalar columns.
sorted_header_features = pd.read_csv('data/sorted-pe-header-features-10percent-vs263.csv')
# Adding function counts makes the csv file too big for github and models too slow,
# model separately.
#sorted_function_count_features = pd.read_csv('data/sorted-pe-function-counts-10percent-vs263.csv')

sorted_asm_features_fixed.head()


Out[3]:
file_name edx esi es fs ds ss gs cs ah ... sidt stc std sti stos sub test wait xchg xor
0 0002b2f621ea5786be03bf4153532dce 52 490 0 0 0 0 0 0 4 ... 0 0 0 0 0 47 118 0 0 129
1 000401419eccde59975c713cfadc974c 4378 3246 1 0 0 0 0 0 34 ... 0 0 1 0 0 343 649 88 42 1031
2 0004824a60ff9fe1fb30d669a5baa627 4374 3295 1 0 0 0 0 0 34 ... 0 0 1 0 0 343 653 88 42 1032
3 0006d2cd674c8501ffe59dae330ffcb5 2100 959 0 0 0 0 0 0 0 ... 0 0 1 0 0 154 307 2 6 472
4 0007893715059c51a92d3ce2b10d9cf5 163 180 0 0 0 0 0 0 31 ... 0 0 0 0 0 73 86 0 29 34

5 rows × 120 columns


In [ ]:
X,y = get_training_data(sorted_asm_features_fixed, sorted_train_labels, 'data/sorted-pe-coff-train-labels-vs263.csv')
print("Length of y: {:d}".format(len(y)))
print("Shape of X: {:d} {:d}".format(X.shape[0], X.shape[1]))

In [4]:
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
fileidfeatures.head()


Out[4]:
file_name file_id
0 0002b2f621ea5786be03bf4153532dce 5
1 000401419eccde59975c713cfadc974c 1
2 00042f23bc15b89d9c6a7bde0e316f8b 1
3 0004824a60ff9fe1fb30d669a5baa627 1
4 0004c49071481789f1c8c80656638497 13

In [5]:
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
packeridfeatures.head()


Out[5]:
file_name packer_id
0 0002b2f621ea5786be03bf4153532dce 0
1 000401419eccde59975c713cfadc974c 317
2 00042f23bc15b89d9c6a7bde0e316f8b 0
3 0004824a60ff9fe1fb30d669a5baa627 317
4 0004c49071481789f1c8c80656638497 0

In [6]:
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
trididfeatures.head()


Out[6]:
file_name percentage trid_id
0 0002b2f621ea5786be03bf4153532dce 7.4 1
1 000401419eccde59975c713cfadc974c 5.4 4
2 00042f23bc15b89d9c6a7bde0e316f8b 7.4 1
3 0004824a60ff9fe1fb30d669a5baa627 5.4 4
4 0004c49071481789f1c8c80656638497 0.0 0

In [8]:
# Combine all the feature sets, ensure we drop all the rows that are not in the PE/COFF sample set.
# NOTE: use stream editor to fix ASM file_name field: echo 'as;dflkj;a.pe,123' | sed s/\.pe//

combined_train_features = sorted_asm_features_fixed.merge(sorted_header_features, on='file_name', how='inner', suffixes=('_asm','_hd'))
combined_train_features = combined_train_features.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
combined_train_features = combined_train_features.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
combined_train_features = combined_train_features.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))

combined_train_features.head()


Out[8]:
file_name edx esi es fs ds ss gs cs ah ... vertex_count edge_count delta_max density entropy file_size file_id percentage trid_id packer_id
0 0002b2f621ea5786be03bf4153532dce 52 490 0 0 0 0 0 0 4 ... 365 518 74 0.234283 0.684706 81812 5 7.4 1 0
1 000401419eccde59975c713cfadc974c 4378 3246 1 0 0 0 0 0 34 ... 1593 3310 56 0.006680 0.800788 137131 1 5.4 4 317
2 0004824a60ff9fe1fb30d669a5baa627 4374 3295 1 0 0 0 0 0 34 ... 1593 3325 56 0.006710 0.802050 137630 1 5.4 4 317
3 0006d2cd674c8501ffe59dae330ffcb5 2100 959 0 0 0 0 0 0 0 ... 717 2167 285 0.049634 0.762935 83819 1 8.3 13 0
4 0007893715059c51a92d3ce2b10d9cf5 163 180 0 0 0 0 0 0 31 ... 795 1193 355 0.052839 0.812079 120832 1 3.5 13 0

5 rows × 204 columns


In [9]:
combined_train_features.to_csv('data/combined-pe-features-vs263.csv', index=False)

In [ ]:


In [ ]:


In [ ]:


In [ ]:

4. VirusShare 264 PE/COF Feature Merge.


In [4]:
# vs264 Feature sets:
# 1. PE ASM features.
# 2. Entropy features.
# 3. File ID features.
# 4. Packer ID features.
# 5. PE Call Graph features.
# 6. Trid ID features.
# 7. PE Header features.
# 8. ASM function count features.
# 9. TODO: Binary or ASM images (RNN/CNN models).
#
# Final Combination: combine all PE/COFF features then use chi2 tests to 
# reduce to 10% best feature set ->
#     all-combined-pe-features-10perc-vs264.csv

sorted_train_labels = pd.read_csv('data/sorted-train-labels-vs264.csv')
#sorted_asm_features = pd.read_csv('data/sorted-pe-asm-features-vs264.csv')
sorted_asm_features_fixed = pd.read_csv('data/sorted-pe-asm-features-vs264-fixed.csv')

sorted_entropy_features = pd.read_csv('data/sorted-entropy-features-vs264.csv')
sorted_file_id_features = pd.read_csv('data/sorted-file-id-features-vs264.csv')
sorted_packer_id_features = pd.read_csv('data/sorted-packer-id-features-vs264.csv')
sorted_call_graph_features = pd.read_csv('data/sorted-pe-call-graph-features-vs264.csv')
sorted_trid_id_features = pd.read_csv('data/sorted-trid-id-features-vs264.csv') # Select only scalar columns.
sorted_header_features = pd.read_csv('data/sorted-pe-header-features-10percent-vs264.csv')
sorted_function_count_features = pd.read_csv('data/sorted-pe-function-count-features-10percent-vs264.csv')

sorted_asm_features_fixed.head()


Out[4]:
file_name edx esi es fs ds ss gs cs ah ... sidt stc std sti stos sub test wait xchg xor
0 0006bac4260d377e3bf2b9cfea530eb9 8714 16348 39 8 9 12 8 8 73 ... 0 1 4 3 0 1736 2990 0 7 2176
1 0006e6b536f6622bf308e80f837ceaeb 4374 3295 1 0 0 0 0 0 34 ... 0 0 1 0 0 343 653 88 42 1032
2 0007106c237e8689cc68b5111db1a174 3549 13709 8 0 0 0 0 0 76 ... 0 0 1 0 0 1179 2575 1 0 3059
3 00094c34bd1a0f622b49c1b1b9274ae6 1167 4780 0 0 0 0 0 0 0 ... 0 0 0 0 0 377 996 0 0 869
4 000b01ac8e485e5a143ff577be88b853 1703 1181 0 0 0 0 0 0 32 ... 0 0 2 0 0 174 247 5 11 395

5 rows × 120 columns


In [5]:
fileidfeatures = pd.DataFrame(sorted_file_id_features.iloc[:,[0,2]])
fileidfeatures.head()


Out[5]:
file_name file_id
0 000070db76b6dc1ee3497a3f9319848c 16
1 00009cbc0a90337e4c30950a51ae3d67 1
2 0003c05a1320e64fe72438ab48da7ecf 15
3 0003e52a9267b657d9b08b2cbc0a2593 16
4 0005743596135fe65f61da7a0eba0bb6 15

In [6]:
packeridfeatures = pd.DataFrame(sorted_packer_id_features.iloc[:,[0,2]])
packeridfeatures.head()


Out[6]:
file_name packer_id
0 000070db76b6dc1ee3497a3f9319848c 0
1 00009cbc0a90337e4c30950a51ae3d67 317
2 0003c05a1320e64fe72438ab48da7ecf 0
3 0003e52a9267b657d9b08b2cbc0a2593 0
4 0005743596135fe65f61da7a0eba0bb6 0

In [7]:
trididfeatures = pd.DataFrame(sorted_trid_id_features.iloc[:,[0,2,3]])
trididfeatures.head()


Out[7]:
file_name percentage trid_id
0 000070db76b6dc1ee3497a3f9319848c 0.0 0
1 00009cbc0a90337e4c30950a51ae3d67 5.5 4
2 0003c05a1320e64fe72438ab48da7ecf 0.0 0
3 0003e52a9267b657d9b08b2cbc0a2593 0.0 0
4 0005743596135fe65f61da7a0eba0bb6 0.6 8

In [8]:
# Combine all the feature sets, ensure we drop all the rows that are not in the PE/COFF sample set.
# NOTE: use stream editor to fix ASM file_name field: echo 'as;dflkj;a.pe,123' | sed s/\.pe//

combined_train_features = sorted_asm_features_fixed.merge(sorted_header_features, on='file_name', how='inner', suffixes=('_asm','_hd'))
combined_train_features = combined_train_features.merge(sorted_call_graph_features, on='file_name', how='inner', suffixes=('_asm','_cg'))
combined_train_features = combined_train_features.merge(sorted_entropy_features, on='file_name', how='inner', suffixes=('_asm','_ent'))
combined_train_features = combined_train_features.merge(fileidfeatures, on='file_name', how='inner', suffixes=('_asm','_fid'))
combined_train_features = combined_train_features.merge(trididfeatures, on='file_name', how='inner', suffixes=('_asm','_tid'))
combined_train_features = combined_train_features.merge(packeridfeatures, on='file_name', how='inner', suffixes=('_asm','_pid'))

combined_train_features.head()


Out[8]:
file_name edx esi es fs ds ss gs cs ah ... vertex_count edge_count delta_max density entropy file_size file_id percentage trid_id packer_id
0 0006bac4260d377e3bf2b9cfea530eb9 8714 16348 39 8 9 12 8 8 73 ... 4689 8093 377 0.005650 0.820715 2015064.0 1 2.6 7 1101
1 0006e6b536f6622bf308e80f837ceaeb 4374 3295 1 0 0 0 0 0 34 ... 1593 3325 56 0.006710 0.804215 138620.0 1 5.4 4 317
2 0007106c237e8689cc68b5111db1a174 3549 13709 8 0 0 0 0 0 76 ... 5753 11775 168 0.013797 0.826531 524877.0 1 8.1 17 1101
3 00094c34bd1a0f622b49c1b1b9274ae6 1167 4780 0 0 0 0 0 0 0 ... 2737 3884 125 0.011075 0.833980 325328.0 1 7.4 1 1101
4 000b01ac8e485e5a143ff577be88b853 1703 1181 0 0 0 0 0 0 32 ... 575 1052 59 0.018804 0.983293 304458.0 1 7.7 12 0

5 rows × 244 columns


In [9]:
combined_train_features.to_csv('data/combined-pe-features-vs264.csv', index=False)

In [ ]:


In [ ]: