Torch is really bad at handling external data. It's native data format is t7. This is a collection of notes on getting data loaded in for use in the ML contest.
I've included Scott Locklin's script in this directory for converting csv to t7. The raw CSV files must be stripped of text (t7 only accepts numerals) including headers before feeding them into the script. I also had to condition the NaNs in the PE logs in the 'facies_vectors' data. They are now 9999s.
In [68]:
-- facies_vectors
-- remove CSV header and replace NaNs with 99999
os.execute("sed '1d' ~/2016-ml-contest/facies_vectors.csv | \
awk '{gsub(/,,/,\",99999,\")}; 1' > dat/facies_vectors.csv")
-- delete formation names and well names (all textual data)
os.execute("cut --complement -f 3 -d, dat/facies_vectors.csv > tmpfile; \
cut --complement -f 2 -d, tmpfile > dat/facies_vectors.csv")
-- convert CSV to t7
print("******FACIES_VECTORS FILE STATISTICS*******")
os.execute("bash csv2t7.sh dat/facies_vectors.csv dat/facies_vectors.t7");
-- repeat for all files
-- nofacies_data
os.execute("sed '1d' ~/2016-ml-contest/nofacies_data.csv | \
awk '{gsub(/,,/,\",99999,\")}; 1' > dat/nofacies_data.csv")
os.execute("cut --complement -f 2 -d, dat/nofacies_data.csv > tmpfile; \
cut --complement -f 1 -d, tmpfile > dat/nofacies_data.csv")
print("******NOFACIES_DATA FILE STATISTICS*******")
os.execute("bash csv2t7.sh dat/nofacies_data.csv dat/nofacies_data.t7");
-- training_data
os.execute("sed '1d' ~/2016-ml-contest/training_data.csv | \
awk '{gsub(/,,/,\",99999,\")}; 1' > dat/training_data.csv")
os.execute("cut --complement -f 3 -d, dat/training_data.csv > tmpfile; \
cut --complement -f 2 -d, tmpfile > dat/training_data.csv")
print("******TRAINING_DATA FILE STATISTICS*******")
os.execute("bash csv2t7.sh dat/training_data.csv dat/training_data.t7");
-- validation_data_nofacies
os.execute("sed '1d' ~/2016-ml-contest/validation_data_nofacies.csv | \
awk '{gsub(/,,/,\",99999,\")}; 1' > dat/validation_data_nofacies.csv")
os.execute("cut --complement -f 2 -d, dat/validation_data_nofacies.csv > tmpfile; \
cut --complement -f 1 -d, tmpfile > dat/validation_data_nofacies.csv")
print("******VALIDATION_DATA_NO_FACIES FILE STATISTICS*******")
os.execute("bash csv2t7.sh dat/validation_data_nofacies.csv dat/validation_data_nofacies.t7");
-- well_data_with_facies
os.execute("sed '1d' ~/2016-ml-contest/well_data_with_facies.csv | \
awk '{gsub(/,,/,\",99999,\")}; 1' > dat/well_data_with_facies.csv")
os.execute("cut --complement -f 3 -d, dat/well_data_with_facies.csv > tmpfile; \
cut --complement -f 2 -d, tmpfile > dat/well_data_with_facies.csv")
print("******WELL_DATA_WITH_FACIES FILE STATISTICS*******")
os.execute("bash csv2t7.sh dat/well_data_with_facies.csv dat/well_data_with_facies.t7");
-- clean
os.execute("rm tmpfile dat/*.csv")
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
Out[68]:
In [69]:
file = torch.DiskFile('dat/facies_vectors.t7', 'r')
facies = file:readObject()
file:close()
file = torch.DiskFile('dat/nofacies_data.t7', 'r')
nofacies = file:readObject()
file:close()
file = torch.DiskFile('dat/training_data.t7', 'r')
training = file:readObject()
file:close()
file = torch.DiskFile('dat/validation_data_nofacies.t7', 'r')
validate = file:readObject()
file:close()
file = torch.DiskFile('dat/well_data_with_facies.t7', 'r')
well = file:readObject()
file:close()
In [74]:
print("facies size: ", facies:size()[1], "x", facies:size()[2])
print("nofacies size: ", nofacies:size()[1], "x", nofacies:size()[2])
print("training size: ", training:size()[1], "x", training:size()[2])
print("validate size: ", validate:size()[1], "x", validate:size()[2])
print("well size: ", well:size()[1], "x", well:size()[2])
Out[74]:
In [ ]: