In [2]:
# Parses the BeatlesISO dataset
import difflib
import glob
import os
import shutil
output_labs_dir = "/Users/uriadmin/NYU/Dissertation/SegmenterMIREX14/beatlesISO"
audio_wavs = glob.glob("/Users/uriadmin/datasets/Segments/originalDatasets/Beatles/originalAudio/*.wav")
segment_labs = glob.glob("/Users/uriadmin/datasets/Segments/originalDatasets/Beatles/originalGT/SegmentsISO/*/*.lab")
# Keep file name only from wav files
audio_wavs = [os.path.basename(f).replace(".wav", "") for f in audio_wavs]
# print audio_wavs
# Match audio files with segment lab files
for segment_lab in segment_labs:
file_name = os.path.basename(segment_lab).replace(".lab", "")
lab_file = difflib.get_close_matches(file_name, audio_wavs, n=1, cutoff=0.5)
out_lab = os.path.join(output_labs_dir, lab_file[0] + ".lab")
# File found, let's parse it
with open(segment_lab, "r") as f:
lines = f.readlines()
out_str = ""
for line in lines:
values = line.split("\t")
if len(values) < 3:
values = line.split(" ")
# Check for the strange ISO-Beatles bug
if float(values[0]) >= float(values[1]):
continue
out_str += values[0] + "\t" + values[1] + "\t" + values[-1]
assert values[0] != "" and values[1] != "" and values[-1] != "" \
"Error parsing file %s" % segment_lab
# Write results
with open(out_lab, "w") as f:
f.write(out_str)
In [ ]: