In [2]:
# Parses the BeatlesISO dataset
import difflib
import glob
import os
import shutil


output_labs_dir = "/Users/uriadmin/NYU/Dissertation/SegmenterMIREX14/beatlesISO"
audio_wavs = glob.glob("/Users/uriadmin/datasets/Segments/originalDatasets/Beatles/originalAudio/*.wav")
segment_labs = glob.glob("/Users/uriadmin/datasets/Segments/originalDatasets/Beatles/originalGT/SegmentsISO/*/*.lab")

# Keep file name only from wav files
audio_wavs = [os.path.basename(f).replace(".wav", "") for f in audio_wavs]
# print audio_wavs

# Match audio files with segment lab files
for segment_lab in segment_labs:
    file_name = os.path.basename(segment_lab).replace(".lab", "")
    lab_file = difflib.get_close_matches(file_name, audio_wavs, n=1, cutoff=0.5)
    out_lab = os.path.join(output_labs_dir, lab_file[0] + ".lab")
    
    # File found, let's parse it
    with open(segment_lab, "r") as f:
        lines = f.readlines()
    out_str = ""
    for line in lines:
        values = line.split("\t")
        if len(values) < 3:
            values = line.split(" ")
        
        # Check for the strange ISO-Beatles bug
        if float(values[0]) >= float(values[1]):
            continue
        
        out_str += values[0] + "\t" + values[1] + "\t" + values[-1]
        assert values[0] != "" and values[1] != "" and values[-1] != "" \
            "Error parsing file %s" % segment_lab
    
    # Write results
    with open(out_lab, "w") as f:
        f.write(out_str)

In [ ]: