In [1]:
import pandas as pd
import json

In [2]:
from collections import defaultdict

In [3]:
from tqdm import tqdm_notebook as tqdm

In [4]:
import imp

Load the instrument class ids


In [5]:
ont = json.load(open('../schema/ontology.json', 'r'))

In [6]:
omap = {o['id']: o for o in ont}

In [7]:
# ('Musical instrument', '/m/04szw')
instrument_classes = omap['/m/04szw']['child_ids']
instrument_classes.append('/m/015lz1') # Singing
instrument_classes.append('/m/01swy6') # Yodeling
instrument_classes.append('/t/dd00003') # Male singing
instrument_classes.append('/t/dd00004') # Female singing
instrument_classes.append('/t/dd00005') # Child singing

In [8]:
good_classes = []

In [9]:
candidates = [omap[i]['id'] for i in instrument_classes]

In [10]:
while candidates:
    new_class = candidates.pop()
    good_classes.append(new_class)
    candidates.extend(omap[new_class]['child_ids'])

In [11]:
good_classes = sorted(list(set(good_classes)))

In [12]:
class_map = {omap[i]['name']: i for i in good_classes}

In [13]:
instruments = [omap[i]['name'] for i in good_classes]

In [14]:
ns = imp.load_source('ns', '../scripts/namespaces.py')

In [15]:
openmic25_to_audioset_ids = defaultdict(list)

for k in ns.AUDIOSET_MAP:
    openmic25_to_audioset_ids[ns.AUDIOSET_MAP[k]].append(class_map[k])
    
openmic25_to_audioset_ids = dict(openmic25_to_audioset_ids)

In [16]:
ids_to_om25 = {}
for k in openmic25_to_audioset_ids:
    for aid in openmic25_to_audioset_ids[k]:
        ids_to_om25[aid] = k

In [17]:
good_ids = []
for k in openmic25_to_audioset_ids:
    good_ids.extend(openmic25_to_audioset_ids[k])

Load a slice of the training data


In [18]:
df_bal = pd.read_csv('../data/audioset/balanced_train_segments.csv',
                     skiprows=3, names=['YTID', 'start', 'end', 'labels'], quoting=2, sep=',\s+')


/home/bmcfee/miniconda/envs/py35/lib/python3.5/site-packages/ipykernel/__main__.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  from ipykernel import kernelapp as app

In [19]:
df_un = pd.read_csv('../data/audioset/unbalanced_train_segments.csv',
                     skiprows=3, names=['YTID', 'start', 'end', 'labels'], quoting=2, sep=',\s+')


/home/bmcfee/miniconda/envs/py35/lib/python3.5/site-packages/ipykernel/__main__.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  from ipykernel import kernelapp as app

In [20]:
df_ev = pd.read_csv('../data/audioset/eval_segments.csv',
                     skiprows=3, names=['YTID', 'start', 'end', 'labels'], quoting=2, sep=',\s+')


/home/bmcfee/miniconda/envs/py35/lib/python3.5/site-packages/ipykernel/__main__.py:2: ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators (separators > 1 char and different from '\s+' are interpreted as regex); you can avoid this warning by specifying engine='python'.
  from ipykernel import kernelapp as app

In [21]:
inst_set = set(good_ids)

In [22]:
mapping = defaultdict(set)

for i, (ytid, start, end, labels) in tqdm(df_bal.iterrows(), desc='Balanced set', total=len(df_bal)):
    for tag in labels.strip('"').split(','):
        if tag in inst_set:
            mapping[ids_to_om25[tag]].add((ytid, start, end))

for i, (ytid, start, end, labels) in tqdm(df_un.iterrows(), desc='Unbalanced set', total=len(df_un)):
    for tag in labels.strip('"').split(','):
        if tag in inst_set:
            mapping[ids_to_om25[tag]].add((ytid, start, end))
            
for i, (ytid, start, end, labels) in tqdm(df_ev.iterrows(), desc='Eval set', total=len(df_ev)):
    for tag in labels.strip('"').split(','):
        if tag in inst_set:
            mapping[ids_to_om25[tag]].add((ytid, start, end))
            
mapping = dict(mapping)






In [23]:
for k in mapping:
    mapping[k] = sorted(list(mapping[k]))

In [24]:
[(k, len(mapping[k])) for k in sorted(mapping)]


Out[24]:
[('accordion', 2894),
 ('bagpipes', 1776),
 ('banjo', 2456),
 ('bass', 8669),
 ('cello', 5282),
 ('clarinet', 2121),
 ('cymbals', 5435),
 ('drums', 26599),
 ('flute', 4781),
 ('guitar', 57322),
 ('harmonica', 2216),
 ('harp', 2043),
 ('mallet_percussion', 7385),
 ('mandolin', 2375),
 ('organ', 3701),
 ('piano', 12838),
 ('saxophone', 3075),
 ('synthesizer', 5041),
 ('trombone', 2795),
 ('trumpet', 3834),
 ('ukulele', 5292),
 ('violin', 28125),
 ('voice', 60028)]

In [25]:
json.dump(mapping,
          open('/home/bmcfee/git/cosmir/dev-set-builder/data/audioset/openmic25_label_index.json', 'w'),
          indent=2)