In [ ]:
%matplotlib inline

import pandas as pd
import numpy as np

In [ ]:
def load_data(dataset):
    df = pd.read_csv('{}/{}.csv'.format('../data/input',dataset))
    
    return df

Load data


In [ ]:
audios = load_data('labeled_audios')
# audios.head()

In [ ]:
unl_audios = load_data('unlabeled_audios')
# unl_audios.head()

In [ ]:
manual = load_data('manual_selection')
manual.head()

In [ ]:
audios = audios[["Document Title","Document Url","answer_El audio parece estar relacionado con:","length"]]
unl_audios = unl_audios[["Document Title","Document Url","length"]]
manual = manual[["Document Title","Document Url","columbia_words"]]

Sample labeled features


In [ ]:
audios_merged = pd.merge(audios, manual, on='Document Title', how='left')
audios_merged.head()

In [ ]:
m = audios_merged.drop('Document Url_y', 1)
m = m.rename(columns={'Document Url_x': 'Document Url'})
manual_selection = m[m['columbia_words'].notnull()]
m_rest = m.loc[~m.index.isin(manual_selection.index)]

In [ ]:
l_discard = m_rest[m_rest["answer_El audio parece estar relacionado con:"].str.startswith('Descartar:')]
l_keep = m_rest.loc[~m_rest.index.isin(l_discard.index)]

In [ ]:
l_discard_sample = l_discard.sample(n=1500)
l_keep_sample = l_keep.sample(n=(1500-len(manual_selection)))

In [ ]:
labeled_sample = pd.concat([manual_selection, l_discard_sample, l_keep_sample])
labeled_sample.head()

Sample unlabeled features


In [ ]:
# Randomly sample 7 elements from your dataframe
unlabeled_sample = unl_audios.sample(n=3000)
unlabeled_sample.head(5)

 Export to csv files


In [ ]:
import os
export_path = '../data/output/sample/'
if not os.path.exists(export_path):
    os.makedirs(export_path)

In [ ]:
labeled_sample.to_csv('{}/labeled_audios.csv'.format(export_path), index=False)

In [ ]:
unlabeled_sample.to_csv('{}/unlabeled_audios.csv'.format(export_path), index=False)

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: