In [1]:
import json
import numpy as np 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC
import csv

#un-comment line below to print whole numpy arrays (note that computing cost will be much higher)
#np.set_printoptions(threshold=np.nan)

#this opens_loads the json files and assign them to their corresponding variables

dict_bright = json.load(open("json_files/bright_analysis.json", 'rb'))
dict_metal = json.load(open("json_files/metal_analysis.json",'rb'))
dict_hard = json.load(open("json_files/hard_analysis.json",'rb'))
dict_reverb = json.load(open("json_files/reverb_analysis.json",'rb'))
dict_rough = json.load(open("json_files/rough_analysis.json",'rb'))

In [2]:
#dict_bright #printing to see comment

In [3]:
#importing CSV files 

with open('csv_files/bright.csv', 'rb') as f:
    reader = csv.reader(f)
    bright_list_csv = list(reader)
    #removing brackets from list
    bright_list = [l[0] for l in bright_list_csv]
    
with open('csv_files/warm.csv', 'rb') as f:
    reader = csv.reader(f)
    warm_list_csv = list(reader)
    warm_list = [l[0] for l in warm_list_csv]
    
with open('csv_files/rough.csv', 'rb') as f:
    reader = csv.reader(f)
    rough_list_csv = map(tuple, reader)
    rough_list = [l[0] for l in rough_list_csv]
    
with open('csv_files/reverb.csv', 'rb') as f:
    reader = csv.reader(f)
    reverb_list_csv = map(tuple, reader)
    reverb_list = [l[0] for l in reverb_list_csv]
    
with open('csv_files/clear.csv', 'rb') as f:
    reader = csv.reader(f)
    clear_list_csv = list(reader)
    clear_list = [l[0] for l in clear_list_csv]
    
with open('csv_files/hollow.csv', 'rb') as f:
    reader = csv.reader(f)
    hollow_list_csv = list(reader)
    hollow_list = [l[0] for l in hollow_list_csv]
    
with open('csv_files/deep.csv', 'rb') as f:
    reader = csv.reader(f)
    deep_list_csv = list(reader)
    deep_list = [l[0] for l in deep_list_csv]
    
with open('csv_files/punchy.csv', 'rb') as f:
    reader = csv.reader(f)
    punchy_list_csv = list(reader)
    punchy_list = [l[0] for l in punchy_list_csv]
    
with open('csv_files/metallic.csv', 'rb') as f:
    reader = csv.reader(f)
    metallic_list_csv = list(reader)
    metallic_list = [l[0] for l in metallic_list_csv]
    
with open('csv_files/sharp.csv', 'rb') as f:
    reader = csv.reader(f)
    sharp_list_csv = list(reader)
    sharp_list = [l[0] for l in sharp_list_csv]
    
with open('csv_files/hard.csv', 'rb') as f:
    reader = csv.reader(f)
    hard_list_csv = list(reader) 
    hard_list = [l[0] for l in hard_list_csv]

In [4]:
#hard_list #print if want to see content.change accordingly for other lists

In [5]:
len(hard_list)


Out[5]:
502

In [6]:
#El número de sonidos en tu dataset:
nb_sounds = len(set(bright_list + hard_list +warm_list + rough_list + reverb_list + clear_list + hollow_list + deep_list + punchy_list + metallic_list + sharp_list))

In [7]:
sounds_list = (set(bright_list + hard_list +warm_list + rough_list + reverb_list + clear_list + hollow_list + deep_list + punchy_list + metallic_list + sharp_list))

In [8]:
with open("sounds_list.txt", "w") as output:
    output.write(str(sounds_list))

In [9]:
print "bright length:",len(dict_bright)
print "metal length:",len(dict_metal)
print "hard length:",len(dict_hard)
print "reverb length:",len(dict_reverb)
print "rough length:",len(dict_rough)


bright length: 2745
metal length: 2645
hard length: 2735
reverb length: 2422
rough length: 2736

In [10]:
#Cleaning_removing "nan" values from the dicts
clean_dict_bright = filter(lambda k: not np.isnan(dict_bright[k]), dict_bright)
clean_dict_metal = filter(lambda k: not np.isnan(dict_metal[k]), dict_metal)
clean_dict_hard = filter(lambda k: not np.isnan(dict_hard[k]), dict_hard)
clean_dict_reverb = filter(lambda k: not np.isnan(dict_reverb[k]), dict_reverb)
clean_dict_rough = filter(lambda k: not np.isnan(dict_rough[k]), dict_rough)

In [11]:
print "bright length:",len(clean_dict_bright)
print "metal length:",len(clean_dict_metal)
print "hard length:",len(clean_dict_hard)
print "reverb length:",len(clean_dict_reverb)
print "rough length:",len(clean_dict_rough)


bright length: 2745
metal length: 2562
hard length: 2721
reverb length: 2422
rough length: 2736

In [12]:
#clean_dict_bright #checking out one of them to see content

In [13]:
#applying intersection to all the lists
all_ids_intersection=list(set(clean_dict_bright) & set(clean_dict_metal) & set(clean_dict_hard) & set(clean_dict_rough))
all_ids_intersection
len(all_ids_intersection)


Out[13]:
2550

In [14]:
#creating matrix X
X = []

for fs_id in all_ids_intersection:
    #print fs_id
    feature_vector = [dict_bright[fs_id], dict_metal[fs_id], dict_hard[fs_id],dict_rough[fs_id]]
    X.append(feature_vector)
len(feature_vector)    
X = np.array(X)    
#X  #printing out matrix

In [15]:
len(X)


Out[15]:
2550

In [16]:
X.shape


Out[16]:
(2550, 4)

In [17]:
#confirming it matches in size as supposed to.
print len(all_ids_intersection)


2550

In [18]:
y = []
NB_SOUNDS = len(X)  #here will get same result if using "all_ids_intersection" instead of "X"
NB_LABELS = len(feature_vector)

y = np.zeros((NB_SOUNDS, NB_LABELS), dtype=int)

for idx, sound_id in enumerate(all_ids_intersection): # recorro todos los sonidos (lineas)
    if sound_id in bright_list: # si el sonido es bright
        y[idx][0] = 1 # add a 1 for each line (soundid) "idx" and the columns (label) 0....
    if sound_id in metallic_list: 
        y[idx][1] = 1 # add a 1 for each line (soundid) "idx" and the columns (label) 1....
    if sound_id in hard_list: 
        y[idx][2] = 1 # add a 1 for each line (soundid) "idx" and the columns (label) 2....
    if sound_id in rough_list: 
        y[idx][3] = 1 # add a 1 for each line (soundid) "idx" and the columns (label) 3....´

        
#Y = np.array(y)

In [19]:
y.shape


Out[19]:
(2550, 4)

In [20]:
#y  #printing out y matrix

In [21]:
#SPLIT
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [22]:
X_train.shape, y_train.shape
X_test.shape, y_test.shape


Out[22]:
((510, 4), (510, 4))

In [151]:
#X_train  #cheking X_train matrix values

In [23]:
#TRAIN
clf = OneVsRestClassifier(SVC(kernel='rbf'))
clf.fit(X, y)


Out[23]:
OneVsRestClassifier(estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
          n_jobs=1)

In [25]:
clf.score(X, y, sample_weight=None)


Out[25]:
0.76313725490196083

In [26]:
#clf.predict(X_test)

In [27]:
#The recall is the ratio tp / (tp + fn) 
#where tp is the number of true positives and fn the number of false negatives.
#The recall is intuitively the ability of the classifier to find all the positive samples.
y_true = y_test
y_pred = clf.predict(X_test)
 
x1 = recall_score(y_true, y_pred, average='micro') 
x2 = recall_score(y_true, y_pred, average='macro') 
x3 = recall_score(y_true, y_pred, average='weighted')
x1,x2,x3


Out[27]:
(0.68449197860962563, 0.68469416227912205, 0.68449197860962563)

In [28]:
#The precision is the ratio tp / (tp + fp) 
#where tp is the number of true positives and fp the number of false positives. 
#The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.

macro_prec=precision_score(y_true, y_pred, average='macro') 
weigh_prec= precision_score(y_true, y_pred, average='weighted')
micro_prec=precision_score(y_true, y_pred, average='micro')

micro_prec, macro_prec, weigh_prec


Out[28]:
(0.98841698841698844, 0.98584905660377364, 0.98713550600343059)

In [29]:
#ACCURACY

y_true = X_train.shape
y_pred = X_test.shape
accuracy_score(y_true, y_pred)


Out[29]:
0.5

In [30]:
#Classification report


#from sklearn.metrics import classification_report
#y_test = clf.predict(y_test) takes it from above
y_pred = clf.predict(X_test)
categories = ['bright', 'metal', 'hard', 'rough']
print(classification_report(y_test, y_pred, target_names=categories))


             precision    recall  f1-score   support

     bright       1.00      0.84      0.91        87
      metal       1.00      0.70      0.82       107
       hard       0.94      0.59      0.72        85
      rough       1.00      0.61      0.76        95

avg / total       0.99      0.68      0.81       374