notebook.community

Edit and run



In [2]:

    
#-------------------------------------------------------------------------------
# Name:        CSC492 - Coding Assignment #1
# Purpose:
#
# Author:      Marion
#
# Created:     13/10/2017
# Copyright:   (c) Marion 2017
# Licence:     <your licence>
#-------------------------------------------------------------------------------

import csv
import numpy as np
from numpy.random import randn

#seed the random numbers to help debbugging

np.random.seed(1)

#define hyperparameters

LEARNING_RATE = 0.01
NB_FEATURES = 26
NB_TRAININGEX = 13000
NB_CLASSES = 13
NB_HIDDEN_NEURONS = 16
NB_TEST = 10400

#getting the test data from the csv file
testDatatemp = np.loadtxt(open("../data/kaggle-music-genre/test.x.csv","rb"), dtype =np.float16,delimiter = ',',skiprows=1, usecols=range(1,27))

#invert testData to get a 26 * 13000 matrix
testDatatemp = testDatatemp.T

#getting the input data from the csv file
input = np.loadtxt(open("../data/kaggle-music-genre/train.x.csv","rb"), dtype =np.float16,delimiter = ',',skiprows=1, usecols=range(1,27))

#invert input to get a 26 * 10400 matrix
inputFinal = input.T

testData = testDatatemp

#getting the output data from the csv file
outputtemp = np.genfromtxt(open("../data/kaggle-music-genre/train.y.csv","rb"), dtype = 'str', delimiter=',',skip_header=1, usecols=(1))
output = np.zeros((NB_CLASSES,NB_TRAININGEX))

#initializing all the weigths randomly
syn1 = np.random.random((NB_FEATURES,NB_HIDDEN_NEURONS)) 
syn2 = np.random.random((NB_HIDDEN_NEURONS, NB_CLASSES))



In [54]:

    
testData.shape, inputFinal.shape, outputtemp.shape, syn1.shape, syn2.shape, output.shape, outputtemp.shape









    Out[54]:





((26, 10400), (26, 13000), (13000,), (26, 16), (16, 13), (13, 13000), (13000,))



In [88]:

    
outputtemp[:20]









    Out[88]:





array(['International', 'Vocal', 'Latin', 'Blues', 'Vocal', 'Jazz', 'Folk',
       'Folk', 'RnB', 'Pop_Rock', 'Latin', 'Latin', 'New_Age', 'Vocal',
       'Rap', 'Reggae', 'Rap', 'Jazz', 'Rap', 'Jazz'], 
      dtype='<U13')



In [91]:

    
import pandas as pd
train_y = pd.read_csv(filepath_or_buffer='../data/kaggle-music-genre/train.y.csv')
train_y.head()









    Out[91]:







  
    
      
      Id
      class_label
    
  
  
    
      0
      1
      International
    
    
      1
      2
      Vocal
    
    
      2
      3
      Latin
    
    
      3
      4
      Blues
    
    
      4
      5
      Vocal



In [93]:

    
test_y_sample = pd.read_csv(filepath_or_buffer='../data/kaggle-music-genre/submission-random.csv')
test_y_sample.head()









    Out[93]:







  
    
      
      Id
      Blues
      Country
      Electronic
      Folk
      International
      Jazz
      Latin
      New_Age
      Pop_Rock
      Rap
      Reggae
      RnB
      Vocal
    
  
  
    
      0
      1
      0.0964
      0.0884
      0.0121
      0.1004
      0.0137
      0.1214
      0.0883
      0.0765
      0.0332
      0.0445
      0.1193
      0.1019
      0.1038
    
    
      1
      2
      0.0121
      0.0804
      0.0376
      0.0289
      0.1310
      0.0684
      0.1044
      0.0118
      0.1562
      0.0585
      0.1633
      0.1400
      0.0073
    
    
      2
      3
      0.1291
      0.0985
      0.0691
      0.0356
      0.0788
      0.0529
      0.1185
      0.1057
      0.1041
      0.0075
      0.0481
      0.1283
      0.0238
    
    
      3
      4
      0.0453
      0.1234
      0.0931
      0.0126
      0.1224
      0.0627
      0.0269
      0.0764
      0.0812
      0.1337
      0.0357
      0.0937
      0.0930
    
    
      4
      5
      0.0600
      0.0915
      0.0667
      0.0947
      0.0509
      0.0335
      0.1251
      0.0202
      0.1012
      0.0365
      0.1310
      0.0898
      0.0991



In [94]:

    
test_y_sample[:0]









    Out[94]:







  
    
      
      Id
      Blues
      Country
      Electronic
      Folk
      International
      Jazz
      Latin
      New_Age
      Pop_Rock
      Rap
      Reggae
      RnB
      Vocal



In [95]:

    
# import numpy as np

# train_X = np.array(train_x)
train_Y = np.array(train_y[:]['class_label'])
# test_X = np.array(test_x)

# Getting rid of the first and the last column: Id and msd_track_id
# X_train_val = np.array(train_X[:, 1:-1], dtype=float)
# X_test = np.array(test_X[:, 1:], dtype=float)

train_Y.shape









    Out[95]:





(13000,)



In [96]:

    
from collections import Counter

# Count the freq of the keys in the training labels
counted_labels = Counter(train_Y)
labels_keys = counted_labels.keys()
labels_keys









    Out[96]:





dict_keys(['Blues', 'Jazz', 'Electronic', 'Rap', 'Folk', 'New_Age', 'Country', 'Pop_Rock', 'Vocal', 'Latin', 'RnB', 'International', 'Reggae'])



In [97]:

    
labels_keys_sorted = sorted(labels_keys)
labels_keys_sorted









    Out[97]:





['Blues',
 'Country',
 'Electronic',
 'Folk',
 'International',
 'Jazz',
 'Latin',
 'New_Age',
 'Pop_Rock',
 'Rap',
 'Reggae',
 'RnB',
 'Vocal']



In [98]:

    
# This for loop for creating a dictionary/ vocab
key_to_val = {key: val for val, key in enumerate(labels_keys_sorted)}
key_to_val['Country']
key_to_val









    Out[98]:





{'Blues': 0,
 'Country': 1,
 'Electronic': 2,
 'Folk': 3,
 'International': 4,
 'Jazz': 5,
 'Latin': 6,
 'New_Age': 7,
 'Pop_Rock': 8,
 'Rap': 9,
 'Reggae': 10,
 'RnB': 11,
 'Vocal': 12}



In [114]:

    
Y_train_vec = []
for each in train_y[:]['class_label']:
#     print(each, key_to_val[each])
    Y_train_vec.append(key_to_val[each])

Y_train_val = np.array(Y_train_vec)
Y_train_val.shape, Y_train_vec[:10], Y_train_val[:10]









    Out[114]:





((13000,),
 [4, 12, 6, 0, 12, 5, 3, 3, 11, 8],
 array([ 4, 12,  6,  0, 12,  5,  3,  3, 11,  8]))



In [101]:

    
#initializing the output matrix for the training data, we map the classes, we get a 13 * 13000 matrix, 1 for the good class, 0 for the others
j = 0
while j < NB_TRAININGEX:
	str = outputtemp[j]
	if str == 'International':
		output[0,j] = 1
	if str == 'Vocal':
		output[1,j] = 1
	if str == 'Latin':
		output[2,j] = 1
	if str == 'Blues':
		output[3,j] = 1
	if str == 'Country':
		output[4,j] = 1
	if str == 'Electronic':
		output[5,j] = 1
	if str == 'Folk':
		output[6,j] = 1
	if str == 'Jazz':
		output[7,j] = 1
	if str == 'New_Age':
		output[8,j] = 1
	if str == 'Pop_Rock':
		output[9,j] = 1
	if str == 'Rap':
		output[10,j] = 1
	if str == 'Reggae':
		output[11,j] = 1
	if str == 'RnB':
		output[12,j] = 1
	j = j+1

#values for reference
#international = 0
#vocal = 1
#latin = 2
#blues = 3
#country = 4
#electronic = 5
#folk = 6
#jazz= 7
#new-age=8
#pop_rock = 9
#rap = 10
#reggae = 11
#rnb = 12

def sigmoid(x):
	return 1/(1 + np.exp(-x))

# def sigmoidDeriv(x):
# 	return x *(1 - x)
# def sigmoid(X):
#     return 1 / (1 + np.exp(-X))

#1 hidden layer = 2 synapses = 2 
def forwardPass(inputLayer, weights1, weigths2):

    hiddenLayer = weights1.T.dot(inputLayer)
    print('hiddenLayer.shape', hiddenLayer.shape)

    # apply sigmoid on all activations
    hiddenLayer = sigmoid(hiddenLayer)
    print('hiddenLayer.shape', hiddenLayer.shape)

# # 	i = 0
# # 	while i < NB_HIDDEN_NEURONS:
# # 		hiddenLayer[i] = sigmoid(hiddenLayer[i])
# # 		i = i + 1

    result = weigths2.T.dot(hiddenLayer)
    print('result.shape', result.shape)
    
    return result



In [102]:

    
# The model here is refering to the last layer model before softmax function applied.
def softmax(X):
    eX = np.exp((X.T - np.max(X, axis=1)).T)
    return (eX.T / eX.sum(axis=1)).T

def cross_entropy(y_pred, y_train):
    m = int(y_pred.shape[0])

    prob = softmax(y_pred)
    print(prob.shape, y_train.shape, m)
    log_like = -np.log(prob[range(m), y_train]) # to avoid the division/dividing by zero
    data_loss = np.sum(log_like) / m

    return data_loss

def dcross_entropy(y_pred, y_train): # this is equal for both since the reg_loss (noise) derivative is ZERO.
    m = y_pred.shape[0]

    grad_y = softmax(y_pred)
    grad_y[range(m), y_train] -= 1.
    grad_y /= m

    return grad_y

def loss_function(y, y_train):

    loss = cross_entropy(y, y_train) # softmax is included
    dy = dcross_entropy(y, y_train) # dsoftmax is included

    return loss, dy



In [113]:

    
#result2 = forwardPass(inputFinal,syn1,syn2)

#calculate the error

#testing
testing = forwardPass(testData[:, :10],syn1,syn2)
# testing = testing.T
testing.reshape(10, -1).shape, output.shape, output[:, :10].shape, output[:, :10].dtype, testing.dtype, outputtemp[:10].shape
output[:2], train_Y.shape, testing.shape, Y_train_vec[:10]









    



hiddenLayer.shape (16, 10)
hiddenLayer.shape (16, 10)
result.shape (13, 10)






    Out[113]:





(array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  1.,  0., ...,  1.,  0.,  0.]]),
 (13000,),
 (13, 10),
 [4, 12, 6, 0, 12, 5, 3, 3, 11, 8])



In [118]:

    
loss, dy = loss_function(y=testing.reshape(10, -1), y_train=Y_train_val[:10])
# (1300, 13) (1300,)
loss, dy.shape









    



(10, 13) (10,) 10






    Out[118]:





(2.3792356624941791, (10, 13))



In [5]:

    
#add index column
finalOutput = np.zeros((NB_TEST,NB_CLASSES+1))
i = 0
while i < NB_TEST:
	finalOutput[i] = np.hstack((i+1,testing[i]))
	i = i + 1

finalOutput.astype(np.int32)

#test data&
with open('submission.csv','a') as f_handle:
	np.savetxt(f_handle, finalOutput, fmt='%i,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f',delimiter=",")









    



/home/arasdar/anaconda3/envs/arasdar-DL-env/lib/python3.5/site-packages/ipykernel_launcher.py:49: RuntimeWarning: overflow encountered in exp






    



---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/anaconda3/envs/arasdar-DL-env/lib/python3.5/site-packages/numpy/lib/npyio.py in savetxt(fname, X, fmt, delimiter, newline, header, footer, comments)
   1157                 try:
-> 1158                     fh.write(asbytes(format % tuple(row) + newline))
   1159                 except TypeError:

TypeError: write() argument must be str, not bytes

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-5-1a577239c49e> in <module>()
     19 #test data&
     20 with open('submission.csv','a') as f_handle:
---> 21         np.savetxt(f_handle, finalOutput, fmt='%i,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f',delimiter=",")

~/anaconda3/envs/arasdar-DL-env/lib/python3.5/site-packages/numpy/lib/npyio.py in savetxt(fname, X, fmt, delimiter, newline, header, footer, comments)
   1160                     raise TypeError("Mismatch between array dtype ('%s') and "
   1161                                     "format specifier ('%s')"
-> 1162                                     % (str(X.dtype), format))
   1163         if len(footer) > 0:
   1164             footer = footer.replace('\n', '\n' + comments)

TypeError: Mismatch between array dtype ('float64') and format specifier ('%i,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f')



In [ ]:

	Id	Blues	Country	Electronic	Folk	International	Jazz	Latin	New_Age	Pop_Rock	Rap	Reggae	RnB	Vocal
0	1	0.0964	0.0884	0.0121	0.1004	0.0137	0.1214	0.0883	0.0765	0.0332	0.0445	0.1193	0.1019	0.1038
1	2	0.0121	0.0804	0.0376	0.0289	0.1310	0.0684	0.1044	0.0118	0.1562	0.0585	0.1633	0.1400	0.0073
2	3	0.1291	0.0985	0.0691	0.0356	0.0788	0.0529	0.1185	0.1057	0.1041	0.0075	0.0481	0.1283	0.0238
3	4	0.0453	0.1234	0.0931	0.0126	0.1224	0.0627	0.0269	0.0764	0.0812	0.1337	0.0357	0.0937	0.0930
4	5	0.0600	0.0915	0.0667	0.0947	0.0509	0.0335	0.1251	0.0202	0.1012	0.0365	0.1310	0.0898	0.0991