In [2]:
#-------------------------------------------------------------------------------
# Name:        CSC492 - Coding Assignment #1
# Purpose:
#
# Author:      Marion
#
# Created:     13/10/2017
# Copyright:   (c) Marion 2017
# Licence:     <your licence>
#-------------------------------------------------------------------------------

import csv
import numpy as np
from numpy.random import randn

#seed the random numbers to help debbugging

np.random.seed(1)

#define hyperparameters

LEARNING_RATE = 0.01
NB_FEATURES = 26
NB_TRAININGEX = 13000
NB_CLASSES = 13
NB_HIDDEN_NEURONS = 16
NB_TEST = 10400

#getting the test data from the csv file
testDatatemp = np.loadtxt(open("../data/kaggle-music-genre/test.x.csv","rb"), dtype =np.float16,delimiter = ',',skiprows=1, usecols=range(1,27))

#invert testData to get a 26 * 13000 matrix
testDatatemp = testDatatemp.T

#getting the input data from the csv file
input = np.loadtxt(open("../data/kaggle-music-genre/train.x.csv","rb"), dtype =np.float16,delimiter = ',',skiprows=1, usecols=range(1,27))

#invert input to get a 26 * 10400 matrix
inputFinal = input.T

testData = testDatatemp

#getting the output data from the csv file
outputtemp = np.genfromtxt(open("../data/kaggle-music-genre/train.y.csv","rb"), dtype = 'str', delimiter=',',skip_header=1, usecols=(1))
output = np.zeros((NB_CLASSES,NB_TRAININGEX))

#initializing all the weigths randomly
syn1 = np.random.random((NB_FEATURES,NB_HIDDEN_NEURONS)) 
syn2 = np.random.random((NB_HIDDEN_NEURONS, NB_CLASSES))

In [54]:
testData.shape, inputFinal.shape, outputtemp.shape, syn1.shape, syn2.shape, output.shape, outputtemp.shape


Out[54]:
((26, 10400), (26, 13000), (13000,), (26, 16), (16, 13), (13, 13000), (13000,))

In [88]:
outputtemp[:20]


Out[88]:
array(['International', 'Vocal', 'Latin', 'Blues', 'Vocal', 'Jazz', 'Folk',
       'Folk', 'RnB', 'Pop_Rock', 'Latin', 'Latin', 'New_Age', 'Vocal',
       'Rap', 'Reggae', 'Rap', 'Jazz', 'Rap', 'Jazz'], 
      dtype='<U13')

In [91]:
import pandas as pd
train_y = pd.read_csv(filepath_or_buffer='../data/kaggle-music-genre/train.y.csv')
train_y.head()


Out[91]:
Id class_label
0 1 International
1 2 Vocal
2 3 Latin
3 4 Blues
4 5 Vocal

In [93]:
test_y_sample = pd.read_csv(filepath_or_buffer='../data/kaggle-music-genre/submission-random.csv')
test_y_sample.head()


Out[93]:
Id Blues Country Electronic Folk International Jazz Latin New_Age Pop_Rock Rap Reggae RnB Vocal
0 1 0.0964 0.0884 0.0121 0.1004 0.0137 0.1214 0.0883 0.0765 0.0332 0.0445 0.1193 0.1019 0.1038
1 2 0.0121 0.0804 0.0376 0.0289 0.1310 0.0684 0.1044 0.0118 0.1562 0.0585 0.1633 0.1400 0.0073
2 3 0.1291 0.0985 0.0691 0.0356 0.0788 0.0529 0.1185 0.1057 0.1041 0.0075 0.0481 0.1283 0.0238
3 4 0.0453 0.1234 0.0931 0.0126 0.1224 0.0627 0.0269 0.0764 0.0812 0.1337 0.0357 0.0937 0.0930
4 5 0.0600 0.0915 0.0667 0.0947 0.0509 0.0335 0.1251 0.0202 0.1012 0.0365 0.1310 0.0898 0.0991

In [94]:
test_y_sample[:0]


Out[94]:
Id Blues Country Electronic Folk International Jazz Latin New_Age Pop_Rock Rap Reggae RnB Vocal

In [95]:
# import numpy as np

# train_X = np.array(train_x)
train_Y = np.array(train_y[:]['class_label'])
# test_X = np.array(test_x)

# Getting rid of the first and the last column: Id and msd_track_id
# X_train_val = np.array(train_X[:, 1:-1], dtype=float)
# X_test = np.array(test_X[:, 1:], dtype=float)

train_Y.shape


Out[95]:
(13000,)

In [96]:
from collections import Counter

# Count the freq of the keys in the training labels
counted_labels = Counter(train_Y)
labels_keys = counted_labels.keys()
labels_keys


Out[96]:
dict_keys(['Blues', 'Jazz', 'Electronic', 'Rap', 'Folk', 'New_Age', 'Country', 'Pop_Rock', 'Vocal', 'Latin', 'RnB', 'International', 'Reggae'])

In [97]:
labels_keys_sorted = sorted(labels_keys)
labels_keys_sorted


Out[97]:
['Blues',
 'Country',
 'Electronic',
 'Folk',
 'International',
 'Jazz',
 'Latin',
 'New_Age',
 'Pop_Rock',
 'Rap',
 'Reggae',
 'RnB',
 'Vocal']

In [98]:
# This for loop for creating a dictionary/ vocab
key_to_val = {key: val for val, key in enumerate(labels_keys_sorted)}
key_to_val['Country']
key_to_val


Out[98]:
{'Blues': 0,
 'Country': 1,
 'Electronic': 2,
 'Folk': 3,
 'International': 4,
 'Jazz': 5,
 'Latin': 6,
 'New_Age': 7,
 'Pop_Rock': 8,
 'Rap': 9,
 'Reggae': 10,
 'RnB': 11,
 'Vocal': 12}

In [114]:
Y_train_vec = []
for each in train_y[:]['class_label']:
#     print(each, key_to_val[each])
    Y_train_vec.append(key_to_val[each])

Y_train_val = np.array(Y_train_vec)
Y_train_val.shape, Y_train_vec[:10], Y_train_val[:10]


Out[114]:
((13000,),
 [4, 12, 6, 0, 12, 5, 3, 3, 11, 8],
 array([ 4, 12,  6,  0, 12,  5,  3,  3, 11,  8]))

In [101]:
#initializing the output matrix for the training data, we map the classes, we get a 13 * 13000 matrix, 1 for the good class, 0 for the others
j = 0
while j < NB_TRAININGEX:
	str = outputtemp[j]
	if str == 'International':
		output[0,j] = 1
	if str == 'Vocal':
		output[1,j] = 1
	if str == 'Latin':
		output[2,j] = 1
	if str == 'Blues':
		output[3,j] = 1
	if str == 'Country':
		output[4,j] = 1
	if str == 'Electronic':
		output[5,j] = 1
	if str == 'Folk':
		output[6,j] = 1
	if str == 'Jazz':
		output[7,j] = 1
	if str == 'New_Age':
		output[8,j] = 1
	if str == 'Pop_Rock':
		output[9,j] = 1
	if str == 'Rap':
		output[10,j] = 1
	if str == 'Reggae':
		output[11,j] = 1
	if str == 'RnB':
		output[12,j] = 1
	j = j+1

#values for reference
#international = 0
#vocal = 1
#latin = 2
#blues = 3
#country = 4
#electronic = 5
#folk = 6
#jazz= 7
#new-age=8
#pop_rock = 9
#rap = 10
#reggae = 11
#rnb = 12

def sigmoid(x):
	return 1/(1 + np.exp(-x))

# def sigmoidDeriv(x):
# 	return x *(1 - x)
# def sigmoid(X):
#     return 1 / (1 + np.exp(-X))

#1 hidden layer = 2 synapses = 2 
def forwardPass(inputLayer, weights1, weigths2):

    hiddenLayer = weights1.T.dot(inputLayer)
    print('hiddenLayer.shape', hiddenLayer.shape)

    # apply sigmoid on all activations
    hiddenLayer = sigmoid(hiddenLayer)
    print('hiddenLayer.shape', hiddenLayer.shape)

# # 	i = 0
# # 	while i < NB_HIDDEN_NEURONS:
# # 		hiddenLayer[i] = sigmoid(hiddenLayer[i])
# # 		i = i + 1

    result = weigths2.T.dot(hiddenLayer)
    print('result.shape', result.shape)
    
    return result

In [102]:
# The model here is refering to the last layer model before softmax function applied.
def softmax(X):
    eX = np.exp((X.T - np.max(X, axis=1)).T)
    return (eX.T / eX.sum(axis=1)).T

def cross_entropy(y_pred, y_train):
    m = int(y_pred.shape[0])

    prob = softmax(y_pred)
    print(prob.shape, y_train.shape, m)
    log_like = -np.log(prob[range(m), y_train]) # to avoid the division/dividing by zero
    data_loss = np.sum(log_like) / m

    return data_loss

def dcross_entropy(y_pred, y_train): # this is equal for both since the reg_loss (noise) derivative is ZERO.
    m = y_pred.shape[0]

    grad_y = softmax(y_pred)
    grad_y[range(m), y_train] -= 1.
    grad_y /= m

    return grad_y

def loss_function(y, y_train):

    loss = cross_entropy(y, y_train) # softmax is included
    dy = dcross_entropy(y, y_train) # dsoftmax is included

    return loss, dy

In [113]:
#result2 = forwardPass(inputFinal,syn1,syn2)

#calculate the error

#testing
testing = forwardPass(testData[:, :10],syn1,syn2)
# testing = testing.T
testing.reshape(10, -1).shape, output.shape, output[:, :10].shape, output[:, :10].dtype, testing.dtype, outputtemp[:10].shape
output[:2], train_Y.shape, testing.shape, Y_train_vec[:10]


hiddenLayer.shape (16, 10)
hiddenLayer.shape (16, 10)
result.shape (13, 10)
Out[113]:
(array([[ 1.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  1.,  0., ...,  1.,  0.,  0.]]),
 (13000,),
 (13, 10),
 [4, 12, 6, 0, 12, 5, 3, 3, 11, 8])

In [118]:
loss, dy = loss_function(y=testing.reshape(10, -1), y_train=Y_train_val[:10])
# (1300, 13) (1300,)
loss, dy.shape


(10, 13) (10,) 10
Out[118]:
(2.3792356624941791, (10, 13))

In [5]:
#add index column
finalOutput = np.zeros((NB_TEST,NB_CLASSES+1))
i = 0
while i < NB_TEST:
	finalOutput[i] = np.hstack((i+1,testing[i]))
	i = i + 1

finalOutput.astype(np.int32)

#test data&
with open('submission.csv','a') as f_handle:
	np.savetxt(f_handle, finalOutput, fmt='%i,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f',delimiter=",")


/home/arasdar/anaconda3/envs/arasdar-DL-env/lib/python3.5/site-packages/ipykernel_launcher.py:49: RuntimeWarning: overflow encountered in exp
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
~/anaconda3/envs/arasdar-DL-env/lib/python3.5/site-packages/numpy/lib/npyio.py in savetxt(fname, X, fmt, delimiter, newline, header, footer, comments)
   1157                 try:
-> 1158                     fh.write(asbytes(format % tuple(row) + newline))
   1159                 except TypeError:

TypeError: write() argument must be str, not bytes

During handling of the above exception, another exception occurred:

TypeError                                 Traceback (most recent call last)
<ipython-input-5-1a577239c49e> in <module>()
     19 #test data&
     20 with open('submission.csv','a') as f_handle:
---> 21         np.savetxt(f_handle, finalOutput, fmt='%i,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f',delimiter=",")

~/anaconda3/envs/arasdar-DL-env/lib/python3.5/site-packages/numpy/lib/npyio.py in savetxt(fname, X, fmt, delimiter, newline, header, footer, comments)
   1160                     raise TypeError("Mismatch between array dtype ('%s') and "
   1161                                     "format specifier ('%s')"
-> 1162                                     % (str(X.dtype), format))
   1163         if len(footer) > 0:
   1164             footer = footer.replace('\n', '\n' + comments)

TypeError: Mismatch between array dtype ('float64') and format specifier ('%i,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f,%1.4f')

In [ ]: