The sound of Silence

Dependencias

  • https://github.com/jiaaro/pydub ou pip3 install pydub
  • sudo apt install ffmpeg
  • sudo apt install python3-pip
  • pip3 install keras
  • pip3 install tensorflow
  • pip3 install pandas
  • pip3 install h5py

Note

This produre start using a module data, but the code get to messy and I start to code a small "system" for manage the research, thus the cell's that import data are kind of deprecated.

In [1]:
from keras.models import Sequential
from keras.layers import Convolution1D,Dropout,Dense, MaxPooling1D, Activation,Reshape
from keras import metrics


Using TensorFlow backend.
/usr/lib/python3.6/importlib/_bootstrap.py:219: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6
  return f(*args, **kwds)

In [3]:
datasetMusan = "/home/joseildo/codigos/voiceDetection/musan/"
datasetMusanSepareted = "/media/joseildo/DATA/Bmusan"
datasetCodeTest = "/home/joseildo/codigos/voiceDetection/dataCodeTest/"
%store datasetMusan
%store datasetMusanSepareted
%store datasetCodeTest


Stored 'datasetMusan' (str)
Stored 'datasetMusanSepareted' (str)
Stored 'datasetCodeTest' (str)

In [4]:
modelo_2 = Sequential()
modelo_2.add(Dense(1600,activation="sigmoid", input_shape=(None,1600)))
modelo_2.add(Dense(800,activation="sigmoid"))
modelo_2.add(Dropout(0.2))
modelo_2.add(Dense(100,activation="relu"))
modelo_2.add(Dense(2,activation = 'softmax'))
modelo_2.compile(loss='categorical_crossentropy',optimizer = 'sgd',metrics=[metrics.categorical_accuracy])

In [5]:
modelo_3 = Sequential()
modelo_3.add(Convolution1D(1600, 100, padding="same", input_shape=(None,1600)))
modelo_3.add(MaxPooling1D(50,padding="same"))
modelo_3.add(Activation("relu"))
modelo_3.add(Convolution1D(400, 40, padding="same"))
modelo_3.add(MaxPooling1D(50,padding="same"))
modelo_3.add(Activation("relu"))
modelo_3.add(Dense(100))
modelo_3.add(Activation("tanh"))
modelo_3.add(Dropout(0.5))
modelo_3.add(Dense(2))
modelo_3.add(Activation("softmax"))
modelo_3.compile(loss='categorical_crossentropy',optimizer = 'sgd',metrics=["acc"])

In [12]:
from Folder import Folder

from keras import metrics
from pydub import AudioSegment
import numpy as np
from keras.utils import np_utils
## pastas
pasta = Folder(src = datasetMusanSepareted)

pasta.regex = "(?=silence)"
silencio = pasta.sliceF(0.52).loadLowerSlicedToMemory()
pasta.regex = "(?=speech)"
voz = pasta.sliceF(0.035).loadLowerSlicedToMemory()

silencioData = []
silencioLabel = []

tempo = 100
dataSize = 50000


for i in silencio:
    for j in range(0,len(i)-tempo,tempo):
        silencioData.append(i[j:j+tempo].get_array_of_samples())
        silencioLabel += "0"

vozData = []
vozLabel = []

for i in voz:
    for j in range(0,len(i)-tempo,tempo):
        vozData.append(i[j:j+tempo].get_array_of_samples())
        vozLabel += "1"   

data, label = silencioData[:dataSize] + vozData[:dataSize], silencioLabel[:dataSize] + vozLabel[:dataSize]

print("quantidade de silencio:",len(silencioData),"quantidade de voz",len(vozData))

#silencioData,silencioLabel,vozData,vozLabel = [],[],[],[]

## modelo de teste

data, label = np.asarray([data]),np.asarray([np_utils.to_categorical(label,2)])

print(data.shape)

modelo_2.fit(data,label,shuffle=True)

print(modelo_2.evaluate(np.asarray([vozData]),np.asarray([np_utils.to_categorical(vozLabel,2)])))
print(modelo_2.evaluate(np.asarray([silencioData]),np.asarray([np_utils.to_categorical(silencioLabel,2)])))


quantidade de silencio: 119928 quantidade de voz 57309
(1, 100000, 1600)
Epoch 1/10
1/1 [==============================] - 49s - loss: 1.0286 - categorical_accuracy: 0.5001
Epoch 2/10
1/1 [==============================] - 42s - loss: 0.7335 - categorical_accuracy: 0.5016
Epoch 3/10
1/1 [==============================] - 40s - loss: 0.7282 - categorical_accuracy: 0.4969
Epoch 4/10
1/1 [==============================] - 40s - loss: 0.7271 - categorical_accuracy: 0.4968
Epoch 5/10
1/1 [==============================] - 40s - loss: 0.7264 - categorical_accuracy: 0.4983
Epoch 6/10
1/1 [==============================] - 40s - loss: 0.7254 - categorical_accuracy: 0.5002
Epoch 7/10
1/1 [==============================] - 40s - loss: 0.7253 - categorical_accuracy: 0.5001
Epoch 8/10
1/1 [==============================] - 40s - loss: 0.7257 - categorical_accuracy: 0.4972
Epoch 9/10
1/1 [==============================] - 40s - loss: 0.7242 - categorical_accuracy: 0.4998
Epoch 10/10
1/1 [==============================] - 40s - loss: 0.7250 - categorical_accuracy: 0.5007
1/1 [==============================] - 10s
[0.65908843278884888, 0.65183478593826294]
1/1 [==============================] - 21s
[0.73986673355102539, 0.31953337788581848]

In [ ]:
from Folder import Folder

from keras import metrics
from pydub import AudioSegment
import numpy as np
from keras.utils import np_utils
## pastas
pasta = Folder(src = datasetCodeTest)

pasta.regex = "(?=silence)"
silencio = pasta.sliceF(1).loadLowerSlicedToMemory()
pasta.regex = "(?=speech)"
voz = pasta.sliceF(1).loadLowerSlicedToMemory()

silencioData = []
silencioLabel = []

tempo = 100
dataSize = 50000


for i in silencio:
    for j in range(0,len(i)-2*tempo,tempo):
        silencioData.append(i[j:j+tempo].get_array_of_samples())
        silencioLabel += "0"

vozData = []
vozLabel = []

for i in voz:
    for j in range(0,len(i)-2*tempo,tempo):
        vozData.append(i[j:j+tempo].get_array_of_samples())
        vozLabel += "1"   

data, label = silencioData[:dataSize] + vozData[:dataSize], silencioLabel[:dataSize] + vozLabel[:dataSize]

print("quantidade de silencio:",len(silencioData),"quantidade de voz",len(vozData))

#silencioData,silencioLabel,vozData,vozLabel = [],[],[],[]

## modelo de teste

data, label = np.asarray([data]),np.asarray([np_utils.to_categorical(label,2)])

data,label = data.reshape(data.shape[1],1,data.shape[2]), label.reshape(label.shape[1],1,label.shape[2])

print(data.shape)
print(label.shape)

modelo_3.fit(data,label,shuffle=True)

print(modelo_3.evaluate(np.asarray(vozData),np.asarray(np_utils.to_categorical(vozLabel,2))))
print(modelo_3.evaluate(np.asarray(silencioData),np.asarray(np_utils.to_categorical(silencioLabel,2))))

Statistics

Todo-List

  • Produzir 20% de silencio
  • Produzir ruido
  • Gerar musicas misturadas com fala
  • Separar os silencio dos audios
  • Testes
    • testar usando 10% de silencio mais 10% de musica sem voz contra os 20% de fala
    • testar usando 10% de silencio mais 10% de musica com voz contra 20% de fala
    • testar usando (20/3)% de silencio mais (20/3)% de musica sem voz mais (20/3)% ruido contra 20% de fala

      fma contains voice


In [1]:
from keras.models import load_model
import data

audios = "/home/gtad/gapsSom/voiceDetection/musan/"
data.iterateOnFold(audios)

silence_quant = 0
speech_quant = 0
music_quant = 0
noise_quant = 0

while(data.hasNext()):
    try:
        x,y = data.next()
        for i in y[0]:
            if i[0] == 1:
                silence_quant += 1
            elif i[1] == 1:
                speech_quant += 1
            elif i[2] == 1:
                music_quant += 1
            elif i[3] == 1:
                noise_quant += 1                
    except:
        pass
count = silence_quant + speech_quant + music_quant + noise_quant
print("\nsilence percentage: %.2f"%(silence_quant/count))
print("speech percentage: %.2f"%(speech_quant/count))
print("music percentage: %.2f"%(music_quant/count))
print("noise percentage: %.2f"%(noise_quant/count))


Using TensorFlow backend.
[============================================================] 100.0% ...loading and running fold
silence percentage: 0.06
speech percentage: 0.51
music percentage: 0.38
noise percentage: 0.05

In [9]:
from pydub import AudioSegment as AS
from pydub import silence as dub
from pydub.utils import mediainfo
import os
music = 0
mC = 0
noise = 0
nC = 0
speech = 0
sC = 0

os.chdir("/home/joseildo/codigos/voiceDetection/musan/")
for audio in os.listdir():
    sound = mediainfo(audio)
    if "speech" in audio:
        speech += float(sound["duration"])
        sC += 1
    elif "music" in audio:
        music += float(sound["duration"])
        mC += 1
    else:
        noise += float(sound["duration"])
        nC += 1
        
time = music + noise + speech
total = mC + nC + sC
print("music time:",music,"noise time:",noise,"speech time:",speech,"\n")
print("music quant:",mC,"noise quant:",nC,"speech quant:",sC)
print("total time:",time)
print("music percentage of seconds: %.2f"%(music/time))
print("noise percentage of seconds: %.2f"%(noise/time))
print("speech percentage of seconds: %.2f"%(speech/time))
print("music percentage of files: %.2f"%(mC/total))
print("noise percentage of files: %.2f"%(nC/total))
print("speech percentage of files: %.2f"%(sC/total))


music time: 153412.7663940001 noise time: 22414.717253999992 speech time: 217615.099637 

music quant: 660 noise quant: 930 speech quant: 426
total time: 393442.58328500006
music percentage of seconds: 0.39
noise percentage of seconds: 0.06
speech percentage of seconds: 0.55
music percentage of files: 0.33
noise percentage of files: 0.46
speech percentage of files: 0.21

Slip Silence from sound


In [3]:
from Folder import Folder
from Data import Data

folder = Folder(src =  "/media/joseildo/DATA/Linux/musan",output="/home/joseildo/codigos/voiceDetection/Bmusan/")
i = 0
with Data(folder = folder,thresh = -42) as data:
    folder.regex = "(?=music)"
    for audio in iter(data):
        audio.splitSilence().appendSilences()
        audio.splitSound().appendSounds()
        audio.storeSounds(name=audio.audiosName)
        audio.storeSilences(name="silence-"+str(i))
        i += 1
        
    folder.regex = "(?=noise)"
    for audio in iter(data):
        audio.splitSilence().appendSilences()
        audio.splitSound().appendSounds()
        audio.storeSounds(name=audio.audiosName)
        audio.storeSilences(name="silence-"+str(i))
        i += 1
        
    folder.regex = "(?=speech)"
    for audio in iter(data):
        audio.splitSilence().appendSilences()
        audio.splitSound().appendSounds()
        audio.storeSounds(name=audio.audiosName)
        audio.storeSilences(name="silence-"+str(i))
        i += 1


660/660    loading
930/930    loading
426/426    loading
do nothing

In [ ]:
import time
for i in range(100):
    time.sleep(0.5)
    print(str("\r"),i,end="")