Dataset statistics


In [41]:
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm

In [42]:
wavlist = '/data/hktxt/AISHELL-2/iOS/data/wav.scp'  # wav files list
trans = '/data/hktxt/AISHELL-2/iOS/data/trans.txt'  # transition
spk_info = '/data/hktxt/AISHELL-2/iOS/data/spk_info.txt' #speaker info

In [43]:
wav = pd.read_csv(wavlist, header=None, sep="\s+", names=["folder","file"])

In [44]:
wav


Out[44]:
folder file
0 IC0001W0001 wav/C0001/IC0001W0001.wav
1 IC0001W0002 wav/C0001/IC0001W0002.wav
2 IC0001W0003 wav/C0001/IC0001W0003.wav
3 IC0001W0004 wav/C0001/IC0001W0004.wav
4 IC0001W0005 wav/C0001/IC0001W0005.wav
5 IC0001W0006 wav/C0001/IC0001W0006.wav
6 IC0001W0007 wav/C0001/IC0001W0007.wav
7 IC0001W0008 wav/C0001/IC0001W0008.wav
8 IC0001W0009 wav/C0001/IC0001W0009.wav
9 IC0001W0010 wav/C0001/IC0001W0010.wav
10 IC0001W0011 wav/C0001/IC0001W0011.wav
11 IC0001W0012 wav/C0001/IC0001W0012.wav
12 IC0001W0013 wav/C0001/IC0001W0013.wav
13 IC0001W0014 wav/C0001/IC0001W0014.wav
14 IC0001W0015 wav/C0001/IC0001W0015.wav
15 IC0001W0016 wav/C0001/IC0001W0016.wav
16 IC0001W0017 wav/C0001/IC0001W0017.wav
17 IC0001W0018 wav/C0001/IC0001W0018.wav
18 IC0001W0019 wav/C0001/IC0001W0019.wav
19 IC0001W0020 wav/C0001/IC0001W0020.wav
20 IC0001W0021 wav/C0001/IC0001W0021.wav
21 IC0001W0022 wav/C0001/IC0001W0022.wav
22 IC0001W0023 wav/C0001/IC0001W0023.wav
23 IC0001W0024 wav/C0001/IC0001W0024.wav
24 IC0001W0025 wav/C0001/IC0001W0025.wav
25 IC0001W0026 wav/C0001/IC0001W0026.wav
26 IC0001W0027 wav/C0001/IC0001W0027.wav
27 IC0001W0028 wav/C0001/IC0001W0028.wav
28 IC0001W0029 wav/C0001/IC0001W0029.wav
29 IC0001W0030 wav/C0001/IC0001W0030.wav
... ... ...
1009193 ID2166W0491 wav/D2166/ID2166W0491.wav
1009194 ID2166W0492 wav/D2166/ID2166W0492.wav
1009195 ID2166W0493 wav/D2166/ID2166W0493.wav
1009196 ID2166W0494 wav/D2166/ID2166W0494.wav
1009197 ID2166W0495 wav/D2166/ID2166W0495.wav
1009198 ID2166W0496 wav/D2166/ID2166W0496.wav
1009199 ID2166W0497 wav/D2166/ID2166W0497.wav
1009200 ID2166W0498 wav/D2166/ID2166W0498.wav
1009201 ID2166W0499 wav/D2166/ID2166W0499.wav
1009202 ID2166W0500 wav/D2166/ID2166W0500.wav
1009203 ID2166W0501 wav/D2166/ID2166W0501.wav
1009204 ID2166W0502 wav/D2166/ID2166W0502.wav
1009205 ID2166W0503 wav/D2166/ID2166W0503.wav
1009206 ID2166W0504 wav/D2166/ID2166W0504.wav
1009207 ID2166W0505 wav/D2166/ID2166W0505.wav
1009208 ID2166W0506 wav/D2166/ID2166W0506.wav
1009209 ID2166W0507 wav/D2166/ID2166W0507.wav
1009210 ID2166W0508 wav/D2166/ID2166W0508.wav
1009211 ID2166W0509 wav/D2166/ID2166W0509.wav
1009212 ID2166W0510 wav/D2166/ID2166W0510.wav
1009213 ID2166W0511 wav/D2166/ID2166W0511.wav
1009214 ID2166W0512 wav/D2166/ID2166W0512.wav
1009215 ID2166W0513 wav/D2166/ID2166W0513.wav
1009216 ID2166W0514 wav/D2166/ID2166W0514.wav
1009217 ID2166W0515 wav/D2166/ID2166W0515.wav
1009218 ID2166W0516 wav/D2166/ID2166W0516.wav
1009219 ID2166W0517 wav/D2166/ID2166W0517.wav
1009220 ID2166W0518 wav/D2166/ID2166W0518.wav
1009221 ID2166W0519 wav/D2166/ID2166W0519.wav
1009222 ID2166W0520 wav/D2166/ID2166W0520.wav

1009223 rows × 2 columns


In [45]:
len(wav) # total wav files 1009223


Out[45]:
1009223

wav duration statistics


In [46]:
wav.folder[1]


Out[46]:
'IC0001W0002'

In [47]:
wav.folder[1:9]


Out[47]:
1    IC0001W0002
2    IC0001W0003
3    IC0001W0004
4    IC0001W0005
5    IC0001W0006
6    IC0001W0007
7    IC0001W0008
8    IC0001W0009
Name: folder, dtype: object

In [48]:
wav.folder[1][1:6]


Out[48]:
'C0001'

In [49]:
wav.folder[1:3]


Out[49]:
1    IC0001W0002
2    IC0001W0003
Name: folder, dtype: object

read wav.label to l


In [50]:
l = []
for i in tqdm(range(len(wav))):
    l.append(wav.folder[i][1:6])


100%|██████████| 1009223/1009223 [00:44<00:00, 22473.61it/s]

In [51]:
len(l)


Out[51]:
1009223

erase the repeating elements, 2 method: unique and set, after erase repeating elems, total elems = 1991


In [52]:
l1 = np.unique(l) #shun xu bu bian

In [53]:
len(l1)


Out[53]:
1991

In [54]:
l2 = list(set(l))#gai bian le shun xu

In [55]:
len(l2)


Out[55]:
1991

In [56]:
l1


Out[56]:
array(['C0001', 'C0002', 'C0003', ..., 'D2164', 'D2165', 'D2166'],
      dtype='<U5')

In [57]:
l2[:10]


Out[57]:
['D0093',
 'C0448',
 'D0177',
 'C0094',
 'C0872',
 'C9108',
 'C0576',
 'D0390',
 'C9001',
 'D0245']

In [58]:
l2.sort()

In [59]:
l2[:10]


Out[59]:
['C0001',
 'C0002',
 'C0003',
 'C0004',
 'C0005',
 'C0006',
 'C0007',
 'C0008',
 'C0009',
 'C0010']

use a dic relabeling label, C0001->0, C0002->1, ... , D2166->1990


In [60]:
label = {}
for i in tqdm(range(len(l2))):
    label[l2[i]] = i


100%|██████████| 1991/1991 [00:00<00:00, 583894.51it/s]

In [100]:
label['C0001'] #the first one


Out[100]:
0

In [101]:
label['D2166'] #the last one


Out[101]:
1990

In [102]:
label['C0754']


Out[102]:
751

save label to csv file


In [63]:
dataframe = pd.DataFrame({'name':list(label.keys()),'label':list(label.values())})
dataframe.to_csv("label.csv", index = False, sep=',')

read csv file to dict, which is used for training


In [64]:
###check
t = pd.read_csv('label.csv', sep=',')

In [65]:
t[:10]


Out[65]:
name label
0 C0001 0
1 C0002 1
2 C0003 2
3 C0004 3
4 C0005 4
5 C0006 5
6 C0007 6
7 C0008 7
8 C0009 8
9 C0010 9

In [66]:
lt = dict(zip(t.name, t.label))

In [67]:
#print first 5 elems
print([item for item in lt.items()][:5])


[('C0001', 0), ('C0002', 1), ('C0003', 2), ('C0004', 3), ('C0005', 4)]

In [68]:
#split dataset into train and test
from sklearn.model_selection import train_test_split

train, test = train_test_split(wav, test_size=0.2)

In [69]:
len(train)


Out[69]:
807378

In [70]:
len(test)


Out[70]:
201845

In [71]:
wav[1:10]


Out[71]:
folder file
1 IC0001W0002 wav/C0001/IC0001W0002.wav
2 IC0001W0003 wav/C0001/IC0001W0003.wav
3 IC0001W0004 wav/C0001/IC0001W0004.wav
4 IC0001W0005 wav/C0001/IC0001W0005.wav
5 IC0001W0006 wav/C0001/IC0001W0006.wav
6 IC0001W0007 wav/C0001/IC0001W0007.wav
7 IC0001W0008 wav/C0001/IC0001W0008.wav
8 IC0001W0009 wav/C0001/IC0001W0009.wav
9 IC0001W0010 wav/C0001/IC0001W0010.wav

In [72]:
ll =  [lt[x[1:6]] for x in wav.folder]

In [73]:
len(ll)


Out[73]:
1009223

In [74]:
max(ll)


Out[74]:
1990

In [75]:
min(ll)


Out[75]:
0

In [76]:
ll[499]


Out[76]:
0

In [77]:
ll[500]


Out[77]:
1

In [78]:
wav.folder[499]


Out[78]:
'IC0001W0500'

In [79]:
wav.folder[500]


Out[79]:
'IC0002W0001'

In [80]:
wav.insert(0,'label',[lt[x[1:6]] for x in wav.folder])

In [81]:
wav[495:505]


Out[81]:
label folder file
495 0 IC0001W0496 wav/C0001/IC0001W0496.wav
496 0 IC0001W0497 wav/C0001/IC0001W0497.wav
497 0 IC0001W0498 wav/C0001/IC0001W0498.wav
498 0 IC0001W0499 wav/C0001/IC0001W0499.wav
499 0 IC0001W0500 wav/C0001/IC0001W0500.wav
500 1 IC0002W0001 wav/C0002/IC0002W0001.wav
501 1 IC0002W0002 wav/C0002/IC0002W0002.wav
502 1 IC0002W0003 wav/C0002/IC0002W0003.wav
503 1 IC0002W0004 wav/C0002/IC0002W0004.wav
504 1 IC0002W0005 wav/C0002/IC0002W0005.wav

In [82]:
wav[995:1005]


Out[82]:
label folder file
995 1 IC0002W0497 wav/C0002/IC0002W0497.wav
996 1 IC0002W0498 wav/C0002/IC0002W0498.wav
997 1 IC0002W0499 wav/C0002/IC0002W0499.wav
998 2 IC0003W0001 wav/C0003/IC0003W0001.wav
999 2 IC0003W0002 wav/C0003/IC0003W0002.wav
1000 2 IC0003W0003 wav/C0003/IC0003W0003.wav
1001 2 IC0003W0004 wav/C0003/IC0003W0004.wav
1002 2 IC0003W0005 wav/C0003/IC0003W0005.wav
1003 2 IC0003W0006 wav/C0003/IC0003W0006.wav
1004 2 IC0003W0007 wav/C0003/IC0003W0007.wav

In [83]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(wav.folder, test_size=0.2) #split data into 0.8 train and 0.2 test
train, valid = train_test_split(train, test_size=0.25) #split train(0.8) into 0.6 train and 0.2 valid

In [84]:
len(train)


Out[84]:
605533

In [85]:
len(test)


Out[85]:
201845

In [86]:
len(valid)


Out[86]:
201845

In [87]:
len(train)/len(wav)


Out[87]:
0.5999992073109709

In [88]:
len(test)/len(wav)


Out[88]:
0.20000039634451455

In [89]:
len(valid)/len(wav)


Out[89]:
0.20000039634451455

In [90]:
phase = np.random.choice([1, 2, 3], size=len(wav), p=[.9, .05, .05])

In [91]:
len(phase)


Out[91]:
1009223

In [92]:
list(phase).count(1) #0.9


Out[92]:
908789

In [93]:
list(phase).count(2) #0.05


Out[93]:
50250

In [94]:
list(phase).count(3) #0.05


Out[94]:
50184

In [96]:
shuffered_wav = wav.sample(frac=1) #shuffer row

In [97]:
len(shuffered_wav)


Out[97]:
1009223

In [103]:
shuffered_wav[:10]


Out[103]:
label folder file
373742 751 IC0754W0215 wav/C0754/IC0754W0215.wav
369257 742 IC0745W0215 wav/C0745/IC0745W0215.wav
277018 556 IC0559W0105 wav/C0559/IC0559W0105.wav
644178 1283 IC9316W0031 wav/C9316/IC9316W0031.wav
401453 806 IC0810W0471 wav/C0810/IC0810W0471.wav
639720 1274 IC9307W0242 wav/C9307/IC9307W0242.wav
906602 1791 ID0446W0250 wav/D0446/ID0446W0250.wav
381361 766 IC0769W0345 wav/C0769/IC0769W0345.wav
681457 1355 ID0009W0078 wav/D0009/ID0009W0078.wav
521823 1046 IC9053W0129 wav/C9053/IC9053W0129.wav

In [104]:
# insert phase to shuffered_wav
shuffered_wav.insert(0, 'phase', phase)

In [105]:
len(shuffered_wav)


Out[105]:
1009223

In [106]:
shuffered_wav[:10]


Out[106]:
phase label folder file
373742 1 751 IC0754W0215 wav/C0754/IC0754W0215.wav
369257 1 742 IC0745W0215 wav/C0745/IC0745W0215.wav
277018 1 556 IC0559W0105 wav/C0559/IC0559W0105.wav
644178 1 1283 IC9316W0031 wav/C9316/IC9316W0031.wav
401453 1 806 IC0810W0471 wav/C0810/IC0810W0471.wav
639720 1 1274 IC9307W0242 wav/C9307/IC9307W0242.wav
906602 1 1791 ID0446W0250 wav/D0446/ID0446W0250.wav
381361 1 766 IC0769W0345 wav/C0769/IC0769W0345.wav
681457 1 1355 ID0009W0078 wav/D0009/ID0009W0078.wav
521823 1 1046 IC9053W0129 wav/C9053/IC9053W0129.wav

In [107]:
## sort shuffered_wav
new_wav = shuffered_wav.sort_index()

In [108]:
len(new_wav)


Out[108]:
1009223

In [109]:
new_wav[:10]


Out[109]:
phase label folder file
0 1 0 IC0001W0001 wav/C0001/IC0001W0001.wav
1 1 0 IC0001W0002 wav/C0001/IC0001W0002.wav
2 1 0 IC0001W0003 wav/C0001/IC0001W0003.wav
3 1 0 IC0001W0004 wav/C0001/IC0001W0004.wav
4 1 0 IC0001W0005 wav/C0001/IC0001W0005.wav
5 1 0 IC0001W0006 wav/C0001/IC0001W0006.wav
6 1 0 IC0001W0007 wav/C0001/IC0001W0007.wav
7 2 0 IC0001W0008 wav/C0001/IC0001W0008.wav
8 2 0 IC0001W0009 wav/C0001/IC0001W0009.wav
9 2 0 IC0001W0010 wav/C0001/IC0001W0010.wav

In [110]:
new_wav[1000:1001]


Out[110]:
phase label folder file
1000 1 2 IC0003W0003 wav/C0003/IC0003W0003.wav

In [111]:
wav[1000:1001]


Out[111]:
label folder file
1000 2 IC0003W0003 wav/C0003/IC0003W0003.wav

In [112]:
## save to csv
dataframe = pd.DataFrame({'phase':new_wav.phase,'label':new_wav.label, 'folder':new_wav.folder, 'file':new_wav.file})
dataframe.to_csv("wav91.csv", index = False, sep=',')

In [113]:
wwav = pd.read_csv('wav91.csv', sep=',')

In [114]:
wwav[:10]


Out[114]:
phase label folder file
0 1 0 IC0001W0001 wav/C0001/IC0001W0001.wav
1 1 0 IC0001W0002 wav/C0001/IC0001W0002.wav
2 1 0 IC0001W0003 wav/C0001/IC0001W0003.wav
3 1 0 IC0001W0004 wav/C0001/IC0001W0004.wav
4 1 0 IC0001W0005 wav/C0001/IC0001W0005.wav
5 1 0 IC0001W0006 wav/C0001/IC0001W0006.wav
6 1 0 IC0001W0007 wav/C0001/IC0001W0007.wav
7 2 0 IC0001W0008 wav/C0001/IC0001W0008.wav
8 2 0 IC0001W0009 wav/C0001/IC0001W0009.wav
9 2 0 IC0001W0010 wav/C0001/IC0001W0010.wav

In [288]:
lt = wwav.loc[wwav['file'] == 'wav/C0001/IC0001W0004.wav'].label

In [289]:
lt


Out[289]:
3    0
Name: label, dtype: int64

In [290]:
type(lt)


Out[290]:
pandas.core.series.Series

In [286]:
llt = np.array(lt)

In [287]:
type(llt)


Out[287]:
numpy.ndarray

In [291]:
int(lt)


Out[291]:
0

In [24]:
import numpy as np
s = np.random.randint(0, 1, size=3)

In [20]:
s


Out[20]:
array([0, 0, 0])

In [21]:
e = s + 3

In [22]:
e


Out[22]:
array([3, 3, 3])

In [17]:
e[0]


Out[17]:
4

In [26]:
np.random.randint(1, 2)


Out[26]:
1

In [28]:
np.random.choice()


Out[28]:
0

In [30]:
import os
p = 'a'
epoch_num = 2
pa = os.path.join(p, 'model_snapshot_{}.pkl'.format(epoch_num+1))

In [31]:
pa


Out[31]:
'a/model_snapshot_3.pkl'

In [ ]: