WSJ0数据集处理。原.wv数据集转换.wav方式

需要的工具:

  1. wsj0原数据集(LDC93S6A 或者 LDC93S6B)
  2. python3
  3. sph2pipe

python code:

FOR LDC93S6B:

"""
# example:
# 11-1.1/wsj0/si_tr_s/01t/01to030v.wv1 is converted to wav and 
# stored in YOUR_PATH/wsj0/si_tr_s/01t/01to030v.wav
"""
import os

# the root dir for WSJ0 corpus
root_dir = "" #here add your original wsj0 dataset directory

# the disc number
disc_dir = []
for list_disc in os.listdir(root_dir):
    if list_disc not in ["doc", "11-13.1"]: #doc file and 11-13.1 file do not contain .wv files
        # the data dir for each disc
        disc_dir.append(os.path.join(root_dir, list_disc, "wsj0"))

my_path = "YOUR PATH\\wsj0" #here add your target directory
if not os.path.exists(my_path):
    os.mkdir(my_path)

# the sub_data dir for each disc
for i, list_sub_data in enumerate(disc_dir):
    for sub_data_dir in os.listdir(list_sub_data):
        s_dir = os.path.join(my_path, sub_data_dir)
        if not os.path.exists(s_dir):
            os.mkdir(s_dir)
        if sub_data_dir[0][0] == 's':
            datatype_dir = os.path.join(list_sub_data, sub_data_dir)
            for list_spk in os.listdir(datatype_dir):
                spk_dir = os.path.join(s_dir, list_spk)
                spk_dir_abs = os.path.join(datatype_dir, list_spk)
                if not os.path.exists(spk_dir):
                    os.mkdir(spk_dir)
                for wv_file in os.listdir(spk_dir_abs):
                    if not wv_file.endswith('.wv1'): #only convert the .wv1 files, you can note the two lines to choose converting .wv1 and .wv2 files
                        continue
                    speech_dir = os.path.join(spk_dir_abs, wv_file)
                    target_name = wv_file.split(sep='.')[0] + '.wav'
                    target_dir = spk_dir + '\\' + target_name
                    # rif(wav)
                    cmd = "YOUR PATH\\sph2pipe -f rif " + speech_dir + " " + target_dir
                    os.system(cmd)

FOR LDC93S6A:

"""
# example:
# 11-1.1/wsj0/si_tr_s/01t/01to030v.wv1 is converted to wav and
# stored in YOUR_PATH/wsj0/si_tr_s/01t/01to030v.wav
"""
import os

# the root dir for WSJ0 corpus
root_dir = ""

# the disc number
disc_dir = []
for list_disc in os.listdir(root_dir):
    if list_disc not in ["text", "11-13.1"]: #doc file and 11-13.1 file do not contain .wv files
        # the data dir for each disc
        disc_dir.append(os.path.join(root_dir, list_disc, "wsj0"))

my_path = "YOUR PATH\\wsj0"
if not os.path.exists(my_path):
    os.mkdir(my_path)
# # the sub_data dir for each disc
for i, list_sub_data in enumerate(disc_dir):
    for sub_data_dir in os.listdir(list_sub_data):
        if (not sub_data_dir.startswith("si")) and (not sub_data_dir.startswith("sd")):
            continue
        s_dir = os.path.join(my_path, sub_data_dir)
        if not os.path.exists(s_dir):
            os.mkdir(s_dir)
        if sub_data_dir[0][0] == 's':
            datatype_dir = os.path.join(list_sub_data, sub_data_dir)
            for list_spk in os.listdir(datatype_dir):
                spk_dir = os.path.join(s_dir, list_spk)
                spk_dir_abs = os.path.join(datatype_dir, list_spk)
                if not os.path.exists(spk_dir):
                    os.mkdir(spk_dir)
                for wv_file in os.listdir(spk_dir_abs):
                    if (not wv_file.endswith('.wv1')) and (not wv_file.endswith('.wv2')):
                        continue
                    speech_dir = os.path.join(spk_dir_abs, wv_file)
                    if wv_file.split('.')[1] == "wv1":
                        target_name = wv_file.split(sep='.')[0] + '.wav'
                    elif wv_file.split('.')[1] == 'wv2':
                        target_name = wv_file.split(sep='.')[0] + '_1.wav'

                    target_dir = spk_dir + '\\' + target_name
                    # rif == wav
                    cmd = "YOUR PATH\\sph2pipe -f wav " + speech_dir + " " + target_dir
                    os.system(cmd)

之后便可以使用MATLAB脚本制作wsj0-mix数据集

posted @ 2022-03-02 15:31  wsl_lld  阅读(1411)  评论(6编辑  收藏  举报