-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript_preprocess.py
More file actions
65 lines (54 loc) ยท 2.28 KB
/
script_preprocess.py
File metadata and controls
65 lines (54 loc) ยท 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from fileOI import filename_script_pair_tolist
import script_manipulate
from tqdm import tqdm
import fileOI
import random
def merge_script_like_libris(file_list, divider=' ', encoding='utf8'):
dataset = []
# step 01 : ํ์ผ๋ฆฌ์คํธ๋ฅผ ํตํด์ ์คํฌ๋ฆฝํธ ์ฝ๊ธฐ
for file in file_list:
temp = filename_script_pair_tolist(file, encoding)
# step 01 -01 : u/ ์ด ์ฝ์
๋ ๋ฌธ์ฅ์ ์ญ์
if script_manipulate.is_remove_line(temp[1]):
break
temp[1] = remove_options(temp[1])
dataset.append(temp)
print(temp)
return dataset
def remove_options(line):
# step 02 : ์ก์ ์ ๊ฑฐํ๊ธฐ
line = script_manipulate.remove_noise_id(line, 'b/', 'i/', 'o/', 'n/', 'l/')
# step 03 : ๊ตฌ๋์ ์ ๊ฑฐํ๊ธฐ
line = script_manipulate.remove_punctuation_rules(line, '.', ',', '?', '!', '+', '*')
# step 04 : ์ด์ค์ ์ฌ ์ฒ๋ฆฌํ๊ธฐ
line = script_manipulate.change_number_to_pron(line)
# step 05 : ์ด์ค์ ์ฌ ์ฒ๋ฆฌํ๊ธฐ (์ฒ ์์ ์ฌ, ๋ฐ์์ ์ฌ)
line = script_manipulate.change_pron_to_dic(line)
# step 06 : ์์ฃผ ๋ฑ์ฅํ๋ ์์ด ์ํ๋ฒณ ํ๊ธํ
line = script_manipulate.replace_alphabet(line)
return line
def merge_script_like_clova_call(file_list, encoding='utf8'):
dataset = []
# step 01 : ํ์ผ๋ฆฌ์คํธ๋ฅผ ํตํด์ ์คํฌ๋ฆฝํธ ์ฝ๊ธฐ
for file in tqdm(file_list):
temp = filename_script_pair_tolist(file, encoding)
# step 01 -01 : u/ ์ด ์ฝ์
๋ ๋ฌธ์ฅ์ ์ญ์
if not script_manipulate.is_remove_line(temp[1]):
temp[1] = remove_options(temp[1])
filename = temp[0]
text = temp[1]
speaker_id = 0
dic_temp = {'wav': '{}.wav'.format(filename), 'text': text, 'speaker_id': speaker_id}
dataset.append(dic_temp)
return dataset
def split_train_test_dataset_with_json(dataset, split_rate=0.2, encoding='utf8'):
json_data = dataset
random.shuffle(json_data)
json_length = len(json_data)
test_length = int(json_length*split_rate)
testset = json_data[0:test_length]
trainset = json_data[test_length:]
return trainset, testset
def create_small_json(dataset, count=100):
json_data = dataset[:count]
fileOI.write_json_file(json_data, './output_test_small.json')