SohomToom commited on
Commit
3e75327
·
verified ·
1 Parent(s): f0352da

Update openvoice/se_extractor.py

Browse files
Files changed (1) hide show
  1. openvoice/se_extractor.py +153 -153
openvoice/se_extractor.py CHANGED
@@ -1,153 +1,153 @@
1
- import os
2
- import glob
3
- import torch
4
- import hashlib
5
- import librosa
6
- import base64
7
- from glob import glob
8
- import numpy as np
9
- from pydub import AudioSegment
10
- from faster_whisper import WhisperModel
11
- import hashlib
12
- import base64
13
- import librosa
14
- from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
15
-
16
- model_size = "medium"
17
- # Run on GPU with FP16
18
- model = None
19
- def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
20
- global model
21
- if model is None:
22
- model = WhisperModel(model_size, device="cuda", compute_type="float16")
23
- audio = AudioSegment.from_file(audio_path)
24
- max_len = len(audio)
25
-
26
- target_folder = os.path.join(target_dir, audio_name)
27
-
28
- segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
29
- segments = list(segments)
30
-
31
- # create directory
32
- os.makedirs(target_folder, exist_ok=True)
33
- wavs_folder = os.path.join(target_folder, 'wavs')
34
- os.makedirs(wavs_folder, exist_ok=True)
35
-
36
- # segments
37
- s_ind = 0
38
- start_time = None
39
-
40
- for k, w in enumerate(segments):
41
- # process with the time
42
- if k == 0:
43
- start_time = max(0, w.start)
44
-
45
- end_time = w.end
46
-
47
- # calculate confidence
48
- if len(w.words) > 0:
49
- confidence = sum([s.probability for s in w.words]) / len(w.words)
50
- else:
51
- confidence = 0.
52
- # clean text
53
- text = w.text.replace('...', '')
54
-
55
- # left 0.08s for each audios
56
- audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
57
-
58
- # segment file name
59
- fname = f"{audio_name}_seg{s_ind}.wav"
60
-
61
- # filter out the segment shorter than 1.5s and longer than 20s
62
- save = audio_seg.duration_seconds > 1.5 and \
63
- audio_seg.duration_seconds < 20. and \
64
- len(text) >= 2 and len(text) < 200
65
-
66
- if save:
67
- output_file = os.path.join(wavs_folder, fname)
68
- audio_seg.export(output_file, format='wav')
69
-
70
- if k < len(segments) - 1:
71
- start_time = max(0, segments[k+1].start - 0.08)
72
-
73
- s_ind = s_ind + 1
74
- return wavs_folder
75
-
76
-
77
- def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
78
- SAMPLE_RATE = 16000
79
- audio_vad = get_audio_tensor(audio_path)
80
- segments = get_vad_segments(
81
- audio_vad,
82
- output_sample=True,
83
- min_speech_duration=0.1,
84
- min_silence_duration=1,
85
- method="silero",
86
- )
87
- segments = [(seg["start"], seg["end"]) for seg in segments]
88
- segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
89
- print(segments)
90
- audio_active = AudioSegment.silent(duration=0)
91
- audio = AudioSegment.from_file(audio_path)
92
-
93
- for start_time, end_time in segments:
94
- audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
95
-
96
- audio_dur = audio_active.duration_seconds
97
- print(f'after vad: dur = {audio_dur}')
98
- target_folder = os.path.join(target_dir, audio_name)
99
- wavs_folder = os.path.join(target_folder, 'wavs')
100
- os.makedirs(wavs_folder, exist_ok=True)
101
- start_time = 0.
102
- count = 0
103
- num_splits = int(np.round(audio_dur / split_seconds))
104
- assert num_splits > 0, 'input audio is too short'
105
- interval = audio_dur / num_splits
106
-
107
- for i in range(num_splits):
108
- end_time = min(start_time + interval, audio_dur)
109
- if i == num_splits - 1:
110
- end_time = audio_dur
111
- output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
112
- audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
113
- audio_seg.export(output_file, format='wav')
114
- start_time = end_time
115
- count += 1
116
- return wavs_folder
117
-
118
- def hash_numpy_array(audio_path):
119
- array, _ = librosa.load(audio_path, sr=None, mono=True)
120
- # Convert the array to bytes
121
- array_bytes = array.tobytes()
122
- # Calculate the hash of the array bytes
123
- hash_object = hashlib.sha256(array_bytes)
124
- hash_value = hash_object.digest()
125
- # Convert the hash value to base64
126
- base64_value = base64.b64encode(hash_value)
127
- return base64_value.decode('utf-8')[:16].replace('/', '_^')
128
-
129
- def get_se(audio_path, vc_model, target_dir='processed', vad=True):
130
- device = vc_model.device
131
- version = vc_model.version
132
- print("OpenVoice version:", version)
133
-
134
- audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
135
- se_path = os.path.join(target_dir, audio_name, 'se.pth')
136
-
137
- # if os.path.isfile(se_path):
138
- # se = torch.load(se_path).to(device)
139
- # return se, audio_name
140
- # if os.path.isdir(audio_path):
141
- # wavs_folder = audio_path
142
-
143
- if vad:
144
- wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
145
- else:
146
- wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
147
-
148
- audio_segs = glob(f'{wavs_folder}/*.wav')
149
- if len(audio_segs) == 0:
150
- raise NotImplementedError('No audio segments found!')
151
-
152
- return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name
153
-
 
1
+ import os
2
+ import glob
3
+ import torch
4
+ import hashlib
5
+ import librosa
6
+ import base64
7
+ from glob import glob
8
+ import numpy as np
9
+ from pydub import AudioSegment
10
+ from faster_whisper import WhisperModel
11
+ import hashlib
12
+ import base64
13
+ import librosa
14
+ from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
15
+
16
+ model_size = "medium"
17
+ # Run on GPU with FP16
18
+ model = None
19
+ def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
20
+ global model
21
+ if model is None:
22
+ model = WhisperModel(model_size, device="cpu", compute_type="int8")
23
+ audio = AudioSegment.from_file(audio_path)
24
+ max_len = len(audio)
25
+
26
+ target_folder = os.path.join(target_dir, audio_name)
27
+
28
+ segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
29
+ segments = list(segments)
30
+
31
+ # create directory
32
+ os.makedirs(target_folder, exist_ok=True)
33
+ wavs_folder = os.path.join(target_folder, 'wavs')
34
+ os.makedirs(wavs_folder, exist_ok=True)
35
+
36
+ # segments
37
+ s_ind = 0
38
+ start_time = None
39
+
40
+ for k, w in enumerate(segments):
41
+ # process with the time
42
+ if k == 0:
43
+ start_time = max(0, w.start)
44
+
45
+ end_time = w.end
46
+
47
+ # calculate confidence
48
+ if len(w.words) > 0:
49
+ confidence = sum([s.probability for s in w.words]) / len(w.words)
50
+ else:
51
+ confidence = 0.
52
+ # clean text
53
+ text = w.text.replace('...', '')
54
+
55
+ # left 0.08s for each audios
56
+ audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
57
+
58
+ # segment file name
59
+ fname = f"{audio_name}_seg{s_ind}.wav"
60
+
61
+ # filter out the segment shorter than 1.5s and longer than 20s
62
+ save = audio_seg.duration_seconds > 1.5 and \
63
+ audio_seg.duration_seconds < 20. and \
64
+ len(text) >= 2 and len(text) < 200
65
+
66
+ if save:
67
+ output_file = os.path.join(wavs_folder, fname)
68
+ audio_seg.export(output_file, format='wav')
69
+
70
+ if k < len(segments) - 1:
71
+ start_time = max(0, segments[k+1].start - 0.08)
72
+
73
+ s_ind = s_ind + 1
74
+ return wavs_folder
75
+
76
+
77
+ def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
78
+ SAMPLE_RATE = 16000
79
+ audio_vad = get_audio_tensor(audio_path)
80
+ segments = get_vad_segments(
81
+ audio_vad,
82
+ output_sample=True,
83
+ min_speech_duration=0.1,
84
+ min_silence_duration=1,
85
+ method="silero",
86
+ )
87
+ segments = [(seg["start"], seg["end"]) for seg in segments]
88
+ segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
89
+ print(segments)
90
+ audio_active = AudioSegment.silent(duration=0)
91
+ audio = AudioSegment.from_file(audio_path)
92
+
93
+ for start_time, end_time in segments:
94
+ audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
95
+
96
+ audio_dur = audio_active.duration_seconds
97
+ print(f'after vad: dur = {audio_dur}')
98
+ target_folder = os.path.join(target_dir, audio_name)
99
+ wavs_folder = os.path.join(target_folder, 'wavs')
100
+ os.makedirs(wavs_folder, exist_ok=True)
101
+ start_time = 0.
102
+ count = 0
103
+ num_splits = int(np.round(audio_dur / split_seconds))
104
+ assert num_splits > 0, 'input audio is too short'
105
+ interval = audio_dur / num_splits
106
+
107
+ for i in range(num_splits):
108
+ end_time = min(start_time + interval, audio_dur)
109
+ if i == num_splits - 1:
110
+ end_time = audio_dur
111
+ output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
112
+ audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
113
+ audio_seg.export(output_file, format='wav')
114
+ start_time = end_time
115
+ count += 1
116
+ return wavs_folder
117
+
118
+ def hash_numpy_array(audio_path):
119
+ array, _ = librosa.load(audio_path, sr=None, mono=True)
120
+ # Convert the array to bytes
121
+ array_bytes = array.tobytes()
122
+ # Calculate the hash of the array bytes
123
+ hash_object = hashlib.sha256(array_bytes)
124
+ hash_value = hash_object.digest()
125
+ # Convert the hash value to base64
126
+ base64_value = base64.b64encode(hash_value)
127
+ return base64_value.decode('utf-8')[:16].replace('/', '_^')
128
+
129
+ def get_se(audio_path, vc_model, target_dir='processed', vad=True):
130
+ device = vc_model.device
131
+ version = vc_model.version
132
+ print("OpenVoice version:", version)
133
+
134
+ audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
135
+ se_path = os.path.join(target_dir, audio_name, 'se.pth')
136
+
137
+ # if os.path.isfile(se_path):
138
+ # se = torch.load(se_path).to(device)
139
+ # return se, audio_name
140
+ # if os.path.isdir(audio_path):
141
+ # wavs_folder = audio_path
142
+
143
+ if vad:
144
+ wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
145
+ else:
146
+ wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
147
+
148
+ audio_segs = glob(f'{wavs_folder}/*.wav')
149
+ if len(audio_segs) == 0:
150
+ raise NotImplementedError('No audio segments found!')
151
+
152
+ return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name
153
+