Update app.py
Browse files
app.py
CHANGED
@@ -25,7 +25,20 @@ model.eval()
|
|
25 |
|
26 |
|
27 |
|
28 |
-
def inference(audio_file_path: str, text_p: str, text_n: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
print(f"Separate audio from [{audio_file_path}] with textual query p: [{text_p}] and n: [{text_n}]")
|
30 |
|
31 |
mixture, _ = librosa.load(audio_file_path, sr=32000)
|
@@ -38,7 +51,7 @@ def inference(audio_file_path: str, text_p: str, text_n: str):
|
|
38 |
sep_segments = []
|
39 |
for chunk in mixture_chunks:
|
40 |
with torch.no_grad():
|
41 |
-
sep_segments.append(model.inference_from_data(chunk.unsqueeze(0),
|
42 |
|
43 |
sep_segment = torch.concat(sep_segments, dim=1)
|
44 |
|
@@ -49,8 +62,10 @@ with gr.Blocks(title="CLAPSep") as demo:
|
|
49 |
with gr.Row():
|
50 |
with gr.Column():
|
51 |
input_audio = gr.Audio(label="Mixture", type="filepath")
|
52 |
-
text_p = gr.Textbox(label="Positive Query")
|
53 |
-
text_n = gr.Textbox(label="Negative Query")
|
|
|
|
|
54 |
with gr.Column():
|
55 |
with gr.Column():
|
56 |
output_audio = gr.Audio(label="Separation Result", scale=10)
|
@@ -62,7 +77,7 @@ with gr.Blocks(title="CLAPSep") as demo:
|
|
62 |
interactive=True,
|
63 |
)
|
64 |
button.click(
|
65 |
-
fn=inference, inputs=[input_audio, text_p, text_n], outputs=[output_audio]
|
66 |
)
|
67 |
|
68 |
|
|
|
25 |
|
26 |
|
27 |
|
28 |
+
def inference(audio_file_path: str, text_p: str, audio_file_path_p: str, text_n: str, audio_file_path_n: str):
|
29 |
+
# handling queries
|
30 |
+
with torch.no_grad():
|
31 |
+
embed_pos, embed_neg = torch.chunk(model.clap_model.get_text_embedding([text_p, text_n],
|
32 |
+
use_tensor=True), dim=0, chunks=2)
|
33 |
+
embed_pos = torch.zeros_like(embed_pos) if text_p == '' else embed_pos
|
34 |
+
embed_neg = torch.zeros_like(embed_neg) if text_n == '' else embed_neg
|
35 |
+
embed_pos += (model.clap_model.get_audio_embedding_from_filelist(
|
36 |
+
[audio_file_path_p]) if audio_file_path_p is not None else torch.zeros_like(embed_pos))
|
37 |
+
embed_neg += (model.clap_model.get_audio_embedding_from_filelist(
|
38 |
+
[audio_file_path_n]) if audio_file_path_n is not None else torch.zeros_like(embed_neg))
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
print(f"Separate audio from [{audio_file_path}] with textual query p: [{text_p}] and n: [{text_n}]")
|
43 |
|
44 |
mixture, _ = librosa.load(audio_file_path, sr=32000)
|
|
|
51 |
sep_segments = []
|
52 |
for chunk in mixture_chunks:
|
53 |
with torch.no_grad():
|
54 |
+
sep_segments.append(model.inference_from_data(chunk.unsqueeze(0), embed_pos, embed_neg))
|
55 |
|
56 |
sep_segment = torch.concat(sep_segments, dim=1)
|
57 |
|
|
|
62 |
with gr.Row():
|
63 |
with gr.Column():
|
64 |
input_audio = gr.Audio(label="Mixture", type="filepath")
|
65 |
+
text_p = gr.Textbox(label="Positive Query Text")
|
66 |
+
text_n = gr.Textbox(label="Negative Query Text")
|
67 |
+
query_audio_p = gr.Audio(label="Positive Query Audio (optional)", type="filepath")
|
68 |
+
query_audio_n = gr.Audio(label="Negative Query Audio (optional)", type="filepath")
|
69 |
with gr.Column():
|
70 |
with gr.Column():
|
71 |
output_audio = gr.Audio(label="Separation Result", scale=10)
|
|
|
77 |
interactive=True,
|
78 |
)
|
79 |
button.click(
|
80 |
+
fn=inference, inputs=[input_audio, text_p, query_audio_p, text_n, query_audio_n], outputs=[output_audio]
|
81 |
)
|
82 |
|
83 |
|