AisakaMikoto commited on
Commit
33171a6
·
verified ·
1 Parent(s): f5e7d93

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -5
app.py CHANGED
@@ -25,7 +25,20 @@ model.eval()
25
 
26
 
27
 
28
- def inference(audio_file_path: str, text_p: str, text_n: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  print(f"Separate audio from [{audio_file_path}] with textual query p: [{text_p}] and n: [{text_n}]")
30
 
31
  mixture, _ = librosa.load(audio_file_path, sr=32000)
@@ -38,7 +51,7 @@ def inference(audio_file_path: str, text_p: str, text_n: str):
38
  sep_segments = []
39
  for chunk in mixture_chunks:
40
  with torch.no_grad():
41
- sep_segments.append(model.inference_from_data(chunk.unsqueeze(0), [text_p], [text_n]))
42
 
43
  sep_segment = torch.concat(sep_segments, dim=1)
44
 
@@ -49,8 +62,10 @@ with gr.Blocks(title="CLAPSep") as demo:
49
  with gr.Row():
50
  with gr.Column():
51
  input_audio = gr.Audio(label="Mixture", type="filepath")
52
- text_p = gr.Textbox(label="Positive Query")
53
- text_n = gr.Textbox(label="Negative Query")
 
 
54
  with gr.Column():
55
  with gr.Column():
56
  output_audio = gr.Audio(label="Separation Result", scale=10)
@@ -62,7 +77,7 @@ with gr.Blocks(title="CLAPSep") as demo:
62
  interactive=True,
63
  )
64
  button.click(
65
- fn=inference, inputs=[input_audio, text_p, text_n], outputs=[output_audio]
66
  )
67
 
68
 
 
25
 
26
 
27
 
28
+ def inference(audio_file_path: str, text_p: str, audio_file_path_p: str, text_n: str, audio_file_path_n: str):
29
+ # handling queries
30
+ with torch.no_grad():
31
+ embed_pos, embed_neg = torch.chunk(model.clap_model.get_text_embedding([text_p, text_n],
32
+ use_tensor=True), dim=0, chunks=2)
33
+ embed_pos = torch.zeros_like(embed_pos) if text_p == '' else embed_pos
34
+ embed_neg = torch.zeros_like(embed_neg) if text_n == '' else embed_neg
35
+ embed_pos += (model.clap_model.get_audio_embedding_from_filelist(
36
+ [audio_file_path_p]) if audio_file_path_p is not None else torch.zeros_like(embed_pos))
37
+ embed_neg += (model.clap_model.get_audio_embedding_from_filelist(
38
+ [audio_file_path_n]) if audio_file_path_n is not None else torch.zeros_like(embed_neg))
39
+
40
+
41
+
42
  print(f"Separate audio from [{audio_file_path}] with textual query p: [{text_p}] and n: [{text_n}]")
43
 
44
  mixture, _ = librosa.load(audio_file_path, sr=32000)
 
51
  sep_segments = []
52
  for chunk in mixture_chunks:
53
  with torch.no_grad():
54
+ sep_segments.append(model.inference_from_data(chunk.unsqueeze(0), embed_pos, embed_neg))
55
 
56
  sep_segment = torch.concat(sep_segments, dim=1)
57
 
 
62
  with gr.Row():
63
  with gr.Column():
64
  input_audio = gr.Audio(label="Mixture", type="filepath")
65
+ text_p = gr.Textbox(label="Positive Query Text")
66
+ text_n = gr.Textbox(label="Negative Query Text")
67
+ query_audio_p = gr.Audio(label="Positive Query Audio (optional)", type="filepath")
68
+ query_audio_n = gr.Audio(label="Negative Query Audio (optional)", type="filepath")
69
  with gr.Column():
70
  with gr.Column():
71
  output_audio = gr.Audio(label="Separation Result", scale=10)
 
77
  interactive=True,
78
  )
79
  button.click(
80
+ fn=inference, inputs=[input_audio, text_p, query_audio_p, text_n, query_audio_n], outputs=[output_audio]
81
  )
82
 
83