Spaces:

ASLP-lab
/

DiffRhythm

Running on Zero

App Files Files Community

ing0 commited on 6 days ago

Commit

69ef734

1 Parent(s): 45a38e5

cfg

Browse files

Files changed (3) hide show

app.py +19 -3
diffrhythm/infer/infer.py +3 -2
diffrhythm/model/cfm.py +4 -1

app.py CHANGED Viewed

@@ -31,7 +31,7 @@ cfm, tokenizer, muq, vae = prepare_model(device)
 cfm = torch.compile(cfm)
 @spaces.GPU(duration=20)
-def infer_music(lrc, ref_audio_path, text_prompt, current_prompt_type, seed=42, randomize_seed=False, steps=32, file_type='wav', max_frames=2048, device='cuda'):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
@@ -56,10 +56,12 @@ def infer_music(lrc, ref_audio_path, text_prompt, current_prompt_type, seed=42,
                                style_prompt=style_prompt,
                                negative_style_prompt=negative_style_prompt,
                                steps=steps,
                                sway_sampling_coef=sway_sampling_coef,
                                start_time=start_time,
                                file_type=file_type,
-                               vocal_flag=vocal_flag
                                )
     return generated_song
@@ -223,6 +225,10 @@ with gr.Blocks(css=css) as demo:
 4. **Supported Languages**
     - **Chinese and English**
     - More languages comming soon
                         """)
                     lyrics_btn = gr.Button("Generate", variant="primary")
@@ -246,6 +252,16 @@ with gr.Blocks(css=css) as demo:
                                     interactive=True,
                                     elem_id="step_slider"
                                 )
                         file_type = gr.Dropdown(["wav", "mp3", "ogg"], label="Output Format", value="wav")
@@ -387,7 +403,7 @@ with gr.Blocks(css=css) as demo:
     lyrics_btn.click(
         fn=infer_music,
-        inputs=[lrc, audio_prompt, text_prompt, current_prompt_type, seed, randomize_seed, steps, file_type],
         outputs=audio_output
     )

 cfm = torch.compile(cfm)
 @spaces.GPU(duration=20)
+def infer_music(lrc, ref_audio_path, text_prompt, current_prompt_type, seed=42, randomize_seed=False, steps=32, cfg_strength=4.0, file_type='wav', odeint_method='euler', max_frames=2048, device='cuda'):
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
                                style_prompt=style_prompt,
                                negative_style_prompt=negative_style_prompt,
                                steps=steps,
+                               cfg_strength=cfg_strength,
                                sway_sampling_coef=sway_sampling_coef,
                                start_time=start_time,
                                file_type=file_type,
+                               vocal_flag=vocal_flag,
+                               odeint_method=odeint_method,
                                )
     return generated_song
 4. **Supported Languages**
     - **Chinese and English**
     - More languages comming soon
+5. **Others**
+    - If loading audio result is slow, you can select Output Format as mp3 in Advanced Settings.
                         """)
                     lyrics_btn = gr.Button("Generate", variant="primary")
                                     interactive=True,
                                     elem_id="step_slider"
                                 )
+                        cfg_strength = gr.Slider(
+                                    minimum=1,
+                                    maximum=10,
+                                    value=4.0,
+                                    step=0.5,
+                                    label="CFG Strength",
+                                    interactive=True,
+                                    elem_id="step_slider"
+                                )
+                        odeint_method = gr.Radio(["euler", "midpoint", "rk4","implicit_adams"], label="ODE Solver", value="euler")
                         file_type = gr.Dropdown(["wav", "mp3", "ogg"], label="Output Format", value="wav")
     lyrics_btn.click(
         fn=infer_music,
+        inputs=[lrc, audio_prompt, text_prompt, current_prompt_type, seed, randomize_seed, steps, cfg_strength, file_type, odeint_method],
         outputs=audio_output
     )

diffrhythm/infer/infer.py CHANGED Viewed

@@ -74,7 +74,7 @@ def decode_audio(latents, vae_model, chunked=False, overlap=32, chunk_size=128):
             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
-def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, sway_sampling_coef, start_time, file_type, vocal_flag):
     with torch.inference_mode():
         generated, _ = cfm_model.sample(
@@ -84,10 +84,11 @@ def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative
             style_prompt=style_prompt,
             negative_style_prompt=negative_style_prompt,
             steps=steps,
-            cfg_strength=4.0,
             sway_sampling_coef=sway_sampling_coef,
             start_time=start_time,
             vocal_flag=vocal_flag,
         )
         generated = generated.to(torch.float32)

             y_final[:,:,t_start:t_end] = y_chunk[:,:,chunk_start:chunk_end]
         return y_final
+def inference(cfm_model, vae_model, cond, text, duration, style_prompt, negative_style_prompt, steps, cfg_strength, sway_sampling_coef, start_time, file_type, vocal_flag, odeint_method):
     with torch.inference_mode():
         generated, _ = cfm_model.sample(
             style_prompt=style_prompt,
             negative_style_prompt=negative_style_prompt,
             steps=steps,
+            cfg_strength=cfg_strength,
             sway_sampling_coef=sway_sampling_coef,
             start_time=start_time,
             vocal_flag=vocal_flag,
+            odeint_method=odeint_method,
         )
         generated = generated.to(torch.float32)

diffrhythm/model/cfm.py CHANGED Viewed

@@ -114,9 +114,12 @@ class CFM(nn.Module):
         start_time=None,
         latent_pred_start_frame=0,
         latent_pred_end_frame=2048,
-        vocal_flag=False
     ):
         self.eval()
         if next(self.parameters()).dtype == torch.float16:
             cond = cond.half()

         start_time=None,
         latent_pred_start_frame=0,
         latent_pred_end_frame=2048,
+        vocal_flag=False,
+        odeint_method="euler"
     ):
         self.eval()
+        self.odeint_kwargs = dict(method=odeint_method)
         if next(self.parameters()).dtype == torch.float16:
             cond = cond.half()