import gradio as gr import torch from diffusers import AudioLDM2Pipeline import random # make Space compatible with CPU duplicates if torch.cuda.is_available(): device = "cuda" torch_dtype = torch.float16 else: device = "cpu" torch_dtype = torch.float32 # load the diffusers pipeline repo_id = "cvssp/audioldm2" pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch_dtype).to(device) # pipe.unet = torch.compile(pipe.unet) # set the generator for reproducibility generator = torch.Generator(device) def text2audio(text, negative_prompt, duration, guidance_scale, random_seed, n_candidates): if text is None: raise gr.Error("Please provide a text input.") waveforms = pipe( text, audio_length_in_s=duration, guidance_scale=guidance_scale, num_inference_steps=200, negative_prompt=negative_prompt, num_waveforms_per_prompt=n_candidates if n_candidates else 1, generator=generator.manual_seed(int(random_seed)), )["audios"] return gr.make_waveform((16000, waveforms[0]), bg_image="bg.png") # def selections_to_audio(descriptive_group, weather_group, nature_group, musical_group, human_group, duration, guidance_scale, random_seed, n_candidates): # combined_terms = [] # groups_to_combine = [descriptive_group, weather_group, nature_group, musical_group, human_group] # for group in groups_to_combine: # combined_terms.extend(group) # if len(combined_terms) == 0: # raise gr.Error("Please select at least one term in any group.") # text = " ".join(combined_terms) # waveforms = pipe( # text, # audio_length_in_s=duration, # guidance_scale=guidance_scale, # num_inference_steps=200, # negative_prompt="Low quality.", # num_waveforms_per_prompt=n_candidates if n_candidates else 1, # generator=generator.manual_seed(int(random_seed)), # )["audios"] # return gr.make_waveform((16000, waveforms[0]), bg_image="bg.png") def selections_to_audio(prompt_display, duration, guidance_scale, random_seed, n_candidates, randomize): if prompt_display is None or len(prompt_display) == 0: raise gr.Error("Please select at least one term in any group.") if randomize: random_seed = random.randint(0, 999999) waveforms = pipe( prompt_display, audio_length_in_s=duration, guidance_scale=guidance_scale, num_inference_steps=200, negative_prompt="Low quality.", num_waveforms_per_prompt=n_candidates if n_candidates else 1, generator=generator.manual_seed(int(random_seed)), )["audios"] return (gr.make_waveform((16000, waveforms[0]), bg_image="bg.png"), prompt_display) def update_prompt_display(descriptive_group, nature_group, water_scene_group, forest_scene_group, jungle_scene_group, beach_scene_group, swamp_scene_group, musical_group, temple_scene_group, gathering_scene_group, prompt_display): combined_terms = [] groups_to_combine = [ descriptive_group, nature_group, water_scene_group, forest_scene_group, jungle_scene_group, beach_scene_group, swamp_scene_group, musical_group, temple_scene_group, gathering_scene_group, ] for group in groups_to_combine: combined_terms.extend(group) if len(combined_terms) == 0: return prompt_display return " ".join(combined_terms) iface = gr.Blocks() # weather_choices = [ # "raindrops", # "snowfall", # "rainy", # "thunder", # "rainstorm", # "wind", # "foggy", # "fog", # "blizzard" # ] nature_choices = [ "prairie", "nature", "mountain", "countryside", "rattling", "midnight", "twilight", "dawn", "dusk", "noon", "afternoon", "morning", ] # human_choices = [ # "village", # "tavern", # "garden", # "park", # "gentle", # "bonfire", # "campfire", # "gathering", # "cooking", # "temple", # "church", # "camping", # "hum", # "creaks", # "marketplace", # "farm", # "barnyard" # ] descriptive_choices = [ "calm", "lively", "magic", "magical", "serene", "meandering", "floating", "peaceful", "gentle", "busy", ] musical_choices = [ "music", "ambient", "chorus", "singing bowls", "bells", "symphony", "orchestra", "piano", "strings", "violin", "brass", "trumpet", "flute", "woodwinds", "drums", "steel drums", "calypso", "reggae", "electronic", "dance", "classical", "jazzy", "jazz", "guitar", "synth", "synthesizer", "organ", "rock", "pop", ] water_scene_choices = [ "water", "stream", "river", "ocean", "waves", "spring", "bubbling", "flowing", "creek", "brook", "babbling brook", "rain", "raindrops", "rainy", "rainstorm", "snow", "snowfall", "blizzard", "thunder", "lightning strike", "wind", "gust", "gale", "foggy", "fog", "cloud", "cloudy", "mist", "misty", "haze", "hazy", ] forest_scene_choices = [ "forest", "chirping", "birds", "crickets", "owls", "crows", "warbler", "sparrow", "goldfinch", "blackbird", "blue jay", "squirrels", "chipmunks", "rabbits", "trees", "rustling", ] jungle_scene_choices = [ "jungle", "rain forest", "parrots", "monkeys", "lion", "elephant", "rumbling", ] beach_scene_choices = [ "beach", "coast", "costal", "seagulls", "ocean", "waves", ] swamp_scene_choices = [ "swamp ambience", "swamp", "marsh", "bog", "wetlands", "frogs", "toad", "alligators", "turtles", "ducks", "geese", "swans", "reeds", ] temple_scene_choices = [ "temple", "church", "church bells", "pipe organ", "chanting", "singing bowls", "incantation", ] gathering_scene_choices = [ "gathering", "village", "tavern", "coffee shop", "garden", "park", "bonfire", "campfire", "cooking", "camping", "hum", "creaks", "marketplace", "farm", "barnyard", "conversation", "chatter", "road", "highway", "automobile", "truck", "engine", "traffic", "city", "urban", "footsteps", "radio", "machinery", "factory", ] def append_to_prompt(original_prompt): token_to_append = random.choice(prompt_append_tokens) return original_prompt + " " + token_to_append def append_selected_values_to_prompt(original_prompt, selected_values): return original_prompt + " " + " ".join(selected_values) with iface: gr.HTML( """