SohomToom's picture
Update app.py
53a7adb verified
import nltk
nltk.download('all')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
import os
import uuid
import time
import torch
import gradio as gr
os.environ["NUMBA_DISABLE_CACHE"] = "1"
# import mecab_patch
# import english_patch
#from melo.api import TTS
from MeloTTS.melo.api import TTS
from openvoice import se_extractor
from openvoice.api import ToneColorConverter
#from meloTTS import english
# Set temporary cache locations for Hugging Face Spaces
os.environ["TORCH_HOME"] = "/tmp/torch"
os.environ["HF_HOME"] = "/tmp/huggingface"
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface"
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface"
os.environ["MPLCONFIGDIR"] = "/tmp"
os.environ["XDG_CACHE_HOME"] = "/tmp"
os.environ["XDG_CONFIG_HOME"] = "/tmp"
os.environ["NUMBA_DISABLE_CACHE"] = "1"
os.makedirs("/tmp/torch", exist_ok=True)
os.makedirs("/tmp/huggingface", exist_ok=True)
os.makedirs("/tmp/flagged", exist_ok=True)
# Output folder
output_dir = "/tmp/outputs"
os.makedirs(output_dir, exist_ok=True)
# Initialize tone converter
ckpt_converter = "checkpoint/converter"
# Device setting
device = "cuda" if torch.cuda.is_available() else "cpu"
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
def clone_and_speak(text, speaker_wav):
if not speaker_wav:
return "Please upload a reference .wav file."
base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=True)
# Use English speaker model
model = TTS(language="EN", device=device)
speaker_ids = model.hps.data.spk2id
default_speaker_id = next(iter(speaker_ids.values()))
for speaker_key in speaker_ids.keys():
speaker_id = speaker_ids[speaker_key]
speaker_key = speaker_key.lower().replace('_', '-')
source_se = torch.load(f'checkpoint/base_speakers/ses/{speaker_key}.pth', map_location=device)
speed = 1.0
# Use speaker_wav as reference to extract style embedding
#torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=False)
if torch.backends.mps.is_available() and device == 'cpu':
torch.backends.mps.is_available = lambda: False
model.tts_to_file(text, speaker_id, tmp_melo_path,speed=speed)
final_output_path = f"{output_dir}/{base_name}_converted.wav"
# Run the tone conversion
tone_color_converter.convert(
audio_src_path=tmp_melo_path,
src_se=source_se,
tgt_se=ref_se,
output_path=final_output_path,
message="@HuggingFace",
)
return final_output_path
gr.Interface(
fn=clone_and_speak,
inputs=[
gr.Textbox(label="Enter Text"),
gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)")
],
outputs=gr.Audio(label="Synthesized Output"),
flagging_dir="/tmp/flagged",
title="Text to Voice using Melo TTS + OpenVoice",
description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.",
).launch()
# iface = gr.Interface(
# fn=clone_with_base_speaker,
# inputs=[
# gr.Textbox(label="Input Text", placeholder="Enter text to synthesize..."),
# gr.Dropdown(choices=base_speaker_choices, label="Select Base Speaker"),
# ],
# outputs=gr.Audio(type="filepath", label="Cloned Voice Output"),
# title="Voice Cloning with OpenVoice Base Speakers",
# description="Choose a base speaker from OpenVoice and enter text to generate voice."
# )
# iface.launch()
# import os
# import time
# import uuid
# import gradio as gr
# from TTS.api import TTS
# from openvoice import se_extractor
# from openvoice.api import ToneColorConverter
# # Import your local english.py logic
# from meloTTS import english
# # Paths
# device = "cuda" if os.system("nvidia-smi") == 0 else "cpu"
# output_dir = "outputs"
# os.makedirs(output_dir, exist_ok=True)
# # Load OpenVoice tone converter
# tone_color_converter = ToneColorConverter(f"{os.getcwd()}/checkpoints", device=device)
# tone_color_converter.load_model()
# def clone_and_speak(text, speaker_wav):
# if not speaker_wav:
# return "Please upload a reference .wav file."
# base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}"
# tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav"
# final_output_path = f"{output_dir}/{base_name}_converted.wav"
# # Use English speaker model
# model = TTS(language="EN", device=device)
# speaker_ids = model.hps.data.spk2id
# default_speaker_id = next(iter(speaker_ids.values()))
# # Generate base TTS voice
# model.tts_to_file(text, speaker_id=default_speaker_id, file_path=tmp_melo_path, speed=1.0)
# # Extract style embedding
# ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False)
# # Convert tone
# tone_color_converter.convert(
# audio_src_path=tmp_melo_path,
# src_se=ref_se,
# tgt_se=ref_se,
# output_path=final_output_path,
# message="@HuggingFace"
# )
# return final_output_path
# # Gradio Interface
# demo = gr.Interface(
# fn=clone_and_speak,
# inputs=[
# gr.Textbox(label="Text to Synthesize"),
# gr.Audio(label="Reference Voice (WAV)", type="filepath")
# ],
# outputs=gr.Audio(label="Cloned Voice Output"),
# title="Voice Cloner with MeloTTS + OpenVoice"
# )
# if __name__ == "__main__":
# demo.launch()