import nltk | |
nltk.download('all') | |
nltk.download('averaged_perceptron_tagger') | |
nltk.download('punkt') | |
import os | |
import uuid | |
import time | |
import torch | |
import gradio as gr | |
os.environ["NUMBA_DISABLE_CACHE"] = "1" | |
# import mecab_patch | |
# import english_patch | |
#from melo.api import TTS | |
from MeloTTS.melo.api import TTS | |
from openvoice import se_extractor | |
from openvoice.api import ToneColorConverter | |
#from meloTTS import english | |
# Set temporary cache locations for Hugging Face Spaces | |
os.environ["TORCH_HOME"] = "/tmp/torch" | |
os.environ["HF_HOME"] = "/tmp/huggingface" | |
os.environ["HF_HUB_CACHE"] = "/tmp/huggingface" | |
os.environ["TRANSFORMERS_CACHE"] = "/tmp/huggingface" | |
os.environ["MPLCONFIGDIR"] = "/tmp" | |
os.environ["XDG_CACHE_HOME"] = "/tmp" | |
os.environ["XDG_CONFIG_HOME"] = "/tmp" | |
os.environ["NUMBA_DISABLE_CACHE"] = "1" | |
os.makedirs("/tmp/torch", exist_ok=True) | |
os.makedirs("/tmp/huggingface", exist_ok=True) | |
os.makedirs("/tmp/flagged", exist_ok=True) | |
# Output folder | |
output_dir = "/tmp/outputs" | |
os.makedirs(output_dir, exist_ok=True) | |
# Initialize tone converter | |
ckpt_converter = "checkpoint/converter" | |
# Device setting | |
device = "cuda" if torch.cuda.is_available() else "cpu" | |
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device) | |
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth') | |
def clone_and_speak(text, speaker_wav): | |
if not speaker_wav: | |
return "Please upload a reference .wav file." | |
base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}" | |
tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav" | |
ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=True) | |
# Use English speaker model | |
model = TTS(language="EN", device=device) | |
speaker_ids = model.hps.data.spk2id | |
default_speaker_id = next(iter(speaker_ids.values())) | |
for speaker_key in speaker_ids.keys(): | |
speaker_id = speaker_ids[speaker_key] | |
speaker_key = speaker_key.lower().replace('_', '-') | |
source_se = torch.load(f'checkpoint/base_speakers/ses/{speaker_key}.pth', map_location=device) | |
speed = 1.0 | |
# Use speaker_wav as reference to extract style embedding | |
#torch.hub.load('snakers4/silero-vad', 'silero_vad', force_reload=False) | |
if torch.backends.mps.is_available() and device == 'cpu': | |
torch.backends.mps.is_available = lambda: False | |
model.tts_to_file(text, speaker_id, tmp_melo_path,speed=speed) | |
final_output_path = f"{output_dir}/{base_name}_converted.wav" | |
# Run the tone conversion | |
tone_color_converter.convert( | |
audio_src_path=tmp_melo_path, | |
src_se=source_se, | |
tgt_se=ref_se, | |
output_path=final_output_path, | |
message="@HuggingFace", | |
) | |
return final_output_path | |
gr.Interface( | |
fn=clone_and_speak, | |
inputs=[ | |
gr.Textbox(label="Enter Text"), | |
gr.Audio(type="filepath", label="Upload a Reference Voice (.wav)") | |
], | |
outputs=gr.Audio(label="Synthesized Output"), | |
flagging_dir="/tmp/flagged", | |
title="Text to Voice using Melo TTS + OpenVoice", | |
description="Use Melo TTS for base synthesis and OpenVoice to apply a reference speaker's tone.", | |
).launch() | |
# iface = gr.Interface( | |
# fn=clone_with_base_speaker, | |
# inputs=[ | |
# gr.Textbox(label="Input Text", placeholder="Enter text to synthesize..."), | |
# gr.Dropdown(choices=base_speaker_choices, label="Select Base Speaker"), | |
# ], | |
# outputs=gr.Audio(type="filepath", label="Cloned Voice Output"), | |
# title="Voice Cloning with OpenVoice Base Speakers", | |
# description="Choose a base speaker from OpenVoice and enter text to generate voice." | |
# ) | |
# iface.launch() | |
# import os | |
# import time | |
# import uuid | |
# import gradio as gr | |
# from TTS.api import TTS | |
# from openvoice import se_extractor | |
# from openvoice.api import ToneColorConverter | |
# # Import your local english.py logic | |
# from meloTTS import english | |
# # Paths | |
# device = "cuda" if os.system("nvidia-smi") == 0 else "cpu" | |
# output_dir = "outputs" | |
# os.makedirs(output_dir, exist_ok=True) | |
# # Load OpenVoice tone converter | |
# tone_color_converter = ToneColorConverter(f"{os.getcwd()}/checkpoints", device=device) | |
# tone_color_converter.load_model() | |
# def clone_and_speak(text, speaker_wav): | |
# if not speaker_wav: | |
# return "Please upload a reference .wav file." | |
# base_name = f"output_{int(time.time())}_{uuid.uuid4().hex[:6]}" | |
# tmp_melo_path = f"{output_dir}/{base_name}_tmp.wav" | |
# final_output_path = f"{output_dir}/{base_name}_converted.wav" | |
# # Use English speaker model | |
# model = TTS(language="EN", device=device) | |
# speaker_ids = model.hps.data.spk2id | |
# default_speaker_id = next(iter(speaker_ids.values())) | |
# # Generate base TTS voice | |
# model.tts_to_file(text, speaker_id=default_speaker_id, file_path=tmp_melo_path, speed=1.0) | |
# # Extract style embedding | |
# ref_se, _ = se_extractor.get_se(speaker_wav, tone_color_converter, vad=False) | |
# # Convert tone | |
# tone_color_converter.convert( | |
# audio_src_path=tmp_melo_path, | |
# src_se=ref_se, | |
# tgt_se=ref_se, | |
# output_path=final_output_path, | |
# message="@HuggingFace" | |
# ) | |
# return final_output_path | |
# # Gradio Interface | |
# demo = gr.Interface( | |
# fn=clone_and_speak, | |
# inputs=[ | |
# gr.Textbox(label="Text to Synthesize"), | |
# gr.Audio(label="Reference Voice (WAV)", type="filepath") | |
# ], | |
# outputs=gr.Audio(label="Cloned Voice Output"), | |
# title="Voice Cloner with MeloTTS + OpenVoice" | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch() | |