This is a 4bit awq quantized version of Orpheus-3b FT. I recommend using lmdeploy as its easy to install and the speed is very fast. Here is the code to load model, process audio files for voice cloning, and generate speech.
Code to load model:
## Install snac and lmdeploy with pip install snac lmdeploy
from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
from transformers import AutoTokenizer
from snac import SNAC
tp = 1 ## change if you have multiple gpus
cache_max_entry_count = 0.2 ## how much vram is reserved for context
engine_config = TurbomindEngineConfig(model_format='awq', dtype='float16', cache_max_entry_count=cache_max_entry_count, tp=tp, quant_policy=8)
pipe = pipeline("YaTharThShaRma999/orpheus_awq", backend_config=engine_config)
tokeniser = AutoTokenizer.from_pretrained("unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to('cuda:0')
Code to convert voice file into snac tokens for voice cloning
import librosa
import torch
from IPython.display import Audio
import gc
import torch
from pydub import AudioSegment
tokenizer = tokeniser
my_wav_file_is = "test.mp3" ## path to your reference audio file
and_the_transcript_is = "" ## transcript of the audio file
filename = my_wav_file_is
audio_array, sample_rate = librosa.load(filename)
def tokenise_audio(waveform):
waveform = torch.from_numpy(waveform).unsqueeze(0)
waveform = waveform.to(dtype=torch.float32)
waveform = waveform.unsqueeze(0).to('cuda:0')
with torch.inference_mode():
codes = snac_model.encode(waveform)
all_codes = []
for i in range(codes[0].shape[1]):
all_codes.append(codes[0][0][i].item()+128266)
all_codes.append(codes[1][0][2*i].item()+128266+4096)
all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))
return all_codes
myts = tokenise_audio(audio_array) ## the snac tokens
gc.collect()
torch.cuda.empty_cache()
Finally, generate speech and display it using IPython
from lmdeploy import GenerationConfig
import gc
import torch
### sampling params are heavily experimental, try to experiment with them.
gen_config = GenerationConfig(top_p=0.7,
top_k=50,
temperature=0.2,
max_new_tokens=1024,
min_new_tokens=30,
stop_token_ids=[128009, 128001, 49158, 128258],
repetition_penalty=2.0,
skip_special_tokens=False,
do_sample=True,
min_p=0.6)
prompt = and_the_transcript_is + "<laugh> So um hey, like what's up??" ## put prompt here
voice_name = "zac" ## experimental, might be removed or not
response2 = pipe([f"<custom_token_3><|begin_of_text|>{voice_name}: {prompt}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>" + tokeniser.decode(myts)], gen_config=gen_config)
gc.collect()
torch.cuda.empty_cache()
generated_ids = tokeniser.encode(response2[0].text, return_tensors='pt', add_special_tokens=False)
token_to_find = 128257
token_to_remove = 128258
token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)
if len(token_indices[1]) > 0:
last_occurrence_idx = token_indices[1][-1].item()
cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
else:
cropped_tensor = generated_ids
mask = cropped_tensor != token_to_remove
processed_rows = []
for row in cropped_tensor:
masked_row = row[row != token_to_remove]
processed_rows.append(masked_row)
code_lists = []
for row in processed_rows:
row_length = row.size(0)
new_length = (row_length // 7) * 7
trimmed_row = row[:new_length]
trimmed_row = [t - 128266 for t in trimmed_row]
code_lists.append(trimmed_row)
def redistribute_codes(code_list):
layer_1 = []
layer_2 = []
layer_3 = []
for i in range((len(code_list)+1)//7):
layer_1.append(code_list[7*i])
layer_2.append(code_list[7*i+1]-4096)
layer_3.append(code_list[7*i+2]-(2*4096))
layer_3.append(code_list[7*i+3]-(3*4096))
layer_2.append(code_list[7*i+4]-(4*4096))
layer_3.append(code_list[7*i+5]-(5*4096))
layer_3.append(code_list[7*i+6]-(6*4096))
codes = [torch.tensor(layer_1).unsqueeze(0).to('cuda:0'),
torch.tensor(layer_2).unsqueeze(0).to('cuda:0'),
torch.tensor(layer_3).unsqueeze(0).to('cuda:0')]
audio_hat = snac_model.decode(codes)
return audio_hat
my_samples = []
for code_list in code_lists:
samples = redistribute_codes(code_list)
my_samples.append(samples)
from IPython.display import display, Audio
display(Audio(samples.detach().squeeze().to("cpu").numpy(), rate=24000))
del my_samples,samples, code_lists, mask, cropped_tensor, processed_rows
gc.collect()
torch.cuda.empty_cache()
- Downloads last month
- 69
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
馃檵
Ask for provider support