YaTharThShaRma999/orpheus_awq

This is a 4bit awq quantized version of Orpheus-3b FT. I recommend using lmdeploy as its easy to install and the speed is very fast. Here is the code to load model, process audio files for voice cloning, and generate speech.

Code to load model:

## Install snac and lmdeploy with pip install snac lmdeploy

from lmdeploy import pipeline, TurbomindEngineConfig, GenerationConfig
from transformers import AutoTokenizer
from snac import SNAC

tp = 1 ## change if you have multiple gpus
cache_max_entry_count = 0.2 ## how much vram is reserved for context

engine_config = TurbomindEngineConfig(model_format='awq', dtype='float16', cache_max_entry_count=cache_max_entry_count, tp=tp, quant_policy=8)
pipe = pipeline("YaTharThShaRma999/orpheus_awq", backend_config=engine_config)
tokeniser = AutoTokenizer.from_pretrained("unsloth/orpheus-3b-0.1-ft-unsloth-bnb-4bit")
snac_model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to('cuda:0')

Code to convert voice file into snac tokens for voice cloning

import librosa
import torch
from IPython.display import Audio
import gc
import torch
from pydub import AudioSegment
tokenizer = tokeniser

my_wav_file_is = "test.mp3" ## path to your reference audio file

and_the_transcript_is = "" ## transcript of the audio file

filename = my_wav_file_is

audio_array, sample_rate = librosa.load(filename)

def tokenise_audio(waveform):
  waveform = torch.from_numpy(waveform).unsqueeze(0)
  waveform = waveform.to(dtype=torch.float32)


  waveform = waveform.unsqueeze(0).to('cuda:0')

  with torch.inference_mode():
    codes = snac_model.encode(waveform)

  all_codes = []
  for i in range(codes[0].shape[1]):
    all_codes.append(codes[0][0][i].item()+128266)
    all_codes.append(codes[1][0][2*i].item()+128266+4096)
    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))


  return all_codes

myts = tokenise_audio(audio_array) ## the snac tokens

gc.collect()
torch.cuda.empty_cache()

Finally, generate speech and display it using IPython

from lmdeploy import GenerationConfig
import gc
import torch

### sampling params are heavily experimental, try to experiment with them.
gen_config = GenerationConfig(top_p=0.7,
                              top_k=50,
                              temperature=0.2,
                              max_new_tokens=1024,
                              min_new_tokens=30,
                              stop_token_ids=[128009, 128001, 49158, 128258],
                              repetition_penalty=2.0,
                              skip_special_tokens=False,
                              do_sample=True,
                              min_p=0.6)

prompt = and_the_transcript_is + "<laugh> So um hey, like what's up??" ## put prompt here

voice_name = "zac" ## experimental, might be removed or not
response2 = pipe([f"<custom_token_3><|begin_of_text|>{voice_name}: {prompt}<|eot_id|><custom_token_4><custom_token_5><custom_token_1>" + tokeniser.decode(myts)], gen_config=gen_config)
gc.collect()
torch.cuda.empty_cache()

generated_ids = tokeniser.encode(response2[0].text, return_tensors='pt',  add_special_tokens=False)

token_to_find = 128257
token_to_remove = 128258

token_indices = (generated_ids == token_to_find).nonzero(as_tuple=True)

if len(token_indices[1]) > 0:
    last_occurrence_idx = token_indices[1][-1].item()
    cropped_tensor = generated_ids[:, last_occurrence_idx+1:]
else:
    cropped_tensor = generated_ids
mask = cropped_tensor != token_to_remove

processed_rows = []

for row in cropped_tensor:
    masked_row = row[row != token_to_remove]
    processed_rows.append(masked_row)

code_lists = []

for row in processed_rows:
    row_length = row.size(0)
    new_length = (row_length // 7) * 7
    trimmed_row = row[:new_length]
    trimmed_row = [t - 128266 for t in trimmed_row]
    code_lists.append(trimmed_row)


def redistribute_codes(code_list):
  layer_1 = []
  layer_2 = []
  layer_3 = []
  for i in range((len(code_list)+1)//7):
    layer_1.append(code_list[7*i])
    layer_2.append(code_list[7*i+1]-4096)
    layer_3.append(code_list[7*i+2]-(2*4096))
    layer_3.append(code_list[7*i+3]-(3*4096))
    layer_2.append(code_list[7*i+4]-(4*4096))
    layer_3.append(code_list[7*i+5]-(5*4096))
    layer_3.append(code_list[7*i+6]-(6*4096))
  codes = [torch.tensor(layer_1).unsqueeze(0).to('cuda:0'),
         torch.tensor(layer_2).unsqueeze(0).to('cuda:0'),
         torch.tensor(layer_3).unsqueeze(0).to('cuda:0')]

  audio_hat = snac_model.decode(codes)
  return audio_hat

my_samples = []
for code_list in code_lists:
  samples = redistribute_codes(code_list)
  my_samples.append(samples)
from IPython.display import display, Audio

display(Audio(samples.detach().squeeze().to("cpu").numpy(), rate=24000))

del my_samples,samples, code_lists, mask, cropped_tensor, processed_rows
gc.collect()
torch.cuda.empty_cache()