Spaces:

Yehor
/

radtts-uk-bigvgan-demo

Sleeping

App Files Files Community

radtts-uk-bigvgan-demo / data.py

Yehor

Init

ea6a7ed 3 months ago

raw

history blame contribute delete

22.4 kB

	# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: MIT
	#
	# Permission is hereby granted, free of charge, to any person obtaining a
	# copy of this software and associated documentation files (the "Software"),
	# to deal in the Software without restriction, including without limitation
	# the rights to use, copy, modify, merge, publish, distribute, sublicense,
	# and/or sell copies of the Software, and to permit persons to whom the
	# Software is furnished to do so, subject to the following conditions:
	#
	# The above copyright notice and this permission notice shall be included in
	# all copies or substantial portions of the Software.
	#
	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
	# DEALINGS IN THE SOFTWARE.

	# Based on https://github.com/NVIDIA/flowtron/blob/master/data.py
	# Original license text:
	###############################################################################
	#
	# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	###############################################################################

	import os
	import argparse
	import json
	import numpy as np
	import lmdb
	import pickle as pkl
	import torch
	import torch.utils.data
	from scipy.io.wavfile import read
	from audio_processing import TacotronSTFT
	from tts_text_processing.text_processing import TextProcessing
	from scipy.stats import betabinom
	from librosa import pyin
	from common import update_params
	from scipy.ndimage import distance_transform_edt as distance_transform


	def beta_binomial_prior_distribution(phoneme_count, mel_count, scaling_factor=0.05):
	P = phoneme_count
	M = mel_count
	x = np.arange(0, P)
	mel_text_probs = []
	for i in range(1, M + 1):
	a, b = scaling_factor * i, scaling_factor * (M + 1 - i)
	rv = betabinom(P - 1, a, b)
	mel_i_prob = rv.pmf(x)
	mel_text_probs.append(mel_i_prob)
	return torch.tensor(np.array(mel_text_probs))


	def load_wav_to_torch(full_path):
	"""Loads wavdata into torch array"""
	sampling_rate, data = read(full_path)
	return torch.from_numpy(np.array(data)).float(), sampling_rate


	class Data(torch.utils.data.Dataset):
	def __init__(
	self,
	datasets,
	filter_length,
	hop_length,
	win_length,
	sampling_rate,
	n_mel_channels,
	mel_fmin,
	mel_fmax,
	f0_min,
	f0_max,
	max_wav_value,
	use_f0,
	use_energy_avg,
	use_log_f0,
	use_scaled_energy,
	symbol_set,
	cleaner_names,
	heteronyms_path,
	phoneme_dict_path,
	p_phoneme,
	handle_phoneme="word",
	handle_phoneme_ambiguous="ignore",
	speaker_ids=None,
	include_speakers=None,
	n_frames=-1,
	use_attn_prior_masking=True,
	prepend_space_to_text=True,
	append_space_to_text=True,
	add_bos_eos_to_text=False,
	betabinom_cache_path="",
	betabinom_scaling_factor=0.05,
	lmdb_cache_path="",
	dur_min=None,
	dur_max=None,
	combine_speaker_and_emotion=False,
	**kwargs,
	):
	self.combine_speaker_and_emotion = combine_speaker_and_emotion
	self.max_wav_value = max_wav_value
	self.audio_lmdb_dict = {} # dictionary of lmdbs for audio data
	self.data = self.load_data(datasets)
	self.distance_tx_unvoiced = False
	if "distance_tx_unvoiced" in kwargs.keys():
	self.distance_tx_unvoiced = kwargs["distance_tx_unvoiced"]
	self.stft = TacotronSTFT(
	filter_length=filter_length,
	hop_length=hop_length,
	win_length=win_length,
	sampling_rate=sampling_rate,
	n_mel_channels=n_mel_channels,
	mel_fmin=mel_fmin,
	mel_fmax=mel_fmax,
	)

	self.do_mel_scaling = kwargs.get("do_mel_scaling", True)
	self.mel_noise_scale = kwargs.get("mel_noise_scale", 0.0)
	self.filter_length = filter_length
	self.hop_length = hop_length
	self.win_length = win_length
	self.mel_fmin = mel_fmin
	self.mel_fmax = mel_fmax
	self.f0_min = f0_min
	self.f0_max = f0_max
	self.use_f0 = use_f0
	self.use_log_f0 = use_log_f0
	self.use_energy_avg = use_energy_avg
	self.use_scaled_energy = use_scaled_energy
	self.sampling_rate = sampling_rate
	self.tp = TextProcessing(
	symbol_set,
	cleaner_names,
	heteronyms_path,
	phoneme_dict_path,
	p_phoneme=p_phoneme,
	handle_phoneme=handle_phoneme,
	handle_phoneme_ambiguous=handle_phoneme_ambiguous,
	prepend_space_to_text=prepend_space_to_text,
	append_space_to_text=append_space_to_text,
	add_bos_eos_to_text=add_bos_eos_to_text,
	)

	self.dur_min = dur_min
	self.dur_max = dur_max
	if speaker_ids is None or speaker_ids == "":
	self.speaker_ids = self.create_speaker_lookup_table(self.data)
	else:
	self.speaker_ids = speaker_ids

	print("Number of files", len(self.data))
	if include_speakers is not None:
	for speaker_set, include in include_speakers:
	self.filter_by_speakers_(speaker_set, include)
	print("Number of files after speaker filtering", len(self.data))

	if dur_min is not None and dur_max is not None:
	self.filter_by_duration_(dur_min, dur_max)
	print("Number of files after duration filtering", len(self.data))

	self.use_attn_prior_masking = bool(use_attn_prior_masking)
	self.prepend_space_to_text = bool(prepend_space_to_text)
	self.append_space_to_text = bool(append_space_to_text)
	self.betabinom_cache_path = betabinom_cache_path
	self.betabinom_scaling_factor = betabinom_scaling_factor
	self.lmdb_cache_path = lmdb_cache_path
	if self.lmdb_cache_path != "":
	self.cache_data_lmdb = lmdb.open(
	self.lmdb_cache_path, readonly=True, max_readers=1024, lock=False
	).begin()

	# # make sure caching path exists
	# if not os.path.exists(self.betabinom_cache_path):
	# os.makedirs(self.betabinom_cache_path)

	print("Dataloader initialized with no augmentations")
	self.speaker_map = None
	if "speaker_map" in kwargs:
	self.speaker_map = kwargs["speaker_map"]

	def load_data(self, datasets, split="\|"):
	dataset = []
	for dset_name, dset_dict in datasets.items():
	folder_path = dset_dict["basedir"]
	audiodir = dset_dict["audiodir"]
	filename = dset_dict["filelist"]
	audio_lmdb_key = None
	if "lmdbpath" in dset_dict.keys() and len(dset_dict["lmdbpath"]) > 0:
	self.audio_lmdb_dict[dset_name] = lmdb.open(
	dset_dict["lmdbpath"], readonly=True, max_readers=256, lock=False
	).begin()
	audio_lmdb_key = dset_name

	wav_folder_prefix = os.path.join(folder_path, audiodir)
	filelist_path = os.path.join(folder_path, filename)
	with open(filelist_path, encoding="utf-8") as f:
	data = [line.strip().split(split) for line in f]

	for d in data:
	emotion = "other" if len(d) == 3 else d[3]
	duration = -1 if len(d) == 3 else d[4]
	dataset.append(
	{
	"audiopath": os.path.join(wav_folder_prefix, d[0]),
	"text": d[1],
	"speaker": d[2] + "-" + emotion
	if self.combine_speaker_and_emotion
	else d[2],
	"emotion": emotion,
	"duration": float(duration),
	"lmdb_key": audio_lmdb_key,
	}
	)
	return dataset

	def filter_by_speakers_(self, speakers, include=True):
	print("Include spaker {}: {}".format(speakers, include))
	if include:
	self.data = [x for x in self.data if x["speaker"] in speakers]
	else:
	self.data = [x for x in self.data if x["speaker"] not in speakers]

	def filter_by_duration_(self, dur_min, dur_max):
	self.data = [
	x
	for x in self.data
	if x["duration"] == -1
	or (x["duration"] >= dur_min and x["duration"] <= dur_max)
	]

	def create_speaker_lookup_table(self, data):
	speaker_ids = np.sort(np.unique([x["speaker"] for x in data]))
	d = {speaker_ids[i]: i for i in range(len(speaker_ids))}
	print("Number of speakers:", len(d))
	print("Speaker IDS", d)
	return d

	def f0_normalize(self, x):
	if self.use_log_f0:
	mask = x >= self.f0_min
	x[mask] = torch.log(x[mask])
	x[~mask] = 0.0

	return x

	def f0_denormalize(self, x):
	if self.use_log_f0:
	log_f0_min = np.log(self.f0_min)
	mask = x >= log_f0_min
	x[mask] = torch.exp(x[mask])
	x[~mask] = 0.0
	x[x <= 0.0] = 0.0

	return x

	def energy_avg_normalize(self, x):
	if self.use_scaled_energy:
	x = (x + 20.0) / 20.0
	return x

	def energy_avg_denormalize(self, x):
	if self.use_scaled_energy:
	x = x * 20.0 - 20.0
	return x

	def get_f0_pvoiced(
	self,
	audio,
	sampling_rate=22050,
	frame_length=1024,
	hop_length=256,
	f0_min=100,
	f0_max=300,
	):
	audio_norm = audio / self.max_wav_value
	f0, voiced_mask, p_voiced = pyin(
	audio_norm,
	f0_min,
	f0_max,
	sampling_rate,
	frame_length=frame_length,
	win_length=frame_length // 2,
	hop_length=hop_length,
	)
	f0[~voiced_mask] = 0.0
	f0 = torch.FloatTensor(f0)
	p_voiced = torch.FloatTensor(p_voiced)
	voiced_mask = torch.FloatTensor(voiced_mask)
	return f0, voiced_mask, p_voiced

	def get_energy_average(self, mel):
	energy_avg = mel.mean(0)
	energy_avg = self.energy_avg_normalize(energy_avg)
	return energy_avg

	def get_mel(self, audio):
	audio_norm = audio / self.max_wav_value
	audio_norm = audio_norm.unsqueeze(0)
	audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
	melspec = self.stft.mel_spectrogram(audio_norm)
	melspec = torch.squeeze(melspec, 0)
	if self.do_mel_scaling:
	melspec = (melspec + 5.5) / 2
	if self.mel_noise_scale > 0:
	melspec += torch.randn_like(melspec) * self.mel_noise_scale
	return melspec

	def get_speaker_id(self, speaker):
	if self.speaker_map is not None and speaker in self.speaker_map:
	speaker = self.speaker_map[speaker]

	return torch.LongTensor([self.speaker_ids[speaker]])

	def get_text(self, text):
	text = self.tp.encode_text(text)
	text = torch.LongTensor(text)
	return text

	def get_attention_prior(self, n_tokens, n_frames):
	# cache the entire attn_prior by filename
	if self.use_attn_prior_masking:
	filename = "{}_{}".format(n_tokens, n_frames)
	prior_path = os.path.join(self.betabinom_cache_path, filename)
	prior_path += "_prior.pth"
	if self.lmdb_cache_path != "":
	attn_prior = pkl.loads(
	self.cache_data_lmdb.get(prior_path.encode("ascii"))
	)
	elif os.path.exists(prior_path):
	attn_prior = torch.load(prior_path)
	else:
	attn_prior = beta_binomial_prior_distribution(
	n_tokens, n_frames, self.betabinom_scaling_factor
	)
	torch.save(attn_prior, prior_path)
	else:
	attn_prior = torch.ones(n_frames, n_tokens) # all ones baseline

	return attn_prior

	def __getitem__(self, index):
	data = self.data[index]
	audiopath, text = data["audiopath"], data["text"]
	speaker_id = data["speaker"]

	if data["lmdb_key"] is not None:
	data_dict = pkl.loads(
	self.audio_lmdb_dict[data["lmdb_key"]].get(audiopath.encode("ascii"))
	)
	audio = data_dict["audio"]
	sampling_rate = data_dict["sampling_rate"]
	else:
	audio, sampling_rate = load_wav_to_torch(audiopath)

	if sampling_rate != self.sampling_rate:
	raise ValueError(
	"{} SR doesn't match target {} SR".format(
	sampling_rate, self.sampling_rate
	)
	)

	mel = self.get_mel(audio)
	f0 = None
	p_voiced = None
	voiced_mask = None
	if self.use_f0:
	filename = "_".join(audiopath.split("/")[-3:])
	f0_path = os.path.join(self.betabinom_cache_path, filename)
	f0_path += "_f0_sr{}_fl{}_hl{}_f0min{}_f0max{}_log{}.pt".format(
	self.sampling_rate,
	self.filter_length,
	self.hop_length,
	self.f0_min,
	self.f0_max,
	self.use_log_f0,
	)

	dikt = None
	if len(self.lmdb_cache_path) > 0:
	dikt = pkl.loads(self.cache_data_lmdb.get(f0_path.encode("ascii")))
	f0 = dikt["f0"]
	p_voiced = dikt["p_voiced"]
	voiced_mask = dikt["voiced_mask"]
	elif os.path.exists(f0_path):
	try:
	dikt = torch.load(f0_path)
	except:
	print(f"f0 loading from {f0_path} is broken, recomputing.")

	if dikt is not None:
	f0 = dikt["f0"]
	p_voiced = dikt["p_voiced"]
	voiced_mask = dikt["voiced_mask"]
	else:
	f0, voiced_mask, p_voiced = self.get_f0_pvoiced(
	audio.cpu().numpy(),
	self.sampling_rate,
	self.filter_length,
	self.hop_length,
	self.f0_min,
	self.f0_max,
	)
	print("saving f0 to {}".format(f0_path))
	torch.save(
	{"f0": f0, "voiced_mask": voiced_mask, "p_voiced": p_voiced},
	f0_path,
	)
	if f0 is None:
	raise Exception("STOP, BROKEN F0 {}".format(audiopath))

	f0 = self.f0_normalize(f0)
	if self.distance_tx_unvoiced:
	mask = f0 <= 0.0
	distance_map = np.log(distance_transform(mask))
	distance_map[distance_map <= 0] = 0.0
	f0 = f0 - distance_map

	energy_avg = None
	if self.use_energy_avg:
	energy_avg = self.get_energy_average(mel)
	if self.use_scaled_energy and energy_avg.min() < 0.0:
	print(audiopath, "has scaled energy avg smaller than 0")

	speaker_id = self.get_speaker_id(speaker_id)
	text_encoded = self.get_text(text)

	attn_prior = self.get_attention_prior(text_encoded.shape[0], mel.shape[1])

	if not self.use_attn_prior_masking:
	attn_prior = None

	return {
	"mel": mel,
	"speaker_id": speaker_id,
	"text_encoded": text_encoded,
	"audiopath": audiopath,
	"attn_prior": attn_prior,
	"f0": f0,
	"p_voiced": p_voiced,
	"voiced_mask": voiced_mask,
	"energy_avg": energy_avg,
	}

	def __len__(self):
	return len(self.data)


	class DataCollate:
	"""Zero-pads model inputs and targets given number of steps"""

	def __init__(self, n_frames_per_step=1):
	self.n_frames_per_step = n_frames_per_step

	def __call__(self, batch):
	"""Collate from normalized data"""
	# Right zero-pad all one-hot text sequences to max input length
	input_lengths, ids_sorted_decreasing = torch.sort(
	torch.LongTensor([len(x["text_encoded"]) for x in batch]),
	dim=0,
	descending=True,
	)

	max_input_len = input_lengths[0]
	text_padded = torch.LongTensor(len(batch), max_input_len)
	text_padded.zero_()

	for i in range(len(ids_sorted_decreasing)):
	text = batch[ids_sorted_decreasing[i]]["text_encoded"]
	text_padded[i, : text.size(0)] = text

	# Right zero-pad mel-spec
	num_mel_channels = batch[0]["mel"].size(0)
	max_target_len = max([x["mel"].size(1) for x in batch])

	# include mel padded, gate padded and speaker ids
	mel_padded = torch.FloatTensor(len(batch), num_mel_channels, max_target_len)
	mel_padded.zero_()
	f0_padded = None
	p_voiced_padded = None
	voiced_mask_padded = None
	energy_avg_padded = None
	if batch[0]["f0"] is not None:
	f0_padded = torch.FloatTensor(len(batch), max_target_len)
	f0_padded.zero_()

	if batch[0]["p_voiced"] is not None:
	p_voiced_padded = torch.FloatTensor(len(batch), max_target_len)
	p_voiced_padded.zero_()

	if batch[0]["voiced_mask"] is not None:
	voiced_mask_padded = torch.FloatTensor(len(batch), max_target_len)
	voiced_mask_padded.zero_()

	if batch[0]["energy_avg"] is not None:
	energy_avg_padded = torch.FloatTensor(len(batch), max_target_len)
	energy_avg_padded.zero_()

	attn_prior_padded = torch.FloatTensor(len(batch), max_target_len, max_input_len)
	attn_prior_padded.zero_()

	output_lengths = torch.LongTensor(len(batch))
	speaker_ids = torch.LongTensor(len(batch))
	audiopaths = []
	for i in range(len(ids_sorted_decreasing)):
	mel = batch[ids_sorted_decreasing[i]]["mel"]
	mel_padded[i, :, : mel.size(1)] = mel
	if batch[ids_sorted_decreasing[i]]["f0"] is not None:
	f0 = batch[ids_sorted_decreasing[i]]["f0"]
	f0_padded[i, : len(f0)] = f0

	if batch[ids_sorted_decreasing[i]]["voiced_mask"] is not None:
	voiced_mask = batch[ids_sorted_decreasing[i]]["voiced_mask"]
	voiced_mask_padded[i, : len(f0)] = voiced_mask

	if batch[ids_sorted_decreasing[i]]["p_voiced"] is not None:
	p_voiced = batch[ids_sorted_decreasing[i]]["p_voiced"]
	p_voiced_padded[i, : len(f0)] = p_voiced

	if batch[ids_sorted_decreasing[i]]["energy_avg"] is not None:
	energy_avg = batch[ids_sorted_decreasing[i]]["energy_avg"]
	energy_avg_padded[i, : len(energy_avg)] = energy_avg

	output_lengths[i] = mel.size(1)
	speaker_ids[i] = batch[ids_sorted_decreasing[i]]["speaker_id"]
	audiopath = batch[ids_sorted_decreasing[i]]["audiopath"]
	audiopaths.append(audiopath)
	cur_attn_prior = batch[ids_sorted_decreasing[i]]["attn_prior"]
	if cur_attn_prior is None:
	attn_prior_padded = None
	else:
	attn_prior_padded[
	i, : cur_attn_prior.size(0), : cur_attn_prior.size(1)
	] = cur_attn_prior

	return {
	"mel": mel_padded,
	"speaker_ids": speaker_ids,
	"text": text_padded,
	"input_lengths": input_lengths,
	"output_lengths": output_lengths,
	"audiopaths": audiopaths,
	"attn_prior": attn_prior_padded,
	"f0": f0_padded,
	"p_voiced": p_voiced_padded,
	"voiced_mask": voiced_mask_padded,
	"energy_avg": energy_avg_padded,
	}


	# ===================================================================
	# Takes directory of clean audio and makes directory of spectrograms
	# Useful for making test sets
	# ===================================================================
	if __name__ == "__main__":
	# Get defaults so it can work with no Sacred
	parser = argparse.ArgumentParser()
	parser.add_argument("-c", "--config", type=str, help="JSON file for configuration")
	parser.add_argument("-p", "--params", nargs="+", default=[])
	args = parser.parse_args()
	args.rank = 0

	# Parse configs. Globals nicer in this case
	with open(args.config) as f:
	data = f.read()

	config = json.loads(data)
	update_params(config, args.params)
	print(config)

	data_config = config["data_config"]

	ignore_keys = ["training_files", "validation_files"]
	trainset = Data(
	data_config["training_files"],
	**dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
	)

	valset = Data(
	data_config["validation_files"],
	**dict((k, v) for k, v in data_config.items() if k not in ignore_keys),
	speaker_ids=trainset.speaker_ids,
	)

	collate_fn = DataCollate()

	for dataset in (trainset, valset):
	for i, batch in enumerate(dataset):
	out = batch
	print("{}/{}".format(i, len(dataset)))