emova_speech_tokenizer_hf / configuration_emova_speech_tokenizer.py

KaiChen1998

Upload tokenizer

02e65c1 verified 3 months ago

10.2 kB

	# coding=utf-8
	# Copyright 2024 The EMOVA team and The HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" EMOVASpeechTokenizer model configuration """

	import copy
	from typing import List

	from transformers.configuration_utils import PretrainedConfig
	from transformers.utils import logging

	logger = logging.get_logger(__name__)

	U2S_STYLES = [
	'gender-female_emotion-angry_speed-fast_pitch-high', 'gender-female_emotion-angry_speed-fast_pitch-low', 'gender-female_emotion-angry_speed-fast_pitch-normal',
	'gender-female_emotion-angry_speed-normal_pitch-high', 'gender-female_emotion-angry_speed-normal_pitch-low', 'gender-female_emotion-angry_speed-normal_pitch-normal',
	'gender-female_emotion-angry_speed-slow_pitch-high', 'gender-female_emotion-angry_speed-slow_pitch-low', 'gender-female_emotion-angry_speed-slow_pitch-normal',
	'gender-female_emotion-disgusted_speed-fast_pitch-high', 'gender-female_emotion-disgusted_speed-fast_pitch-low', 'gender-female_emotion-disgusted_speed-fast_pitch-normal',
	'gender-female_emotion-disgusted_speed-normal_pitch-high', 'gender-female_emotion-disgusted_speed-normal_pitch-low', 'gender-female_emotion-disgusted_speed-normal_pitch-normal',
	'gender-female_emotion-disgusted_speed-slow_pitch-high', 'gender-female_emotion-disgusted_speed-slow_pitch-low', 'gender-female_emotion-disgusted_speed-slow_pitch-normal',
	'gender-female_emotion-fearful_speed-fast_pitch-high', 'gender-female_emotion-fearful_speed-fast_pitch-low', 'gender-female_emotion-fearful_speed-fast_pitch-normal',
	'gender-female_emotion-fearful_speed-normal_pitch-high', 'gender-female_emotion-fearful_speed-normal_pitch-low', 'gender-female_emotion-fearful_speed-normal_pitch-normal',
	'gender-female_emotion-fearful_speed-slow_pitch-high', 'gender-female_emotion-fearful_speed-slow_pitch-low', 'gender-female_emotion-fearful_speed-slow_pitch-normal',
	'gender-female_emotion-happy_speed-fast_pitch-high', 'gender-female_emotion-happy_speed-fast_pitch-low', 'gender-female_emotion-happy_speed-fast_pitch-normal',
	'gender-female_emotion-happy_speed-normal_pitch-high', 'gender-female_emotion-happy_speed-normal_pitch-low', 'gender-female_emotion-happy_speed-normal_pitch-normal',
	'gender-female_emotion-happy_speed-slow_pitch-high', 'gender-female_emotion-happy_speed-slow_pitch-low', 'gender-female_emotion-happy_speed-slow_pitch-normal',
	'gender-female_emotion-neutral_speed-fast_pitch-high', 'gender-female_emotion-neutral_speed-fast_pitch-low', 'gender-female_emotion-neutral_speed-fast_pitch-normal',
	'gender-female_emotion-neutral_speed-normal_pitch-high', 'gender-female_emotion-neutral_speed-normal_pitch-low', 'gender-female_emotion-neutral_speed-normal_pitch-normal',
	'gender-female_emotion-neutral_speed-slow_pitch-high', 'gender-female_emotion-neutral_speed-slow_pitch-low', 'gender-female_emotion-neutral_speed-slow_pitch-normal',
	'gender-female_emotion-sad_speed-fast_pitch-high', 'gender-female_emotion-sad_speed-fast_pitch-low', 'gender-female_emotion-sad_speed-fast_pitch-normal',
	'gender-female_emotion-sad_speed-normal_pitch-high', 'gender-female_emotion-sad_speed-normal_pitch-low', 'gender-female_emotion-sad_speed-normal_pitch-normal',
	'gender-female_emotion-sad_speed-slow_pitch-high', 'gender-female_emotion-sad_speed-slow_pitch-low', 'gender-female_emotion-sad_speed-slow_pitch-normal',
	'gender-female_emotion-surprised_speed-fast_pitch-high', 'gender-female_emotion-surprised_speed-fast_pitch-low', 'gender-female_emotion-surprised_speed-fast_pitch-normal',
	'gender-female_emotion-surprised_speed-normal_pitch-high', 'gender-female_emotion-surprised_speed-normal_pitch-low', 'gender-female_emotion-surprised_speed-normal_pitch-normal',
	'gender-female_emotion-surprised_speed-slow_pitch-high', 'gender-female_emotion-surprised_speed-slow_pitch-low', 'gender-female_emotion-surprised_speed-slow_pitch-normal',
	'gender-male_emotion-angry_speed-fast_pitch-high', 'gender-male_emotion-angry_speed-fast_pitch-low', 'gender-male_emotion-angry_speed-fast_pitch-normal',
	'gender-male_emotion-angry_speed-normal_pitch-high', 'gender-male_emotion-angry_speed-normal_pitch-low', 'gender-male_emotion-angry_speed-normal_pitch-normal',
	'gender-male_emotion-angry_speed-slow_pitch-high', 'gender-male_emotion-angry_speed-slow_pitch-low', 'gender-male_emotion-angry_speed-slow_pitch-normal',
	'gender-male_emotion-disgusted_speed-fast_pitch-high', 'gender-male_emotion-disgusted_speed-fast_pitch-low', 'gender-male_emotion-disgusted_speed-fast_pitch-normal',
	'gender-male_emotion-disgusted_speed-normal_pitch-high', 'gender-male_emotion-disgusted_speed-normal_pitch-low', 'gender-male_emotion-disgusted_speed-normal_pitch-normal',
	'gender-male_emotion-disgusted_speed-slow_pitch-high', 'gender-male_emotion-disgusted_speed-slow_pitch-low', 'gender-male_emotion-disgusted_speed-slow_pitch-normal',
	'gender-male_emotion-fearful_speed-fast_pitch-high', 'gender-male_emotion-fearful_speed-fast_pitch-low', 'gender-male_emotion-fearful_speed-fast_pitch-normal',
	'gender-male_emotion-fearful_speed-normal_pitch-high', 'gender-male_emotion-fearful_speed-normal_pitch-low', 'gender-male_emotion-fearful_speed-normal_pitch-normal',
	'gender-male_emotion-fearful_speed-slow_pitch-high', 'gender-male_emotion-fearful_speed-slow_pitch-low', 'gender-male_emotion-fearful_speed-slow_pitch-normal',
	'gender-male_emotion-happy_speed-fast_pitch-high', 'gender-male_emotion-happy_speed-fast_pitch-low', 'gender-male_emotion-happy_speed-fast_pitch-normal',
	'gender-male_emotion-happy_speed-normal_pitch-high', 'gender-male_emotion-happy_speed-normal_pitch-low', 'gender-male_emotion-happy_speed-normal_pitch-normal',
	'gender-male_emotion-happy_speed-slow_pitch-high', 'gender-male_emotion-happy_speed-slow_pitch-low', 'gender-male_emotion-happy_speed-slow_pitch-normal',
	'gender-male_emotion-neutral_speed-fast_pitch-high', 'gender-male_emotion-neutral_speed-fast_pitch-low', 'gender-male_emotion-neutral_speed-fast_pitch-normal',
	'gender-male_emotion-neutral_speed-normal_pitch-high', 'gender-male_emotion-neutral_speed-normal_pitch-low', 'gender-male_emotion-neutral_speed-normal_pitch-normal',
	'gender-male_emotion-neutral_speed-slow_pitch-high', 'gender-male_emotion-neutral_speed-slow_pitch-low', 'gender-male_emotion-neutral_speed-slow_pitch-normal',
	'gender-male_emotion-sad_speed-fast_pitch-high', 'gender-male_emotion-sad_speed-fast_pitch-low', 'gender-male_emotion-sad_speed-fast_pitch-normal',
	'gender-male_emotion-sad_speed-normal_pitch-high', 'gender-male_emotion-sad_speed-normal_pitch-low', 'gender-male_emotion-sad_speed-normal_pitch-normal',
	'gender-male_emotion-sad_speed-slow_pitch-high', 'gender-male_emotion-sad_speed-slow_pitch-low', 'gender-male_emotion-sad_speed-slow_pitch-normal',
	'gender-male_emotion-surprised_speed-fast_pitch-high', 'gender-male_emotion-surprised_speed-fast_pitch-low', 'gender-male_emotion-surprised_speed-fast_pitch-normal',
	'gender-male_emotion-surprised_speed-normal_pitch-high', 'gender-male_emotion-surprised_speed-normal_pitch-low', 'gender-male_emotion-surprised_speed-normal_pitch-normal',
	'gender-male_emotion-surprised_speed-slow_pitch-high', 'gender-male_emotion-surprised_speed-slow_pitch-low', 'gender-male_emotion-surprised_speed-slow_pitch-normal'
	]

	class EMOVASpeechTokenizerConfig(PretrainedConfig):
	r"""
	This is the configuration class to store the configuration of a [`EMOVASpeechTokenizer`]. It is used to instantiate
	a EMOVASpeechTokenizer model especially designed for training the EMOVA (https://arxiv.org/abs/2409.18042)
	model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
	defaults will yield a configuration to the speech tokenizer model presented in EMOVA paper.
	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.
	Args:
	s2u_unit_type (`str`, defaults to `40ms_multilingual_8888`):
	Unit type to specify model configurations for the speech-to-unit (S2U) encoder. Detailed configs will be found accordingly.
	u2s_unit_type (`str`, defaults to `40ms_multilingual_8888_xujing_cosyvoice_FT`):
	Unit type to specify model configurations for the unit-to-speech (U2S) decoder. Detailed configs will be found accordingly.
	u2s_num_styles, u2s_dim_styles (`int`, defaults to 126 and 256):
	Size of the style embedding matrix.
	```python
	>>> from transformers import EMOVASpeechTokenizerConfig, EMOVASpeechTokenizer
	>>> # Initializing a EMOVA speech tokenizer configuration
	>>> configuration = EMOVASpeechTokenizerConfig()
	>>> # Initializing a model from the EMOVA speech tokenizer configuration
	>>> model = EMOVASpeechTokenizer(configuration)
	>>> # Accessing the model configuration
	>>> configuration = model.config
	```"""

	model_type = "EMOVASpeechTokenizer"

	def __init__(
	self,
	s2u_unit_type="40ms_multilingual_8888",
	u2s_unit_type="40ms_multilingual_8888_xujing_cosyvoice_FT",
	u2s_num_styles=126,
	u2s_dim_styles=256,
	**kwargs,
	):
	super().__init__(**kwargs)

	self.s2u_unit_type = s2u_unit_type
	self.u2s_unit_type = u2s_unit_type
	self.u2s_num_styles = u2s_num_styles
	self.u2s_dim_styles = u2s_dim_styles
	self.u2s_style2idx = {each:i for i, each in enumerate(U2S_STYLES)}