# coding=utf-8 # Copyright 2024 The EMOVA team and The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ EMOVASpeechTokenizer model configuration """ import copy from typing import List from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging logger = logging.get_logger(__name__) U2S_STYLES = [ 'gender-female_emotion-angry_speed-fast_pitch-high', 'gender-female_emotion-angry_speed-fast_pitch-low', 'gender-female_emotion-angry_speed-fast_pitch-normal', 'gender-female_emotion-angry_speed-normal_pitch-high', 'gender-female_emotion-angry_speed-normal_pitch-low', 'gender-female_emotion-angry_speed-normal_pitch-normal', 'gender-female_emotion-angry_speed-slow_pitch-high', 'gender-female_emotion-angry_speed-slow_pitch-low', 'gender-female_emotion-angry_speed-slow_pitch-normal', 'gender-female_emotion-disgusted_speed-fast_pitch-high', 'gender-female_emotion-disgusted_speed-fast_pitch-low', 'gender-female_emotion-disgusted_speed-fast_pitch-normal', 'gender-female_emotion-disgusted_speed-normal_pitch-high', 'gender-female_emotion-disgusted_speed-normal_pitch-low', 'gender-female_emotion-disgusted_speed-normal_pitch-normal', 'gender-female_emotion-disgusted_speed-slow_pitch-high', 'gender-female_emotion-disgusted_speed-slow_pitch-low', 'gender-female_emotion-disgusted_speed-slow_pitch-normal', 'gender-female_emotion-fearful_speed-fast_pitch-high', 'gender-female_emotion-fearful_speed-fast_pitch-low', 'gender-female_emotion-fearful_speed-fast_pitch-normal', 'gender-female_emotion-fearful_speed-normal_pitch-high', 'gender-female_emotion-fearful_speed-normal_pitch-low', 'gender-female_emotion-fearful_speed-normal_pitch-normal', 'gender-female_emotion-fearful_speed-slow_pitch-high', 'gender-female_emotion-fearful_speed-slow_pitch-low', 'gender-female_emotion-fearful_speed-slow_pitch-normal', 'gender-female_emotion-happy_speed-fast_pitch-high', 'gender-female_emotion-happy_speed-fast_pitch-low', 'gender-female_emotion-happy_speed-fast_pitch-normal', 'gender-female_emotion-happy_speed-normal_pitch-high', 'gender-female_emotion-happy_speed-normal_pitch-low', 'gender-female_emotion-happy_speed-normal_pitch-normal', 'gender-female_emotion-happy_speed-slow_pitch-high', 'gender-female_emotion-happy_speed-slow_pitch-low', 'gender-female_emotion-happy_speed-slow_pitch-normal', 'gender-female_emotion-neutral_speed-fast_pitch-high', 'gender-female_emotion-neutral_speed-fast_pitch-low', 'gender-female_emotion-neutral_speed-fast_pitch-normal', 'gender-female_emotion-neutral_speed-normal_pitch-high', 'gender-female_emotion-neutral_speed-normal_pitch-low', 'gender-female_emotion-neutral_speed-normal_pitch-normal', 'gender-female_emotion-neutral_speed-slow_pitch-high', 'gender-female_emotion-neutral_speed-slow_pitch-low', 'gender-female_emotion-neutral_speed-slow_pitch-normal', 'gender-female_emotion-sad_speed-fast_pitch-high', 'gender-female_emotion-sad_speed-fast_pitch-low', 'gender-female_emotion-sad_speed-fast_pitch-normal', 'gender-female_emotion-sad_speed-normal_pitch-high', 'gender-female_emotion-sad_speed-normal_pitch-low', 'gender-female_emotion-sad_speed-normal_pitch-normal', 'gender-female_emotion-sad_speed-slow_pitch-high', 'gender-female_emotion-sad_speed-slow_pitch-low', 'gender-female_emotion-sad_speed-slow_pitch-normal', 'gender-female_emotion-surprised_speed-fast_pitch-high', 'gender-female_emotion-surprised_speed-fast_pitch-low', 'gender-female_emotion-surprised_speed-fast_pitch-normal', 'gender-female_emotion-surprised_speed-normal_pitch-high', 'gender-female_emotion-surprised_speed-normal_pitch-low', 'gender-female_emotion-surprised_speed-normal_pitch-normal', 'gender-female_emotion-surprised_speed-slow_pitch-high', 'gender-female_emotion-surprised_speed-slow_pitch-low', 'gender-female_emotion-surprised_speed-slow_pitch-normal', 'gender-male_emotion-angry_speed-fast_pitch-high', 'gender-male_emotion-angry_speed-fast_pitch-low', 'gender-male_emotion-angry_speed-fast_pitch-normal', 'gender-male_emotion-angry_speed-normal_pitch-high', 'gender-male_emotion-angry_speed-normal_pitch-low', 'gender-male_emotion-angry_speed-normal_pitch-normal', 'gender-male_emotion-angry_speed-slow_pitch-high', 'gender-male_emotion-angry_speed-slow_pitch-low', 'gender-male_emotion-angry_speed-slow_pitch-normal', 'gender-male_emotion-disgusted_speed-fast_pitch-high', 'gender-male_emotion-disgusted_speed-fast_pitch-low', 'gender-male_emotion-disgusted_speed-fast_pitch-normal', 'gender-male_emotion-disgusted_speed-normal_pitch-high', 'gender-male_emotion-disgusted_speed-normal_pitch-low', 'gender-male_emotion-disgusted_speed-normal_pitch-normal', 'gender-male_emotion-disgusted_speed-slow_pitch-high', 'gender-male_emotion-disgusted_speed-slow_pitch-low', 'gender-male_emotion-disgusted_speed-slow_pitch-normal', 'gender-male_emotion-fearful_speed-fast_pitch-high', 'gender-male_emotion-fearful_speed-fast_pitch-low', 'gender-male_emotion-fearful_speed-fast_pitch-normal', 'gender-male_emotion-fearful_speed-normal_pitch-high', 'gender-male_emotion-fearful_speed-normal_pitch-low', 'gender-male_emotion-fearful_speed-normal_pitch-normal', 'gender-male_emotion-fearful_speed-slow_pitch-high', 'gender-male_emotion-fearful_speed-slow_pitch-low', 'gender-male_emotion-fearful_speed-slow_pitch-normal', 'gender-male_emotion-happy_speed-fast_pitch-high', 'gender-male_emotion-happy_speed-fast_pitch-low', 'gender-male_emotion-happy_speed-fast_pitch-normal', 'gender-male_emotion-happy_speed-normal_pitch-high', 'gender-male_emotion-happy_speed-normal_pitch-low', 'gender-male_emotion-happy_speed-normal_pitch-normal', 'gender-male_emotion-happy_speed-slow_pitch-high', 'gender-male_emotion-happy_speed-slow_pitch-low', 'gender-male_emotion-happy_speed-slow_pitch-normal', 'gender-male_emotion-neutral_speed-fast_pitch-high', 'gender-male_emotion-neutral_speed-fast_pitch-low', 'gender-male_emotion-neutral_speed-fast_pitch-normal', 'gender-male_emotion-neutral_speed-normal_pitch-high', 'gender-male_emotion-neutral_speed-normal_pitch-low', 'gender-male_emotion-neutral_speed-normal_pitch-normal', 'gender-male_emotion-neutral_speed-slow_pitch-high', 'gender-male_emotion-neutral_speed-slow_pitch-low', 'gender-male_emotion-neutral_speed-slow_pitch-normal', 'gender-male_emotion-sad_speed-fast_pitch-high', 'gender-male_emotion-sad_speed-fast_pitch-low', 'gender-male_emotion-sad_speed-fast_pitch-normal', 'gender-male_emotion-sad_speed-normal_pitch-high', 'gender-male_emotion-sad_speed-normal_pitch-low', 'gender-male_emotion-sad_speed-normal_pitch-normal', 'gender-male_emotion-sad_speed-slow_pitch-high', 'gender-male_emotion-sad_speed-slow_pitch-low', 'gender-male_emotion-sad_speed-slow_pitch-normal', 'gender-male_emotion-surprised_speed-fast_pitch-high', 'gender-male_emotion-surprised_speed-fast_pitch-low', 'gender-male_emotion-surprised_speed-fast_pitch-normal', 'gender-male_emotion-surprised_speed-normal_pitch-high', 'gender-male_emotion-surprised_speed-normal_pitch-low', 'gender-male_emotion-surprised_speed-normal_pitch-normal', 'gender-male_emotion-surprised_speed-slow_pitch-high', 'gender-male_emotion-surprised_speed-slow_pitch-low', 'gender-male_emotion-surprised_speed-slow_pitch-normal' ] class EMOVASpeechTokenizerConfig(PretrainedConfig): r""" This is the configuration class to store the configuration of a [`EMOVASpeechTokenizer`]. It is used to instantiate a EMOVASpeechTokenizer model especially designed for training the EMOVA (https://arxiv.org/abs/2409.18042) model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield a configuration to the speech tokenizer model presented in EMOVA paper. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from [`PretrainedConfig`] for more information. Args: s2u_unit_type (`str`, defaults to `40ms_multilingual_8888`): Unit type to specify model configurations for the speech-to-unit (S2U) encoder. Detailed configs will be found accordingly. u2s_unit_type (`str`, defaults to `40ms_multilingual_8888_xujing_cosyvoice_FT`): Unit type to specify model configurations for the unit-to-speech (U2S) decoder. Detailed configs will be found accordingly. u2s_num_styles, u2s_dim_styles (`int`, defaults to 126 and 256): Size of the style embedding matrix. ```python >>> from transformers import EMOVASpeechTokenizerConfig, EMOVASpeechTokenizer >>> # Initializing a EMOVA speech tokenizer configuration >>> configuration = EMOVASpeechTokenizerConfig() >>> # Initializing a model from the EMOVA speech tokenizer configuration >>> model = EMOVASpeechTokenizer(configuration) >>> # Accessing the model configuration >>> configuration = model.config ```""" model_type = "EMOVASpeechTokenizer" def __init__( self, s2u_unit_type="40ms_multilingual_8888", u2s_unit_type="40ms_multilingual_8888_xujing_cosyvoice_FT", u2s_num_styles=126, u2s_dim_styles=256, **kwargs, ): super().__init__(**kwargs) self.s2u_unit_type = s2u_unit_type self.u2s_unit_type = u2s_unit_type self.u2s_num_styles = u2s_num_styles self.u2s_dim_styles = u2s_dim_styles self.u2s_style2idx = {each:i for i, each in enumerate(U2S_STYLES)}