# Universal Metrics Configuration for Versa # This file contains the configuration for various universal metrics used in speech quality assessment. # visqol metric # -- visqol: visual quality of speech - name: visqol model: default # Word error rate with ESPnet-OWSM model # More model_tag can be from the ESPnet huggingface https://huggingface.co/espnet . # The default model is `espnet/owsm_v3.1_ebf`. # --lid: the nbest language tag - name: lid model_tag: default nbest: 1 # nomad (reference-based) metric # -- nomad: nomad reference-based model - name: nomad model_cache: versa_cache/nomad_pt-models # srmr related metrics # -- srmr: speech-to-reverberation modulation energy ratio - name: srmr n_cochlear_filters: 23 low_freq: 125 min_cf: 4 max_cf: 128 fast: True norm: False # Emotion similarity calculated based on emo2vec # --emo2vec_similarity: the emotion similarity with emo2vec - name: emo2vec_similarity # noresqa related metrics # -- noresqa: non-matching reference based speech quality assessment - name: noresqa metric_type: 1 #0: NORESQA-score, 1: NORESQA-MOS # pysepm related metrics # -- pysepm_fwsegsnr: frequency-weighted segmental SNR # -- pysepm_llr: Log likelihood ratio # -- pysepm_wss: weighted spectral slope # -- pysepm_cd: cepstral distance objective speech quality measure # -- pysepm_Csig, pysepm_Cbak, pysepm_Covl: composite objective speech quality # -- pysepm_csii_high, pysepm_csii_mid, pysepm_csii_low: coherence and speech intelligibility index # -- pysepm_ncm: normalized-covariance measure - name: pysepm # nisqa score for speech quality assessment # -- nisqa_mos_pred: NISQA MOS prediction # -- nisqa_noi_pred: NISQA noise prediction # -- nisqa_dis_pred: NISQA distortion prediction # -- nisqa_col_pred: NISQA color prediction # --nisqa_loud_pred: NISQA loudness prediction # NOTE(jiatong): pretrain model can be downloaded with `./tools/setup_nisqa.sh` - name: nisqa nisqa_model_path: ./tools/NISQA/weights/nisqa.tar # discrete speech metrics # -- speech_bert: speech bert score # -- speech_bleu: speech bleu score # -- speech_token_distance: speech token distance score - name: discrete_speech # mcd f0 related metrics # -- mcd: mel cepstral distortion # -- f0_corr: f0 correlation # -- f0_rmse: f0 root mean square error - name: mcd_f0 f0min: 40 f0max: 800 mcep_shift: 5 mcep_fftl: 1024 mcep_dim: 39 mcep_alpha: 0.466 seq_mismatch_tolerance: 0.1 power_threshold: -20 dtw: false # An overall model on MOS-bench from Sheet toolkit # --sheet_ssqa: the mos prediction from sheet_ssqa - name: sheet_ssqa # pesq related metrics # -- pesq: perceptual evaluation of speech quality - name: pesq # stoi related metrics # -- stoi: short-time objective intelligibility - name: stoi # pseudo subjective metrics # -- utmos: UT-MOS score # -- dnsmos: DNS-MOS score # -- plcmos: PLC-MOS score # -- aecmos: AEC-MOS score - name: pseudo_mos predictor_types: ["utmos", "dnsmos", "plcmos", "singmos", "utmosv2"] predictor_args: utmos: fs: 16000 dnsmos: fs: 16000 plcmos: fs: 16000 singmos: fs: 16000 utmosv2: fs: 16000 # Word error rate with OpenAI-Whisper model # -- whisper_wer: word error rate of openai-whisper - name: whisper_wer model_tag: default beam_size: 1 text_cleaner: whisper_basic # scoreq (reference-based) metric # -- scoreq_ref: scoreq reference-based model - name: scoreq_ref data_domain: natrual model_cache: versa_cache/scoreq_pt-models # scoreq (non-reference-based) metric # -- scoreq_nr: scoreq non-reference-based model - name: scoreq_nr data_domain: natural model_cache: versa_cache/scoreq_pt-models # Speech Enhancement-based Metrics # model tag can be any ESPnet-SE huggingface repo # -- se_si_snr: the SI-SNR from a rerference speech enhancement model - name: se_snr model_tag: default # PAM: Prompting Audio-Language Models for Audio Quality Assessment # https://github.com/soham97/PAM/tree/main - name: pam repro: true cache_dir: versa_cache/pam io: soundfile # TEXT ENCODER CONFIG text_model: 'gpt2' text_len: 77 transformer_embed_dim: 768 freeze_text_encoder_weights: True # AUDIO ENCODER CONFIG audioenc_name: 'HTSAT' out_emb: 768 sampling_rate: 44100 duration: 7 fmin: 50 fmax: 8000 #14000 n_fft: 1024 # 1028 hop_size: 320 mel_bins: 64 window_size: 1024 # PROJECTION SPACE CONFIG d_proj: 1024 temperature: 0.003 # TRAINING AND EVALUATION CONFIG num_classes: 527 batch_size: 1024 demo: False # Speaking rate calculating # --speaking_rate: correct matching words/character counts - name: speaking_rate model_tag: default beam_size: 1 text_cleaner: whisper_basic # Audiobox Aesthetics (Unified automatic quality assessment for speech, music, and sound.) - name: audiobox_aesthetics batch_size: 1 cache_dir: versa_cache/audiobox # ASR-match calculating # --asr_match_error_rate: correct matching words/character counts - name: asr_match model_tag: default beam_size: 1 text_cleaner: whisper_basic # speaker related metrics # -- spk_similarity: speaker cosine similarity - name: speaker model_tag: default # asvspoof related metrics # -- asvspoof_score: evaluate how the generated speech is likely to be classifiied by a deepfake classifier - name: asvspoof_score # signal related metrics # -- sir: signal to interference ratio # -- sar: signal to artifact ratio # -- sdr: signal to distortion ratio # -- ci-sdr: scale-invariant signal to distortion ratio # -- si-snri: scale-invariant signal to noise ratio improvement - name: signal_metric