Spaces:

ElvisTata2024
/

elvis-voice-assistant-demo

Runtime error

App Files Files Community

tanettech commited on 15 days ago

Commit

0907799

1 Parent(s): 8616c67

Add Gradio app files

Browse files

Files changed (12) hide show

.gitignore +3 -0
README.md +6 -8
app.py +978 -56
pyscripts/utils/dialog_eval/ASR_WER.py +165 -0
pyscripts/utils/dialog_eval/LLM_Metrics.py +245 -0
pyscripts/utils/dialog_eval/TTS_intelligibility.py +169 -0
pyscripts/utils/dialog_eval/TTS_speech_quality.py +98 -0
pyscripts/utils/dialog_eval/human_feedback.py +242 -0
pyscripts/utils/dialog_eval/vert.py +299 -0
requirements.txt +18 -1
temp_repo +0 -1
versa.sh +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+venv_py311/
+__pycache__/
+*.pyc

README.md CHANGED Viewed

@@ -1,14 +1,12 @@
 ---
-title: Elvis Voice Assistant Demo
-emoji: 💬
-colorFrom: yellow
-colorTo: purple
 sdk: gradio
-sdk_version: 5.0.1
 app_file: app.py
 pinned: false
-license: mit
-short_description: Espnet Recipe
 ---
-An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).

 ---
+title: Voice Assistant Demo
+emoji: 📊
+colorFrom: blue
+colorTo: pink
 sdk: gradio
+sdk_version: 4.43.0
 app_file: app.py
 pinned: false
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,64 +1,986 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
 ):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
 )
-if __name__ == "__main__":
-    demo.launch()

+try:
+    import versa
+except ImportError:
+    from subprocess import call
+    with open('versa.sh', 'rb') as file:
+        script = file.read()
+    rc = call(script, shell=True)
+import os
+import shutil
+import time
+from typing import Generator, Optional, Tuple
 import gradio as gr
+import nltk
+import numpy as np
+import torch
+from huggingface_hub import HfApi
+from pyscripts.utils.dialog_eval.ASR_WER import handle_espnet_ASR_WER
+from pyscripts.utils.dialog_eval.human_feedback import (
+    natural_vote1_last_response,
+    natural_vote2_last_response,
+    natural_vote3_last_response,
+    natural_vote4_last_response,
+    relevant_vote1_last_response,
+    relevant_vote2_last_response,
+    relevant_vote3_last_response,
+    relevant_vote4_last_response,
+)
+from pyscripts.utils.dialog_eval.LLM_Metrics import (
+    DialoGPT_perplexity,
+    bert_score,
+    perplexity,
+    vert,
+)
+from pyscripts.utils.dialog_eval.TTS_intelligibility import (
+    handle_espnet_TTS_intelligibility,
+)
+from pyscripts.utils.dialog_eval.TTS_speech_quality import TTS_psuedomos
+from espnet2.sds.espnet_model import ESPnetSDSModelInterface
+# ------------------------
+# Hyperparameters
+# ------------------------
+access_token = os.environ.get("HF_TOKEN")
+ASR_name="pyf98/owsm_ctc_v3.1_1B"
+LLM_name="meta-llama/Llama-3.2-1B-Instruct"
+TTS_name="kan-bayashi/ljspeech_vits"
+ASR_options="pyf98/owsm_ctc_v3.1_1B,espnet/owsm_ctc_v3.2_ft_1B,espnet/owsm_v3.1_ebf,librispeech_asr,whisper-large".split(",")
+LLM_options="meta-llama/Llama-3.2-1B-Instruct,HuggingFaceTB/SmolLM2-1.7B-Instruct".split(",")
+TTS_options="kan-bayashi/ljspeech_vits,kan-bayashi/libritts_xvector_vits,kan-bayashi/vctk_multi_spk_vits,ChatTTS".split(",")
+Eval_options="Latency,TTS Intelligibility,TTS Speech Quality,ASR WER,Text Dialog Metrics"
+upload_to_hub=None
+dialogue_model = ESPnetSDSModelInterface(
+    ASR_name, LLM_name, TTS_name, "Cascaded", access_token
+)
+ASR_curr_name=None
+LLM_curr_name=None
+TTS_curr_name=None
+latency_ASR = 0.0
+latency_LM = 0.0
+latency_TTS = 0.0
+text_str = ""
+asr_output_str = ""
+vad_output = None
+audio_output = None
+audio_output1 = None
+LLM_response_arr = []
+total_response_arr = []
+callback = gr.CSVLogger()
+start_record_time = None
+enable_btn = gr.Button(interactive=True, visible=True)
+# ------------------------
+# Function Definitions
+# ------------------------
+def handle_eval_selection(
+    option: str,
+    TTS_audio_output: str,
+    LLM_Output: str,
+    ASR_audio_output: str,
+    ASR_transcript: str,
 ):
+    """
+    Handles the evaluation of a selected metric based on
+    user input and provided outputs.
+    This function evaluates different aspects of a
+    casacaded conversational AI pipeline, such as:
+    Latency, TTS intelligibility, TTS speech quality,
+    ASR WER, and text dialog metrics.
+    It is designed to integrate with Gradio via
+    multiple yield statements,
+    allowing updates to be displayed in real time.
+    Parameters:
+    ----------
+    option : str
+        The evaluation metric selected by the user.
+        Supported options include:
+        - "Latency"
+        - "TTS Intelligibility"
+        - "TTS Speech Quality"
+        - "ASR WER"
+        - "Text Dialog Metrics"
+    TTS_audio_output : np.ndarray
+        The audio output generated by the TTS module for evaluation.
+    LLM_Output : str
+        The text output generated by the LLM module for evaluation.
+    ASR_audio_output : np.ndarray
+        The audio input/output used for ASR evaluation.
+    ASR_transcript : str
+        The transcript generated by the ASR module for evaluation.
+    Returns:
+    -------
+    str
+        A string representation of the evaluation results.
+        The specific result depends on the selected evaluation metric:
+        - "Latency": Latencies of ASR, LLM, and TTS modules.
+        - "TTS Intelligibility": A range of scores indicating how intelligible
+        the TTS audio output is based on different reference ASR models.
+        - "TTS Speech Quality": A range of scores representing the
+        speech quality of the TTS audio output.
+        - "ASR WER": The Word Error Rate (WER) of the ASR output
+        based on different judge ASR models.
+        - "Text Dialog Metrics": A combination of perplexity,
+        diversity metrics, and relevance scores for the dialog.
+    Raises:
+    ------
+    ValueError
+        If the `option` parameter does not match any supported evaluation metric.
+    Example:
+    -------
+    >>> result = handle_eval_selection(
+            option="Latency",
+            TTS_audio_output=audio_array,
+            LLM_Output="Generated response",
+            ASR_audio_output=audio_input,
+            ASR_transcript="Expected transcript"
+        )
+    >>> print(result)
+    "ASR Latency: 0.14
+     LLM Latency: 0.42
+     TTS Latency: 0.21"
+    """
+    global LLM_response_arr
+    global total_response_arr
+    yield (option, gr.Textbox(visible=True))
+    if option == "Latency":
+        text = (
+            f"ASR Latency: {latency_ASR:.2f}\n"
+            f"LLM Latency: {latency_LM:.2f}\n"
+            f"TTS Latency: {latency_TTS:.2f}"
+        )
+        yield (None, text)
+    elif option == "TTS Intelligibility":
+        yield (None, handle_espnet_TTS_intelligibility(TTS_audio_output, LLM_Output))
+    elif option == "TTS Speech Quality":
+        yield (None, TTS_psuedomos(TTS_audio_output))
+    elif option == "ASR WER":
+        yield (None, handle_espnet_ASR_WER(ASR_audio_output, ASR_transcript))
+    elif option == "Text Dialog Metrics":
+        yield (
+            None,
+            perplexity(LLM_Output.replace("\n", " "))
+            + vert(LLM_response_arr)
+            + bert_score(total_response_arr)
+            + DialoGPT_perplexity(
+                ASR_transcript.replace("\n", " "), LLM_Output.replace("\n", " ")
+            ),
+        )
+    elif option is None:
+        return
+    else:
+        raise ValueError(f"Unknown option: {option}")
+def handle_eval_selection_E2E(
+    option: str,
+    TTS_audio_output: str,
+    LLM_Output: str,
+):
+    """
+    Handles the evaluation of a selected metric based on user input
+    and provided outputs.
+    This function evaluates different aspects of an E2E
+    conversational AI model, such as:
+    Latency, TTS intelligibility, TTS speech quality, and
+    text dialog metrics.
+    It is designed to integrate with Gradio via
+    multiple yield statements,
+    allowing updates to be displayed in real time.
+    Parameters:
+    ----------
+    option : str
+        The evaluation metric selected by the user.
+        Supported options include:
+        - "Latency"
+        - "TTS Intelligibility"
+        - "TTS Speech Quality"
+        - "Text Dialog Metrics"
+    TTS_audio_output : np.ndarray
+        The audio output generated by the TTS module for evaluation.
+    LLM_Output : str
+        The text output generated by the LLM module for evaluation.
+    Returns:
+    -------
+    str
+        A string representation of the evaluation results.
+        The specific result depends on the selected evaluation metric:
+        - "Latency": Latency of the entire system.
+        - "TTS Intelligibility": A range of scores indicating how intelligible the
+        TTS audio output is based on different reference ASR models.
+        - "TTS Speech Quality": A range of scores representing the
+         speech quality of the TTS audio output.
+        - "Text Dialog Metrics": A combination of perplexity and
+        diversity metrics for the dialog.
+    Raises:
+    ------
+    ValueError
+        If the `option` parameter does not match any supported evaluation metric.
+    Example:
+    -------
+    >>> result = handle_eval_selection(
+            option="Latency",
+            TTS_audio_output=audio_array,
+            LLM_Output="Generated response",
+        )
+    >>> print(result)
+    "Total Latency: 2.34"
+    """
+    global LLM_response_arr
+    global total_response_arr
+    yield (option, gr.Textbox(visible=True))
+    if option == "Latency":
+        text = f"Total Latency: {latency_TTS:.2f}"
+        yield (None, text)
+    elif option == "TTS Intelligibility":
+        yield (None, handle_espnet_TTS_intelligibility(TTS_audio_output, LLM_Output))
+    elif option == "TTS Speech Quality":
+        yield (None, TTS_psuedomos(TTS_audio_output))
+    elif option == "Text Dialog Metrics":
+        yield (None, perplexity(LLM_Output.replace("\n", " ")) + vert(LLM_response_arr))
+    elif option is None:
+        return
+    else:
+        raise ValueError(f"Unknown option: {option}")
+def start_warmup():
+    """
+    Initializes and warms up the dialogue and evaluation model.
+    This function is designed to ensure that all
+    components of the dialogue model are pre-loaded
+    and ready for execution, avoiding delays during runtime.
+    """
+    global dialogue_model
+    global ASR_options
+    global LLM_options
+    global TTS_options
+    global ASR_name
+    global LLM_name
+    global TTS_name
+    remove=0
+    for opt_count in range(len(ASR_options)):
+        opt_count-=remove
+        if opt_count>=len(ASR_options):
+            break
+        print(opt_count)
+        print(ASR_options)
+        opt = ASR_options[opt_count]
+        try:
+            for _ in dialogue_model.handle_ASR_selection(opt):
+                continue
+        except Exception:
+            print("Removing " + opt + " from ASR options since it cannot be loaded.")
+            ASR_options = ASR_options[:opt_count] + ASR_options[(opt_count + 1) :]
+            remove+=1
+            if opt == ASR_name:
+                ASR_name = ASR_options[0]
+    for opt_count in range(len(LLM_options)):
+        opt = LLM_options[opt_count]
+        try:
+            for _ in dialogue_model.handle_LLM_selection(opt):
+                continue
+        except Exception:
+            print("Removing " + opt + " from LLM options since it cannot be loaded.")
+            LLM_options = LLM_options[:opt_count] + LLM_options[(opt_count + 1) :]
+            if opt == LLM_name:
+                LLM_name = LLM_options[0]
+    for opt_count in range(len(TTS_options)):
+        opt = TTS_options[opt_count]
+        try:
+            for _ in dialogue_model.handle_TTS_selection(opt):
+                continue
+        except Exception:
+            print("Removing " + opt + " from TTS options since it cannot be loaded.")
+            TTS_options = TTS_options[:opt_count] + TTS_options[(opt_count + 1) :]
+            if opt == TTS_name:
+                TTS_name = TTS_options[0]
+    dialogue_model.handle_E2E_selection()
+    dialogue_model.client = None
+    for _ in dialogue_model.handle_TTS_selection(TTS_name):
+        continue
+    for _ in dialogue_model.handle_ASR_selection(ASR_name):
+        continue
+    for _ in dialogue_model.handle_LLM_selection(LLM_name):
+        continue
+    dummy_input = (
+        torch.randn(
+            (3000),
+            dtype=getattr(torch, "float16"),
+            device="cpu",
+        )
+        .cpu()
+        .numpy()
+    )
+    dummy_text = "This is dummy text"
+    for opt in Eval_options:
+        handle_eval_selection(opt, dummy_input, dummy_text, dummy_input, dummy_text)
+def flash_buttons():
+    """
+    Enables human feedback buttons after displaying system output.
+    """
+    btn_updates = (enable_btn,) * 8
+    yield (
+        "",
+        "",
+    ) + btn_updates
+def transcribe(
+    stream: np.ndarray,
+    new_chunk: Tuple[int, np.ndarray],
+    TTS_option: str,
+    ASR_option: str,
+    LLM_option: str,
+    type_option: str,
+    input_text: str,
+):
+    """
+    Processes and transcribes an audio stream in real-time.
+    This function handles the transcription of audio input
+    and its transformation through a cascaded
+    or E2E conversational AI system.
+    It dynamically updates the transcription, text generation,
+    and synthesized speech output, while managing global states and latencies.
+    Args:
+        stream: The current audio stream buffer.
+            `None` if the stream is being reset (e.g., after user refresh).
+        new_chunk: A tuple containing:
+            - `sr`: Sample rate of the new audio chunk.
+            - `y`: New audio data chunk.
+        TTS_option: Selected TTS model option.
+        ASR_option: Selected ASR model option.
+        LLM_option: Selected LLM model option.
+        type_option: Type of system ("Cascaded" or "E2E").
+    Yields:
+        Tuple[Optional[np.ndarray], Optional[str], Optional[str],
+        Optional[Tuple[int, np.ndarray]], Optional[Tuple[int, np.ndarray]]]:
+            A tuple containing:
+            - Updated stream buffer.
+            - ASR output text.
+            - Generated LLM output text.
+            - Audio output as a tuple of sample rate and audio waveform.
+            - User input audio as a tuple of sample rate and audio waveform.
+    Notes:
+        - Resets the session if the transcription exceeds 5 minutes.
+        - Updates the Gradio interface elements dynamically.
+        - Manages latencies.
+    """
+    sr, y = new_chunk
+    global text_str
+    global chat
+    global user_role
+    global audio_output
+    global audio_output1
+    global vad_output
+    global asr_output_str
+    global start_record_time
+    global sids
+    global spembs
+    global latency_ASR
+    global latency_LM
+    global latency_TTS
+    global LLM_response_arr
+    global total_response_arr
+    if stream is None:
+        # Handle user refresh
+        for (
+            _,
+            _,
+            _,
+            _,
+            asr_output_box,
+            text_box,
+            audio_box,
+            _,
+            _,
+        ) in dialogue_model.handle_type_selection(
+            type_option, TTS_option, ASR_option, LLM_option
+        ):
+            gr.Info("The models are being reloaded due to a browser refresh.")
+            yield (stream, asr_output_box, text_box, audio_box, gr.Audio(visible=False))
+        stream = y
+        text_str = ""
+        audio_output = None
+        audio_output1 = None
+    else:
+        stream = np.concatenate((stream, y))
+    # import pdb;pdb.set_trace()
+    dialogue_model.chat.init_chat(
+        {
+            "role": "system",
+            "content": (
+                input_text
+            ),
+        }
+    )
+    (
+        asr_output_str,
+        text_str,
+        audio_output,
+        audio_output1,
+        latency_ASR,
+        latency_LM,
+        latency_TTS,
+        stream,
+        change,
+    ) = dialogue_model(
+        y,
+        sr,
+        stream,
+        asr_output_str,
+        text_str,
+        audio_output,
+        audio_output1,
+        latency_ASR,
+        latency_LM,
+        latency_TTS,
+    )
+    text_str1 = text_str
+    if change:
+        print("Output changed")
+        if asr_output_str != "":
+            total_response_arr.append(asr_output_str.replace("\n", " "))
+        LLM_response_arr.append(text_str.replace("\n", " "))
+        total_response_arr.append(text_str.replace("\n", " "))
+    if (text_str != "") and (start_record_time is None):
+        start_record_time = time.time()
+    elif start_record_time is not None:
+        current_record_time = time.time()
+        if current_record_time - start_record_time > 300:
+            gr.Info(
+                "Conversations are limited to 5 minutes. "
+                "The session will restart in approximately 60 seconds. "
+                "Please wait for the demo to reset. "
+                "Close this message once you have read it.",
+                duration=None,
+            )
+            yield stream, gr.Textbox(visible=False), gr.Textbox(
+                visible=False
+            ), gr.Audio(visible=False), gr.Audio(visible=False)
+            if upload_to_hub is not None:
+                api.upload_folder(
+                    folder_path="flagged_data_points",
+                    path_in_repo="checkpoint_" + str(start_record_time),
+                    repo_id=upload_to_hub,
+                    repo_type="dataset",
+                    token=access_token,
+                )
+            dialogue_model.chat.buffer = []
+            text_str = ""
+            audio_output = None
+            audio_output1 = None
+            asr_output_str = ""
+            start_record_time = None
+            LLM_response_arr = []
+            total_response_arr = []
+            shutil.rmtree("flagged_data_points")
+            os.mkdir("flagged_data_points")
+            yield (stream, asr_output_str, text_str1, audio_output, audio_output1)
+            yield stream, gr.Textbox(visible=True), gr.Textbox(visible=True), gr.Audio(
+                visible=True
+            ), gr.Audio(visible=False)
+    yield (stream, asr_output_str, text_str1, audio_output, audio_output1)
+# ------------------------
+# Executable Script
+# ------------------------
+api = HfApi()
+nltk.download("averaged_perceptron_tagger_eng")
+start_warmup()
+default_instruct=(
+    "You are a helpful and friendly AI "
+    "assistant. "
+    "You are polite, respectful, and aim to "
+    "provide concise and complete responses of "
+    "less than 15 words."
 )
+import pandas as pd
+examples = pd.DataFrame([
+    ["General Purpose Conversation", default_instruct],
+    ["Translation", "You are a translator. Translate user text into English."],
+    ["General Purpose Conversation with Disfluencies", "Please reply to user with lot of filler words like ummm, so"],
+    ["Summarization", "You are summarizer. Summarize user's utterance."]
+], columns=["Task", "LLM Prompt"])
+with gr.Blocks(
+    title="E2E Spoken Dialog System",
+) as demo:
+    with gr.Row():
+        gr.Markdown(
+            """
+            ## ESPnet-SDS
+            Welcome to our unified web interface for various cascaded and
+            E2E spoken dialogue systems built using ESPnet-SDS  toolkit,
+            supporting real-time automated evaluation metrics, and
+            human-in-the-loop feedback collection.
+            For more details on how to use the app, refer to the [README]
+            (https://github.com/siddhu001/espnet/tree/sds_demo_recipe/egs2/TEMPLATE/sds1#how-to-use).
+        """
+        )
+    with gr.Row():
+        with gr.Column(scale=1):
+            user_audio = gr.Audio(
+                sources=["microphone"],
+                streaming=True,
+                waveform_options=gr.WaveformOptions(sample_rate=16000),
+            )
+            input_text=gr.Textbox(
+                label="LLM prompt",
+                visible=True,
+                interactive=True,
+                value=default_instruct
+            )
+            with gr.Row():
+                type_radio = gr.Radio(
+                    choices=["Cascaded", "E2E"],
+                    label="Choose type of Spoken Dialog:",
+                    value="Cascaded",
+                )
+            with gr.Row():
+                ASR_radio = gr.Radio(
+                    choices=ASR_options,
+                    label="Choose ASR:",
+                    value=ASR_name,
+                )
+            with gr.Row():
+                LLM_radio = gr.Radio(
+                    choices=LLM_options,
+                    label="Choose LLM:",
+                    value=LLM_name,
+                )
+            with gr.Row():
+                radio = gr.Radio(
+                    choices=TTS_options,
+                    label="Choose TTS:",
+                    value=TTS_name,
+                )
+            with gr.Row():
+                E2Eradio = gr.Radio(
+                    choices=["mini-omni"],
+                    label="Choose E2E model:",
+                    value="mini-omni",
+                    visible=False,
+                )
+            with gr.Row():
+                feedback_btn = gr.Button(
+                    value=(
+                        "Please provide your feedback "
+                        "after each system response below."
+                    ),
+                    visible=True,
+                    interactive=False,
+                    elem_id="button",
+                )
+            with gr.Row():
+                natural_btn1 = gr.Button(
+                    value="Very Natural", visible=False, interactive=False, scale=1
+                )
+                natural_btn2 = gr.Button(
+                    value="Somewhat Awkward", visible=False, interactive=False, scale=1
+                )
+                natural_btn3 = gr.Button(
+                    value="Very Awkward", visible=False, interactive=False, scale=1
+                )
+                natural_btn4 = gr.Button(
+                    value="Unnatural", visible=False, interactive=False, scale=1
+                )
+            with gr.Row():
+                relevant_btn1 = gr.Button(
+                    value="Highly Relevant", visible=False, interactive=False, scale=1
+                )
+                relevant_btn2 = gr.Button(
+                    value="Partially Relevant",
+                    visible=False,
+                    interactive=False,
+                    scale=1,
+                )
+                relevant_btn3 = gr.Button(
+                    value="Slightly Irrelevant",
+                    visible=False,
+                    interactive=False,
+                    scale=1,
+                )
+                relevant_btn4 = gr.Button(
+                    value="Completely Irrelevant",
+                    visible=False,
+                    interactive=False,
+                    scale=1,
+                )
+        with gr.Column(scale=1):
+            output_audio = gr.Audio(label="Output", autoplay=True, visible=True, interactive=False)
+            output_audio1 = gr.Audio(label="Output1", autoplay=False, visible=False, interactive=False)
+            output_asr_text = gr.Textbox(label="ASR output", interactive=False)
+            output_text = gr.Textbox(label="LLM output", interactive=False)
+            eval_radio = gr.Radio(
+                choices=[
+                    "Latency",
+                    "TTS Intelligibility",
+                    "TTS Speech Quality",
+                    "ASR WER",
+                    "Text Dialog Metrics",
+                ],
+                label="Choose Evaluation metrics:",
+            )
+            eval_radio_E2E = gr.Radio(
+                choices=[
+                    "Latency",
+                    "TTS Intelligibility",
+                    "TTS Speech Quality",
+                    "Text Dialog Metrics",
+                ],
+                label="Choose Evaluation metrics:",
+                visible=False,
+            )
+            output_eval_text = gr.Textbox(label="Evaluation Results")
+            state = gr.State()
+    gr.Markdown("### Example Prompts & Responses")
+    gr.DataFrame(value=examples, headers=["Task", "LLM Prompt"], interactive=False)
+    with gr.Row():
+        privacy_text = gr.Textbox(
+            label="Privacy Notice",
+            interactive=False,
+            value=(
+                "By using this demo, you acknowledge that"
+                "interactions with this dialog system are collected "
+                "for research and improvement purposes. The data "
+                "will only be used to enhance the performance and "
+                "understanding of the system. If you have any "
+                "concerns about data collection, please discontinue "
+                "use."
+            ),
+        )
+    btn_list = [
+        natural_btn1,
+        natural_btn2,
+        natural_btn3,
+        natural_btn4,
+        relevant_btn1,
+        relevant_btn2,
+        relevant_btn3,
+        relevant_btn4,
+    ]
+    natural_btn_list = [
+        natural_btn1,
+        natural_btn2,
+        natural_btn3,
+        natural_btn4,
+    ]
+    relevant_btn_list = [
+        relevant_btn1,
+        relevant_btn2,
+        relevant_btn3,
+        relevant_btn4,
+    ]
+    natural_response = gr.Textbox(
+        label="natural_response", visible=False, interactive=False
+    )
+    diversity_response = gr.Textbox(
+        label="diversity_response", visible=False, interactive=False
+    )
+    ip_address = gr.Textbox(label="ip_address", visible=False, interactive=False)
+    callback.setup(
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        "flagged_data_points",
+    )
+    user_audio.stream(
+        transcribe,
+        inputs=[state, user_audio, radio, ASR_radio, LLM_radio, type_radio, input_text],
+        outputs=[state, output_asr_text, output_text, output_audio, output_audio1],
+    ).then(
+        lambda *args: callback.flag(list(args)), [user_audio], None, preprocess=False
+    )
+    radio.change(
+        fn=dialogue_model.handle_TTS_selection,
+        inputs=[radio],
+        outputs=[output_asr_text, output_text, output_audio],
+    )
+    LLM_radio.change(
+        fn=dialogue_model.handle_LLM_selection,
+        inputs=[LLM_radio],
+        outputs=[output_asr_text, output_text, output_audio],
+    )
+    ASR_radio.change(
+        fn=dialogue_model.handle_ASR_selection,
+        inputs=[ASR_radio],
+        outputs=[output_asr_text, output_text, output_audio],
+    )
+    eval_radio.change(
+        fn=handle_eval_selection,
+        inputs=[eval_radio, output_audio, output_text, output_audio1, output_asr_text],
+        outputs=[eval_radio, output_eval_text],
+    )
+    eval_radio_E2E.change(
+        fn=handle_eval_selection_E2E,
+        inputs=[eval_radio_E2E, output_audio, output_text],
+        outputs=[eval_radio_E2E, output_eval_text],
+    )
+    type_radio.change(
+        fn=dialogue_model.handle_type_selection,
+        inputs=[type_radio, radio, ASR_radio, LLM_radio],
+        outputs=[
+            radio,
+            ASR_radio,
+            LLM_radio,
+            E2Eradio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            eval_radio,
+            eval_radio_E2E,
+        ],
+    )
+    output_audio.play(
+        flash_buttons, [], [natural_response, diversity_response] + btn_list
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+        ],
+        None,
+        preprocess=False,
+    )
+    natural_btn1.click(
+        natural_vote1_last_response,
+        [],
+        [natural_response, ip_address] + natural_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    natural_btn2.click(
+        natural_vote2_last_response,
+        [],
+        [natural_response, ip_address] + natural_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    natural_btn3.click(
+        natural_vote3_last_response,
+        [],
+        [natural_response, ip_address] + natural_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    natural_btn4.click(
+        natural_vote4_last_response,
+        [],
+        [natural_response, ip_address] + natural_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    relevant_btn1.click(
+        relevant_vote1_last_response,
+        [],
+        [diversity_response, ip_address] + relevant_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    relevant_btn2.click(
+        relevant_vote2_last_response,
+        [],
+        [diversity_response, ip_address] + relevant_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    relevant_btn3.click(
+        relevant_vote3_last_response,
+        [],
+        [diversity_response, ip_address] + relevant_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+    relevant_btn4.click(
+        relevant_vote4_last_response,
+        [],
+        [diversity_response, ip_address] + relevant_btn_list,
+    ).then(
+        lambda *args: callback.flag(list(args)),
+        [
+            user_audio,
+            output_asr_text,
+            output_text,
+            output_audio,
+            output_audio1,
+            type_radio,
+            ASR_radio,
+            LLM_radio,
+            radio,
+            E2Eradio,
+            natural_response,
+            diversity_response,
+            ip_address,
+        ],
+        None,
+        preprocess=False,
+    )
+demo.queue(max_size=10, default_concurrency_limit=1)
+demo.launch(share=True)

pyscripts/utils/dialog_eval/ASR_WER.py ADDED Viewed

	@@ -0,0 +1,165 @@

+from typing import Tuple
+import numpy as np
+from espnet2.sds.utils.utils import int2float
+def handle_espnet_ASR_WER(
+    ASR_audio_output: Tuple[int, np.ndarray], ASR_transcript: str
+) -> str:
+    """
+    Compute and return Word Error Rate (WER) and Character Error Rate (CER) metrics
+    for multiple judge ASR systems (ESPnet, OWSM, Whisper) using the Versa library.
+    This function performs the following:
+        1. Imports necessary metrics and setup functions from Versa.
+        2. Prepares configuration arguments for each ASR system (ESPnet, OWSM, Whisper).
+        3. Runs the Levenshtein-based WER/CER calculations.
+        4. Returns a formatted string summarizing WER and CER
+        results for reference produced by each ASR system.
+    Args:
+        ASR_audio_output (tuple):
+            A tuple where:
+                - The first element is the frame rate.
+                - The second element is the audio signal (NumPy array).
+        ASR_transcript (str):
+            The transcript produced by the ASR model in the cascaded
+            conversational AI pipeline.
+    Returns:
+        str:
+            A formatted string showing the WER and CER percentages
+            for ESPnet, OWSM, and Whisper. Example output:
+            "ESPnet WER: 10.50
+             ESPnet CER: 7.20
+             OWSM WER: 11.30
+             OWSM CER: 8.00
+             Whisper WER: 9.25
+             Whisper CER: 6.50"
+    Raises:
+        ImportError:
+            If Versa is not installed or cannot be imported.
+    Example:
+        >>> asr_audio_output = (16000, audio_array)
+        >>> asr_transcript = "This is the ASR transcript."
+        >>> result = handle_espnet_ASR_WER(asr_audio_output, asr_transcript)
+        >>> print(result)
+        "ESPnet WER: 10.50
+         ESPnet CER: 7.20
+         OWSM WER: 11.30
+         OWSM CER: 8.00
+         Whisper WER: 9.25
+         Whisper CER: 6.50"
+    """
+    try:
+        from versa import (
+            espnet_levenshtein_metric,
+            espnet_wer_setup,
+            owsm_levenshtein_metric,
+            owsm_wer_setup,
+            whisper_levenshtein_metric,
+            whisper_wer_setup,
+        )
+    except Exception as e:
+        print("Error: Versa is not properly installed.")
+        raise e
+    score_modules_espnet = {
+        "module": espnet_levenshtein_metric,
+        "args": espnet_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1 = score_modules_espnet["module"](
+        score_modules_espnet["args"],
+        int2float(ASR_audio_output[1]),
+        ASR_transcript,
+        ASR_audio_output[0],
+    )
+    espnet_wer = (
+        dict1["espnet_wer_delete"]
+        + dict1["espnet_wer_insert"]
+        + dict1["espnet_wer_replace"]
+    ) / (
+        dict1["espnet_wer_insert"]
+        + dict1["espnet_wer_replace"]
+        + dict1["espnet_wer_equal"]
+    )
+    espnet_cer = (
+        dict1["espnet_cer_delete"]
+        + dict1["espnet_cer_insert"]
+        + dict1["espnet_cer_replace"]
+    ) / (
+        dict1["espnet_cer_insert"]
+        + dict1["espnet_cer_replace"]
+        + dict1["espnet_cer_equal"]
+    )
+    score_modules_owsm = {
+        "module": owsm_levenshtein_metric,
+        "args": owsm_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1 = score_modules_owsm["module"](
+        score_modules_owsm["args"],
+        int2float(ASR_audio_output[1]),
+        ASR_transcript,
+        ASR_audio_output[0],
+    )
+    owsm_wer = (
+        dict1["owsm_wer_delete"] + dict1["owsm_wer_insert"] + dict1["owsm_wer_replace"]
+    ) / (dict1["owsm_wer_insert"] + dict1["owsm_wer_replace"] + dict1["owsm_wer_equal"])
+    owsm_cer = (
+        dict1["owsm_cer_delete"] + dict1["owsm_cer_insert"] + dict1["owsm_cer_replace"]
+    ) / (dict1["owsm_cer_insert"] + dict1["owsm_cer_replace"] + dict1["owsm_cer_equal"])
+    score_modules_whisper = {
+        "module": whisper_levenshtein_metric,
+        "args": whisper_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1 = score_modules_whisper["module"](
+        score_modules_whisper["args"],
+        int2float(ASR_audio_output[1]),
+        ASR_transcript,
+        ASR_audio_output[0],
+    )
+    whisper_wer = (
+        dict1["whisper_wer_delete"]
+        + dict1["whisper_wer_insert"]
+        + dict1["whisper_wer_replace"]
+    ) / (
+        dict1["whisper_wer_insert"]
+        + dict1["whisper_wer_replace"]
+        + dict1["whisper_wer_equal"]
+    )
+    whisper_cer = (
+        dict1["whisper_cer_delete"]
+        + dict1["whisper_cer_insert"]
+        + dict1["whisper_cer_replace"]
+    ) / (
+        dict1["whisper_cer_insert"]
+        + dict1["whisper_cer_replace"]
+        + dict1["whisper_cer_equal"]
+    )
+    return (
+        f"ESPnet WER: {espnet_wer*100:.2f}\n"
+        f"ESPnet CER: {espnet_cer*100:.2f}\n"
+        f"OWSM WER: {owsm_wer*100:.2f}\n"
+        f"OWSM CER: {owsm_cer*100:.2f}\n"
+        f"Whisper WER: {whisper_wer*100:.2f}\n"
+        f"Whisper CER: {whisper_cer*100:.2f}"
+    )

pyscripts/utils/dialog_eval/LLM_Metrics.py ADDED Viewed

	@@ -0,0 +1,245 @@

+from multiprocessing import Pool
+from typing import List
+import numpy as np
+import torch
+from pyscripts.utils.dialog_eval.vert import (
+    get_auto_bleu2_geometric,
+    get_self_bleu2_geometric,
+    run_f,
+)
+from scipy.stats import gmean
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
+def perplexity(LLM_Output: str, model_id: str = "gpt2") -> str:
+    """
+    Compute the perplexity of the given text using a specified model from the
+    `evaluate` library (default: GPT-2).
+    Args:
+        LLM_Output str:
+            The text (string) for which perplexity is to be computed.
+        model_id (str, optional):
+            The identifier of the model to use for computing
+            perplexity. Defaults to "gpt2".
+    Returns:
+        str:
+            A formatted string showing the perplexity of the
+            provided text(s), for example:
+            "Perplexity: 45.23\n"
+    Raises:
+        ImportError:
+            If the `evaluate` library is not installed or cannot be imported.
+    Example:
+        >>> text = "Hello world, this is a test."
+        >>> result = perplexity(text, model_id="gpt2")
+        >>> print(result)
+        "Perplexity: 27.34\n"
+    """
+    try:
+        import evaluate
+    except Exception as e:
+        print("Error: evaluate is not properly installed.")
+        raise e
+    perplexity = evaluate.load("perplexity", module_type="metric")
+    results = perplexity.compute(model_id=model_id, predictions=[LLM_Output])
+    return f"Perplexity: {results['mean_perplexity']:.2f}\n"
+def vert(LLM_response_arr: List[str]) -> str:
+    """
+    Calculate and return Self BLEU-2, Auto BLEU-2 and VERT-2
+    metrics for a list of LLM responses.
+    Args:
+        LLM_response_arr (List[str]):
+            A list of responses (strings) generated by the language
+            model acting as text dialog response generator.
+    Returns:
+        str:
+            A formatted string that includes each computed metric and the final
+            VERT value, for example:
+            "Self-BLEU2-geometric: 42.13
+             Auto-BLEU2-geometric: 38.94
+             VERT: 40.5
+             "
+    Example:
+        >>> # Suppose we have the following LLM responses:
+        >>> responses = ["Hello world", "Foo bar", "Lorem ipsum dolor sit amet"]
+        >>> result = vert(responses)
+        >>> print(result)
+        "Self-BLEU2-geometric: 42.13
+         Auto-BLEU2-geometric: 38.94
+         VERT: 40.5
+         "
+    """
+    terms = [x.strip().split() for x in LLM_response_arr]
+    tasks = [
+        ("Self-BLEU2-geometric", get_self_bleu2_geometric),
+        ("Auto-BLEU2-geometric", get_auto_bleu2_geometric),
+    ]
+    n_processes = min(16, len(tasks))
+    with Pool(n_processes) as pool:
+        metrics = pool.map(run_f, [(t[1], terms) for t in tasks])
+    metric_arr = []
+    str1 = ""
+    for (metric_name, _), metric in zip(tasks, metrics):
+        metric, sem = np.mean(metric), np.std(metric) / np.sqrt(len(metric))
+        metric, sem = [round(100 * x, 2) for x in [metric, sem]]
+        metric_arr.append(metric)
+        str1 += f"{metric_name}: {metric}\n"
+    str1 += f"VERT: {round(gmean(metric_arr), 2)}\n"
+    return str1
+def bert_score(
+    total_response_arr: List[str], bert_model_name: str = "bert-base-uncased"
+) -> str:
+    """
+    Compute a cosine similarity score between the concatenated
+    context (all but the last element)
+    and the final response (last element) using a BERT-based model.
+    This serves as a simplified
+    measure of how closely the response aligns with the preceding context semantically.
+    Args:
+        total_response_arr (List[str]):
+            A list of strings. The last element represents the response,
+            while all other elements
+            are treated as the context.
+        bert_model_name (str, optional):
+            The name or path of the BERT model to use (from the Hugging Face Model Hub).
+            Defaults to "bert-base-uncased".
+    Returns:
+        str:
+            A string containing the cosine similarity
+            (as a percentage) followed by a newline.
+            For example:
+                "Cosine Similarity: 85.67\n"
+    Example:
+        >>> total_responses = [
+        ...     "User: Hi, how are you?",
+        ...     "Assistant: I'm good! How can I help you today?",
+        ...     "User: Can you tell me a joke?",
+        ...     "Assistant: Sure! Here's one: Why did the chicken join a band?"
+        ... ]
+        >>> result = bert_score(total_responses, bert_model_name="bert-base-uncased")
+        >>> print(result)
+        "Cosine Similarity: 75.89\n"
+    """
+    def cosine_similarity_context_response(context, response, model, tokenizer):
+        # Tokenize and encode both context and response
+        context_inputs = tokenizer(context, return_tensors="pt", truncation=True)
+        response_inputs = tokenizer(response, return_tensors="pt", truncation=True)
+        for k in context_inputs:
+            context_inputs[k] = context_inputs[k].cuda()
+        for k in response_inputs:
+            response_inputs[k] = response_inputs[k].cuda()
+        # Get embeddings from the model
+        with torch.no_grad():
+            context_embedding = model(**context_inputs).last_hidden_state.mean(dim=1)
+            response_embedding = model(**response_inputs).last_hidden_state.mean(dim=1)
+        # Compute cosine similarity
+        similarity = cosine_similarity(
+            context_embedding.cpu().numpy(), response_embedding.cpu().numpy()
+        )
+        return similarity[0][0]
+    bert_model = AutoModel.from_pretrained(bert_model_name).cuda()
+    bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
+    similarity = cosine_similarity_context_response(
+        " ".join(total_response_arr[:-1]),
+        total_response_arr[-1],
+        bert_model,
+        bert_tokenizer,
+    )
+    return f"Cosine Similarity: {similarity*100:.2f}" + "\n"
+def DialoGPT_perplexity(
+    user_utterance: str,
+    response: str,
+    dialog_model_name: str = "microsoft/DialoGPT-medium",
+) -> str:
+    """
+    Compute the perplexity of a response given a user utterance using a pre-trained
+    DialoGPT model. The function loads DialoGPT (medium by default)
+    from the Hugging Face Model Hub, then calculates the perplexity
+    for the
+    (context + response) sequence.
+    Args:
+        user_utterance (str):
+            The user utterance preceding the model's response.
+        response (str):
+            The generated response whose perplexity needs to be evaluated.
+    Returns:
+        str:
+            A formatted string containing the DialoGPT perplexity score. For example:
+            "DialoGPT Perplexity: 25.67\n"
+    Example:
+        >>> user_text = "Hi, how are you today?"
+        >>> system_response = "I'm good, thank you! How can I help you?"
+        >>> result = DialoGPT_perplexity(user_text, system_response)
+        >>> print(result)
+        "DialoGPT Perplexity: 31.45\n"
+    """
+    def evaluate_response_with_dialoGPT(context, response, model, tokenizer):
+        """
+        Evaluate the appropriateness of a response based on the
+        given context using DialoGPT.
+        Args:
+            context (str): The dialogue context (previous conversation).
+            response (str): The generated response to evaluate.
+            model: Pre-trained DialoGPT model.
+            tokenizer: Corresponding tokenizer for the DialoGPT model.
+        Returns:
+            float: Perplexity score of the response given the context.
+        """
+        model.eval()
+        # Combine context and response as input
+        input_text = context + tokenizer.eos_token + response + tokenizer.eos_token
+        inputs = tokenizer(input_text, return_tensors="pt", truncation=True)
+        inputs["input_ids"] = inputs["input_ids"].cuda()
+        inputs["attention_mask"] = inputs["attention_mask"].cuda()
+        # import pdb;pdb.set_trace()
+        # Compute model outputs and loss
+        with torch.no_grad():
+            outputs = model(**inputs, labels=inputs["input_ids"].cuda())
+            loss = outputs.loss
+        # Calculate perplexity
+        perplexity = torch.exp(loss)
+        return perplexity.cpu().item()
+    # Load DialoGPT model and tokenizer
+    model_name = dialog_model_name
+    model = AutoModelForCausalLM.from_pretrained(model_name).cuda()
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    perplexity = evaluate_response_with_dialoGPT(
+        user_utterance, response, model, tokenizer
+    )
+    return f"DialoGPT Perplexity: {perplexity:.2f}" + "\n"

pyscripts/utils/dialog_eval/TTS_intelligibility.py ADDED Viewed

	@@ -0,0 +1,169 @@

+from typing import Tuple
+import numpy as np
+from espnet2.sds.utils.utils import int2float
+def handle_espnet_TTS_intelligibility(
+    TTS_audio_output: Tuple[int, np.ndarray], LLM_Output: str
+) -> str:
+    """
+    Compute and return Word Error Rate (WER) and Character Error Rate (CER) metrics
+    for multiple ASR systems (ESPnet, OWSM, Whisper) using the Versa library.
+    This function:
+      1. Imports the necessary metrics and setup functions from Versa.
+      2. Prepares configuration arguments for each ASR system (ESPnet, OWSM, Whisper).
+      3. Runs the Levenshtein-based WER/CER calculations on the provided TTS audio.
+      4. Returns a formatted string summarizing WER and CER results
+      for hypotheses produced
+        by each ASR system when transcribing the TTS audio, using
+        the LLM output as the reference text.
+    Args:
+        TTS_audio_output (Tuple[int, np.ndarray]):
+            A tuple consisting of:
+                - The first element (int): the frame rate of the audio.
+                - The second element (np.ndarray):
+                the audio signal (e.g., a NumPy array).
+        LLM_Output (str):
+            The reference text generated by the LLM, which serves as the ground truth
+            for evaluating the TTS audio.
+    Returns:
+        str:
+            A formatted string showing the WER and CER percentages
+            for ESPnet, OWSM, and Whisper.
+            Example:
+            ESPnet WER: 10.50
+            ESPnet CER: 7.20
+            OWSM WER: 11.30
+            OWSM CER: 8.00
+            Whisper WER: 9.25
+            Whisper CER: 6.50
+    Raises:
+        ImportError:
+            If the Versa library is not installed or cannot be imported.
+    Example:
+        >>> tts_audio_output = (16000, audio_array)
+        >>> llm_output = "This is the reference text for evaluation."
+        >>> result = handle_espnet_TTS_intelligibility(tts_audio_output, llm_output)
+        >>> print(result)
+        ESPnet WER: 10.50
+        ESPnet CER: 7.20
+        OWSM WER: 11.30
+        OWSM CER: 8.00
+        Whisper WER: 9.25
+        Whisper CER: 6.50
+    """
+    try:
+        from versa import (
+            espnet_levenshtein_metric,
+            espnet_wer_setup,
+            owsm_levenshtein_metric,
+            owsm_wer_setup,
+            whisper_levenshtein_metric,
+            whisper_wer_setup,
+        )
+    except Exception as e:
+        print("Error: Versa is not properly installed.")
+        raise e
+    score_modules_espnet = {
+        "module": espnet_levenshtein_metric,
+        "args": espnet_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1 = score_modules_espnet["module"](
+        score_modules_espnet["args"],
+        int2float(TTS_audio_output[1]),
+        LLM_Output,
+        TTS_audio_output[0],
+    )
+    espnet_wer = (
+        dict1["espnet_wer_delete"]
+        + dict1["espnet_wer_insert"]
+        + dict1["espnet_wer_replace"]
+    ) / (
+        dict1["espnet_wer_delete"]
+        + dict1["espnet_wer_replace"]
+        + dict1["espnet_wer_equal"]
+    )
+    espnet_cer = (
+        dict1["espnet_cer_delete"]
+        + dict1["espnet_cer_insert"]
+        + dict1["espnet_cer_replace"]
+    ) / (
+        dict1["espnet_cer_delete"]
+        + dict1["espnet_cer_replace"]
+        + dict1["espnet_cer_equal"]
+    )
+    score_modules_owsm = {
+        "module": owsm_levenshtein_metric,
+        "args": owsm_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1 = score_modules_owsm["module"](
+        score_modules_owsm["args"],
+        int2float(TTS_audio_output[1]),
+        LLM_Output,
+        TTS_audio_output[0],
+    )
+    owsm_wer = (
+        dict1["owsm_wer_delete"] + dict1["owsm_wer_insert"] + dict1["owsm_wer_replace"]
+    ) / (dict1["owsm_wer_delete"] + dict1["owsm_wer_replace"] + dict1["owsm_wer_equal"])
+    owsm_cer = (
+        dict1["owsm_cer_delete"] + dict1["owsm_cer_insert"] + dict1["owsm_cer_replace"]
+    ) / (dict1["owsm_cer_delete"] + dict1["owsm_cer_replace"] + dict1["owsm_cer_equal"])
+    score_modules_whisper = {
+        "module": whisper_levenshtein_metric,
+        "args": whisper_wer_setup(
+            model_tag="default",
+            beam_size=1,
+            text_cleaner="whisper_en",
+            use_gpu=True,
+        ),
+    }
+    dict1 = score_modules_whisper["module"](
+        score_modules_whisper["args"],
+        int2float(TTS_audio_output[1]),
+        LLM_Output,
+        TTS_audio_output[0],
+    )
+    whisper_wer = (
+        dict1["whisper_wer_delete"]
+        + dict1["whisper_wer_insert"]
+        + dict1["whisper_wer_replace"]
+    ) / (
+        dict1["whisper_wer_delete"]
+        + dict1["whisper_wer_replace"]
+        + dict1["whisper_wer_equal"]
+    )
+    whisper_cer = (
+        dict1["whisper_cer_delete"]
+        + dict1["whisper_cer_insert"]
+        + dict1["whisper_cer_replace"]
+    ) / (
+        dict1["whisper_cer_delete"]
+        + dict1["whisper_cer_replace"]
+        + dict1["whisper_cer_equal"]
+    )
+    return (
+        f"ESPnet WER: {espnet_wer*100:.2f}\n"
+        f"ESPnet CER: {espnet_cer*100:.2f}\n"
+        f"OWSM WER: {owsm_wer*100:.2f}\n"
+        f"OWSM CER: {owsm_cer*100:.2f}\n"
+        f"Whisper WER: {whisper_wer*100:.2f}\n"
+        f"Whisper CER: {whisper_cer*100:.2f}"
+    )

pyscripts/utils/dialog_eval/TTS_speech_quality.py ADDED Viewed

	@@ -0,0 +1,98 @@

+from typing import Tuple
+import numpy as np
+from espnet2.sds.utils.utils import int2float
+def TTS_psuedomos(TTS_audio_output: Tuple[int, np.ndarray]) -> str:
+    """
+    Compute and return speech quality metrics
+    for the given synthesized audio output
+    using the Versa library.
+    Args:
+        TTS_audio_output (Tuple[int, np.ndarray]):
+            A tuple containing:
+                - The first element (int): The frame rate of the audio.
+                - The second element (np.ndarray): The audio signal,
+                typically a NumPy array.
+    Returns:
+        str:
+            A formatted string containing each metric name
+            and its corresponding score, for example:
+            utmos: 3.54
+            dnsmos: 3.47
+            plcmos: 3.62
+            sheet_ssqa: 4.03
+    Raises:
+        ImportError:
+            If the Versa library is not installed or cannot be imported.
+    Example:
+        >>> tts_audio_output = (16000, audio_array)
+        >>> result = TTS_psuedomos(tts_audio_output)
+        >>> print(result)
+        utmos: 3.54
+        dnsmos: 3.47
+        plcmos: 3.62
+        sheet_ssqa: 4.03
+    """
+    try:
+        from versa import (
+            pseudo_mos_metric,
+            pseudo_mos_setup,
+            sheet_ssqa,
+            sheet_ssqa_setup,
+        )
+    except Exception as e:
+        print("Error: Versa is not properly installed.")
+        raise e
+    predictor_dict, predictor_fs = pseudo_mos_setup(
+        use_gpu=True,
+        predictor_types=["utmos", "dnsmos", "plcmos"],
+        predictor_args={
+            "utmos": {"fs": 16000},
+            "dnsmos": {"fs": 16000},
+            "plcmos": {"fs": 16000},
+        },
+    )
+    score_modules = {
+        "module": pseudo_mos_metric,
+        "args": {
+            "predictor_dict": predictor_dict,
+            "predictor_fs": predictor_fs,
+            "use_gpu": True,
+        },
+    }
+    dict1 = score_modules["module"](
+        int2float(TTS_audio_output[1]),
+        TTS_audio_output[0],
+        **score_modules["args"],
+    )
+    str1 = ""
+    for k in dict1:
+        str1 = str1 + f"{k}: {dict1[k]:.2f}\n"
+    sheet_model = sheet_ssqa_setup(
+        model_tag="default",
+        model_path=None,
+        model_config=None,
+        use_gpu=True,
+    )
+    score_modules = {
+        "module": sheet_ssqa,
+        "args": {"model": sheet_model, "use_gpu": True},
+    }
+    dict1 = score_modules["module"](
+        score_modules["args"]["model"],
+        int2float(TTS_audio_output[1]),
+        TTS_audio_output[0],
+        use_gpu=score_modules["args"]["use_gpu"],
+    )
+    for k in dict1:
+        str1 = str1 + f"{k}: {dict1[k]:.2f}\n"
+    return str1

pyscripts/utils/dialog_eval/human_feedback.py ADDED Viewed

	@@ -0,0 +1,242 @@

+import gradio as gr
+disable_btn = gr.Button(interactive=False, visible=False)
+def get_ip(request: gr.Request) -> str:
+    """
+    Retrieve the IP address from an incoming HTTP request.
+    Args:
+        request (gr.Request):
+            The incoming HTTP request from which the IP address will be extracted.
+    Returns:
+        str:
+            The IP address as a string.
+    """
+    if "cf-connecting-ip" in request.headers:
+        ip = request.headers["cf-connecting-ip"]
+    elif "x-forwarded-for" in request.headers:
+        ip = request.headers["x-forwarded-for"]
+        if "," in ip:
+            ip = ip.split(",")[0]
+    else:
+        ip = request.client.host
+    return ip
+def natural_vote1_last_response(request: gr.Request):
+    """
+    Handle a user vote for naturalness as "Very Natural".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Very Natural", <ip_address>, (disable_btn,) * 4)
+            - "Very Natural": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable natural vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Very Natural (voted). ip: {ip_address1}")
+    return (
+        "Very Natural",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def natural_vote2_last_response(request: gr.Request):
+    """
+    Handle a user vote for naturalness as "Somewhat Awkward".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Somewhat Awkward", <ip_address>, (disable_btn,) * 4)
+            - "Somewhat Awkward": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable natural vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Somewhat Awkward (voted). ip: {ip_address1}")
+    return (
+        "Somewhat Awkward",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def natural_vote3_last_response(request: gr.Request):
+    """
+    Handle a user vote for naturalness as "Very Awkward".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Very Awkward", <ip_address>, (disable_btn,) * 4)
+            - "Very Awkward": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable natural vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Very Awkward (voted). ip: {ip_address1}")
+    return (
+        "Very Awkward",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def natural_vote4_last_response(request: gr.Request):
+    """
+    Handle a user vote for naturalness as "Unnatural".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Unnatural", <ip_address>, (disable_btn,) * 4)
+            - "Unnatural": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable natural vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Unnatural (voted). ip: {ip_address1}")
+    return (
+        "Unnatural",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def relevant_vote1_last_response(request: gr.Request):
+    """
+    Handle a user vote for relevance as "Highly Relevant".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Highly Relevant", <ip_address>, (disable_btn,) * 4)
+            - "Highly Relevant": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable relevance vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Highly Relevant (voted). ip: {ip_address1}")
+    return (
+        "Highly Relevant",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def relevant_vote2_last_response(request: gr.Request):
+    """
+    Handle a user vote for relevance as "Partially Relevant".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Partially Relevant", <ip_address>, (disable_btn,) * 4)
+            - "Partially Relevant": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable relevance vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Partially Relevant (voted). ip: {ip_address1}")
+    return (
+        "Partially Relevant",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def relevant_vote3_last_response(request: gr.Request):
+    """
+    Handle a user vote for relevance as "Slightly Irrelevant".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Slightly Irrelevant", <ip_address>, (disable_btn,) * 4)
+            - "Slightly Irrelevant": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable relevance vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Slightly Irrelevant (voted). ip: {ip_address1}")
+    return (
+        "Slightly Irrelevant",
+        ip_address1,
+    ) + (disable_btn,) * 4
+def relevant_vote4_last_response(request: gr.Request):
+    """
+    Handle a user vote for relevance as "Completely Irrelevant".
+    Args:
+        request (gr.Request):
+            The Gradio request object providing access to HTTP headers and metadata.
+    Returns:
+        tuple:
+            A tuple containing:
+            ("Completely Irrelevant", <ip_address>, (disable_btn,) * 4)
+            - "Completely Irrelevant": The selected vote or label.
+            - <ip_address>: The IP address of the client retrieved from the request.
+            - disable_btn: An object repeated four times,
+            to disable relevance vote buttons.
+    """
+    ip_address1 = get_ip(request)
+    print(f"Completely Irrelevant (voted). ip: {ip_address1}")
+    return (
+        "Completely Irrelevant",
+        ip_address1,
+    ) + (disable_btn,) * 4

pyscripts/utils/dialog_eval/vert.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+import sys
+import warnings
+from collections import Counter
+from fractions import Fraction
+import nltk
+import numpy as np
+from nltk.translate.bleu_score import (
+    SmoothingFunction,
+    brevity_penalty,
+    closest_ref_length,
+    modified_precision,
+)
+def corpus_bleu(
+    list_of_references,
+    hypotheses,
+    weights=(0.25, 0.25, 0.25, 0.25),
+    smoothing_function=None,
+    auto_reweigh=False,
+    averaging_mode="geometric",
+    no_length_penalty=False,
+):
+    """
+    Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
+    the hypotheses and their respective references.
+    Instead of averaging the sentence level BLEU scores (i.e. marco-average
+    precision), the original BLEU metric (Papineni et al. 2002) accounts for
+    the micro-average precision (i.e. summing the numerators and denominators
+    for each hypothesis-reference(s) pairs before the division).
+    >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
+    ...         'ensures', 'that', 'the', 'military', 'always',
+    ...         'obeys', 'the', 'commands', 'of', 'the', 'party']
+    >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
+    ...          'ensures', 'that', 'the', 'military', 'will', 'forever',
+    ...          'heed', 'Party', 'commands']
+    >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
+    ...          'guarantees', 'the', 'military', 'forces', 'always',
+    ...          'being', 'under', 'the', 'command', 'of', 'the', 'Party']
+    >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
+    ...          'army', 'always', 'to', 'heed', 'the', 'directions',
+    ...          'of', 'the', 'party']
+    >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
+    ...         'interested', 'in', 'world', 'history']
+    >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
+    ...          'because', 'he', 'read', 'the', 'book']
+    >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
+    >>> hypotheses = [hyp1, hyp2]
+    >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
+    0.5920...
+    The example below show that corpus_bleu() is different from averaging
+    sentence_bleu() for hypotheses
+    >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
+    >>> score2 = sentence_bleu([ref2a], hyp2)
+    >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
+    0.6223...
+    :param list_of_references: a corpus of lists of reference
+    sentences, w.r.t. hypotheses
+    :type list_of_references: list(list(list(str)))
+    :param hypotheses: a list of hypothesis sentences
+    :type hypotheses: list(list(str))
+    :param weights: weights for unigrams, bigrams, trigrams and so on
+    :type weights: list(float)
+    :param smoothing_function:
+    :type smoothing_function: SmoothingFunction
+    :param auto_reweigh: Option to re-normalize the weights uniformly.
+    :type auto_reweigh: bool
+    :return: The corpus-level BLEU score.
+    :rtype: float
+    """
+    # Before proceeding to compute BLEU, perform sanity checks.
+    p_numerators = Counter()  # Key = ngram order, and value = no. of ngram matches.
+    p_denominators = Counter()  # Key = ngram order, and value = no. of ngram in ref.
+    hyp_lengths, ref_lengths = 0, 0
+    assert len(list_of_references) == len(hypotheses), (
+        "The number of hypotheses and their reference(s) should be the " "same "
+    )
+    # Iterate through each hypothesis and their corresponding references.
+    for references, hypothesis in zip(list_of_references, hypotheses):
+        # For each order of ngram, calculate the numerator and
+        # denominator for the corpus-level modified precision.
+        for i, _ in enumerate(weights, start=1):
+            p_i = modified_precision(references, hypothesis, i)
+            p_numerators[i] += p_i.numerator
+            p_denominators[i] += p_i.denominator
+        # Calculate the hypothesis length and the closest reference length.
+        # Adds them to the corpus-level hypothesis and reference counts.
+        hyp_len = len(hypothesis)
+        hyp_lengths += hyp_len
+        ref_lengths += closest_ref_length(references, hyp_len)
+    # Calculate corpus-level brevity penalty.
+    if no_length_penalty and averaging_mode == "geometric":
+        bp = 1.0
+    elif no_length_penalty and averaging_mode == "arithmetic":
+        bp = 0.0
+    else:
+        assert not no_length_penalty
+        assert (
+            averaging_mode != "arithmetic"
+        ), "Not sure how to apply length penalty when aurithmetic mode"
+        bp = brevity_penalty(ref_lengths, hyp_lengths)
+    # Uniformly re-weighting based on maximum hypothesis lengths if largest
+    # order of n-grams < 4 and weights is set at default.
+    if auto_reweigh:
+        if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
+            weights = (1 / hyp_lengths,) * hyp_lengths
+    # Collects the various precision values for the different ngram orders.
+    p_n = [
+        Fraction(p_numerators[i], p_denominators[i], _normalize=False)
+        for i, _ in enumerate(weights, start=1)
+    ]
+    # Returns 0 if there's no matching n-grams
+    # We only need to check for p_numerators[1] == 0, since if there's
+    # no unigrams, there won't be any higher order ngrams.
+    if p_numerators[1] == 0:
+        return 0
+    # If there's no smoothing, set use method0 from SmoothinFunction class.
+    if not smoothing_function:
+        smoothing_function = SmoothingFunction().method0
+    # Smoothen the modified precision.
+    # Note: smoothing_function() may convert values into floats;
+    #       it tries to retain the Fraction object as much as the
+    #       smoothing method allows.
+    p_n = smoothing_function(
+        p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
+    )
+    if averaging_mode == "geometric":
+        s = (w_i * math.log(p_i) for w_i, p_i in zip(weights, p_n))
+        s = bp * math.exp(math.fsum(s))
+    elif averaging_mode == "arithmetic":
+        s = (w_i * p_i for w_i, p_i in zip(weights, p_n))
+        s = math.fsum(s)
+    return s
+def sentence_bleu(
+    references,
+    hypothesis,
+    weights=(0.25, 0.25, 0.25, 0.25),
+    smoothing_function=None,
+    auto_reweigh=False,
+    averaging_mode="geometric",
+    no_length_penalty=False,
+):
+    return corpus_bleu(
+        [references],
+        [hypothesis],
+        weights,
+        smoothing_function,
+        auto_reweigh,
+        averaging_mode,
+        no_length_penalty,
+    )
+def get_target_sequences(manifest, ground_truth, to_take=1000):
+    import json
+    import pathlib
+    with open(ground_truth, "r") as fin:
+        original_continuations = json.loads(fin.read())
+    sequence2length = [(k, v[0]) for k, v in original_continuations.items()]
+    assert all(float(v) >= 6.0 for (_, v) in sequence2length)  # 6 seconds
+    sequence2length.sort(key=lambda x: x[1])
+    to_take_sequences = set(v[0] for v in sequence2length[:to_take])
+    to_take_ids = []
+    with open(manifest, "r") as f:
+        f.readline()
+        for i, line in enumerate(f.readlines()):
+            seq_id = line.split()[0]
+            seq_id = pathlib.Path(seq_id).name.split("__")[0]
+            if seq_id in to_take_sequences:
+                to_take_ids.append(i)
+    print(f"Took {len(to_take_ids)} ids")
+    return set(to_take_ids)
+def get_self_bleu(utterances, averaging_mode, weights):
+    self_bleu = []
+    for i in range(len(utterances)):
+        hypo = utterances[i]
+        rest = utterances[:i] + utterances[i + 1 :]
+        self_bleu.append(
+            sentence_bleu(
+                rest,
+                hypo,
+                weights,
+                no_length_penalty=True,
+                averaging_mode=averaging_mode,
+            )
+        )
+    return self_bleu
+def get_self_bleu2_arithmetic(utterances):
+    weights = (0.5, 0.5)  # equal weight for unigrams and bigrams
+    return get_self_bleu(utterances, averaging_mode="arithmetic", weights=weights)
+def get_self_bleu2_geometric(utterances):
+    weights = (0.5, 0.5)
+    return get_self_bleu(utterances, averaging_mode="geometric", weights=weights)
+def get_auto_bleu2_arithmetic(utterances):
+    weights = (0.5, 0.5)
+    return [auto_bleu(u, mean_mode="arithmetic", weights=weights) for u in utterances]
+def get_auto_bleu2_geometric(utterances):
+    weights = (0.5, 0.5)
+    return [auto_bleu(u, mean_mode="geometric", weights=weights) for u in utterances]
+def get_auto_bleu3_geometric(utterances):
+    weights = (1.0 / 3, 1.0 / 3, 1.0 / 3)
+    return [auto_bleu(u, mean_mode="geometric", weights=weights) for u in utterances]
+def get_auto_bleu3_arithmetic(utterances):
+    weights = (1.0 / 3, 1.0 / 3, 1.0 / 3)
+    return [auto_bleu(u, mean_mode="arithmetic", weights=weights) for u in utterances]
+def get_self_bleu3_arithmetic(utterances):
+    weights = (1.0 / 3, 1.0 / 3, 1.0 / 3)
+    return get_self_bleu(utterances, averaging_mode="arithmetic", weights=weights)
+def get_self_bleu3_geometric(utterances):
+    weights = (1.0 / 3, 1.0 / 3, 1.0 / 3)
+    return get_self_bleu(utterances, averaging_mode="geometric", weights=weights)
+def auto_bleu(sentence, weights, mean_mode="arithmetic"):
+    if len(sentence) <= 1:
+        return 0
+    N = len(weights)
+    bleu_n = np.zeros([N])
+    for n in range(N):
+        targ_ngrams = list(nltk.ngrams(sentence, n + 1))
+        for p in range(len(targ_ngrams)):
+            left = sentence[:p]
+            right = sentence[(p + n + 1) :]
+            rest_ngrams = list(nltk.ngrams(left, n + 1)) + list(
+                nltk.ngrams(right, n + 1)
+            )
+            # compute the nb of matching ngrams
+            bleu_n[n] += targ_ngrams[p] in rest_ngrams
+        bleu_n[n] /= len(targ_ngrams)  # average them to get a proportion
+    weights = np.array(weights)
+    if mean_mode == "arithmetic":
+        return (bleu_n * weights).sum()
+    elif mean_mode == "geometric":
+        return (bleu_n**weights).prod()
+    else:
+        raise ValueError(f"Unknown agggregation mode {mean_mode}")
+def run_f(task_params):
+    f, terms = task_params
+    return f(terms)

requirements.txt CHANGED Viewed

	@@ -1 +1,18 @@
1	- ~~huggingface_hub~~==0.25.2

+typeguard==2.13.3
+espnet @ git+https://github.com/siddhu001/espnet@sds_demo_recipe
+espnet_model_zoo
+huggingface_hub==0.23.2
+transformers[sentencepiece]
+sentencepiece
+datasets
+torch==2.5.1
+torchaudio==2.5.1
+librosa
+sounddevice==0.5.0
+webrtcvad-wheels
+webrtcvad==2.0.10
+ChatTTS
+evaluate
+snac==1.2.0
+litgpt==0.4.3
+gradio==4.43.0

temp_repo DELETED Viewed

	@@ -1 +0,0 @@
1	- Subproject commit 8f631305c9bcacda1d4cfd24a9fcef74f518081e

versa.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+git clone https://github.com/shinjiwlab/versa.git
+cd versa
+git checkout 64bf6fe22fbc8d43068afdf4e715864a18577735
+pip install .
+cd ..