Spaces:
Running
Running
import streamlit as st | |
import os | |
import importlib | |
import sys | |
from langchain_openai import OpenAI | |
import re | |
import json | |
from algos.PWS import * | |
from utils.util import * | |
from nodes.Worker import * | |
from prompts import fewshots | |
# Load API keys | |
# with open(os.path.join('./keys/', 'openai.key'), 'r') as f: | |
# os.environ["OPENAI_API_KEY"] = f.read().strip() | |
# with open(os.path.join('./keys/', 'serpapi.key'), 'r') as f: | |
# os.environ["SERPAPI_API_KEY"] = f.read().strip() | |
def reload_modules(): | |
"""Reload all relevant modules""" | |
importlib.reload(sys.modules['nodes.Worker']) | |
importlib.reload(sys.modules['algos.PWS']) | |
importlib.reload(sys.modules['utils.util']) | |
importlib.reload(sys.modules['prompts.fewshots']) | |
importlib.reload(sys.modules['prompts.solver']) | |
return "β Modules reloaded successfully!" | |
def process(tools, model, input_text): | |
# Use study abroad fewshot for study-related questions | |
if any(word in input_text.lower() for word in | |
["study", "student", "university", "college", "school", "abroad", "hα»c", "trΖ°α»ng", "du hα»c", "hα»c bα»ng", | |
"gpa", "ielts", "tcf", "delf", "scholarship"]): | |
# Ensure both Google and LLM are included for study abroad queries | |
print(tools) | |
assert ("LLM" in tools) and ("Google" or "Duckduckgo" in tools) | |
method = PWS_Base(planner_model=model, solver_model=model, | |
fewshot=fewshots.STUDY_ABROAD_PWS, available_tools=tools) | |
else: | |
method = PWS_Base(planner_model=model, solver_model=model, | |
fewshot=fewshots.TRIVIAQA_PWS, available_tools=tools) | |
response = method.run(input_text) | |
# Extract planner log | |
plan = response["planner_log"].split(input_text)[1].strip('\n') | |
# Extract full solver log without truncating at "Now begin to solve the task" | |
solve = response["solver_log"].split(input_text)[1].strip('\n') | |
# Get the complete output | |
output = response["output"] | |
return plan, solve, output | |
def evaluate(response, plan, solve): | |
""" | |
Evaluate whether the response is based on evidence or contains hallucinations. | |
Args: | |
response: The assistant's full response | |
plan: The planning process | |
solve: The solving process with evidence | |
Returns: | |
Dictionary with reasoning, summary and evaluation status | |
""" | |
# Initialize OpenAI client | |
llm = OpenAI(temperature=0) | |
# Extract only evidence paragraphs from solve | |
evidence_blocks = [] | |
for block in solve.split("\n\n"): | |
if "Evidence:" in block: | |
evidence_part = block.split("Evidence:", 1)[1].strip() | |
if evidence_part: | |
evidence_blocks.append(evidence_part) | |
# Combine evidence sources | |
evidence = "\n\n".join(evidence_blocks) | |
if not evidence: | |
evidence = solve # Fallback to using entire solve text if no evidence found | |
# Create prompt for evaluation | |
prompt = f""" | |
Evaluate whether the following response is factually supported by the provided evidence. | |
Response to evaluate: | |
{response} | |
Evidence: | |
{evidence} | |
Provide your evaluation in this format: | |
REASONING: Detailed analysis comparing the response against the evidence | |
SUMMARY: Brief summary of the evaluation | |
VERDICT: [SUPPORTED/PARTIALLY SUPPORTED/UNSUPPORTED] - Choose one verdict | |
""" | |
try: | |
result_text = llm.invoke(prompt).strip() | |
# Parse the structured output | |
reasoning = "" | |
summary = "" | |
verdict = "UNSUPPORTED" | |
if "REASONING:" in result_text: | |
parts = result_text.split("REASONING:", 1) | |
remainder = parts[1] | |
if "SUMMARY:" in remainder: | |
reasoning, remainder = remainder.split("SUMMARY:", 1) | |
if "VERDICT:" in remainder: | |
summary, verdict = remainder.split("VERDICT:", 1) | |
reasoning = reasoning.strip() | |
summary = summary.strip() | |
verdict = verdict.strip() | |
# Determine verdict category | |
verdict_category = "unsupported" | |
if "SUPPORTED" in verdict and not "PARTIALLY" in verdict and not "UNSUPPORTED" in verdict: | |
verdict_category = "supported" | |
elif "PARTIALLY" in verdict: | |
verdict_category = "partially_supported" | |
return { | |
"reasoning": reasoning, | |
"summary": summary, | |
"verdict": verdict, | |
"verdict_category": verdict_category | |
} | |
except Exception as e: | |
return { | |
"reasoning": f"Error during evaluation: {str(e)}", | |
"summary": "Could not complete evaluation", | |
"verdict": "EVALUATION FAILED", | |
"verdict_category": "error" | |
} | |
# Main app | |
st.set_page_config(page_title="ReWOO Demo", layout="wide") | |
st.title("ReWOO Demo π€") | |
st.markdown(""" | |
Demonstrating our recent work -- ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models. | |
Note that this demo is only a conceptual impression of our work, we use a zero-shot set up and not optimizing the run time. | |
""") | |
# Initialize session state | |
if 'messages' not in st.session_state: | |
st.session_state.messages = [] | |
# Sidebar | |
with st.sidebar: | |
st.header("Configuration") | |
# Tools selection | |
tools = st.multiselect( | |
"Select Tools", | |
options=['Wikipedia', 'Google', 'LLM', 'WolframAlpha', 'Calculator', 'Duckduckgo'], | |
default=['Duckduckgo', 'LLM'] | |
) | |
# Model selection | |
model = st.selectbox( | |
"Select Model", | |
options=["text-davinci-003", "gpt-3.5-turbo"], | |
index=1 | |
) | |
# Refresh modules button | |
if st.button("π Refresh Modules"): | |
status = reload_modules() | |
st.success(status) | |
# Examples section | |
st.header("Examples") | |
if st.button("Example 1: American Callan Pinckney's system"): | |
example_text = "American Callan Pinckney's eponymously named system became a best-selling (1980s-2000s) book/video franchise in what genre?" | |
st.session_state.messages.append({"role": "user", "content": example_text}) | |
with st.spinner('Processing...'): | |
plan, solve, output = process(["Wikipedia", "LLM"], "gpt-3.5-turbo", example_text) | |
st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve}) | |
if st.button("Example 2: ReWOO paper"): | |
example_text = "What is the recent paper ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models about?" | |
st.session_state.messages.append({"role": "user", "content": example_text}) | |
with st.spinner('Processing...'): | |
plan, solve, output = process(["Google", "LLM"], "gpt-3.5-turbo", example_text) | |
st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve}) | |
if st.button("Example 3: Car acceleration"): | |
example_text = "the car can accelerate from 0 to 27.8 m/s in a time of 3.85 seconds. Determine the acceleration of this car in m/s/s." | |
st.session_state.messages.append({"role": "user", "content": example_text}) | |
with st.spinner('Processing...'): | |
plan, solve, output = process(["Calculator", "WolframAlpha"], "gpt-3.5-turbo", example_text) | |
st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve}) | |
# Display chat history | |
for i, message in enumerate(st.session_state.messages): | |
if message["role"] == "user": | |
st.chat_message("user").write(message["content"]) | |
else: | |
with st.chat_message("assistant"): | |
st.write(message["content"]) | |
with st.expander("Show reasoning process"): | |
st.subheader("Planner") | |
st.text(message["plan"]) | |
st.subheader("Solver") | |
st.text(message["solve"]) | |
# Add evaluate button in the expander | |
if "evaluation_results" not in message: | |
if st.button("π Evaluate", key=f"eval_btn_{i}", type="secondary"): | |
with st.spinner("Evaluating response..."): | |
results = evaluate(message["content"], message["plan"], message["solve"]) | |
st.session_state.messages[i]["evaluation_results"] = results | |
st.rerun() | |
else: | |
# Show evaluation in an expander | |
with st.expander("Evaluation Results"): | |
results = message["evaluation_results"] | |
# Display verdict with color | |
verdict_color = "red" | |
if results["verdict_category"] == "supported": | |
verdict_color = "green" | |
elif results["verdict_category"] == "partially_supported": | |
verdict_color = "orange" | |
st.markdown(f"<h4 style='color: {verdict_color};'>{results['verdict']}</h4>", | |
unsafe_allow_html=True) | |
st.subheader("Summary") | |
st.write(results["summary"]) | |
st.subheader("Detailed Analysis") | |
st.write(results["reasoning"]) | |
# User input | |
if prompt := st.chat_input("Ask something..."): | |
st.session_state.messages.append({"role": "user", "content": prompt}) | |
st.chat_message("user").write(prompt) | |
with st.chat_message("assistant"): | |
with st.spinner('Researching...'): | |
plan, solve, output = process(tools, model, prompt) | |
st.write(output) | |
with st.expander("Show research process"): | |
st.subheader("Planner") | |
st.text(plan) | |
st.subheader("Solver") | |
st.text(solve) | |
# Add evaluate button in expander for current response | |
if st.button("π Evaluate", key="eval_current", type="secondary"): | |
with st.spinner("Evaluating response..."): | |
results = evaluate(output, plan, solve) | |
# Show evaluation in an expander | |
with st.expander("Evaluation Results"): | |
# Display verdict with color | |
verdict_color = "red" | |
if results["verdict_category"] == "supported": | |
verdict_color = "green" | |
elif results["verdict_category"] == "partially_supported": | |
verdict_color = "orange" | |
st.markdown(f"<h4 style='color: {verdict_color};'>{results['verdict']}</h4>", | |
unsafe_allow_html=True) | |
st.subheader("Summary") | |
st.write(results["summary"]) | |
st.subheader("Detailed Analysis") | |
st.write(results["reasoning"]) | |
# Store evaluation results | |
for i in range(len(st.session_state.messages)): | |
if st.session_state.messages[i]["role"] == "assistant" and \ | |
st.session_state.messages[i]["content"] == output: | |
st.session_state.messages[i]["evaluation_results"] = results | |
break | |
# Add assistant response to chat history | |
st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve}) | |
# Clear chat button | |
if st.sidebar.button("Clear Chat"): | |
st.session_state.messages = [] | |
st.rerun() |