duhoc-rewoo-agent / app-st.py
brandonmai's picture
Original
2224132
import streamlit as st
import os
import importlib
import sys
from langchain_openai import OpenAI
import re
import json
from algos.PWS import *
from utils.util import *
from nodes.Worker import *
from prompts import fewshots
# Load API keys
# with open(os.path.join('./keys/', 'openai.key'), 'r') as f:
# os.environ["OPENAI_API_KEY"] = f.read().strip()
# with open(os.path.join('./keys/', 'serpapi.key'), 'r') as f:
# os.environ["SERPAPI_API_KEY"] = f.read().strip()
def reload_modules():
"""Reload all relevant modules"""
importlib.reload(sys.modules['nodes.Worker'])
importlib.reload(sys.modules['algos.PWS'])
importlib.reload(sys.modules['utils.util'])
importlib.reload(sys.modules['prompts.fewshots'])
importlib.reload(sys.modules['prompts.solver'])
return "βœ… Modules reloaded successfully!"
def process(tools, model, input_text):
# Use study abroad fewshot for study-related questions
if any(word in input_text.lower() for word in
["study", "student", "university", "college", "school", "abroad", "học", "trường", "du học", "học bα»•ng",
"gpa", "ielts", "tcf", "delf", "scholarship"]):
# Ensure both Google and LLM are included for study abroad queries
print(tools)
assert ("LLM" in tools) and ("Google" or "Duckduckgo" in tools)
method = PWS_Base(planner_model=model, solver_model=model,
fewshot=fewshots.STUDY_ABROAD_PWS, available_tools=tools)
else:
method = PWS_Base(planner_model=model, solver_model=model,
fewshot=fewshots.TRIVIAQA_PWS, available_tools=tools)
response = method.run(input_text)
# Extract planner log
plan = response["planner_log"].split(input_text)[1].strip('\n')
# Extract full solver log without truncating at "Now begin to solve the task"
solve = response["solver_log"].split(input_text)[1].strip('\n')
# Get the complete output
output = response["output"]
return plan, solve, output
def evaluate(response, plan, solve):
"""
Evaluate whether the response is based on evidence or contains hallucinations.
Args:
response: The assistant's full response
plan: The planning process
solve: The solving process with evidence
Returns:
Dictionary with reasoning, summary and evaluation status
"""
# Initialize OpenAI client
llm = OpenAI(temperature=0)
# Extract only evidence paragraphs from solve
evidence_blocks = []
for block in solve.split("\n\n"):
if "Evidence:" in block:
evidence_part = block.split("Evidence:", 1)[1].strip()
if evidence_part:
evidence_blocks.append(evidence_part)
# Combine evidence sources
evidence = "\n\n".join(evidence_blocks)
if not evidence:
evidence = solve # Fallback to using entire solve text if no evidence found
# Create prompt for evaluation
prompt = f"""
Evaluate whether the following response is factually supported by the provided evidence.
Response to evaluate:
{response}
Evidence:
{evidence}
Provide your evaluation in this format:
REASONING: Detailed analysis comparing the response against the evidence
SUMMARY: Brief summary of the evaluation
VERDICT: [SUPPORTED/PARTIALLY SUPPORTED/UNSUPPORTED] - Choose one verdict
"""
try:
result_text = llm.invoke(prompt).strip()
# Parse the structured output
reasoning = ""
summary = ""
verdict = "UNSUPPORTED"
if "REASONING:" in result_text:
parts = result_text.split("REASONING:", 1)
remainder = parts[1]
if "SUMMARY:" in remainder:
reasoning, remainder = remainder.split("SUMMARY:", 1)
if "VERDICT:" in remainder:
summary, verdict = remainder.split("VERDICT:", 1)
reasoning = reasoning.strip()
summary = summary.strip()
verdict = verdict.strip()
# Determine verdict category
verdict_category = "unsupported"
if "SUPPORTED" in verdict and not "PARTIALLY" in verdict and not "UNSUPPORTED" in verdict:
verdict_category = "supported"
elif "PARTIALLY" in verdict:
verdict_category = "partially_supported"
return {
"reasoning": reasoning,
"summary": summary,
"verdict": verdict,
"verdict_category": verdict_category
}
except Exception as e:
return {
"reasoning": f"Error during evaluation: {str(e)}",
"summary": "Could not complete evaluation",
"verdict": "EVALUATION FAILED",
"verdict_category": "error"
}
# Main app
st.set_page_config(page_title="ReWOO Demo", layout="wide")
st.title("ReWOO Demo πŸ€—")
st.markdown("""
Demonstrating our recent work -- ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models.
Note that this demo is only a conceptual impression of our work, we use a zero-shot set up and not optimizing the run time.
""")
# Initialize session state
if 'messages' not in st.session_state:
st.session_state.messages = []
# Sidebar
with st.sidebar:
st.header("Configuration")
# Tools selection
tools = st.multiselect(
"Select Tools",
options=['Wikipedia', 'Google', 'LLM', 'WolframAlpha', 'Calculator', 'Duckduckgo'],
default=['Duckduckgo', 'LLM']
)
# Model selection
model = st.selectbox(
"Select Model",
options=["text-davinci-003", "gpt-3.5-turbo"],
index=1
)
# Refresh modules button
if st.button("πŸ”„ Refresh Modules"):
status = reload_modules()
st.success(status)
# Examples section
st.header("Examples")
if st.button("Example 1: American Callan Pinckney's system"):
example_text = "American Callan Pinckney's eponymously named system became a best-selling (1980s-2000s) book/video franchise in what genre?"
st.session_state.messages.append({"role": "user", "content": example_text})
with st.spinner('Processing...'):
plan, solve, output = process(["Wikipedia", "LLM"], "gpt-3.5-turbo", example_text)
st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})
if st.button("Example 2: ReWOO paper"):
example_text = "What is the recent paper ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models about?"
st.session_state.messages.append({"role": "user", "content": example_text})
with st.spinner('Processing...'):
plan, solve, output = process(["Google", "LLM"], "gpt-3.5-turbo", example_text)
st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})
if st.button("Example 3: Car acceleration"):
example_text = "the car can accelerate from 0 to 27.8 m/s in a time of 3.85 seconds. Determine the acceleration of this car in m/s/s."
st.session_state.messages.append({"role": "user", "content": example_text})
with st.spinner('Processing...'):
plan, solve, output = process(["Calculator", "WolframAlpha"], "gpt-3.5-turbo", example_text)
st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})
# Display chat history
for i, message in enumerate(st.session_state.messages):
if message["role"] == "user":
st.chat_message("user").write(message["content"])
else:
with st.chat_message("assistant"):
st.write(message["content"])
with st.expander("Show reasoning process"):
st.subheader("Planner")
st.text(message["plan"])
st.subheader("Solver")
st.text(message["solve"])
# Add evaluate button in the expander
if "evaluation_results" not in message:
if st.button("πŸ” Evaluate", key=f"eval_btn_{i}", type="secondary"):
with st.spinner("Evaluating response..."):
results = evaluate(message["content"], message["plan"], message["solve"])
st.session_state.messages[i]["evaluation_results"] = results
st.rerun()
else:
# Show evaluation in an expander
with st.expander("Evaluation Results"):
results = message["evaluation_results"]
# Display verdict with color
verdict_color = "red"
if results["verdict_category"] == "supported":
verdict_color = "green"
elif results["verdict_category"] == "partially_supported":
verdict_color = "orange"
st.markdown(f"<h4 style='color: {verdict_color};'>{results['verdict']}</h4>",
unsafe_allow_html=True)
st.subheader("Summary")
st.write(results["summary"])
st.subheader("Detailed Analysis")
st.write(results["reasoning"])
# User input
if prompt := st.chat_input("Ask something..."):
st.session_state.messages.append({"role": "user", "content": prompt})
st.chat_message("user").write(prompt)
with st.chat_message("assistant"):
with st.spinner('Researching...'):
plan, solve, output = process(tools, model, prompt)
st.write(output)
with st.expander("Show research process"):
st.subheader("Planner")
st.text(plan)
st.subheader("Solver")
st.text(solve)
# Add evaluate button in expander for current response
if st.button("πŸ” Evaluate", key="eval_current", type="secondary"):
with st.spinner("Evaluating response..."):
results = evaluate(output, plan, solve)
# Show evaluation in an expander
with st.expander("Evaluation Results"):
# Display verdict with color
verdict_color = "red"
if results["verdict_category"] == "supported":
verdict_color = "green"
elif results["verdict_category"] == "partially_supported":
verdict_color = "orange"
st.markdown(f"<h4 style='color: {verdict_color};'>{results['verdict']}</h4>",
unsafe_allow_html=True)
st.subheader("Summary")
st.write(results["summary"])
st.subheader("Detailed Analysis")
st.write(results["reasoning"])
# Store evaluation results
for i in range(len(st.session_state.messages)):
if st.session_state.messages[i]["role"] == "assistant" and \
st.session_state.messages[i]["content"] == output:
st.session_state.messages[i]["evaluation_results"] = results
break
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})
# Clear chat button
if st.sidebar.button("Clear Chat"):
st.session_state.messages = []
st.rerun()