Spaces:

brandonmai
/

duhoc-rewoo-agent

Running

App Files Files Community

duhoc-rewoo-agent / app-st.py

brandonmai

Original

2224132 5 days ago

raw

history blame contribute delete

11.5 kB

	import streamlit as st
	import os
	import importlib
	import sys
	from langchain_openai import OpenAI
	import re
	import json

	from algos.PWS import *
	from utils.util import *
	from nodes.Worker import *
	from prompts import fewshots

	# Load API keys
	# with open(os.path.join('./keys/', 'openai.key'), 'r') as f:
	# os.environ["OPENAI_API_KEY"] = f.read().strip()
	# with open(os.path.join('./keys/', 'serpapi.key'), 'r') as f:
	# os.environ["SERPAPI_API_KEY"] = f.read().strip()


	def reload_modules():
	"""Reload all relevant modules"""
	importlib.reload(sys.modules['nodes.Worker'])
	importlib.reload(sys.modules['algos.PWS'])
	importlib.reload(sys.modules['utils.util'])
	importlib.reload(sys.modules['prompts.fewshots'])
	importlib.reload(sys.modules['prompts.solver'])
	return "✅ Modules reloaded successfully!"


	def process(tools, model, input_text):
	# Use study abroad fewshot for study-related questions
	if any(word in input_text.lower() for word in
	["study", "student", "university", "college", "school", "abroad", "học", "trường", "du học", "học bổng",
	"gpa", "ielts", "tcf", "delf", "scholarship"]):
	# Ensure both Google and LLM are included for study abroad queries
	print(tools)
	assert ("LLM" in tools) and ("Google" or "Duckduckgo" in tools)
	method = PWS_Base(planner_model=model, solver_model=model,
	fewshot=fewshots.STUDY_ABROAD_PWS, available_tools=tools)
	else:
	method = PWS_Base(planner_model=model, solver_model=model,
	fewshot=fewshots.TRIVIAQA_PWS, available_tools=tools)
	response = method.run(input_text)

	# Extract planner log
	plan = response["planner_log"].split(input_text)[1].strip('\n')

	# Extract full solver log without truncating at "Now begin to solve the task"
	solve = response["solver_log"].split(input_text)[1].strip('\n')

	# Get the complete output
	output = response["output"]

	return plan, solve, output


	def evaluate(response, plan, solve):
	"""
	Evaluate whether the response is based on evidence or contains hallucinations.

	Args:
	response: The assistant's full response
	plan: The planning process
	solve: The solving process with evidence

	Returns:
	Dictionary with reasoning, summary and evaluation status
	"""
	# Initialize OpenAI client
	llm = OpenAI(temperature=0)

	# Extract only evidence paragraphs from solve
	evidence_blocks = []
	for block in solve.split("\n\n"):
	if "Evidence:" in block:
	evidence_part = block.split("Evidence:", 1)[1].strip()
	if evidence_part:
	evidence_blocks.append(evidence_part)

	# Combine evidence sources
	evidence = "\n\n".join(evidence_blocks)
	if not evidence:
	evidence = solve # Fallback to using entire solve text if no evidence found

	# Create prompt for evaluation
	prompt = f"""
	Evaluate whether the following response is factually supported by the provided evidence.

	Response to evaluate:
	{response}

	Evidence:
	{evidence}

	Provide your evaluation in this format:
	REASONING: Detailed analysis comparing the response against the evidence
	SUMMARY: Brief summary of the evaluation
	VERDICT: [SUPPORTED/PARTIALLY SUPPORTED/UNSUPPORTED] - Choose one verdict
	"""

	try:
	result_text = llm.invoke(prompt).strip()

	# Parse the structured output
	reasoning = ""
	summary = ""
	verdict = "UNSUPPORTED"

	if "REASONING:" in result_text:
	parts = result_text.split("REASONING:", 1)
	remainder = parts[1]
	if "SUMMARY:" in remainder:
	reasoning, remainder = remainder.split("SUMMARY:", 1)
	if "VERDICT:" in remainder:
	summary, verdict = remainder.split("VERDICT:", 1)

	reasoning = reasoning.strip()
	summary = summary.strip()
	verdict = verdict.strip()

	# Determine verdict category
	verdict_category = "unsupported"
	if "SUPPORTED" in verdict and not "PARTIALLY" in verdict and not "UNSUPPORTED" in verdict:
	verdict_category = "supported"
	elif "PARTIALLY" in verdict:
	verdict_category = "partially_supported"

	return {
	"reasoning": reasoning,
	"summary": summary,
	"verdict": verdict,
	"verdict_category": verdict_category
	}

	except Exception as e:
	return {
	"reasoning": f"Error during evaluation: {str(e)}",
	"summary": "Could not complete evaluation",
	"verdict": "EVALUATION FAILED",
	"verdict_category": "error"
	}


	# Main app
	st.set_page_config(page_title="ReWOO Demo", layout="wide")
	st.title("ReWOO Demo 🤗")
	st.markdown("""
	Demonstrating our recent work -- ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models.
	Note that this demo is only a conceptual impression of our work, we use a zero-shot set up and not optimizing the run time.
	""")

	# Initialize session state
	if 'messages' not in st.session_state:
	st.session_state.messages = []

	# Sidebar
	with st.sidebar:
	st.header("Configuration")

	# Tools selection
	tools = st.multiselect(
	"Select Tools",
	options=['Wikipedia', 'Google', 'LLM', 'WolframAlpha', 'Calculator', 'Duckduckgo'],
	default=['Duckduckgo', 'LLM']
	)

	# Model selection
	model = st.selectbox(
	"Select Model",
	options=["text-davinci-003", "gpt-3.5-turbo"],
	index=1
	)

	# Refresh modules button
	if st.button("🔄 Refresh Modules"):
	status = reload_modules()
	st.success(status)

	# Examples section
	st.header("Examples")

	if st.button("Example 1: American Callan Pinckney's system"):
	example_text = "American Callan Pinckney's eponymously named system became a best-selling (1980s-2000s) book/video franchise in what genre?"
	st.session_state.messages.append({"role": "user", "content": example_text})
	with st.spinner('Processing...'):
	plan, solve, output = process(["Wikipedia", "LLM"], "gpt-3.5-turbo", example_text)
	st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})

	if st.button("Example 2: ReWOO paper"):
	example_text = "What is the recent paper ReWOO: Decoupling Reasoning from Observations for Efficient Augmented Language Models about?"
	st.session_state.messages.append({"role": "user", "content": example_text})
	with st.spinner('Processing...'):
	plan, solve, output = process(["Google", "LLM"], "gpt-3.5-turbo", example_text)
	st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})

	if st.button("Example 3: Car acceleration"):
	example_text = "the car can accelerate from 0 to 27.8 m/s in a time of 3.85 seconds. Determine the acceleration of this car in m/s/s."
	st.session_state.messages.append({"role": "user", "content": example_text})
	with st.spinner('Processing...'):
	plan, solve, output = process(["Calculator", "WolframAlpha"], "gpt-3.5-turbo", example_text)
	st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})

	# Display chat history
	for i, message in enumerate(st.session_state.messages):
	if message["role"] == "user":
	st.chat_message("user").write(message["content"])
	else:
	with st.chat_message("assistant"):
	st.write(message["content"])

	with st.expander("Show reasoning process"):
	st.subheader("Planner")
	st.text(message["plan"])
	st.subheader("Solver")
	st.text(message["solve"])

	# Add evaluate button in the expander
	if "evaluation_results" not in message:
	if st.button("🔍 Evaluate", key=f"eval_btn_{i}", type="secondary"):
	with st.spinner("Evaluating response..."):
	results = evaluate(message["content"], message["plan"], message["solve"])
	st.session_state.messages[i]["evaluation_results"] = results
	st.rerun()
	else:
	# Show evaluation in an expander
	with st.expander("Evaluation Results"):
	results = message["evaluation_results"]

	# Display verdict with color
	verdict_color = "red"
	if results["verdict_category"] == "supported":
	verdict_color = "green"
	elif results["verdict_category"] == "partially_supported":
	verdict_color = "orange"

	st.markdown(f"<h4 style='color: {verdict_color};'>{results['verdict']}</h4>",
	unsafe_allow_html=True)
	st.subheader("Summary")
	st.write(results["summary"])
	st.subheader("Detailed Analysis")
	st.write(results["reasoning"])

	# User input
	if prompt := st.chat_input("Ask something..."):
	st.session_state.messages.append({"role": "user", "content": prompt})
	st.chat_message("user").write(prompt)

	with st.chat_message("assistant"):
	with st.spinner('Researching...'):
	plan, solve, output = process(tools, model, prompt)

	st.write(output)

	with st.expander("Show research process"):
	st.subheader("Planner")
	st.text(plan)
	st.subheader("Solver")
	st.text(solve)

	# Add evaluate button in expander for current response
	if st.button("🔍 Evaluate", key="eval_current", type="secondary"):
	with st.spinner("Evaluating response..."):
	results = evaluate(output, plan, solve)

	# Show evaluation in an expander
	with st.expander("Evaluation Results"):
	# Display verdict with color
	verdict_color = "red"
	if results["verdict_category"] == "supported":
	verdict_color = "green"
	elif results["verdict_category"] == "partially_supported":
	verdict_color = "orange"

	st.markdown(f"<h4 style='color: {verdict_color};'>{results['verdict']}</h4>",
	unsafe_allow_html=True)
	st.subheader("Summary")
	st.write(results["summary"])
	st.subheader("Detailed Analysis")
	st.write(results["reasoning"])

	# Store evaluation results
	for i in range(len(st.session_state.messages)):
	if st.session_state.messages[i]["role"] == "assistant" and \
	st.session_state.messages[i]["content"] == output:
	st.session_state.messages[i]["evaluation_results"] = results
	break

	# Add assistant response to chat history
	st.session_state.messages.append({"role": "assistant", "content": output, "plan": plan, "solve": solve})

	# Clear chat button
	if st.sidebar.button("Clear Chat"):
	st.session_state.messages = []
	st.rerun()