Update app.py
Browse files
app.py
CHANGED
@@ -2,72 +2,316 @@ import os
|
|
2 |
import gradio as gr
|
3 |
import requests
|
4 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
5 |
|
|
|
6 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
7 |
|
8 |
-
|
9 |
-
CodeAgent,
|
10 |
-
tool,
|
11 |
-
DuckDuckGoSearchTool,
|
12 |
-
PythonInterpreterTool,
|
13 |
-
FinalAnswerTool,
|
14 |
-
)
|
15 |
-
from smolagents.llms.anthropic_api import AnthropicModel
|
16 |
-
|
17 |
-
# Прокладка над DuckDuckGoSearchTool, возвращает "" при ошибках
|
18 |
-
@tool
|
19 |
-
def web_search(query: str) -> str:
|
20 |
-
"""
|
21 |
-
Performs a web search using DuckDuckGo.
|
22 |
-
Args:
|
23 |
-
query: The search query string.
|
24 |
-
Returns:
|
25 |
-
The raw text results, or empty string on failure.
|
26 |
-
"""
|
27 |
-
try:
|
28 |
-
return DuckDuckGoSearchTool()(query=query)
|
29 |
-
except Exception:
|
30 |
-
return ""
|
31 |
-
|
32 |
-
class BasicAgent:
|
33 |
def __init__(self):
|
34 |
-
print("Initializing
|
35 |
-
#
|
36 |
-
claude_key = os.environ
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
-
def
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
|
67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
69 |
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
"""
|
72 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
73 |
and displays the results.
|
@@ -128,7 +372,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
128 |
print(f"Skipping item with missing task_id or question: {item}")
|
129 |
continue
|
130 |
try:
|
131 |
-
submitted_answer = agent(question_text)
|
132 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
133 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
134 |
except Exception as e:
|
@@ -190,19 +434,14 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
190 |
|
191 |
# --- Build Gradio Interface using Blocks ---
|
192 |
with gr.Blocks() as demo:
|
193 |
-
gr.Markdown("#
|
194 |
gr.Markdown(
|
195 |
"""
|
196 |
**Instructions:**
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
---
|
203 |
-
**Disclaimers:**
|
204 |
-
Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
|
205 |
-
This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
|
206 |
"""
|
207 |
)
|
208 |
|
@@ -211,7 +450,6 @@ with gr.Blocks() as demo:
|
|
211 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
212 |
|
213 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
214 |
-
# Removed max_rows=10 from DataFrame constructor
|
215 |
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
216 |
|
217 |
run_button.click(
|
@@ -240,5 +478,5 @@ if __name__ == "__main__":
|
|
240 |
|
241 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
242 |
|
243 |
-
print("Launching Gradio Interface for
|
244 |
demo.launch(debug=True, share=False)
|
|
|
2 |
import gradio as gr
|
3 |
import requests
|
4 |
import pandas as pd
|
5 |
+
import json
|
6 |
+
import re
|
7 |
+
import base64
|
8 |
+
from typing import Optional, Dict, List, Any
|
9 |
+
import anthropic
|
10 |
|
11 |
+
# API URL для GAIA
|
12 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
13 |
|
14 |
+
class GAIAAgent:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
def __init__(self):
|
16 |
+
print("Initializing GAIA Agent powered by Claude...")
|
17 |
+
# Получение API-ключа Claude из переменных окружения
|
18 |
+
self.claude_key = os.environ.get("ANTHROPIC_API_KEY")
|
19 |
+
if not self.claude_key:
|
20 |
+
raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
|
21 |
+
|
22 |
+
# Инициализация клиента Claude
|
23 |
+
self.client = anthropic.Anthropic(api_key=self.claude_key)
|
24 |
+
|
25 |
+
# API URL для GAIA
|
26 |
+
self.api_url = DEFAULT_API_URL
|
27 |
+
|
28 |
+
# Словарь для кеширования результатов поиска и ответов
|
29 |
+
self.search_cache = {}
|
30 |
+
self.file_cache = {}
|
31 |
+
|
32 |
+
# Системный промпт для Claude
|
33 |
+
self.system_prompt = """
|
34 |
+
You are an AI assistant specially designed to answer questions from the GAIA benchmark with exceptional accuracy.
|
35 |
+
The GAIA benchmark evaluates AI's ability to perform real-world tasks that require reasoning, web browsing, and tool use.
|
36 |
+
|
37 |
+
Your goal is to provide the EXACT answer in the format requested by each question. GAIA uses exact matching for evaluation.
|
38 |
+
|
39 |
+
Guidelines for GAIA answers:
|
40 |
+
1. Provide ONLY the final answer, with NO explanations, reasoning, or additional text
|
41 |
+
2. Format is critical - follow the instructions in the question precisely
|
42 |
+
3. For comma-separated lists, provide "item1, item2, item3" with no quotes or extra punctuation
|
43 |
+
4. For numeric answers, provide just the number without units unless specifically requested
|
44 |
+
5. Maintain exact capitalization and spacing as requested in the question
|
45 |
+
6. If asked to order items, follow the requested ordering precisely
|
46 |
+
|
47 |
+
Examples of correct formatting:
|
48 |
+
- If asked for fruits in alphabetical order: "apples, bananas, oranges"
|
49 |
+
- If asked for a single word: "photosynthesis"
|
50 |
+
- If asked for a number: "42"
|
51 |
+
- If asked for a date in MM/DD/YY format: "05/04/25"
|
52 |
+
|
53 |
+
Remember, your score depends on exact matching against the reference answer.
|
54 |
+
"""
|
55 |
+
|
56 |
+
def search_web(self, query: str) -> str:
|
57 |
+
"""Improved web search function with caching"""
|
58 |
+
if query in self.search_cache:
|
59 |
+
print(f"Using cached search results for: {query}")
|
60 |
+
return self.search_cache[query]
|
61 |
+
|
62 |
+
print(f"Performing web search for: {query}")
|
63 |
+
try:
|
64 |
+
# DuckDuckGo Instant Answer API
|
65 |
+
response = requests.get(
|
66 |
+
"https://api.duckduckgo.com/",
|
67 |
+
params={"q": query, "format": "json"},
|
68 |
+
timeout=10
|
69 |
+
)
|
70 |
+
data = response.json()
|
71 |
+
|
72 |
+
# Собираем результаты из разных полей
|
73 |
+
results = []
|
74 |
+
if data.get("AbstractText"):
|
75 |
+
results.append(f"Abstract: {data['AbstractText']}")
|
76 |
+
if data.get("RelatedTopics"):
|
77 |
+
topics = data.get("RelatedTopics", [])
|
78 |
+
for i, topic in enumerate(topics[:5]): # Ограничиваем 5 результатами
|
79 |
+
if isinstance(topic, dict) and topic.get("Text"):
|
80 |
+
results.append(f"Related Topic {i+1}: {topic['Text']}")
|
81 |
+
|
82 |
+
result_text = "\n\n".join(results) if results else "No results found"
|
83 |
+
|
84 |
+
# Вторичный поиск с использованием серпапи.com (если бы у нас был ключ API)
|
85 |
+
# В реальном приложении здесь можно было бы использовать другой поисковый API
|
86 |
+
|
87 |
+
# Кешируем и возвращаем результаты
|
88 |
+
self.search_cache[query] = result_text
|
89 |
+
return result_text
|
90 |
+
except Exception as e:
|
91 |
+
print(f"Web search error: {e}")
|
92 |
+
return f"Web search failed: {str(e)}"
|
93 |
+
|
94 |
+
def fetch_file(self, task_id: str) -> Optional[Dict[str, Any]]:
|
95 |
+
"""Fetches and processes a file associated with a task"""
|
96 |
+
if task_id in self.file_cache:
|
97 |
+
print(f"Using cached file for task: {task_id}")
|
98 |
+
return self.file_cache[task_id]
|
99 |
+
|
100 |
+
print(f"Fetching file for task: {task_id}")
|
101 |
+
try:
|
102 |
+
response = requests.get(f"{self.api_url}/files/{task_id}", timeout=15)
|
103 |
+
|
104 |
+
if response.status_code == 200:
|
105 |
+
file_content = response.content
|
106 |
+
file_info = {
|
107 |
+
"content": file_content,
|
108 |
+
"content_type": response.headers.get("Content-Type", ""),
|
109 |
+
"size": len(file_content)
|
110 |
+
}
|
111 |
+
|
112 |
+
# Определяем тип файла и обрабатываем соответственно
|
113 |
+
content_type = file_info["content_type"].lower()
|
114 |
+
|
115 |
+
if "image" in content_type:
|
116 |
+
# Преобразуем изображение в base64 для Claude
|
117 |
+
file_info["base64"] = base64.b64encode(file_content).decode('utf-8')
|
118 |
+
file_info["type"] = "image"
|
119 |
+
print(f"Processed image file ({file_info['size']} bytes)")
|
120 |
+
elif "pdf" in content_type:
|
121 |
+
# Для PDF мы можем только сказать, что это PDF
|
122 |
+
file_info["type"] = "pdf"
|
123 |
+
print(f"Detected PDF file ({file_info['size']} bytes)")
|
124 |
+
elif "text" in content_type or "json" in content_type or "csv" in content_type:
|
125 |
+
# Для текстовых файлов пытаемся декодировать
|
126 |
+
try:
|
127 |
+
file_info["text"] = file_content.decode('utf-8')
|
128 |
+
file_info["type"] = "text"
|
129 |
+
print(f"Processed text file ({file_info['size']} bytes)")
|
130 |
+
except UnicodeDecodeError:
|
131 |
+
file_info["type"] = "binary"
|
132 |
+
print(f"Could not decode text file ({file_info['size']} bytes)")
|
133 |
+
else:
|
134 |
+
file_info["type"] = "binary"
|
135 |
+
print(f"Detected binary file ({file_info['size']} bytes, {content_type})")
|
136 |
+
|
137 |
+
# Кешируем файл
|
138 |
+
self.file_cache[task_id] = file_info
|
139 |
+
return file_info
|
140 |
+
else:
|
141 |
+
print(f"Failed to fetch file, status code: {response.status_code}")
|
142 |
+
print(f"Response: {response.text[:1000]}")
|
143 |
+
return None
|
144 |
+
except Exception as e:
|
145 |
+
print(f"Error fetching file: {e}")
|
146 |
+
return None
|
147 |
+
|
148 |
+
def extract_answer(self, response_text: str) -> str:
|
149 |
+
"""Extract just the final answer from Claude's response"""
|
150 |
+
# Удаляем очевидные вводные фразы
|
151 |
+
cleaned = re.sub(r'^(final answer|the answer is|answer|Here\'s the answer|response):?\s*', '', response_text, flags=re.IGNORECASE)
|
152 |
+
|
153 |
+
# Удаляем объяснения в конце
|
154 |
+
cleaned = re.sub(r'\n.*?explain.*?$', '', cleaned, flags=re.IGNORECASE | re.DOTALL)
|
155 |
+
|
156 |
+
# Проверяем на многострочный ответ и берем только первую строку, если она содержит ответ
|
157 |
+
lines = cleaned.strip().split('\n')
|
158 |
+
if len(lines) > 1:
|
159 |
+
first_line = lines[0].strip()
|
160 |
+
# Если первая строка выглядит как полный ответ, возвращаем только её
|
161 |
+
if len(first_line) > 5 and not first_line.startswith('I ') and not first_line.startswith('The '):
|
162 |
+
return first_line
|
163 |
+
|
164 |
+
# Вычищаем кавычки в начале и конце
|
165 |
+
cleaned = cleaned.strip()
|
166 |
+
if cleaned.startswith('"') and cleaned.endswith('"'):
|
167 |
+
cleaned = cleaned[1:-1]
|
168 |
+
|
169 |
+
return cleaned.strip()
|
170 |
|
171 |
+
def process_question(self, question: str, task_id: str = None) -> Dict[str, Any]:
|
172 |
+
"""Processes a question to extract relevant information and prepare for Claude"""
|
173 |
+
question_info = {
|
174 |
+
"original": question,
|
175 |
+
"task_id": task_id,
|
176 |
+
"has_file": False,
|
177 |
+
"file_info": None,
|
178 |
+
"contains_math": bool(re.search(r'calculate|compute|sum|average|mean|median|formula|equation', question, re.IGNORECASE)),
|
179 |
+
"requires_list": bool(re.search(r'list|order|sequence|rank|items|elements|values', question, re.IGNORECASE)),
|
180 |
+
"format_requirements": None
|
181 |
+
}
|
182 |
+
|
183 |
+
# Извлекаем формат, если указан
|
184 |
+
format_match = re.search(r'(format|in the format|formatted as|as a|in) ([^\.]+)', question, re.IGNORECASE)
|
185 |
+
if format_match:
|
186 |
+
question_info["format_requirements"] = format_match.group(2).strip()
|
187 |
+
|
188 |
+
# Проверяем наличие файла
|
189 |
+
if task_id and self.fetch_file(task_id):
|
190 |
+
question_info["has_file"] = True
|
191 |
+
question_info["file_info"] = self.fetch_file(task_id)
|
192 |
+
|
193 |
+
return question_info
|
194 |
|
195 |
+
def __call__(self, question: str, task_id: str = None) -> str:
|
196 |
+
"""Main method to process a question and return an answer"""
|
197 |
+
if task_id is None:
|
198 |
+
# Пытаемся извлечь task_id из вопроса, если он там есть
|
199 |
+
match = re.search(r'task[\s_-]?id:?\s*(\w+)', question, re.IGNORECASE)
|
200 |
+
if match:
|
201 |
+
task_id = match.group(1)
|
202 |
+
|
203 |
+
print(f"Processing question for task_id: {task_id}")
|
204 |
+
print(f"Question: {question[:100]}...")
|
205 |
+
|
206 |
+
# Обработка вопроса
|
207 |
+
question_info = self.process_question(question, task_id)
|
208 |
+
|
209 |
+
try:
|
210 |
+
# Подготовка сообщения для Claude
|
211 |
+
messages = []
|
212 |
+
|
213 |
+
# Подготовка контента сообщения
|
214 |
+
user_content = [{
|
215 |
+
"type": "text",
|
216 |
+
"text": f"""
|
217 |
+
Question from GAIA benchmark: {question}
|
218 |
+
|
219 |
+
Remember:
|
220 |
+
1. Provide ONLY the final answer
|
221 |
+
2. Format exactly as requested
|
222 |
+
3. No explanations or reasoning
|
223 |
+
"""
|
224 |
+
}]
|
225 |
+
|
226 |
+
# Добавляем результаты поиска, если нужно
|
227 |
+
web_results = self.search_web(question)
|
228 |
+
if web_results:
|
229 |
+
user_content.append({
|
230 |
+
"type": "text",
|
231 |
+
"text": f"""
|
232 |
+
Web search results related to this question:
|
233 |
|
234 |
+
{web_results}
|
235 |
+
"""
|
236 |
+
})
|
237 |
+
|
238 |
+
# Добавляем файл, если он есть
|
239 |
+
if question_info["has_file"] and question_info["file_info"]:
|
240 |
+
file_info = question_info["file_info"]
|
241 |
+
|
242 |
+
if file_info["type"] == "image":
|
243 |
+
# Добавляем изображение для Claude
|
244 |
+
user_content.append({
|
245 |
+
"type": "image",
|
246 |
+
"source": {
|
247 |
+
"type": "base64",
|
248 |
+
"media_type": file_info["content_type"],
|
249 |
+
"data": file_info["base64"]
|
250 |
+
}
|
251 |
+
})
|
252 |
+
|
253 |
+
user_content.append({
|
254 |
+
"type": "text",
|
255 |
+
"text": "The above image is part of the question. Please analyze it carefully."
|
256 |
+
})
|
257 |
+
elif file_info["type"] == "text" and "text" in file_info:
|
258 |
+
# Для текстовых файлов добавляем содержимое
|
259 |
+
user_content.append({
|
260 |
+
"type": "text",
|
261 |
+
"text": f"""
|
262 |
+
The question includes a text file with the following content:
|
263 |
|
264 |
+
{file_info["text"][:4000]} # ограничиваем, чтобы не превысить лимиты токенов
|
265 |
+
"""
|
266 |
+
})
|
267 |
+
|
268 |
+
# Добавляем форматирование, если указано
|
269 |
+
if question_info["format_requirements"]:
|
270 |
+
user_content.append({
|
271 |
+
"type": "text",
|
272 |
+
"text": f"""
|
273 |
+
Important format requirement: {question_info["format_requirements"]}
|
274 |
+
Make sure your answer follows this format EXACTLY.
|
275 |
+
"""
|
276 |
+
})
|
277 |
+
|
278 |
+
messages.append({
|
279 |
+
"role": "user",
|
280 |
+
"content": user_content
|
281 |
+
})
|
282 |
+
|
283 |
+
# Запрос к Claude
|
284 |
+
response = self.client.messages.create(
|
285 |
+
model="claude-3-5-sonnet-20241022",
|
286 |
+
system=self.system_prompt,
|
287 |
+
messages=messages,
|
288 |
+
temperature=0.1, # Низкая температура для точных ответов
|
289 |
+
max_tokens=4096
|
290 |
+
)
|
291 |
+
|
292 |
+
# Получаем ответ
|
293 |
+
raw_answer = response.content[0].text.strip()
|
294 |
+
|
295 |
+
# Вычищаем ответ от лишнего
|
296 |
+
clean_answer = self.extract_answer(raw_answer)
|
297 |
+
|
298 |
+
print(f"Raw answer: {raw_answer}")
|
299 |
+
print(f"Clean answer: {clean_answer}")
|
300 |
+
|
301 |
+
return clean_answer
|
302 |
+
except Exception as e:
|
303 |
+
print(f"Error in agent: {e}")
|
304 |
+
import traceback
|
305 |
+
traceback.print_exc()
|
306 |
+
return f"Error processing question: {str(e)}"
|
307 |
|
308 |
+
|
309 |
+
# Используем наш агент как BasicAgent для совместимости с остальным кодом
|
310 |
+
class BasicAgent(GAIAAgent):
|
311 |
+
pass
|
312 |
+
|
313 |
+
|
314 |
+
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
315 |
"""
|
316 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
317 |
and displays the results.
|
|
|
372 |
print(f"Skipping item with missing task_id or question: {item}")
|
373 |
continue
|
374 |
try:
|
375 |
+
submitted_answer = agent(question_text, task_id)
|
376 |
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
|
377 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
378 |
except Exception as e:
|
|
|
434 |
|
435 |
# --- Build Gradio Interface using Blocks ---
|
436 |
with gr.Blocks() as demo:
|
437 |
+
gr.Markdown("# GAIA Benchmark Agent Evaluation")
|
438 |
gr.Markdown(
|
439 |
"""
|
440 |
**Instructions:**
|
441 |
+
1. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
|
442 |
+
2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
|
443 |
+
|
444 |
+
This agent uses Claude 3.5 Sonnet to solve GAIA benchmark tasks.
|
|
|
|
|
|
|
|
|
|
|
445 |
"""
|
446 |
)
|
447 |
|
|
|
450 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
451 |
|
452 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
|
|
453 |
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
454 |
|
455 |
run_button.click(
|
|
|
478 |
|
479 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
480 |
|
481 |
+
print("Launching Gradio Interface for GAIA Agent Evaluation...")
|
482 |
demo.launch(debug=True, share=False)
|