Alexis-alexis commited on
Commit
f18df04
·
verified ·
1 Parent(s): b88b30b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +308 -70
app.py CHANGED
@@ -2,72 +2,316 @@ import os
2
  import gradio as gr
3
  import requests
4
  import pandas as pd
 
 
 
 
 
5
 
 
6
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
7
 
8
- from smolagents import (
9
- CodeAgent,
10
- tool,
11
- DuckDuckGoSearchTool,
12
- PythonInterpreterTool,
13
- FinalAnswerTool,
14
- )
15
- from smolagents.llms.anthropic_api import AnthropicModel
16
-
17
- # Прокладка над DuckDuckGoSearchTool, возвращает "" при ошибках
18
- @tool
19
- def web_search(query: str) -> str:
20
- """
21
- Performs a web search using DuckDuckGo.
22
- Args:
23
- query: The search query string.
24
- Returns:
25
- The raw text results, or empty string on failure.
26
- """
27
- try:
28
- return DuckDuckGoSearchTool()(query=query)
29
- except Exception:
30
- return ""
31
-
32
- class BasicAgent:
33
  def __init__(self):
34
- print("Initializing smart CodeAgent…")
35
- # 1) Подхватываем Claude-токен из Secrets
36
- claude_key = os.environ["ANTHROPIC_API_KEY"]
37
- # 2) Инициализируем модель Anthropic/Claude
38
- self.model = AnthropicModel(
39
- api_key=claude_key,
40
- model="claude-3-5-sonnet-20241022", # или "claude-3.5-sonnet-instruct", смотрите что у вас есть
41
- temperature=0.3,
42
- max_tokens=2048,
43
- )
44
- # 3) Собираем наш агент
45
- self.agent = CodeAgent(
46
- model=self.model,
47
- tools=[
48
- web_search, # web-поиск
49
- PythonInterpreterTool(), # Python-исполнение
50
- FinalAnswerTool(), # финализация
51
- ],
52
- max_steps=6,
53
- verbosity_level=2,
54
- name="GAIAAgent",
55
- description="Agent that solves GAIA tasks",
56
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- def __call__(self, question: str) -> str:
59
- print(f"Agent received question: {question[:80]}…")
60
- # достаточно передать строку
61
- result = self.agent.run(question)
62
- print(f"Agent result: {result}")
63
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- # остальной код без изменений
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
- # — дальше ваш run_and_submit_all и Gradio UI без изменений —
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- def run_and_submit_all( profile: gr.OAuthProfile | None):
 
 
 
 
 
 
71
  """
72
  Fetches all questions, runs the BasicAgent on them, submits all answers,
73
  and displays the results.
@@ -128,7 +372,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
128
  print(f"Skipping item with missing task_id or question: {item}")
129
  continue
130
  try:
131
- submitted_answer = agent(question_text)
132
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
133
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
134
  except Exception as e:
@@ -190,19 +434,14 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
190
 
191
  # --- Build Gradio Interface using Blocks ---
192
  with gr.Blocks() as demo:
193
- gr.Markdown("# Basic Agent Evaluation Runner")
194
  gr.Markdown(
195
  """
196
  **Instructions:**
197
-
198
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
199
- 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
200
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
201
-
202
- ---
203
- **Disclaimers:**
204
- Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
205
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
206
  """
207
  )
208
 
@@ -211,7 +450,6 @@ with gr.Blocks() as demo:
211
  run_button = gr.Button("Run Evaluation & Submit All Answers")
212
 
213
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
214
- # Removed max_rows=10 from DataFrame constructor
215
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
216
 
217
  run_button.click(
@@ -240,5 +478,5 @@ if __name__ == "__main__":
240
 
241
  print("-"*(60 + len(" App Starting ")) + "\n")
242
 
243
- print("Launching Gradio Interface for Basic Agent Evaluation...")
244
  demo.launch(debug=True, share=False)
 
2
  import gradio as gr
3
  import requests
4
  import pandas as pd
5
+ import json
6
+ import re
7
+ import base64
8
+ from typing import Optional, Dict, List, Any
9
+ import anthropic
10
 
11
+ # API URL для GAIA
12
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
13
 
14
+ class GAIAAgent:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  def __init__(self):
16
+ print("Initializing GAIA Agent powered by Claude...")
17
+ # Получение API-ключа Claude из переменных окружения
18
+ self.claude_key = os.environ.get("ANTHROPIC_API_KEY")
19
+ if not self.claude_key:
20
+ raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
21
+
22
+ # Инициализация клиента Claude
23
+ self.client = anthropic.Anthropic(api_key=self.claude_key)
24
+
25
+ # API URL для GAIA
26
+ self.api_url = DEFAULT_API_URL
27
+
28
+ # Словарь для кеширования результатов поиска и ответов
29
+ self.search_cache = {}
30
+ self.file_cache = {}
31
+
32
+ # Системный промпт для Claude
33
+ self.system_prompt = """
34
+ You are an AI assistant specially designed to answer questions from the GAIA benchmark with exceptional accuracy.
35
+ The GAIA benchmark evaluates AI's ability to perform real-world tasks that require reasoning, web browsing, and tool use.
36
+
37
+ Your goal is to provide the EXACT answer in the format requested by each question. GAIA uses exact matching for evaluation.
38
+
39
+ Guidelines for GAIA answers:
40
+ 1. Provide ONLY the final answer, with NO explanations, reasoning, or additional text
41
+ 2. Format is critical - follow the instructions in the question precisely
42
+ 3. For comma-separated lists, provide "item1, item2, item3" with no quotes or extra punctuation
43
+ 4. For numeric answers, provide just the number without units unless specifically requested
44
+ 5. Maintain exact capitalization and spacing as requested in the question
45
+ 6. If asked to order items, follow the requested ordering precisely
46
+
47
+ Examples of correct formatting:
48
+ - If asked for fruits in alphabetical order: "apples, bananas, oranges"
49
+ - If asked for a single word: "photosynthesis"
50
+ - If asked for a number: "42"
51
+ - If asked for a date in MM/DD/YY format: "05/04/25"
52
+
53
+ Remember, your score depends on exact matching against the reference answer.
54
+ """
55
+
56
+ def search_web(self, query: str) -> str:
57
+ """Improved web search function with caching"""
58
+ if query in self.search_cache:
59
+ print(f"Using cached search results for: {query}")
60
+ return self.search_cache[query]
61
+
62
+ print(f"Performing web search for: {query}")
63
+ try:
64
+ # DuckDuckGo Instant Answer API
65
+ response = requests.get(
66
+ "https://api.duckduckgo.com/",
67
+ params={"q": query, "format": "json"},
68
+ timeout=10
69
+ )
70
+ data = response.json()
71
+
72
+ # Собираем результаты из разных полей
73
+ results = []
74
+ if data.get("AbstractText"):
75
+ results.append(f"Abstract: {data['AbstractText']}")
76
+ if data.get("RelatedTopics"):
77
+ topics = data.get("RelatedTopics", [])
78
+ for i, topic in enumerate(topics[:5]): # Ограничиваем 5 результатами
79
+ if isinstance(topic, dict) and topic.get("Text"):
80
+ results.append(f"Related Topic {i+1}: {topic['Text']}")
81
+
82
+ result_text = "\n\n".join(results) if results else "No results found"
83
+
84
+ # Вторичный поиск с использованием серпапи.com (если бы у нас был ключ API)
85
+ # В реальном приложении здесь можно было бы использовать другой поисковый API
86
+
87
+ # Кешируем и возвращаем результаты
88
+ self.search_cache[query] = result_text
89
+ return result_text
90
+ except Exception as e:
91
+ print(f"Web search error: {e}")
92
+ return f"Web search failed: {str(e)}"
93
+
94
+ def fetch_file(self, task_id: str) -> Optional[Dict[str, Any]]:
95
+ """Fetches and processes a file associated with a task"""
96
+ if task_id in self.file_cache:
97
+ print(f"Using cached file for task: {task_id}")
98
+ return self.file_cache[task_id]
99
+
100
+ print(f"Fetching file for task: {task_id}")
101
+ try:
102
+ response = requests.get(f"{self.api_url}/files/{task_id}", timeout=15)
103
+
104
+ if response.status_code == 200:
105
+ file_content = response.content
106
+ file_info = {
107
+ "content": file_content,
108
+ "content_type": response.headers.get("Content-Type", ""),
109
+ "size": len(file_content)
110
+ }
111
+
112
+ # Определяем тип файла и обрабатываем соответственно
113
+ content_type = file_info["content_type"].lower()
114
+
115
+ if "image" in content_type:
116
+ # Преобразуем изображение в base64 для Claude
117
+ file_info["base64"] = base64.b64encode(file_content).decode('utf-8')
118
+ file_info["type"] = "image"
119
+ print(f"Processed image file ({file_info['size']} bytes)")
120
+ elif "pdf" in content_type:
121
+ # Для PDF мы можем только сказать, что это PDF
122
+ file_info["type"] = "pdf"
123
+ print(f"Detected PDF file ({file_info['size']} bytes)")
124
+ elif "text" in content_type or "json" in content_type or "csv" in content_type:
125
+ # Для текстовых файлов пытаемся декодировать
126
+ try:
127
+ file_info["text"] = file_content.decode('utf-8')
128
+ file_info["type"] = "text"
129
+ print(f"Processed text file ({file_info['size']} bytes)")
130
+ except UnicodeDecodeError:
131
+ file_info["type"] = "binary"
132
+ print(f"Could not decode text file ({file_info['size']} bytes)")
133
+ else:
134
+ file_info["type"] = "binary"
135
+ print(f"Detected binary file ({file_info['size']} bytes, {content_type})")
136
+
137
+ # Кешируем файл
138
+ self.file_cache[task_id] = file_info
139
+ return file_info
140
+ else:
141
+ print(f"Failed to fetch file, status code: {response.status_code}")
142
+ print(f"Response: {response.text[:1000]}")
143
+ return None
144
+ except Exception as e:
145
+ print(f"Error fetching file: {e}")
146
+ return None
147
+
148
+ def extract_answer(self, response_text: str) -> str:
149
+ """Extract just the final answer from Claude's response"""
150
+ # Удаляем очевидные вводные фразы
151
+ cleaned = re.sub(r'^(final answer|the answer is|answer|Here\'s the answer|response):?\s*', '', response_text, flags=re.IGNORECASE)
152
+
153
+ # Удаляем объяснения в конце
154
+ cleaned = re.sub(r'\n.*?explain.*?$', '', cleaned, flags=re.IGNORECASE | re.DOTALL)
155
+
156
+ # Проверяем на многострочный ответ и берем только первую строку, если она содержит ответ
157
+ lines = cleaned.strip().split('\n')
158
+ if len(lines) > 1:
159
+ first_line = lines[0].strip()
160
+ # Если первая строка выглядит как полный ответ, возвращаем только её
161
+ if len(first_line) > 5 and not first_line.startswith('I ') and not first_line.startswith('The '):
162
+ return first_line
163
+
164
+ # Вычищаем кавычки в начале и конце
165
+ cleaned = cleaned.strip()
166
+ if cleaned.startswith('"') and cleaned.endswith('"'):
167
+ cleaned = cleaned[1:-1]
168
+
169
+ return cleaned.strip()
170
 
171
+ def process_question(self, question: str, task_id: str = None) -> Dict[str, Any]:
172
+ """Processes a question to extract relevant information and prepare for Claude"""
173
+ question_info = {
174
+ "original": question,
175
+ "task_id": task_id,
176
+ "has_file": False,
177
+ "file_info": None,
178
+ "contains_math": bool(re.search(r'calculate|compute|sum|average|mean|median|formula|equation', question, re.IGNORECASE)),
179
+ "requires_list": bool(re.search(r'list|order|sequence|rank|items|elements|values', question, re.IGNORECASE)),
180
+ "format_requirements": None
181
+ }
182
+
183
+ # Извлекаем формат, если указан
184
+ format_match = re.search(r'(format|in the format|formatted as|as a|in) ([^\.]+)', question, re.IGNORECASE)
185
+ if format_match:
186
+ question_info["format_requirements"] = format_match.group(2).strip()
187
+
188
+ # Проверяем наличие файла
189
+ if task_id and self.fetch_file(task_id):
190
+ question_info["has_file"] = True
191
+ question_info["file_info"] = self.fetch_file(task_id)
192
+
193
+ return question_info
194
 
195
+ def __call__(self, question: str, task_id: str = None) -> str:
196
+ """Main method to process a question and return an answer"""
197
+ if task_id is None:
198
+ # Пытаемся извлечь task_id из вопроса, если он там есть
199
+ match = re.search(r'task[\s_-]?id:?\s*(\w+)', question, re.IGNORECASE)
200
+ if match:
201
+ task_id = match.group(1)
202
+
203
+ print(f"Processing question for task_id: {task_id}")
204
+ print(f"Question: {question[:100]}...")
205
+
206
+ # Обработка вопроса
207
+ question_info = self.process_question(question, task_id)
208
+
209
+ try:
210
+ # Подготовка сообщения для Claude
211
+ messages = []
212
+
213
+ # Подготовка контента сообщения
214
+ user_content = [{
215
+ "type": "text",
216
+ "text": f"""
217
+ Question from GAIA benchmark: {question}
218
+
219
+ Remember:
220
+ 1. Provide ONLY the final answer
221
+ 2. Format exactly as requested
222
+ 3. No explanations or reasoning
223
+ """
224
+ }]
225
+
226
+ # Добавляем результаты поиска, если нужно
227
+ web_results = self.search_web(question)
228
+ if web_results:
229
+ user_content.append({
230
+ "type": "text",
231
+ "text": f"""
232
+ Web search results related to this question:
233
 
234
+ {web_results}
235
+ """
236
+ })
237
+
238
+ # Добавляем файл, если он есть
239
+ if question_info["has_file"] and question_info["file_info"]:
240
+ file_info = question_info["file_info"]
241
+
242
+ if file_info["type"] == "image":
243
+ # Добавляем изображение для Claude
244
+ user_content.append({
245
+ "type": "image",
246
+ "source": {
247
+ "type": "base64",
248
+ "media_type": file_info["content_type"],
249
+ "data": file_info["base64"]
250
+ }
251
+ })
252
+
253
+ user_content.append({
254
+ "type": "text",
255
+ "text": "The above image is part of the question. Please analyze it carefully."
256
+ })
257
+ elif file_info["type"] == "text" and "text" in file_info:
258
+ # Для текстовых файлов добавляем содержимое
259
+ user_content.append({
260
+ "type": "text",
261
+ "text": f"""
262
+ The question includes a text file with the following content:
263
 
264
+ {file_info["text"][:4000]} # ограничиваем, чтобы не превысить лимиты токенов
265
+ """
266
+ })
267
+
268
+ # Добавляем форматирование, если указано
269
+ if question_info["format_requirements"]:
270
+ user_content.append({
271
+ "type": "text",
272
+ "text": f"""
273
+ Important format requirement: {question_info["format_requirements"]}
274
+ Make sure your answer follows this format EXACTLY.
275
+ """
276
+ })
277
+
278
+ messages.append({
279
+ "role": "user",
280
+ "content": user_content
281
+ })
282
+
283
+ # Запрос к Claude
284
+ response = self.client.messages.create(
285
+ model="claude-3-5-sonnet-20241022",
286
+ system=self.system_prompt,
287
+ messages=messages,
288
+ temperature=0.1, # Низкая температура для точных ответов
289
+ max_tokens=4096
290
+ )
291
+
292
+ # Получаем ответ
293
+ raw_answer = response.content[0].text.strip()
294
+
295
+ # Вычищаем ответ от лишнего
296
+ clean_answer = self.extract_answer(raw_answer)
297
+
298
+ print(f"Raw answer: {raw_answer}")
299
+ print(f"Clean answer: {clean_answer}")
300
+
301
+ return clean_answer
302
+ except Exception as e:
303
+ print(f"Error in agent: {e}")
304
+ import traceback
305
+ traceback.print_exc()
306
+ return f"Error processing question: {str(e)}"
307
 
308
+
309
+ # Используем наш агент как BasicAgent для совместимости с остальным кодом
310
+ class BasicAgent(GAIAAgent):
311
+ pass
312
+
313
+
314
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
315
  """
316
  Fetches all questions, runs the BasicAgent on them, submits all answers,
317
  and displays the results.
 
372
  print(f"Skipping item with missing task_id or question: {item}")
373
  continue
374
  try:
375
+ submitted_answer = agent(question_text, task_id)
376
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
377
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
378
  except Exception as e:
 
434
 
435
  # --- Build Gradio Interface using Blocks ---
436
  with gr.Blocks() as demo:
437
+ gr.Markdown("# GAIA Benchmark Agent Evaluation")
438
  gr.Markdown(
439
  """
440
  **Instructions:**
441
+ 1. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
442
+ 2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
443
+
444
+ This agent uses Claude 3.5 Sonnet to solve GAIA benchmark tasks.
 
 
 
 
 
445
  """
446
  )
447
 
 
450
  run_button = gr.Button("Run Evaluation & Submit All Answers")
451
 
452
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
 
453
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
454
 
455
  run_button.click(
 
478
 
479
  print("-"*(60 + len(" App Starting ")) + "\n")
480
 
481
+ print("Launching Gradio Interface for GAIA Agent Evaluation...")
482
  demo.launch(debug=True, share=False)