Final_Assignment_Template

Sleeping

App Files Files Community

Alexis-alexis commited on 10 days ago

Commit

f18df04

verified ·

1 Parent(s): b88b30b

Update app.py

Browse files

Files changed (1) hide show

app.py +308 -70

app.py CHANGED Viewed

@@ -2,72 +2,316 @@ import os
 import gradio as gr
 import requests
 import pandas as pd
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-from smolagents import (
-    CodeAgent,
-    tool,
-    DuckDuckGoSearchTool,
-    PythonInterpreterTool,
-    FinalAnswerTool,
-)
-from smolagents.llms.anthropic_api import AnthropicModel
-# Прокладка над DuckDuckGoSearchTool, возвращает "" при ошибках
-@tool
-def web_search(query: str) -> str:
-    """
-    Performs a web search using DuckDuckGo.
-    Args:
-        query: The search query string.
-    Returns:
-        The raw text results, or empty string on failure.
-    """
-    try:
-        return DuckDuckGoSearchTool()(query=query)
-    except Exception:
-        return ""
-class BasicAgent:
     def __init__(self):
-        print("Initializing smart CodeAgent…")
-        # 1) Подхватываем Claude-токен из Secrets
-        claude_key = os.environ["ANTHROPIC_API_KEY"]
-        # 2) Инициализируем модель Anthropic/Claude
-        self.model = AnthropicModel(
-            api_key=claude_key,
-            model="claude-3-5-sonnet-20241022",  # или "claude-3.5-sonnet-instruct", смотрите что у вас есть
-            temperature=0.3,
-            max_tokens=2048,
-        )
-        # 3) Собираем наш агент
-        self.agent = CodeAgent(
-            model=self.model,
-            tools=[
-                web_search,               # web-поиск
-                PythonInterpreterTool(),  # Python-исполнение
-                FinalAnswerTool(),        # финализация
-            ],
-            max_steps=6,
-            verbosity_level=2,
-            name="GAIAAgent",
-            description="Agent that solves GAIA tasks",
-        )
-    def __call__(self, question: str) -> str:
-        print(f"Agent received question: {question[:80]}…")
-        # достаточно передать строку
-        result = self.agent.run(question)
-        print(f"Agent result: {result}")
-        return result
-# … остальной код без изменений …
-# — дальше ваш run_and_submit_all и Gradio UI без изменений —
-def run_and_submit_all( profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
@@ -128,7 +372,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
-            submitted_answer = agent(question_text)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
@@ -190,19 +434,14 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
-    gr.Markdown("# Basic Agent Evaluation Runner")
     gr.Markdown(
         """
         **Instructions:**
-        1.  Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
-        2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
-        3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
-        ---
-        **Disclaimers:**
-        Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
-        This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
         """
     )
@@ -211,7 +450,6 @@ with gr.Blocks() as demo:
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
-    # Removed max_rows=10 from DataFrame constructor
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
     run_button.click(
@@ -240,5 +478,5 @@ if __name__ == "__main__":
     print("-"*(60 + len(" App Starting ")) + "\n")
-    print("Launching Gradio Interface for Basic Agent Evaluation...")
     demo.launch(debug=True, share=False)

 import gradio as gr
 import requests
 import pandas as pd
+import json
+import re
+import base64
+from typing import Optional, Dict, List, Any
+import anthropic
+# API URL для GAIA
 DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
+class GAIAAgent:
     def __init__(self):
+        print("Initializing GAIA Agent powered by Claude...")
+        # Получение API-ключа Claude из переменных окружения
+        self.claude_key = os.environ.get("ANTHROPIC_API_KEY")
+        if not self.claude_key:
+            raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
+        # Инициализация клиента Claude
+        self.client = anthropic.Anthropic(api_key=self.claude_key)
+        # API URL для GAIA
+        self.api_url = DEFAULT_API_URL
+        # Словарь для кеширования результатов поиска и ответов
+        self.search_cache = {}
+        self.file_cache = {}
+        # Системный промпт для Claude
+        self.system_prompt = """
+        You are an AI assistant specially designed to answer questions from the GAIA benchmark with exceptional accuracy.
+        The GAIA benchmark evaluates AI's ability to perform real-world tasks that require reasoning, web browsing, and tool use.
+        Your goal is to provide the EXACT answer in the format requested by each question. GAIA uses exact matching for evaluation.
+        Guidelines for GAIA answers:
+        1. Provide ONLY the final answer, with NO explanations, reasoning, or additional text
+        2. Format is critical - follow the instructions in the question precisely
+        3. For comma-separated lists, provide "item1, item2, item3" with no quotes or extra punctuation
+        4. For numeric answers, provide just the number without units unless specifically requested
+        5. Maintain exact capitalization and spacing as requested in the question
+        6. If asked to order items, follow the requested ordering precisely
+        Examples of correct formatting:
+        - If asked for fruits in alphabetical order: "apples, bananas, oranges"
+        - If asked for a single word: "photosynthesis"
+        - If asked for a number: "42"
+        - If asked for a date in MM/DD/YY format: "05/04/25"
+        Remember, your score depends on exact matching against the reference answer.
+        """
+    def search_web(self, query: str) -> str:
+        """Improved web search function with caching"""
+        if query in self.search_cache:
+            print(f"Using cached search results for: {query}")
+            return self.search_cache[query]
+        print(f"Performing web search for: {query}")
+        try:
+            # DuckDuckGo Instant Answer API
+            response = requests.get(
+                "https://api.duckduckgo.com/",
+                params={"q": query, "format": "json"},
+                timeout=10
+            )
+            data = response.json()
+            # Собираем результаты из разных полей
+            results = []
+            if data.get("AbstractText"):
+                results.append(f"Abstract: {data['AbstractText']}")
+            if data.get("RelatedTopics"):
+                topics = data.get("RelatedTopics", [])
+                for i, topic in enumerate(topics[:5]):  # Ограничиваем 5 результатами
+                    if isinstance(topic, dict) and topic.get("Text"):
+                        results.append(f"Related Topic {i+1}: {topic['Text']}")
+            result_text = "\n\n".join(results) if results else "No results found"
+            # Вторичный поиск с использованием серпапи.com (если бы у нас был ключ API)
+            # В реальном приложении здесь можно было бы использовать другой поисковый API
+            # Кешируем и возвращаем результаты
+            self.search_cache[query] = result_text
+            return result_text
+        except Exception as e:
+            print(f"Web search error: {e}")
+            return f"Web search failed: {str(e)}"
+    def fetch_file(self, task_id: str) -> Optional[Dict[str, Any]]:
+        """Fetches and processes a file associated with a task"""
+        if task_id in self.file_cache:
+            print(f"Using cached file for task: {task_id}")
+            return self.file_cache[task_id]
+        print(f"Fetching file for task: {task_id}")
+        try:
+            response = requests.get(f"{self.api_url}/files/{task_id}", timeout=15)
+            if response.status_code == 200:
+                file_content = response.content
+                file_info = {
+                    "content": file_content,
+                    "content_type": response.headers.get("Content-Type", ""),
+                    "size": len(file_content)
+                }
+                # Определяем тип файла и обрабатываем соответственно
+                content_type = file_info["content_type"].lower()
+                if "image" in content_type:
+                    # Преобразуем изображение в base64 для Claude
+                    file_info["base64"] = base64.b64encode(file_content).decode('utf-8')
+                    file_info["type"] = "image"
+                    print(f"Processed image file ({file_info['size']} bytes)")
+                elif "pdf" in content_type:
+                    # Для PDF мы можем только сказать, что это PDF
+                    file_info["type"] = "pdf"
+                    print(f"Detected PDF file ({file_info['size']} bytes)")
+                elif "text" in content_type or "json" in content_type or "csv" in content_type:
+                    # Для текстовых файлов пытаемся декодировать
+                    try:
+                        file_info["text"] = file_content.decode('utf-8')
+                        file_info["type"] = "text"
+                        print(f"Processed text file ({file_info['size']} bytes)")
+                    except UnicodeDecodeError:
+                        file_info["type"] = "binary"
+                        print(f"Could not decode text file ({file_info['size']} bytes)")
+                else:
+                    file_info["type"] = "binary"
+                    print(f"Detected binary file ({file_info['size']} bytes, {content_type})")
+                # Кешируем файл
+                self.file_cache[task_id] = file_info
+                return file_info
+            else:
+                print(f"Failed to fetch file, status code: {response.status_code}")
+                print(f"Response: {response.text[:1000]}")
+                return None
+        except Exception as e:
+            print(f"Error fetching file: {e}")
+            return None
+    def extract_answer(self, response_text: str) -> str:
+        """Extract just the final answer from Claude's response"""
+        # Удаляем очевидные вводные фразы
+        cleaned = re.sub(r'^(final answer|the answer is|answer|Here\'s the answer|response):?\s*', '', response_text, flags=re.IGNORECASE)
+        # Удаляем объяснения в конце
+        cleaned = re.sub(r'\n.*?explain.*?$', '', cleaned, flags=re.IGNORECASE | re.DOTALL)
+        # Проверяем на многострочный ответ и берем только первую строку, если она содержит ответ
+        lines = cleaned.strip().split('\n')
+        if len(lines) > 1:
+            first_line = lines[0].strip()
+            # Если первая строка выглядит как полный ответ, возвращаем только её
+            if len(first_line) > 5 and not first_line.startswith('I ') and not first_line.startswith('The '):
+                return first_line
+        # Вычищаем кавычки в начале и конце
+        cleaned = cleaned.strip()
+        if cleaned.startswith('"') and cleaned.endswith('"'):
+            cleaned = cleaned[1:-1]
+        return cleaned.strip()
+    def process_question(self, question: str, task_id: str = None) -> Dict[str, Any]:
+        """Processes a question to extract relevant information and prepare for Claude"""
+        question_info = {
+            "original": question,
+            "task_id": task_id,
+            "has_file": False,
+            "file_info": None,
+            "contains_math": bool(re.search(r'calculate|compute|sum|average|mean|median|formula|equation', question, re.IGNORECASE)),
+            "requires_list": bool(re.search(r'list|order|sequence|rank|items|elements|values', question, re.IGNORECASE)),
+            "format_requirements": None
+        }
+        # Извлекаем формат, если указан
+        format_match = re.search(r'(format|in the format|formatted as|as a|in) ([^\.]+)', question, re.IGNORECASE)
+        if format_match:
+            question_info["format_requirements"] = format_match.group(2).strip()
+        # Проверяем наличие файла
+        if task_id and self.fetch_file(task_id):
+            question_info["has_file"] = True
+            question_info["file_info"] = self.fetch_file(task_id)
+        return question_info
+    def __call__(self, question: str, task_id: str = None) -> str:
+        """Main method to process a question and return an answer"""
+        if task_id is None:
+            # Пытаемся извлечь task_id из вопроса, если он там есть
+            match = re.search(r'task[\s_-]?id:?\s*(\w+)', question, re.IGNORECASE)
+            if match:
+                task_id = match.group(1)
+        print(f"Processing question for task_id: {task_id}")
+        print(f"Question: {question[:100]}...")
+        # Обработка вопроса
+        question_info = self.process_question(question, task_id)
+        try:
+            # Подготовка сообщения для Claude
+            messages = []
+            # Подготовка контента сообщения
+            user_content = [{
+                "type": "text",
+                "text": f"""
+Question from GAIA benchmark: {question}
+Remember:
+1. Provide ONLY the final answer
+2. Format exactly as requested
+3. No explanations or reasoning
+"""
+            }]
+            # Добавляем результаты поиска, если нужно
+            web_results = self.search_web(question)
+            if web_results:
+                user_content.append({
+                    "type": "text",
+                    "text": f"""
+Web search results related to this question:
+{web_results}
+"""
+                })
+            # Добавляем файл, если он есть
+            if question_info["has_file"] and question_info["file_info"]:
+                file_info = question_info["file_info"]
+                if file_info["type"] == "image":
+                    # Добавляем изображение для Claude
+                    user_content.append({
+                        "type": "image",
+                        "source": {
+                            "type": "base64",
+                            "media_type": file_info["content_type"],
+                            "data": file_info["base64"]
+                        }
+                    })
+                    user_content.append({
+                        "type": "text",
+                        "text": "The above image is part of the question. Please analyze it carefully."
+                    })
+                elif file_info["type"] == "text" and "text" in file_info:
+                    # Для текстовых файлов добавляем содержимое
+                    user_content.append({
+                        "type": "text",
+                        "text": f"""
+The question includes a text file with the following content:
+{file_info["text"][:4000]}  # ограничиваем, чтобы не превысить лимиты токенов
+"""
+                    })
+            # Добавляем форматирование, если указано
+            if question_info["format_requirements"]:
+                user_content.append({
+                    "type": "text",
+                    "text": f"""
+Important format requirement: {question_info["format_requirements"]}
+Make sure your answer follows this format EXACTLY.
+"""
+                })
+            messages.append({
+                "role": "user",
+                "content": user_content
+            })
+            # Запрос к Claude
+            response = self.client.messages.create(
+                model="claude-3-5-sonnet-20241022",
+                system=self.system_prompt,
+                messages=messages,
+                temperature=0.1,  # Низкая температура для точных ответов
+                max_tokens=4096
+            )
+            # Получаем ответ
+            raw_answer = response.content[0].text.strip()
+            # Вычищаем ответ от лишнего
+            clean_answer = self.extract_answer(raw_answer)
+            print(f"Raw answer: {raw_answer}")
+            print(f"Clean answer: {clean_answer}")
+            return clean_answer
+        except Exception as e:
+            print(f"Error in agent: {e}")
+            import traceback
+            traceback.print_exc()
+            return f"Error processing question: {str(e)}"
+# Используем наш агент как BasicAgent для совместимости с остальным кодом
+class BasicAgent(GAIAAgent):
+    pass
+def run_and_submit_all(profile: gr.OAuthProfile | None):
     """
     Fetches all questions, runs the BasicAgent on them, submits all answers,
     and displays the results.
             print(f"Skipping item with missing task_id or question: {item}")
             continue
         try:
+            submitted_answer = agent(question_text, task_id)
             answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
         except Exception as e:
 # --- Build Gradio Interface using Blocks ---
 with gr.Blocks() as demo:
+    gr.Markdown("# GAIA Benchmark Agent Evaluation")
     gr.Markdown(
         """
         **Instructions:**
+        1. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
+        2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
+        This agent uses Claude 3.5 Sonnet to solve GAIA benchmark tasks.
         """
     )
     run_button = gr.Button("Run Evaluation & Submit All Answers")
     status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
     results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
     run_button.click(
     print("-"*(60 + len(" App Starting ")) + "\n")
+    print("Launching Gradio Interface for GAIA Agent Evaluation...")
     demo.launch(debug=True, share=False)