Commit
·
539dfc6
1
Parent(s):
81917a3
Initial commit for Final Assignment Agent
Browse files- .gitignore +1 -0
- README.md +31 -0
- agent_open_search.py +230 -0
- app.py +15 -9
- requirements.txt +142 -2
- scripts/__pycache__/cookies.cpython-312.pyc +0 -0
- scripts/__pycache__/mdconvert.cpython-312.pyc +0 -0
- scripts/__pycache__/text_inspector_tool.cpython-312.pyc +0 -0
- scripts/__pycache__/text_web_browser.cpython-312.pyc +0 -0
- scripts/__pycache__/visual_qa.cpython-312.pyc +0 -0
- scripts/cookies.py +715 -0
- scripts/gaia_scorer.py +124 -0
- scripts/mdconvert.py +1004 -0
- scripts/reformulator.py +86 -0
- scripts/run_agents.py +87 -0
- scripts/text_inspector_tool.py +124 -0
- scripts/text_web_browser.py +567 -0
- scripts/visual_qa.py +313 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
.env
|
README.md
CHANGED
@@ -12,4 +12,35 @@ hf_oauth: true
|
|
12 |
hf_oauth_expiration_minutes: 480
|
13 |
---
|
14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
12 |
hf_oauth_expiration_minutes: 480
|
13 |
---
|
14 |
|
15 |
+
# Web Search Agent
|
16 |
+
|
17 |
+
## Description
|
18 |
+
|
19 |
+
This project is a wrapper of the Open Search agent from the smolagents library, with some minor modifications. It creates a base agent that can search the web and answer questions by navigating through web content, analyzing pages, and extracting relevant information.
|
20 |
+
|
21 |
+
The core idea is to establish a foundation agent that can be expanded with additional tools to improve performance for specific tasks. By adding specialized tools, the agent can be customized to handle various domains and use cases more effectively.
|
22 |
+
|
23 |
+
## Author
|
24 |
+
|
25 |
+
Created by: Guillermo Izquierdo
|
26 |
+
|
27 |
+
## Features
|
28 |
+
|
29 |
+
- Web search using SerpAPI
|
30 |
+
- Page navigation and content analysis
|
31 |
+
- Text search within pages
|
32 |
+
- Support for various document formats (PDF, DOCX, etc.)
|
33 |
+
- Visual content analysis
|
34 |
+
- Modular architecture for easy extension
|
35 |
+
|
36 |
+
## Extending the Agent
|
37 |
+
|
38 |
+
The agent is designed to be easily extended with new tools. Look for the commented sections in `agent_open_search.py` that indicate where to add custom tools:
|
39 |
+
|
40 |
+
```python
|
41 |
+
# ===== ADD YOUR CUSTOM TOOLS HERE =====
|
42 |
+
```
|
43 |
+
|
44 |
+
By adding specialized tools, you can enhance the agent's capabilities for specific domains or tasks.
|
45 |
+
|
46 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
agent_open_search.py
ADDED
@@ -0,0 +1,230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
OpenSearchAgent - A web search agent for answering questions using the internet
|
3 |
+
|
4 |
+
This module implements a powerful agent that can search the web, navigate pages,
|
5 |
+
and analyze content to answer complex questions. It uses the smolagents library
|
6 |
+
to create a hierarchical agent system with a manager agent and a web browser agent.
|
7 |
+
|
8 |
+
The agent can:
|
9 |
+
- Search the web using Google (via SerpAPI)
|
10 |
+
- Visit and navigate web pages
|
11 |
+
- Find and analyze text content
|
12 |
+
- Process PDF files and other document formats
|
13 |
+
- Visualize content when needed
|
14 |
+
|
15 |
+
Environment variables required:
|
16 |
+
- SERPAPI_API_KEY: API key for SerpAPI (for web search)
|
17 |
+
- OPENAI_API_KEY: API key for OpenAI (for the language model)
|
18 |
+
- HF_TOKEN: Hugging Face token (for accessing HF resources)
|
19 |
+
"""
|
20 |
+
|
21 |
+
import os
|
22 |
+
import threading
|
23 |
+
|
24 |
+
from dotenv import load_dotenv
|
25 |
+
from huggingface_hub import login
|
26 |
+
from scripts.text_inspector_tool import TextInspectorTool
|
27 |
+
from scripts.text_web_browser import (
|
28 |
+
ArchiveSearchTool,
|
29 |
+
FinderTool,
|
30 |
+
FindNextTool,
|
31 |
+
PageDownTool,
|
32 |
+
PageUpTool,
|
33 |
+
SimpleTextBrowser,
|
34 |
+
VisitTool,
|
35 |
+
)
|
36 |
+
from scripts.visual_qa import visualizer
|
37 |
+
|
38 |
+
from smolagents import (
|
39 |
+
CodeAgent,
|
40 |
+
GoogleSearchTool,
|
41 |
+
# InferenceClientModel, # Uncomment if you want to use InferenceClientModel
|
42 |
+
LiteLLMModel,
|
43 |
+
ToolCallingAgent,
|
44 |
+
OpenAIServerModel,
|
45 |
+
)
|
46 |
+
|
47 |
+
|
48 |
+
# Load environment variables and authenticate with Hugging Face
|
49 |
+
load_dotenv(override=True)
|
50 |
+
login(os.getenv("HF_TOKEN"))
|
51 |
+
|
52 |
+
# Global configurations for the agent
|
53 |
+
custom_role_conversions = {"tool-call": "assistant", "tool-response": "user"}
|
54 |
+
|
55 |
+
# User agent string for web requests to avoid being blocked by websites
|
56 |
+
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
|
57 |
+
|
58 |
+
# Browser configuration for web navigation
|
59 |
+
BROWSER_CONFIG = {
|
60 |
+
"viewport_size": 1024 * 5, # Large viewport for capturing more content
|
61 |
+
"downloads_folder": "downloads_folder", # Where to store downloaded files
|
62 |
+
"request_kwargs": {
|
63 |
+
"headers": {"User-Agent": user_agent},
|
64 |
+
"timeout": 300, # Generous timeout for slow websites
|
65 |
+
},
|
66 |
+
"serpapi_key": os.getenv("SERPAPI_API_KEY"), # API key for web search
|
67 |
+
}
|
68 |
+
|
69 |
+
# Create downloads folder if it doesn't exist
|
70 |
+
os.makedirs(f"./{BROWSER_CONFIG['downloads_folder']}", exist_ok=True)
|
71 |
+
|
72 |
+
|
73 |
+
class OpenSearchAgent:
|
74 |
+
"""
|
75 |
+
A portable agent that can search the web and answer questions.
|
76 |
+
This class encapsulates the functionality of the web search agent.
|
77 |
+
"""
|
78 |
+
|
79 |
+
def __init__(self, model_id="o1"):
|
80 |
+
"""
|
81 |
+
Initialize the OpenSearchAgent with the specified model.
|
82 |
+
|
83 |
+
Args:
|
84 |
+
model_id (str): The model ID to use for the agent. Default is "o1".
|
85 |
+
Other options include "gpt-4o", "claude-3-opus", etc.
|
86 |
+
"""
|
87 |
+
self.model_id = model_id
|
88 |
+
self.agent = self._create_agent()
|
89 |
+
|
90 |
+
def _create_agent(self):
|
91 |
+
"""
|
92 |
+
Create and configure the agent with the appropriate tools and models.
|
93 |
+
|
94 |
+
This is where you can customize the agent by adding new tools or
|
95 |
+
changing the configuration of existing ones.
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
CodeAgent: The configured agent ready to answer questions.
|
99 |
+
"""
|
100 |
+
# Configure the model parameters
|
101 |
+
model_params = {
|
102 |
+
"model_id": self.model_id,
|
103 |
+
"custom_role_conversions": custom_role_conversions,
|
104 |
+
"max_completion_tokens": 8192,
|
105 |
+
}
|
106 |
+
if self.model_id == "o1":
|
107 |
+
model_params["reasoning_effort"] = "high"
|
108 |
+
print(f"Using model parameters: {model_params}")
|
109 |
+
|
110 |
+
# Initialize the model
|
111 |
+
# You can switch between different model providers here
|
112 |
+
# model = LiteLLMModel(**model_params) # For using LiteLLM
|
113 |
+
model = OpenAIServerModel(model_id="gpt-4o") # For using OpenAI directly
|
114 |
+
|
115 |
+
# Configure text browser and tools
|
116 |
+
text_limit = 100000 # Maximum text length to process
|
117 |
+
browser = SimpleTextBrowser(**BROWSER_CONFIG)
|
118 |
+
|
119 |
+
# ===== TOOL CONFIGURATION =====
|
120 |
+
# This is where you can add new tools to enhance the agent's capabilities
|
121 |
+
WEB_TOOLS = [
|
122 |
+
GoogleSearchTool(provider="serpapi"), # Web search tool
|
123 |
+
VisitTool(browser), # Visit URLs
|
124 |
+
PageUpTool(browser), # Navigate up in a page
|
125 |
+
PageDownTool(browser), # Navigate down in a page
|
126 |
+
FinderTool(browser), # Find text in a page
|
127 |
+
FindNextTool(browser), # Find next occurrence of text
|
128 |
+
ArchiveSearchTool(browser), # Search web archives
|
129 |
+
TextInspectorTool(model, text_limit), # Analyze text content
|
130 |
+
|
131 |
+
# ===== ADD YOUR CUSTOM TOOLS HERE =====
|
132 |
+
# Example:
|
133 |
+
# CustomTool(), # Your custom tool implementation
|
134 |
+
# ImageAnalysisTool(), # Tool for analyzing images
|
135 |
+
# DataExtractionTool(), # Tool for extracting structured data
|
136 |
+
]
|
137 |
+
|
138 |
+
# Create the web browser agent that handles web interactions
|
139 |
+
text_webbrowser_agent = ToolCallingAgent(
|
140 |
+
model=model,
|
141 |
+
tools=WEB_TOOLS,
|
142 |
+
max_steps=20, # Maximum steps before stopping
|
143 |
+
verbosity_level=2, # Level of logging detail
|
144 |
+
planning_interval=4, # How often to re-plan
|
145 |
+
name="search_agent",
|
146 |
+
description="""A team member that will search the internet to answer your question.
|
147 |
+
Ask him for all your questions that require browsing the web.
|
148 |
+
Provide him as much context as possible, in particular if you need to search on a specific timeframe!
|
149 |
+
And don't hesitate to provide him with a complex search task, like finding a difference between two webpages.
|
150 |
+
Your request must be a real sentence, not a google search! Like "Find me this information (...)" rather than a few keywords.
|
151 |
+
""",
|
152 |
+
provide_run_summary=True, # Provide summary of actions taken
|
153 |
+
)
|
154 |
+
|
155 |
+
# Add additional instructions to the web browser agent
|
156 |
+
text_webbrowser_agent.prompt_templates["managed_agent"]["task"] += """You can navigate to .txt online files.
|
157 |
+
If a non-html page is in another format, especially .pdf or a Youtube video, use tool 'inspect_file_as_text' to inspect it.
|
158 |
+
Additionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information."""
|
159 |
+
|
160 |
+
# ===== MANAGER AGENT CONFIGURATION =====
|
161 |
+
# Create the manager agent that oversees the web browser agent
|
162 |
+
# You can add more managed agents here for different specialized tasks
|
163 |
+
manager_agent = CodeAgent(
|
164 |
+
model=model,
|
165 |
+
tools=[
|
166 |
+
visualizer, # Tool for visualization tasks
|
167 |
+
TextInspectorTool(model, text_limit), # Text analysis tool
|
168 |
+
|
169 |
+
# ===== ADD YOUR CUSTOM MANAGER TOOLS HERE =====
|
170 |
+
# Example:
|
171 |
+
# DataAnalysisTool(), # Tool for analyzing data
|
172 |
+
# ReportGeneratorTool(), # Tool for generating reports
|
173 |
+
],
|
174 |
+
max_steps=12, # Maximum steps before stopping
|
175 |
+
verbosity_level=2, # Level of logging detail
|
176 |
+
additional_authorized_imports=["*"], # Allow all imports
|
177 |
+
planning_interval=4, # How often to re-plan
|
178 |
+
managed_agents=[
|
179 |
+
text_webbrowser_agent, # The web browser agent
|
180 |
+
|
181 |
+
# ===== ADD YOUR CUSTOM MANAGED AGENTS HERE =====
|
182 |
+
# Example:
|
183 |
+
# data_analysis_agent, # An agent specialized in data analysis
|
184 |
+
# image_processing_agent, # An agent specialized in image processing
|
185 |
+
],
|
186 |
+
)
|
187 |
+
|
188 |
+
return manager_agent
|
189 |
+
|
190 |
+
def __call__(self, question: str) -> str:
|
191 |
+
"""
|
192 |
+
Run the agent on the given question.
|
193 |
+
|
194 |
+
Args:
|
195 |
+
question (str): The question to answer.
|
196 |
+
|
197 |
+
Returns:
|
198 |
+
str: The agent's answer to the question.
|
199 |
+
"""
|
200 |
+
print(f"OpenSearchAgent received question: {question[:50]}...")
|
201 |
+
answer = self.agent.run(question)
|
202 |
+
|
203 |
+
# Convert answer to string to ensure it's subscriptable
|
204 |
+
answer_str = str(answer)
|
205 |
+
print(f"OpenSearchAgent found answer: {answer_str[:100]}...")
|
206 |
+
|
207 |
+
return answer_str
|
208 |
+
|
209 |
+
|
210 |
+
def main():
|
211 |
+
"""
|
212 |
+
Example usage of the OpenSearchAgent.
|
213 |
+
|
214 |
+
This function demonstrates how to create and use the OpenSearchAgent.
|
215 |
+
You can modify the question or model_id to test different configurations.
|
216 |
+
"""
|
217 |
+
# Define your question here
|
218 |
+
question = "How many studio albums did Mercedes Sosa release before 2007?"
|
219 |
+
|
220 |
+
# Create the agent
|
221 |
+
agent = OpenSearchAgent(model_id="o1")
|
222 |
+
|
223 |
+
# Run the agent
|
224 |
+
answer = agent(question)
|
225 |
+
|
226 |
+
print(f"Got this answer: {answer}")
|
227 |
+
|
228 |
+
|
229 |
+
if __name__ == "__main__":
|
230 |
+
main()
|
app.py
CHANGED
@@ -3,25 +3,31 @@ import gradio as gr
|
|
3 |
import requests
|
4 |
import inspect
|
5 |
import pandas as pd
|
|
|
6 |
|
7 |
# (Keep Constants as is)
|
8 |
# --- Constants ---
|
9 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
10 |
|
11 |
-
# ---
|
12 |
-
|
13 |
-
class BasicAgent:
|
14 |
def __init__(self):
|
15 |
-
print("
|
|
|
|
|
16 |
def __call__(self, question: str) -> str:
|
17 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
18 |
-
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
21 |
|
22 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
23 |
"""
|
24 |
-
Fetches all questions, runs the
|
25 |
and displays the results.
|
26 |
"""
|
27 |
# --- Determine HF Space Runtime URL and Repo URL ---
|
@@ -40,7 +46,7 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
40 |
|
41 |
# 1. Instantiate Agent ( modify this part to create your agent)
|
42 |
try:
|
43 |
-
agent =
|
44 |
except Exception as e:
|
45 |
print(f"Error instantiating agent: {e}")
|
46 |
return f"Error initializing agent: {e}", None
|
|
|
3 |
import requests
|
4 |
import inspect
|
5 |
import pandas as pd
|
6 |
+
from agent_open_search import OpenSearchAgent
|
7 |
|
8 |
# (Keep Constants as is)
|
9 |
# --- Constants ---
|
10 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
11 |
|
12 |
+
# --- Agent Definition ---
|
13 |
+
class Agent:
|
|
|
14 |
def __init__(self):
|
15 |
+
print("Initializing OpenSearchAgent for web search...")
|
16 |
+
self.search_agent = OpenSearchAgent(model_id="gpt-4o")
|
17 |
+
|
18 |
def __call__(self, question: str) -> str:
|
19 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
20 |
+
try:
|
21 |
+
answer = self.search_agent(question)
|
22 |
+
print(f"Agent returning answer (first 100 chars): {answer[:100]}...")
|
23 |
+
return answer
|
24 |
+
except Exception as e:
|
25 |
+
print(f"Error in agent: {str(e)}")
|
26 |
+
return f"I encountered an error while searching for an answer: {str(e)}"
|
27 |
|
28 |
def run_and_submit_all( profile: gr.OAuthProfile | None):
|
29 |
"""
|
30 |
+
Fetches all questions, runs the Agent on them, submits all answers,
|
31 |
and displays the results.
|
32 |
"""
|
33 |
# --- Determine HF Space Runtime URL and Repo URL ---
|
|
|
46 |
|
47 |
# 1. Instantiate Agent ( modify this part to create your agent)
|
48 |
try:
|
49 |
+
agent = Agent()
|
50 |
except Exception as e:
|
51 |
print(f"Error instantiating agent: {e}")
|
52 |
return f"Error initializing agent: {e}", None
|
requirements.txt
CHANGED
@@ -1,2 +1,142 @@
|
|
1 |
-
|
2 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiofiles==24.1.0
|
2 |
+
aiohappyeyeballs==2.6.1
|
3 |
+
aiohttp==3.11.18
|
4 |
+
aiosignal==1.3.2
|
5 |
+
annotated-types==0.7.0
|
6 |
+
anthropic==0.50.0
|
7 |
+
anyio==4.9.0
|
8 |
+
attrs==25.3.0
|
9 |
+
beautifulsoup4==4.13.4
|
10 |
+
bio==1.7.1
|
11 |
+
biopython==1.85
|
12 |
+
biothings-client==0.4.1
|
13 |
+
certifi==2025.1.31
|
14 |
+
cffi==1.17.1
|
15 |
+
charset-normalizer==3.4.1
|
16 |
+
chess==1.11.2
|
17 |
+
click==8.1.8
|
18 |
+
cobble==0.1.4
|
19 |
+
colorama==0.4.6
|
20 |
+
cryptography==44.0.2
|
21 |
+
datasets==3.5.1
|
22 |
+
defusedxml==0.7.1
|
23 |
+
dill==0.3.8
|
24 |
+
distro==1.9.0
|
25 |
+
duckduckgo-search==8.0.1
|
26 |
+
et-xmlfile==2.0.0
|
27 |
+
fastapi==0.115.12
|
28 |
+
ffmpy==0.5.0
|
29 |
+
filelock==3.18.0
|
30 |
+
frozenlist==1.6.0
|
31 |
+
fsspec==2025.3.0
|
32 |
+
google-search-results==2.4.2
|
33 |
+
gprofiler-official==1.0.0
|
34 |
+
gradio==5.28.0
|
35 |
+
gradio-client==1.10.0
|
36 |
+
groovy==0.1.2
|
37 |
+
h11==0.16.0
|
38 |
+
helium==5.1.1
|
39 |
+
httpcore==1.0.9
|
40 |
+
httpx==0.28.1
|
41 |
+
huggingface-hub==0.30.2
|
42 |
+
idna==3.10
|
43 |
+
importlib-metadata==8.6.1
|
44 |
+
jinja2==3.1.6
|
45 |
+
jiter==0.9.0
|
46 |
+
joblib==1.4.2
|
47 |
+
jsonschema==4.23.0
|
48 |
+
jsonschema-specifications==2025.4.1
|
49 |
+
litellm==1.67.2
|
50 |
+
lxml==5.4.0
|
51 |
+
mammoth==1.9.0
|
52 |
+
markdown-it-py==3.0.0
|
53 |
+
markdownify==1.1.0
|
54 |
+
markupsafe==3.0.2
|
55 |
+
mdurl==0.1.2
|
56 |
+
mpmath==1.3.0
|
57 |
+
multidict==6.4.3
|
58 |
+
multiprocess==0.70.16
|
59 |
+
mygene==3.2.2
|
60 |
+
networkx==3.4.2
|
61 |
+
numexpr==2.10.2
|
62 |
+
numpy==2.2.5
|
63 |
+
openai==1.76.0
|
64 |
+
openpyxl==3.1.5
|
65 |
+
orjson==3.10.18
|
66 |
+
outcome==1.3.0.post0
|
67 |
+
packaging==25.0
|
68 |
+
pandas==2.2.3
|
69 |
+
pathvalidate==3.2.3
|
70 |
+
pdfminer==20191125
|
71 |
+
pdfminer-six==20250416
|
72 |
+
pillow==11.2.1
|
73 |
+
platformdirs==4.3.7
|
74 |
+
pooch==1.8.2
|
75 |
+
primp==0.15.0
|
76 |
+
propcache==0.3.1
|
77 |
+
pubchempy==1.0.4
|
78 |
+
puremagic==1.28
|
79 |
+
pyarrow==20.0.0
|
80 |
+
pycparser==2.22
|
81 |
+
pycryptodome==3.22.0
|
82 |
+
pydantic==2.11.3
|
83 |
+
pydantic-core==2.33.1
|
84 |
+
pydub==0.25.1
|
85 |
+
pygments==2.19.1
|
86 |
+
pypdf==5.4.0
|
87 |
+
pypdf2==3.0.1
|
88 |
+
pysocks==1.7.1
|
89 |
+
python-dateutil==2.9.0.post0
|
90 |
+
python-dotenv==1.1.0
|
91 |
+
python-multipart==0.0.20
|
92 |
+
python-pptx==1.0.2
|
93 |
+
pytz==2025.2
|
94 |
+
pyyaml==6.0.2
|
95 |
+
referencing==0.36.2
|
96 |
+
regex==2024.11.6
|
97 |
+
requests==2.32.3
|
98 |
+
rich==14.0.0
|
99 |
+
rpds-py==0.24.0
|
100 |
+
ruff==0.11.8
|
101 |
+
safehttpx==0.1.6
|
102 |
+
safetensors==0.5.3
|
103 |
+
scikit-learn==1.6.1
|
104 |
+
scipy==1.15.2
|
105 |
+
selenium==4.31.0
|
106 |
+
semantic-version==2.10.0
|
107 |
+
serpapi==0.1.5
|
108 |
+
setuptools==80.0.1
|
109 |
+
shellingham==1.5.4
|
110 |
+
six==1.17.0
|
111 |
+
smolagents==1.14.0
|
112 |
+
sniffio==1.3.1
|
113 |
+
sortedcontainers==2.4.0
|
114 |
+
soupsieve==2.7
|
115 |
+
speechrecognition==3.14.2
|
116 |
+
starlette==0.46.2
|
117 |
+
sympy==1.14.0
|
118 |
+
threadpoolctl==3.6.0
|
119 |
+
tiktoken==0.9.0
|
120 |
+
tokenizers==0.21.1
|
121 |
+
tomlkit==0.13.2
|
122 |
+
torch==2.7.0
|
123 |
+
torchvision==0.22.0
|
124 |
+
tqdm==4.67.1
|
125 |
+
transformers==4.51.3
|
126 |
+
trio==0.30.0
|
127 |
+
trio-websocket==0.12.2
|
128 |
+
typer==0.15.3
|
129 |
+
typing-extensions==4.13.2
|
130 |
+
typing-inspection==0.4.0
|
131 |
+
tzdata==2025.2
|
132 |
+
urllib3==2.4.0
|
133 |
+
uvicorn==0.34.2
|
134 |
+
websocket-client==1.8.0
|
135 |
+
websockets==15.0.1
|
136 |
+
wsproto==1.2.0
|
137 |
+
xlrd==2.0.1
|
138 |
+
xlsxwriter==3.2.3
|
139 |
+
xxhash==3.5.0
|
140 |
+
yarl==1.20.0
|
141 |
+
youtube-transcript-api==1.0.3
|
142 |
+
zipp==3.21.0
|
scripts/__pycache__/cookies.cpython-312.pyc
ADDED
Binary file (11.2 kB). View file
|
|
scripts/__pycache__/mdconvert.cpython-312.pyc
ADDED
Binary file (43.6 kB). View file
|
|
scripts/__pycache__/text_inspector_tool.cpython-312.pyc
ADDED
Binary file (4.66 kB). View file
|
|
scripts/__pycache__/text_web_browser.cpython-312.pyc
ADDED
Binary file (29.8 kB). View file
|
|
scripts/__pycache__/visual_qa.cpython-312.pyc
ADDED
Binary file (7.71 kB). View file
|
|
scripts/cookies.py
ADDED
@@ -0,0 +1,715 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from requests.cookies import RequestsCookieJar
|
2 |
+
|
3 |
+
|
4 |
+
COOKIES_LIST = [
|
5 |
+
{
|
6 |
+
"domain": ".youtube.com",
|
7 |
+
"expirationDate": 1718884961,
|
8 |
+
"hostOnly": False,
|
9 |
+
"httpOnly": False,
|
10 |
+
"name": "ST-xuwub9",
|
11 |
+
"path": "/",
|
12 |
+
"sameSite": None,
|
13 |
+
"secure": False,
|
14 |
+
"session": False,
|
15 |
+
"storeId": None,
|
16 |
+
"value": "session_logininfo=AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0%3AQUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"domain": ".youtube.com",
|
20 |
+
"expirationDate": 1753004444.745411,
|
21 |
+
"hostOnly": False,
|
22 |
+
"httpOnly": True,
|
23 |
+
"name": "__Secure-YEC",
|
24 |
+
"path": "/",
|
25 |
+
"sameSite": "lax",
|
26 |
+
"secure": True,
|
27 |
+
"session": False,
|
28 |
+
"storeId": None,
|
29 |
+
"value": "CgtRVnI5LW1zRHlQVSjbtNCzBjIhCgJGUhIbEhcSFRMLFBUWFwwYGRobHB0eHw4PIBAREiAk",
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"domain": ".youtube.com",
|
33 |
+
"expirationDate": 1753434620.050824,
|
34 |
+
"hostOnly": False,
|
35 |
+
"httpOnly": True,
|
36 |
+
"name": "__Secure-3PSID",
|
37 |
+
"path": "/",
|
38 |
+
"sameSite": "no_restriction",
|
39 |
+
"secure": True,
|
40 |
+
"session": False,
|
41 |
+
"storeId": None,
|
42 |
+
"value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB4ezJ_bdWu46a7YwObVn44wACgYKAakSARQSFQHGX2MicJcTzecTKH6bHzqU6TMbTxoVAUF8yKqQYK-MoI6Ql3vI2oYTB3E-0076",
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"domain": ".youtube.com",
|
46 |
+
"expirationDate": 1750420959.974642,
|
47 |
+
"hostOnly": False,
|
48 |
+
"httpOnly": False,
|
49 |
+
"name": "SIDCC",
|
50 |
+
"path": "/",
|
51 |
+
"sameSite": None,
|
52 |
+
"secure": False,
|
53 |
+
"session": False,
|
54 |
+
"storeId": None,
|
55 |
+
"value": "AKEyXzWQZauHKOo8t87zoEcjaVNIYUX54ohoWXT-tX4aAhEuZzIIptxZAcNkHuG2oDXYL6t-lw",
|
56 |
+
},
|
57 |
+
{
|
58 |
+
"domain": ".youtube.com",
|
59 |
+
"expirationDate": 1753434620.050652,
|
60 |
+
"hostOnly": False,
|
61 |
+
"httpOnly": False,
|
62 |
+
"name": "SID",
|
63 |
+
"path": "/",
|
64 |
+
"sameSite": None,
|
65 |
+
"secure": False,
|
66 |
+
"session": False,
|
67 |
+
"storeId": None,
|
68 |
+
"value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBB6VHrZcC3gBAsFPbCQ0gF5AACgYKAYkSARQSFQHGX2Mi9kt0gHg5CxCYSkLQGHWaeBoVAUF8yKre_V6r3jZVak6JV4o2Q0FL0076",
|
69 |
+
},
|
70 |
+
{
|
71 |
+
"domain": ".youtube.com",
|
72 |
+
"expirationDate": 1750420958.397534,
|
73 |
+
"hostOnly": False,
|
74 |
+
"httpOnly": True,
|
75 |
+
"name": "__Secure-1PSIDTS",
|
76 |
+
"path": "/",
|
77 |
+
"sameSite": None,
|
78 |
+
"secure": True,
|
79 |
+
"session": False,
|
80 |
+
"storeId": None,
|
81 |
+
"value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
|
82 |
+
},
|
83 |
+
{
|
84 |
+
"domain": ".youtube.com",
|
85 |
+
"expirationDate": 1753433494.44729,
|
86 |
+
"hostOnly": False,
|
87 |
+
"httpOnly": False,
|
88 |
+
"name": "_ga_M0180HEFCY",
|
89 |
+
"path": "/",
|
90 |
+
"sameSite": None,
|
91 |
+
"secure": False,
|
92 |
+
"session": False,
|
93 |
+
"storeId": None,
|
94 |
+
"value": "GS1.1.1718871908.1.0.1718873494.0.0.0",
|
95 |
+
},
|
96 |
+
{
|
97 |
+
"domain": ".youtube.com",
|
98 |
+
"expirationDate": 1753434620.050933,
|
99 |
+
"hostOnly": False,
|
100 |
+
"httpOnly": False,
|
101 |
+
"name": "SAPISID",
|
102 |
+
"path": "/",
|
103 |
+
"sameSite": None,
|
104 |
+
"secure": True,
|
105 |
+
"session": False,
|
106 |
+
"storeId": None,
|
107 |
+
"value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"domain": ".youtube.com",
|
111 |
+
"expirationDate": 1750420959.974764,
|
112 |
+
"hostOnly": False,
|
113 |
+
"httpOnly": True,
|
114 |
+
"name": "__Secure-1PSIDCC",
|
115 |
+
"path": "/",
|
116 |
+
"sameSite": None,
|
117 |
+
"secure": True,
|
118 |
+
"session": False,
|
119 |
+
"storeId": None,
|
120 |
+
"value": "AKEyXzWHDSoXGCZpZhPxRrnC7B1s8zGIUjeMVyvgtQfsm1fs92lXPtFEI_td9LBUyqVUe0xK",
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"domain": ".youtube.com",
|
124 |
+
"expirationDate": 1753434620.050881,
|
125 |
+
"hostOnly": False,
|
126 |
+
"httpOnly": True,
|
127 |
+
"name": "SSID",
|
128 |
+
"path": "/",
|
129 |
+
"sameSite": None,
|
130 |
+
"secure": True,
|
131 |
+
"session": False,
|
132 |
+
"storeId": None,
|
133 |
+
"value": "AmlwXHnQvOQ10LVd-",
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"domain": ".youtube.com",
|
137 |
+
"expirationDate": 1753434620.050959,
|
138 |
+
"hostOnly": False,
|
139 |
+
"httpOnly": False,
|
140 |
+
"name": "__Secure-1PAPISID",
|
141 |
+
"path": "/",
|
142 |
+
"sameSite": None,
|
143 |
+
"secure": True,
|
144 |
+
"session": False,
|
145 |
+
"storeId": None,
|
146 |
+
"value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
|
147 |
+
},
|
148 |
+
{
|
149 |
+
"domain": ".youtube.com",
|
150 |
+
"expirationDate": 1753434620.050795,
|
151 |
+
"hostOnly": False,
|
152 |
+
"httpOnly": True,
|
153 |
+
"name": "__Secure-1PSID",
|
154 |
+
"path": "/",
|
155 |
+
"sameSite": None,
|
156 |
+
"secure": True,
|
157 |
+
"session": False,
|
158 |
+
"storeId": None,
|
159 |
+
"value": "g.a000kwibeLUu8Ea9Y-vLun7u3kU5VNJVuMAZl_jdfJaNm50JyDBBrlk7lRpKQGywAHEon7WGQAACgYKAQsSARQSFQHGX2MirAmnSRdZl6GPG6KLd4hOihoVAUF8yKoV17Tcj1a_OenIOkf2wBjO0076",
|
160 |
+
},
|
161 |
+
{
|
162 |
+
"domain": ".youtube.com",
|
163 |
+
"expirationDate": 1753434620.050993,
|
164 |
+
"hostOnly": False,
|
165 |
+
"httpOnly": False,
|
166 |
+
"name": "__Secure-3PAPISID",
|
167 |
+
"path": "/",
|
168 |
+
"sameSite": "no_restriction",
|
169 |
+
"secure": True,
|
170 |
+
"session": False,
|
171 |
+
"storeId": None,
|
172 |
+
"value": "mfeuiC-HraNJ-A03/ASXvCPNJSw7yTFgd6",
|
173 |
+
},
|
174 |
+
{
|
175 |
+
"domain": ".youtube.com",
|
176 |
+
"expirationDate": 1750420959.974815,
|
177 |
+
"hostOnly": False,
|
178 |
+
"httpOnly": True,
|
179 |
+
"name": "__Secure-3PSIDCC",
|
180 |
+
"path": "/",
|
181 |
+
"sameSite": "no_restriction",
|
182 |
+
"secure": True,
|
183 |
+
"session": False,
|
184 |
+
"storeId": None,
|
185 |
+
"value": "AKEyXzXM5UjKUEXwSHVmRAIo6hGHA4G63adj3EE1VdNriD0f38jZQbsUKiD4LQbA3BValmTFDg",
|
186 |
+
},
|
187 |
+
{
|
188 |
+
"domain": ".youtube.com",
|
189 |
+
"expirationDate": 1750420958.397647,
|
190 |
+
"hostOnly": False,
|
191 |
+
"httpOnly": True,
|
192 |
+
"name": "__Secure-3PSIDTS",
|
193 |
+
"path": "/",
|
194 |
+
"sameSite": "no_restriction",
|
195 |
+
"secure": True,
|
196 |
+
"session": False,
|
197 |
+
"storeId": None,
|
198 |
+
"value": "sidts-CjIB3EgAEkYL2L-GfrEzW5Dfy62S9oefGNLgst78S_986htCnGcfkxECch_9oz-qytSsZBAA",
|
199 |
+
},
|
200 |
+
{
|
201 |
+
"domain": ".youtube.com",
|
202 |
+
"expirationDate": 1753434620.050908,
|
203 |
+
"hostOnly": False,
|
204 |
+
"httpOnly": False,
|
205 |
+
"name": "APISID",
|
206 |
+
"path": "/",
|
207 |
+
"sameSite": None,
|
208 |
+
"secure": False,
|
209 |
+
"session": False,
|
210 |
+
"storeId": None,
|
211 |
+
"value": "IlQWLPjdNqziwCrV/ANG7Z4x5FF-IBxbZk",
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"domain": ".youtube.com",
|
215 |
+
"expirationDate": 1753434620.050855,
|
216 |
+
"hostOnly": False,
|
217 |
+
"httpOnly": True,
|
218 |
+
"name": "HSID",
|
219 |
+
"path": "/",
|
220 |
+
"sameSite": None,
|
221 |
+
"secure": False,
|
222 |
+
"session": False,
|
223 |
+
"storeId": None,
|
224 |
+
"value": "AasA7hmRuTFv7vjoq",
|
225 |
+
},
|
226 |
+
{
|
227 |
+
"domain": ".youtube.com",
|
228 |
+
"expirationDate": 1753435873.577793,
|
229 |
+
"hostOnly": False,
|
230 |
+
"httpOnly": True,
|
231 |
+
"name": "LOGIN_INFO",
|
232 |
+
"path": "/",
|
233 |
+
"sameSite": "no_restriction",
|
234 |
+
"secure": True,
|
235 |
+
"session": False,
|
236 |
+
"storeId": None,
|
237 |
+
"value": "AFmmF2swRAIgf4gadACOuWOcipI1anW-dakEjtidNLkufnOC8uml7EECIDh2YisqWELDBJPTGUysCucJ3I0wjXxYjVHro1LHrdW0:QUQ3MjNmd2Jiajl3OWZYRnpFNnZlWWV5ZGJWZ0hpcmp4LVVPU280bk4zOS03Z0ozZG9fOFhWZ0dXaVo3NG1wTEg1b3hGaG10TFBlaFBnTlJfbER5bEp0aFhoNS1OLVhYNFRZT2F6ajgzOFpDbGhlUjZpMWRETlFFRjFfTTRiM0RnNTROSkdmMTFMVjFic1VuZ2trbGp4aktDa0JJUC1BWDh3",
|
238 |
+
},
|
239 |
+
{
|
240 |
+
"domain": ".youtube.com",
|
241 |
+
"expirationDate": 1753444956.555608,
|
242 |
+
"hostOnly": False,
|
243 |
+
"httpOnly": False,
|
244 |
+
"name": "PREF",
|
245 |
+
"path": "/",
|
246 |
+
"sameSite": None,
|
247 |
+
"secure": True,
|
248 |
+
"session": False,
|
249 |
+
"storeId": None,
|
250 |
+
"value": "f4=4000000&f6=40000000&tz=Europe.Paris&f5=30000&f7=100",
|
251 |
+
},
|
252 |
+
]
|
253 |
+
|
254 |
+
COOKIES_LIST += [
|
255 |
+
{
|
256 |
+
"domain": ".www.researchgate.net",
|
257 |
+
"hostOnly": False,
|
258 |
+
"httpOnly": True,
|
259 |
+
"name": "isInstIp",
|
260 |
+
"path": "/",
|
261 |
+
"sameSite": None,
|
262 |
+
"secure": True,
|
263 |
+
"session": True,
|
264 |
+
"storeId": None,
|
265 |
+
"value": "False",
|
266 |
+
},
|
267 |
+
{
|
268 |
+
"domain": ".researchgate.net",
|
269 |
+
"expirationDate": 1734423981,
|
270 |
+
"hostOnly": False,
|
271 |
+
"httpOnly": False,
|
272 |
+
"name": "__eoi",
|
273 |
+
"path": "/",
|
274 |
+
"sameSite": None,
|
275 |
+
"secure": False,
|
276 |
+
"session": False,
|
277 |
+
"storeId": None,
|
278 |
+
"value": "ID=c26f752377373146:T=1718871981:RT=1718884914:S=AA-AfjZw-T_OOX2kW2LLaFzXImgc",
|
279 |
+
},
|
280 |
+
{
|
281 |
+
"domain": ".www.researchgate.net",
|
282 |
+
"expirationDate": 1753444909.646103,
|
283 |
+
"hostOnly": False,
|
284 |
+
"httpOnly": True,
|
285 |
+
"name": "ptc",
|
286 |
+
"path": "/",
|
287 |
+
"sameSite": None,
|
288 |
+
"secure": True,
|
289 |
+
"session": False,
|
290 |
+
"storeId": None,
|
291 |
+
"value": "RG1.8947708639250500550.1718872043",
|
292 |
+
},
|
293 |
+
{
|
294 |
+
"domain": ".researchgate.net",
|
295 |
+
"expirationDate": 1750507578,
|
296 |
+
"hostOnly": False,
|
297 |
+
"httpOnly": False,
|
298 |
+
"name": "euconsent-v2-didomi",
|
299 |
+
"path": "/",
|
300 |
+
"sameSite": "lax",
|
301 |
+
"secure": True,
|
302 |
+
"session": False,
|
303 |
+
"storeId": None,
|
304 |
+
"value": "CQAgmoAQAgmoAAHABBENA5EsAP_gAEPgAAYgJ2pB5G5UTWlBIG53YMskIAUFhFBoQEAgAACAAwIBSBIAIIwEAGAAIAgAICACAAIAIBIAIABAGAAAAAAAYIAAIAAIAAAQIAAKIAAAAAAAAgBQAAgIAgggEAAAgEBEABAAgAAAEIIAQNgACgAAACCAAAAAAAABAAAAAAAAQAAAAAAAYCQAAAJIAAAAACAIABAIAAAAAAAAAAAAAAAABBAAIJ2wPIAFAAXABQAFQALgAcAA8ACAAEgALwAZAA0ACIAEcAJgAUgAqgBcADEAGgAPQAfgBEACOAE4AMMAZYA0QBsgDkAHOAO4AfsBBwEIAItARwBHQC6gHUAO2Ae0A_4CHQEXgJ2AUOAo8BT4CpQFqALYAXmAwQBkgDLAGXANjAhCBG8CbAE3gJ1gTtAA.f_wACHwAAAAA",
|
305 |
+
},
|
306 |
+
{
|
307 |
+
"domain": ".researchgate.net",
|
308 |
+
"expirationDate": 1718885236,
|
309 |
+
"hostOnly": False,
|
310 |
+
"httpOnly": False,
|
311 |
+
"name": "_gat",
|
312 |
+
"path": "/",
|
313 |
+
"sameSite": None,
|
314 |
+
"secure": False,
|
315 |
+
"session": False,
|
316 |
+
"storeId": None,
|
317 |
+
"value": "1",
|
318 |
+
},
|
319 |
+
{
|
320 |
+
"domain": "www.researchgate.net",
|
321 |
+
"expirationDate": 1721477183,
|
322 |
+
"hostOnly": True,
|
323 |
+
"httpOnly": False,
|
324 |
+
"name": "_pbjs_userid_consent_data",
|
325 |
+
"path": "/",
|
326 |
+
"sameSite": "lax",
|
327 |
+
"secure": False,
|
328 |
+
"session": False,
|
329 |
+
"storeId": None,
|
330 |
+
"value": "3524755945110770",
|
331 |
+
},
|
332 |
+
{
|
333 |
+
"domain": ".researchgate.net",
|
334 |
+
"expirationDate": 1752567981,
|
335 |
+
"hostOnly": False,
|
336 |
+
"httpOnly": False,
|
337 |
+
"name": "__gads",
|
338 |
+
"path": "/",
|
339 |
+
"sameSite": None,
|
340 |
+
"secure": False,
|
341 |
+
"session": False,
|
342 |
+
"storeId": None,
|
343 |
+
"value": "ID=eca2adb88969c830:T=1718871981:RT=1718884914:S=ALNI_MY2qZchynrhWX6hWMlaI87Pcj9riQ",
|
344 |
+
},
|
345 |
+
{
|
346 |
+
"domain": ".researchgate.net",
|
347 |
+
"expirationDate": 1718886709.646173,
|
348 |
+
"hostOnly": False,
|
349 |
+
"httpOnly": True,
|
350 |
+
"name": "__cf_bm",
|
351 |
+
"path": "/",
|
352 |
+
"sameSite": "no_restriction",
|
353 |
+
"secure": True,
|
354 |
+
"session": False,
|
355 |
+
"storeId": None,
|
356 |
+
"value": "IkQ_J4ciBzKQduRvjqsfSmQu8UygDWbHeROO5JVccfo-1718884909-1.0.1.1-qvNGEdbfI0HfhFP6kwe7R7mkTqODNhFuKhs72lLly6K2BOPMG3kbahpQFGvPK0U8FUfkznkq65gngd1sWj7sDA",
|
357 |
+
},
|
358 |
+
{
|
359 |
+
"domain": ".researchgate.net",
|
360 |
+
"expirationDate": 1752567981,
|
361 |
+
"hostOnly": False,
|
362 |
+
"httpOnly": False,
|
363 |
+
"name": "__gpi",
|
364 |
+
"path": "/",
|
365 |
+
"sameSite": None,
|
366 |
+
"secure": False,
|
367 |
+
"session": False,
|
368 |
+
"storeId": None,
|
369 |
+
"value": "UID=00000e4e9aa2e6f2:T=1718871981:RT=1718884914:S=ALNI_MYFNrgzkKn7K6Bd2y8hC6GJCvDiSg",
|
370 |
+
},
|
371 |
+
{
|
372 |
+
"domain": ".researchgate.net",
|
373 |
+
"hostOnly": False,
|
374 |
+
"httpOnly": True,
|
375 |
+
"name": "_cfuvid",
|
376 |
+
"path": "/",
|
377 |
+
"sameSite": "no_restriction",
|
378 |
+
"secure": True,
|
379 |
+
"session": True,
|
380 |
+
"storeId": None,
|
381 |
+
"value": "_GPmGZkBymiH3UiqTqzakEpi98br3nfFUWC2_u_wqkc-1718884909785-0.0.1.1-604800000",
|
382 |
+
},
|
383 |
+
{
|
384 |
+
"domain": ".researchgate.net",
|
385 |
+
"expirationDate": 1753445177.271667,
|
386 |
+
"hostOnly": False,
|
387 |
+
"httpOnly": False,
|
388 |
+
"name": "_ga",
|
389 |
+
"path": "/",
|
390 |
+
"sameSite": None,
|
391 |
+
"secure": False,
|
392 |
+
"session": False,
|
393 |
+
"storeId": None,
|
394 |
+
"value": "GA1.1.1525244793.1718885177",
|
395 |
+
},
|
396 |
+
{
|
397 |
+
"domain": ".researchgate.net",
|
398 |
+
"expirationDate": 1753445177.271482,
|
399 |
+
"hostOnly": False,
|
400 |
+
"httpOnly": False,
|
401 |
+
"name": "_ga_4P31SJ70EJ",
|
402 |
+
"path": "/",
|
403 |
+
"sameSite": None,
|
404 |
+
"secure": False,
|
405 |
+
"session": False,
|
406 |
+
"storeId": None,
|
407 |
+
"value": "GS1.1.1718885177.1.0.1718885177.0.0.0",
|
408 |
+
},
|
409 |
+
{
|
410 |
+
"domain": ".researchgate.net",
|
411 |
+
"expirationDate": 1718971576,
|
412 |
+
"hostOnly": False,
|
413 |
+
"httpOnly": False,
|
414 |
+
"name": "_gid",
|
415 |
+
"path": "/",
|
416 |
+
"sameSite": None,
|
417 |
+
"secure": False,
|
418 |
+
"session": False,
|
419 |
+
"storeId": None,
|
420 |
+
"value": "GA1.2.854907463.1718885177",
|
421 |
+
},
|
422 |
+
{
|
423 |
+
"domain": ".www.researchgate.net",
|
424 |
+
"expirationDate": 1750407982.506505,
|
425 |
+
"hostOnly": False,
|
426 |
+
"httpOnly": True,
|
427 |
+
"name": "did",
|
428 |
+
"path": "/",
|
429 |
+
"sameSite": None,
|
430 |
+
"secure": True,
|
431 |
+
"session": False,
|
432 |
+
"storeId": None,
|
433 |
+
"value": "1dWLO3C6am8l667Q4VUlBo0O1LI49Qi2Vw21SJEXHavBDYT56DI9007W5rYGVFVH",
|
434 |
+
},
|
435 |
+
{
|
436 |
+
"domain": ".researchgate.net",
|
437 |
+
"expirationDate": 1750507578,
|
438 |
+
"hostOnly": False,
|
439 |
+
"httpOnly": False,
|
440 |
+
"name": "didomi_token",
|
441 |
+
"path": "/",
|
442 |
+
"sameSite": "lax",
|
443 |
+
"secure": True,
|
444 |
+
"session": False,
|
445 |
+
"storeId": None,
|
446 |
+
"value": "eyJ1c2VyX2lkIjoiMTkwMzU4YTUtNWU2My02Y2UzLWJlNzAtZGFjNzVmYjdiY2ExIiwiY3JlYXRlZCI6IjIwMjQtMDYtMjBUMTI6MDY6MTYuODA2WiIsInVwZGF0ZWQiOiIyMDI0LTA2LTIwVDEyOjA2OjE4Ljc4MVoiLCJ2ZW5kb3JzIjp7ImVuYWJsZWQiOlsidHdpdHRlciIsImdvb2dsZSIsImM6bGlua2VkaW4tbWFya2V0aW5nLXNvbHV0aW9ucyIsImM6b3duZXJpcSIsImM6b21uaXR1cmUtYWRvYmUtYW5hbHl0aWNzIiwiYzp0ZWNobm9yYXRpLW1lZGlhIiwiYzppbnRlcmNvbSIsImM6aW50ZW50LWlxIiwiYzppcHJvbSIsImM6bGlua2VkaW4iLCJjOmFtYXpvbmFkdi16Y1hGTEI2WCIsImM6bWVkaWFuZXQtY1V3YUtFNnoiLCJjOmluZGV4ZXhjaC1OWkNRTTY4UCIsImM6emVvdGFwZ21iLWQ3YndtdGp3IiwiYzp0cmlwbGVsaWYtZGRKSDM0clkiLCJjOnJ0YmhvdXNlLWI4Y2RIOHRNIiwiYzptZHByaW1pcy1lYU4yOVdjUCIsImM6bG9vcG1lbGktVGRhWXRCUHEiLCJjOm1hZ25pdGVpbi05d1RZTHFSRCIsImM6Ymlkc3dpdGNoLWQ2N0V3N1c5IiwiYzpvcmFjbGVhZHYtcUhlREptQUwiLCJjOmdvb2dsZWFuYS00VFhuSmlnUiIsImM6bG90YW1lc29sLURIaTdMUmpNIiwiYzpuZXh0bWlsbGUtR0pyZlg4VWMiLCJjOm5yaWNodGVjLXFVVlEyUlFxIiwiYzpicml0ZXBvb2wtQldWeVdHeVUiLCJjOnRhcGFkaW5jLXFxY2tVN1BXIiwiYzppZDV0ZWNobi16Tk1KNGR3ZiIsImM6bWljcm9zb2Z0IiwiYzpwZXJtdXRpdmUtSjdpaHJlTWsiLCJjOm9wZXJhc29mdC1CY1hjRFZKTSIsImM6cG9zdGhvZy1Cakp4RmRGOSJdfSwicHVycG9zZXMiOnsiZW5hYmxlZCI6WyJnZW9sb2NhdGlvbl9kYXRhIiwiZGV2aWNlX2NoYXJhY3RlcmlzdGljcyJdfSwidmVuZG9yc19saSI6eyJlbmFibGVkIjpbImdvb2dsZSIsImM6b3BlcmFzb2Z0LUJjWGNEVkpNIl19LCJ2ZXJzaW9uIjoyLCJhYyI6IkRIU0FvQUZrQWNnQTVnSHFnUUhBeGdCNndEMTRJR0FRTkFqMEJJd0NTY0VyQUtCd1YtZ3MxQmgwREc0R09nQUEuREhTQW9BRmtBY2dBNWdIcWdRSEF4Z0I2d0QxNElHQVFOQWowQkl3Q1NjRXJBS0J3Vi1nczFCaDBERzRHT2dBQSJ9",
|
447 |
+
},
|
448 |
+
{
|
449 |
+
"domain": ".www.researchgate.net",
|
450 |
+
"hostOnly": False,
|
451 |
+
"httpOnly": True,
|
452 |
+
"name": "hasPdpNext",
|
453 |
+
"path": "/",
|
454 |
+
"sameSite": None,
|
455 |
+
"secure": True,
|
456 |
+
"session": True,
|
457 |
+
"storeId": None,
|
458 |
+
"value": "False",
|
459 |
+
},
|
460 |
+
{
|
461 |
+
"domain": ".researchgate.net",
|
462 |
+
"expirationDate": 1750421183,
|
463 |
+
"hostOnly": False,
|
464 |
+
"httpOnly": False,
|
465 |
+
"name": "ph_phc_ma1XTQyee96N1GML6qUTgLQRiDifnRcE9STiHTZ0CfZ_posthog",
|
466 |
+
"path": "/",
|
467 |
+
"sameSite": "lax",
|
468 |
+
"secure": True,
|
469 |
+
"session": False,
|
470 |
+
"storeId": None,
|
471 |
+
"value": "%7B%22distinct_id%22%3A%220190358a-56a1-7313-83b0-d13dddeac787%22%2C%22%24sesid%22%3A%5B1718885183223%2C%220190358a-56a1-7313-83b0-d13b2b87778d%22%2C1718885176993%5D%2C%22%24session_is_sampled%22%3Atrue%7D",
|
472 |
+
},
|
473 |
+
{
|
474 |
+
"domain": ".www.researchgate.net",
|
475 |
+
"hostOnly": False,
|
476 |
+
"httpOnly": True,
|
477 |
+
"name": "sid",
|
478 |
+
"path": "/",
|
479 |
+
"sameSite": None,
|
480 |
+
"secure": True,
|
481 |
+
"session": True,
|
482 |
+
"storeId": None,
|
483 |
+
"value": "qmH5Lc4f0CUJ3zeaxORcV0S8I8V1MuCFZtcIQqPYtv1XPejrbSLAQRbT50PL40TqeKQ1XsQDWt9gtYVzuL80bRmPjw6jn3cQ0ikNqW40maHcQ3JL2Vfa8ZZf0j7p35eJ",
|
484 |
+
},
|
485 |
+
]
|
486 |
+
|
487 |
+
COOKIES_LIST += [
|
488 |
+
{
|
489 |
+
"domain": "github.com",
|
490 |
+
"hostOnly": True,
|
491 |
+
"httpOnly": True,
|
492 |
+
"name": "_gh_sess",
|
493 |
+
"path": "/",
|
494 |
+
"sameSite": "lax",
|
495 |
+
"secure": True,
|
496 |
+
"session": True,
|
497 |
+
"storeId": None,
|
498 |
+
"value": "P%2Fmof1avuqwHaUQUIJR%2FZYn7jqbT7lgGuTGjp1BGAFIG5UpNDusEE3b8dRjz0eATE5xPdPjLYFqMs%2FI9AOalKX4YuYfSEEnxCMawU01099b4o9Xzzcv%2BmecrmO0Q8q%2Bdq1h8SIv6nvPP7HzlFesl8ysafb9b%2F0q6dTArKdSOurasza8UgLSYD08ofA50Pcm0IG7CTzF8ZCizrGgGTMi%2F%2B7L3E17jav5PM1Sf2vQKg15Gbg1QIOppJJHzlufgQoZigqFv%2BWznaws0Tt7Y2lSFCw%3D%3D--CJRhqMXJnwOaJgk4--DhUErlL4GdROikEjKD4O9g%3D%3D",
|
499 |
+
},
|
500 |
+
{
|
501 |
+
"domain": ".github.com",
|
502 |
+
"expirationDate": 1750408875.763785,
|
503 |
+
"hostOnly": False,
|
504 |
+
"httpOnly": False,
|
505 |
+
"name": "_octo",
|
506 |
+
"path": "/",
|
507 |
+
"sameSite": "lax",
|
508 |
+
"secure": True,
|
509 |
+
"session": False,
|
510 |
+
"storeId": None,
|
511 |
+
"value": "GH1.1.728652011.1718872875",
|
512 |
+
},
|
513 |
+
{
|
514 |
+
"domain": ".github.com",
|
515 |
+
"expirationDate": 1750408875.763926,
|
516 |
+
"hostOnly": False,
|
517 |
+
"httpOnly": True,
|
518 |
+
"name": "logged_in",
|
519 |
+
"path": "/",
|
520 |
+
"sameSite": "lax",
|
521 |
+
"secure": True,
|
522 |
+
"session": False,
|
523 |
+
"storeId": None,
|
524 |
+
"value": "no",
|
525 |
+
},
|
526 |
+
{
|
527 |
+
"domain": ".github.com",
|
528 |
+
"hostOnly": False,
|
529 |
+
"httpOnly": False,
|
530 |
+
"name": "preferred_color_mode",
|
531 |
+
"path": "/",
|
532 |
+
"sameSite": "lax",
|
533 |
+
"secure": True,
|
534 |
+
"session": True,
|
535 |
+
"storeId": None,
|
536 |
+
"value": "dark",
|
537 |
+
},
|
538 |
+
{
|
539 |
+
"domain": ".github.com",
|
540 |
+
"hostOnly": False,
|
541 |
+
"httpOnly": False,
|
542 |
+
"name": "tz",
|
543 |
+
"path": "/",
|
544 |
+
"sameSite": "lax",
|
545 |
+
"secure": True,
|
546 |
+
"session": True,
|
547 |
+
"storeId": None,
|
548 |
+
"value": "Europe%2FParis",
|
549 |
+
},
|
550 |
+
]
|
551 |
+
|
552 |
+
COOKIES_LIST += [
|
553 |
+
{
|
554 |
+
"domain": ".web.archive.org",
|
555 |
+
"expirationDate": 1718886430,
|
556 |
+
"hostOnly": False,
|
557 |
+
"httpOnly": False,
|
558 |
+
"name": "_gat",
|
559 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
560 |
+
"sameSite": None,
|
561 |
+
"secure": False,
|
562 |
+
"session": False,
|
563 |
+
"storeId": None,
|
564 |
+
"value": "1",
|
565 |
+
},
|
566 |
+
{
|
567 |
+
"domain": ".web.archive.org",
|
568 |
+
"expirationDate": 1718972770,
|
569 |
+
"hostOnly": False,
|
570 |
+
"httpOnly": False,
|
571 |
+
"name": "_gid",
|
572 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
573 |
+
"sameSite": None,
|
574 |
+
"secure": False,
|
575 |
+
"session": False,
|
576 |
+
"storeId": None,
|
577 |
+
"value": "GA1.2.402246368.1606169825",
|
578 |
+
},
|
579 |
+
{
|
580 |
+
"domain": ".web.archive.org",
|
581 |
+
"expirationDate": 1753446370.315621,
|
582 |
+
"hostOnly": False,
|
583 |
+
"httpOnly": False,
|
584 |
+
"name": "_ga",
|
585 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
586 |
+
"sameSite": None,
|
587 |
+
"secure": False,
|
588 |
+
"session": False,
|
589 |
+
"storeId": None,
|
590 |
+
"value": "GA1.2.1301409987.1606169825",
|
591 |
+
},
|
592 |
+
{
|
593 |
+
"domain": ".web.archive.org",
|
594 |
+
"expirationDate": 1750422367,
|
595 |
+
"hostOnly": False,
|
596 |
+
"httpOnly": False,
|
597 |
+
"name": "_hjid",
|
598 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
599 |
+
"sameSite": "lax",
|
600 |
+
"secure": False,
|
601 |
+
"session": False,
|
602 |
+
"storeId": None,
|
603 |
+
"value": "07f80263-a631-4bf4-8ffd-8fc8912085e2",
|
604 |
+
},
|
605 |
+
{
|
606 |
+
"domain": ".web.archive.org",
|
607 |
+
"expirationDate": 1718888167,
|
608 |
+
"hostOnly": False,
|
609 |
+
"httpOnly": False,
|
610 |
+
"name": "_hjFirstSeen",
|
611 |
+
"path": "/web/20201123221659/http://orcid.org/",
|
612 |
+
"sameSite": "lax",
|
613 |
+
"secure": False,
|
614 |
+
"session": False,
|
615 |
+
"storeId": None,
|
616 |
+
"value": "1",
|
617 |
+
},
|
618 |
+
]
|
619 |
+
COOKIES_LIST += [
|
620 |
+
{
|
621 |
+
"domain": "orcid.org",
|
622 |
+
"hostOnly": True,
|
623 |
+
"httpOnly": False,
|
624 |
+
"name": "AWSELBCORS",
|
625 |
+
"path": "/",
|
626 |
+
"sameSite": "no_restriction",
|
627 |
+
"secure": True,
|
628 |
+
"session": True,
|
629 |
+
"storeId": None,
|
630 |
+
"value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
|
631 |
+
},
|
632 |
+
{
|
633 |
+
"domain": ".orcid.org",
|
634 |
+
"expirationDate": 1753452454.637671,
|
635 |
+
"hostOnly": False,
|
636 |
+
"httpOnly": False,
|
637 |
+
"name": "_ga_9R61FWK9H5",
|
638 |
+
"path": "/",
|
639 |
+
"sameSite": None,
|
640 |
+
"secure": False,
|
641 |
+
"session": False,
|
642 |
+
"storeId": None,
|
643 |
+
"value": "GS1.1.1718892454.1.0.1718892454.0.0.0",
|
644 |
+
},
|
645 |
+
{
|
646 |
+
"domain": ".orcid.org",
|
647 |
+
"expirationDate": 1753452454.63421,
|
648 |
+
"hostOnly": False,
|
649 |
+
"httpOnly": False,
|
650 |
+
"name": "_ga",
|
651 |
+
"path": "/",
|
652 |
+
"sameSite": None,
|
653 |
+
"secure": False,
|
654 |
+
"session": False,
|
655 |
+
"storeId": None,
|
656 |
+
"value": "GA1.1.2021310691.1718892455",
|
657 |
+
},
|
658 |
+
{
|
659 |
+
"domain": "orcid.org",
|
660 |
+
"hostOnly": True,
|
661 |
+
"httpOnly": False,
|
662 |
+
"name": "AWSELB",
|
663 |
+
"path": "/",
|
664 |
+
"sameSite": None,
|
665 |
+
"secure": False,
|
666 |
+
"session": True,
|
667 |
+
"storeId": None,
|
668 |
+
"value": "CBD1D7FF1216388FA48838CBCA4774FD22800B8FB548A40EF92BB0994D5B77A8410307CDEAA69C52236663F2BF89B252C17BC0FCDF790FD59771BDDF6EA8CA4CFD29D8733F",
|
669 |
+
},
|
670 |
+
{
|
671 |
+
"domain": ".orcid.org",
|
672 |
+
"expirationDate": 1750428454,
|
673 |
+
"hostOnly": False,
|
674 |
+
"httpOnly": False,
|
675 |
+
"name": "OptanonAlertBoxClosed",
|
676 |
+
"path": "/",
|
677 |
+
"sameSite": "lax",
|
678 |
+
"secure": False,
|
679 |
+
"session": False,
|
680 |
+
"storeId": None,
|
681 |
+
"value": "2024-06-20T14:07:34.583Z",
|
682 |
+
},
|
683 |
+
{
|
684 |
+
"domain": ".orcid.org",
|
685 |
+
"expirationDate": 1750428454,
|
686 |
+
"hostOnly": False,
|
687 |
+
"httpOnly": False,
|
688 |
+
"name": "OptanonConsent",
|
689 |
+
"path": "/",
|
690 |
+
"sameSite": "lax",
|
691 |
+
"secure": False,
|
692 |
+
"session": False,
|
693 |
+
"storeId": None,
|
694 |
+
"value": "isGpcEnabled=0&datestamp=Thu+Jun+20+2024+16%3A07%3A34+GMT%2B0200+(heure+d%E2%80%99%C3%A9t%C3%A9+d%E2%80%99Europe+centrale)&version=202310.2.0&browserGpcFlag=0&isIABGlobal=False&hosts=&landingPath=NotLandingPage&groups=C0001%3A1%2CC0003%3A1%2CC0002%3A1%2CC0004%3A1",
|
695 |
+
},
|
696 |
+
{
|
697 |
+
"domain": "orcid.org",
|
698 |
+
"hostOnly": True,
|
699 |
+
"httpOnly": False,
|
700 |
+
"name": "XSRF-TOKEN",
|
701 |
+
"path": "/",
|
702 |
+
"sameSite": None,
|
703 |
+
"secure": True,
|
704 |
+
"session": True,
|
705 |
+
"storeId": None,
|
706 |
+
"value": "6957be7a-bcb4-4d59-a522-ea9b6b210ed9",
|
707 |
+
},
|
708 |
+
]
|
709 |
+
|
710 |
+
# Create a RequestsCookieJar instance
|
711 |
+
COOKIES = RequestsCookieJar()
|
712 |
+
|
713 |
+
# Add cookies to the jar
|
714 |
+
for cookie in COOKIES_LIST:
|
715 |
+
COOKIES.set(cookie["name"], cookie["value"], domain=cookie["domain"], path=cookie["path"])
|
scripts/gaia_scorer.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
import warnings
|
4 |
+
|
5 |
+
|
6 |
+
def normalize_number_str(number_str: str) -> float:
|
7 |
+
# we replace these common units and commas to allow
|
8 |
+
# conversion to float
|
9 |
+
for char in ["$", "%", ","]:
|
10 |
+
number_str = number_str.replace(char, "")
|
11 |
+
try:
|
12 |
+
return float(number_str)
|
13 |
+
except ValueError:
|
14 |
+
print(f"String {number_str} cannot be normalized to number str.")
|
15 |
+
return float("inf")
|
16 |
+
|
17 |
+
|
18 |
+
def split_string(
|
19 |
+
s: str,
|
20 |
+
char_list: list[str] = [",", ";"],
|
21 |
+
) -> list[str]:
|
22 |
+
pattern = f"[{''.join(char_list)}]"
|
23 |
+
return re.split(pattern, s)
|
24 |
+
|
25 |
+
|
26 |
+
def is_float(element: any) -> bool:
|
27 |
+
try:
|
28 |
+
float(element)
|
29 |
+
return True
|
30 |
+
except ValueError:
|
31 |
+
return False
|
32 |
+
|
33 |
+
|
34 |
+
def question_scorer(
|
35 |
+
model_answer: str,
|
36 |
+
ground_truth: str,
|
37 |
+
) -> bool:
|
38 |
+
# if gt is a number
|
39 |
+
if is_float(ground_truth):
|
40 |
+
normalized_answer = normalize_number_str(str(model_answer))
|
41 |
+
return normalized_answer == float(ground_truth)
|
42 |
+
|
43 |
+
# if gt is a list
|
44 |
+
elif any(char in ground_truth for char in [",", ";"]):
|
45 |
+
# question with the fish: normalization removes punct
|
46 |
+
|
47 |
+
gt_elems = split_string(ground_truth)
|
48 |
+
ma_elems = split_string(model_answer)
|
49 |
+
|
50 |
+
# check length is the same
|
51 |
+
if len(gt_elems) != len(ma_elems):
|
52 |
+
warnings.warn("Answer lists have different lengths, returning False.", UserWarning)
|
53 |
+
return False
|
54 |
+
|
55 |
+
# compare each element as float or str
|
56 |
+
comparisons = []
|
57 |
+
for ma_elem, gt_elem in zip(ma_elems, gt_elems):
|
58 |
+
if is_float(gt_elem):
|
59 |
+
normalized_ma_elem = normalize_number_str(ma_elem)
|
60 |
+
comparisons.append(normalized_ma_elem == float(gt_elem))
|
61 |
+
else:
|
62 |
+
# we do not remove punct since comparisons can include punct
|
63 |
+
comparisons.append(
|
64 |
+
normalize_str(ma_elem, remove_punct=False) == normalize_str(gt_elem, remove_punct=False)
|
65 |
+
)
|
66 |
+
return all(comparisons)
|
67 |
+
|
68 |
+
# if gt is a str
|
69 |
+
else:
|
70 |
+
return normalize_str(model_answer) == normalize_str(ground_truth)
|
71 |
+
|
72 |
+
|
73 |
+
def check_prediction_contains_answer_letters_in_order(prediction, true_answer):
|
74 |
+
prediction = prediction.lower()
|
75 |
+
true_answer = true_answer.lower()
|
76 |
+
if len(prediction) > len(true_answer) * 3:
|
77 |
+
return False
|
78 |
+
i = 0
|
79 |
+
for letter in true_answer:
|
80 |
+
if letter in prediction[i:]:
|
81 |
+
i += prediction[i:].index(letter)
|
82 |
+
else:
|
83 |
+
return False
|
84 |
+
return True
|
85 |
+
|
86 |
+
|
87 |
+
def check_close_call(prediction, true_answer, is_correct):
|
88 |
+
if is_correct:
|
89 |
+
return True
|
90 |
+
else:
|
91 |
+
if is_float(true_answer):
|
92 |
+
return is_correct
|
93 |
+
else:
|
94 |
+
if (
|
95 |
+
check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer))
|
96 |
+
and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer)) * 2
|
97 |
+
):
|
98 |
+
print(f"Close call: {prediction} vs {true_answer}")
|
99 |
+
return True
|
100 |
+
else:
|
101 |
+
return False
|
102 |
+
|
103 |
+
|
104 |
+
def normalize_str(input_str, remove_punct=True) -> str:
|
105 |
+
"""
|
106 |
+
Normalize a string by:
|
107 |
+
- Removing all white spaces
|
108 |
+
- Optionally removing punctuation (if remove_punct is True)
|
109 |
+
- Converting to lowercase
|
110 |
+
Parameters:
|
111 |
+
- input_str: str, the string to normalize
|
112 |
+
- remove_punct: bool, whether to remove punctuation (default: True)
|
113 |
+
Returns:
|
114 |
+
- str, the normalized string
|
115 |
+
"""
|
116 |
+
# Remove all white spaces. Required e.g for seagull vs. sea gull
|
117 |
+
no_spaces = re.sub(r"\s", "", input_str)
|
118 |
+
|
119 |
+
# Remove punctuation, if specified.
|
120 |
+
if remove_punct:
|
121 |
+
translator = str.maketrans("", "", string.punctuation)
|
122 |
+
return no_spaces.lower().translate(translator)
|
123 |
+
else:
|
124 |
+
return no_spaces.lower()
|
scripts/mdconvert.py
ADDED
@@ -0,0 +1,1004 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This is copied from Magentic-one's great repo: https://github.com/microsoft/autogen/blob/v0.4.4/python/packages/autogen-magentic-one/src/autogen_magentic_one/markdown_browser/mdconvert.py
|
2 |
+
# Thanks to Microsoft researchers for open-sourcing this!
|
3 |
+
# type: ignore
|
4 |
+
import base64
|
5 |
+
import copy
|
6 |
+
import html
|
7 |
+
import json
|
8 |
+
import mimetypes
|
9 |
+
import os
|
10 |
+
import re
|
11 |
+
import shutil
|
12 |
+
import subprocess
|
13 |
+
import sys
|
14 |
+
import tempfile
|
15 |
+
import traceback
|
16 |
+
import zipfile
|
17 |
+
from typing import Any
|
18 |
+
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
|
19 |
+
|
20 |
+
import mammoth
|
21 |
+
import markdownify
|
22 |
+
import pandas as pd
|
23 |
+
import pdfminer
|
24 |
+
import pdfminer.high_level
|
25 |
+
import pptx
|
26 |
+
|
27 |
+
# File-format detection
|
28 |
+
import puremagic
|
29 |
+
import pydub
|
30 |
+
import requests
|
31 |
+
import speech_recognition as sr
|
32 |
+
from bs4 import BeautifulSoup
|
33 |
+
from youtube_transcript_api import YouTubeTranscriptApi
|
34 |
+
from youtube_transcript_api.formatters import SRTFormatter
|
35 |
+
|
36 |
+
|
37 |
+
class _CustomMarkdownify(markdownify.MarkdownConverter):
|
38 |
+
"""
|
39 |
+
A custom version of markdownify's MarkdownConverter. Changes include:
|
40 |
+
|
41 |
+
- Altering the default heading style to use '#', '##', etc.
|
42 |
+
- Removing javascript hyperlinks.
|
43 |
+
- Truncating images with large data:uri sources.
|
44 |
+
- Ensuring URIs are properly escaped, and do not conflict with Markdown syntax
|
45 |
+
"""
|
46 |
+
|
47 |
+
def __init__(self, **options: Any):
|
48 |
+
options["heading_style"] = options.get("heading_style", markdownify.ATX)
|
49 |
+
# Explicitly cast options to the expected type if necessary
|
50 |
+
super().__init__(**options)
|
51 |
+
|
52 |
+
def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
|
53 |
+
"""Same as usual, but be sure to start with a new line"""
|
54 |
+
if not convert_as_inline:
|
55 |
+
if not re.search(r"^\n", text):
|
56 |
+
return "\n" + super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
57 |
+
|
58 |
+
return super().convert_hn(n, el, text, convert_as_inline) # type: ignore
|
59 |
+
|
60 |
+
def convert_a(self, el: Any, text: str, convert_as_inline: bool):
|
61 |
+
"""Same as usual converter, but removes Javascript links and escapes URIs."""
|
62 |
+
prefix, suffix, text = markdownify.chomp(text) # type: ignore
|
63 |
+
if not text:
|
64 |
+
return ""
|
65 |
+
href = el.get("href")
|
66 |
+
title = el.get("title")
|
67 |
+
|
68 |
+
# Escape URIs and skip non-http or file schemes
|
69 |
+
if href:
|
70 |
+
try:
|
71 |
+
parsed_url = urlparse(href) # type: ignore
|
72 |
+
if parsed_url.scheme and parsed_url.scheme.lower() not in ["http", "https", "file"]: # type: ignore
|
73 |
+
return "%s%s%s" % (prefix, text, suffix)
|
74 |
+
href = urlunparse(parsed_url._replace(path=quote(unquote(parsed_url.path)))) # type: ignore
|
75 |
+
except ValueError: # It's not clear if this ever gets thrown
|
76 |
+
return "%s%s%s" % (prefix, text, suffix)
|
77 |
+
|
78 |
+
# For the replacement see #29: text nodes underscores are escaped
|
79 |
+
if (
|
80 |
+
self.options["autolinks"]
|
81 |
+
and text.replace(r"\_", "_") == href
|
82 |
+
and not title
|
83 |
+
and not self.options["default_title"]
|
84 |
+
):
|
85 |
+
# Shortcut syntax
|
86 |
+
return "<%s>" % href
|
87 |
+
if self.options["default_title"] and not title:
|
88 |
+
title = href
|
89 |
+
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
90 |
+
return "%s[%s](%s%s)%s" % (prefix, text, href, title_part, suffix) if href else text
|
91 |
+
|
92 |
+
def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
|
93 |
+
"""Same as usual converter, but removes data URIs"""
|
94 |
+
|
95 |
+
alt = el.attrs.get("alt", None) or ""
|
96 |
+
src = el.attrs.get("src", None) or ""
|
97 |
+
title = el.attrs.get("title", None) or ""
|
98 |
+
title_part = ' "%s"' % title.replace('"', r"\"") if title else ""
|
99 |
+
if convert_as_inline and el.parent.name not in self.options["keep_inline_images_in"]:
|
100 |
+
return alt
|
101 |
+
|
102 |
+
# Remove dataURIs
|
103 |
+
if src.startswith("data:"):
|
104 |
+
src = src.split(",")[0] + "..."
|
105 |
+
|
106 |
+
return "" % (alt, src, title_part)
|
107 |
+
|
108 |
+
def convert_soup(self, soup: Any) -> str:
|
109 |
+
return super().convert_soup(soup) # type: ignore
|
110 |
+
|
111 |
+
|
112 |
+
class DocumentConverterResult:
|
113 |
+
"""The result of converting a document to text."""
|
114 |
+
|
115 |
+
def __init__(self, title: str | None = None, text_content: str = ""):
|
116 |
+
self.title: str | None = title
|
117 |
+
self.text_content: str = text_content
|
118 |
+
|
119 |
+
|
120 |
+
class DocumentConverter:
|
121 |
+
"""Abstract superclass of all DocumentConverters."""
|
122 |
+
|
123 |
+
def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
|
124 |
+
raise NotImplementedError()
|
125 |
+
|
126 |
+
|
127 |
+
class PlainTextConverter(DocumentConverter):
|
128 |
+
"""Anything with content type text/plain"""
|
129 |
+
|
130 |
+
def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
|
131 |
+
# Guess the content type from any file extension that might be around
|
132 |
+
content_type, _ = mimetypes.guess_type("__placeholder" + kwargs.get("file_extension", ""))
|
133 |
+
|
134 |
+
# Only accept text files
|
135 |
+
if content_type is None:
|
136 |
+
return None
|
137 |
+
# elif "text/" not in content_type.lower():
|
138 |
+
# return None
|
139 |
+
|
140 |
+
text_content = ""
|
141 |
+
with open(local_path, "rt", encoding="utf-8") as fh:
|
142 |
+
text_content = fh.read()
|
143 |
+
return DocumentConverterResult(
|
144 |
+
title=None,
|
145 |
+
text_content=text_content,
|
146 |
+
)
|
147 |
+
|
148 |
+
|
149 |
+
class HtmlConverter(DocumentConverter):
|
150 |
+
"""Anything with content type text/html"""
|
151 |
+
|
152 |
+
def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
|
153 |
+
# Bail if not html
|
154 |
+
extension = kwargs.get("file_extension", "")
|
155 |
+
if extension.lower() not in [".html", ".htm"]:
|
156 |
+
return None
|
157 |
+
|
158 |
+
result = None
|
159 |
+
with open(local_path, "rt", encoding="utf-8") as fh:
|
160 |
+
result = self._convert(fh.read())
|
161 |
+
|
162 |
+
return result
|
163 |
+
|
164 |
+
def _convert(self, html_content: str) -> None | DocumentConverterResult:
|
165 |
+
"""Helper function that converts and HTML string."""
|
166 |
+
|
167 |
+
# Parse the string
|
168 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
169 |
+
|
170 |
+
# Remove javascript and style blocks
|
171 |
+
for script in soup(["script", "style"]):
|
172 |
+
script.extract()
|
173 |
+
|
174 |
+
# Print only the main content
|
175 |
+
body_elm = soup.find("body")
|
176 |
+
webpage_text = ""
|
177 |
+
if body_elm:
|
178 |
+
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
|
179 |
+
else:
|
180 |
+
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
181 |
+
|
182 |
+
assert isinstance(webpage_text, str)
|
183 |
+
|
184 |
+
return DocumentConverterResult(
|
185 |
+
title=None if soup.title is None else soup.title.string, text_content=webpage_text
|
186 |
+
)
|
187 |
+
|
188 |
+
|
189 |
+
class WikipediaConverter(DocumentConverter):
|
190 |
+
"""Handle Wikipedia pages separately, focusing only on the main document content."""
|
191 |
+
|
192 |
+
def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
|
193 |
+
# Bail if not Wikipedia
|
194 |
+
extension = kwargs.get("file_extension", "")
|
195 |
+
if extension.lower() not in [".html", ".htm"]:
|
196 |
+
return None
|
197 |
+
url = kwargs.get("url", "")
|
198 |
+
if not re.search(r"^https?:\/\/[a-zA-Z]{2,3}\.wikipedia.org\/", url):
|
199 |
+
return None
|
200 |
+
|
201 |
+
# Parse the file
|
202 |
+
soup = None
|
203 |
+
with open(local_path, "rt", encoding="utf-8") as fh:
|
204 |
+
soup = BeautifulSoup(fh.read(), "html.parser")
|
205 |
+
|
206 |
+
# Remove javascript and style blocks
|
207 |
+
for script in soup(["script", "style"]):
|
208 |
+
script.extract()
|
209 |
+
|
210 |
+
# Print only the main content
|
211 |
+
body_elm = soup.find("div", {"id": "mw-content-text"})
|
212 |
+
title_elm = soup.find("span", {"class": "mw-page-title-main"})
|
213 |
+
|
214 |
+
webpage_text = ""
|
215 |
+
main_title = None if soup.title is None else soup.title.string
|
216 |
+
|
217 |
+
if body_elm:
|
218 |
+
# What's the title
|
219 |
+
if title_elm and len(title_elm) > 0:
|
220 |
+
main_title = title_elm.string # type: ignore
|
221 |
+
assert isinstance(main_title, str)
|
222 |
+
|
223 |
+
# Convert the page
|
224 |
+
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(body_elm)
|
225 |
+
else:
|
226 |
+
webpage_text = _CustomMarkdownify().convert_soup(soup)
|
227 |
+
|
228 |
+
return DocumentConverterResult(
|
229 |
+
title=main_title,
|
230 |
+
text_content=webpage_text,
|
231 |
+
)
|
232 |
+
|
233 |
+
|
234 |
+
class YouTubeConverter(DocumentConverter):
|
235 |
+
"""Handle YouTube specially, focusing on the video title, description, and transcript."""
|
236 |
+
|
237 |
+
def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
|
238 |
+
# Bail if not YouTube
|
239 |
+
extension = kwargs.get("file_extension", "")
|
240 |
+
if extension.lower() not in [".html", ".htm"]:
|
241 |
+
return None
|
242 |
+
url = kwargs.get("url", "")
|
243 |
+
if not url.startswith("https://www.youtube.com/watch?"):
|
244 |
+
return None
|
245 |
+
|
246 |
+
# Parse the file
|
247 |
+
soup = None
|
248 |
+
with open(local_path, "rt", encoding="utf-8") as fh:
|
249 |
+
soup = BeautifulSoup(fh.read(), "html.parser")
|
250 |
+
|
251 |
+
# Read the meta tags
|
252 |
+
assert soup.title is not None and soup.title.string is not None
|
253 |
+
metadata: dict[str, str] = {"title": soup.title.string}
|
254 |
+
for meta in soup(["meta"]):
|
255 |
+
for a in meta.attrs:
|
256 |
+
if a in ["itemprop", "property", "name"]:
|
257 |
+
metadata[meta[a]] = meta.get("content", "")
|
258 |
+
break
|
259 |
+
|
260 |
+
# We can also try to read the full description. This is more prone to breaking, since it reaches into the page implementation
|
261 |
+
try:
|
262 |
+
for script in soup(["script"]):
|
263 |
+
content = script.text
|
264 |
+
if "ytInitialData" in content:
|
265 |
+
lines = re.split(r"\r?\n", content)
|
266 |
+
obj_start = lines[0].find("{")
|
267 |
+
obj_end = lines[0].rfind("}")
|
268 |
+
if obj_start >= 0 and obj_end >= 0:
|
269 |
+
data = json.loads(lines[0][obj_start : obj_end + 1])
|
270 |
+
attrdesc = self._findKey(data, "attributedDescriptionBodyText") # type: ignore
|
271 |
+
if attrdesc:
|
272 |
+
metadata["description"] = str(attrdesc["content"])
|
273 |
+
break
|
274 |
+
except Exception:
|
275 |
+
pass
|
276 |
+
|
277 |
+
# Start preparing the page
|
278 |
+
webpage_text = "# YouTube\n"
|
279 |
+
|
280 |
+
title = self._get(metadata, ["title", "og:title", "name"]) # type: ignore
|
281 |
+
assert isinstance(title, str)
|
282 |
+
|
283 |
+
if title:
|
284 |
+
webpage_text += f"\n## {title}\n"
|
285 |
+
|
286 |
+
stats = ""
|
287 |
+
views = self._get(metadata, ["interactionCount"]) # type: ignore
|
288 |
+
if views:
|
289 |
+
stats += f"- **Views:** {views}\n"
|
290 |
+
|
291 |
+
keywords = self._get(metadata, ["keywords"]) # type: ignore
|
292 |
+
if keywords:
|
293 |
+
stats += f"- **Keywords:** {keywords}\n"
|
294 |
+
|
295 |
+
runtime = self._get(metadata, ["duration"]) # type: ignore
|
296 |
+
if runtime:
|
297 |
+
stats += f"- **Runtime:** {runtime}\n"
|
298 |
+
|
299 |
+
if len(stats) > 0:
|
300 |
+
webpage_text += f"\n### Video Metadata\n{stats}\n"
|
301 |
+
|
302 |
+
description = self._get(metadata, ["description", "og:description"]) # type: ignore
|
303 |
+
if description:
|
304 |
+
webpage_text += f"\n### Description\n{description}\n"
|
305 |
+
|
306 |
+
transcript_text = ""
|
307 |
+
parsed_url = urlparse(url) # type: ignore
|
308 |
+
params = parse_qs(parsed_url.query) # type: ignore
|
309 |
+
if "v" in params:
|
310 |
+
assert isinstance(params["v"][0], str)
|
311 |
+
video_id = str(params["v"][0])
|
312 |
+
try:
|
313 |
+
# Must be a single transcript.
|
314 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id) # type: ignore
|
315 |
+
# transcript_text = " ".join([part["text"] for part in transcript]) # type: ignore
|
316 |
+
# Alternative formatting:
|
317 |
+
transcript_text = SRTFormatter().format_transcript(transcript)
|
318 |
+
except Exception:
|
319 |
+
pass
|
320 |
+
if transcript_text:
|
321 |
+
webpage_text += f"\n### Transcript\n{transcript_text}\n"
|
322 |
+
|
323 |
+
title = title if title else soup.title.string
|
324 |
+
assert isinstance(title, str)
|
325 |
+
|
326 |
+
return DocumentConverterResult(
|
327 |
+
title=title,
|
328 |
+
text_content=webpage_text,
|
329 |
+
)
|
330 |
+
|
331 |
+
def _get(self, metadata: dict[str, str], keys: list[str], default: str | None = None) -> str | None:
|
332 |
+
for k in keys:
|
333 |
+
if k in metadata:
|
334 |
+
return metadata[k]
|
335 |
+
return default
|
336 |
+
|
337 |
+
def _findKey(self, json: Any, key: str) -> str | None: # TODO: Fix json type
|
338 |
+
if isinstance(json, list):
|
339 |
+
for elm in json:
|
340 |
+
ret = self._findKey(elm, key)
|
341 |
+
if ret is not None:
|
342 |
+
return ret
|
343 |
+
elif isinstance(json, dict):
|
344 |
+
for k in json:
|
345 |
+
if k == key:
|
346 |
+
return json[k]
|
347 |
+
else:
|
348 |
+
ret = self._findKey(json[k], key)
|
349 |
+
if ret is not None:
|
350 |
+
return ret
|
351 |
+
return None
|
352 |
+
|
353 |
+
|
354 |
+
class PdfConverter(DocumentConverter):
|
355 |
+
"""
|
356 |
+
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
|
357 |
+
"""
|
358 |
+
|
359 |
+
def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
|
360 |
+
# Bail if not a PDF
|
361 |
+
extension = kwargs.get("file_extension", "")
|
362 |
+
if extension.lower() != ".pdf":
|
363 |
+
return None
|
364 |
+
|
365 |
+
return DocumentConverterResult(
|
366 |
+
title=None,
|
367 |
+
text_content=pdfminer.high_level.extract_text(local_path),
|
368 |
+
)
|
369 |
+
|
370 |
+
|
371 |
+
class DocxConverter(HtmlConverter):
|
372 |
+
"""
|
373 |
+
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
|
374 |
+
"""
|
375 |
+
|
376 |
+
def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
|
377 |
+
# Bail if not a DOCX
|
378 |
+
extension = kwargs.get("file_extension", "")
|
379 |
+
if extension.lower() != ".docx":
|
380 |
+
return None
|
381 |
+
|
382 |
+
result = None
|
383 |
+
with open(local_path, "rb") as docx_file:
|
384 |
+
result = mammoth.convert_to_html(docx_file)
|
385 |
+
html_content = result.value
|
386 |
+
result = self._convert(html_content)
|
387 |
+
|
388 |
+
return result
|
389 |
+
|
390 |
+
|
391 |
+
class XlsxConverter(HtmlConverter):
|
392 |
+
"""
|
393 |
+
Converts XLSX files to Markdown, with each sheet presented as a separate Markdown table.
|
394 |
+
"""
|
395 |
+
|
396 |
+
def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
|
397 |
+
# Bail if not a XLSX
|
398 |
+
extension = kwargs.get("file_extension", "")
|
399 |
+
if extension.lower() not in [".xlsx", ".xls"]:
|
400 |
+
return None
|
401 |
+
|
402 |
+
sheets = pd.read_excel(local_path, sheet_name=None)
|
403 |
+
md_content = ""
|
404 |
+
for s in sheets:
|
405 |
+
md_content += f"## {s}\n"
|
406 |
+
html_content = sheets[s].to_html(index=False)
|
407 |
+
md_content += self._convert(html_content).text_content.strip() + "\n\n"
|
408 |
+
|
409 |
+
return DocumentConverterResult(
|
410 |
+
title=None,
|
411 |
+
text_content=md_content.strip(),
|
412 |
+
)
|
413 |
+
|
414 |
+
|
415 |
+
class PptxConverter(HtmlConverter):
|
416 |
+
"""
|
417 |
+
Converts PPTX files to Markdown. Supports heading, tables and images with alt text.
|
418 |
+
"""
|
419 |
+
|
420 |
+
def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
|
421 |
+
# Bail if not a PPTX
|
422 |
+
extension = kwargs.get("file_extension", "")
|
423 |
+
if extension.lower() != ".pptx":
|
424 |
+
return None
|
425 |
+
|
426 |
+
md_content = ""
|
427 |
+
|
428 |
+
presentation = pptx.Presentation(local_path)
|
429 |
+
slide_num = 0
|
430 |
+
for slide in presentation.slides:
|
431 |
+
slide_num += 1
|
432 |
+
|
433 |
+
md_content += f"\n\n<!-- Slide number: {slide_num} -->\n"
|
434 |
+
|
435 |
+
title = slide.shapes.title
|
436 |
+
for shape in slide.shapes:
|
437 |
+
# Pictures
|
438 |
+
if self._is_picture(shape):
|
439 |
+
# https://github.com/scanny/python-pptx/pull/512#issuecomment-1713100069
|
440 |
+
alt_text = ""
|
441 |
+
try:
|
442 |
+
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "")
|
443 |
+
except Exception:
|
444 |
+
pass
|
445 |
+
|
446 |
+
# A placeholder name
|
447 |
+
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
448 |
+
md_content += "\n\n"
|
449 |
+
|
450 |
+
# Tables
|
451 |
+
if self._is_table(shape):
|
452 |
+
html_table = "<html><body><table>"
|
453 |
+
first_row = True
|
454 |
+
for row in shape.table.rows:
|
455 |
+
html_table += "<tr>"
|
456 |
+
for cell in row.cells:
|
457 |
+
if first_row:
|
458 |
+
html_table += "<th>" + html.escape(cell.text) + "</th>"
|
459 |
+
else:
|
460 |
+
html_table += "<td>" + html.escape(cell.text) + "</td>"
|
461 |
+
html_table += "</tr>"
|
462 |
+
first_row = False
|
463 |
+
html_table += "</table></body></html>"
|
464 |
+
md_content += "\n" + self._convert(html_table).text_content.strip() + "\n"
|
465 |
+
|
466 |
+
# Text areas
|
467 |
+
elif shape.has_text_frame:
|
468 |
+
if shape == title:
|
469 |
+
md_content += "# " + shape.text.lstrip() + "\n"
|
470 |
+
else:
|
471 |
+
md_content += shape.text + "\n"
|
472 |
+
|
473 |
+
md_content = md_content.strip()
|
474 |
+
|
475 |
+
if slide.has_notes_slide:
|
476 |
+
md_content += "\n\n### Notes:\n"
|
477 |
+
notes_frame = slide.notes_slide.notes_text_frame
|
478 |
+
if notes_frame is not None:
|
479 |
+
md_content += notes_frame.text
|
480 |
+
md_content = md_content.strip()
|
481 |
+
|
482 |
+
return DocumentConverterResult(
|
483 |
+
title=None,
|
484 |
+
text_content=md_content.strip(),
|
485 |
+
)
|
486 |
+
|
487 |
+
def _is_picture(self, shape):
|
488 |
+
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PICTURE:
|
489 |
+
return True
|
490 |
+
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.PLACEHOLDER:
|
491 |
+
if hasattr(shape, "image"):
|
492 |
+
return True
|
493 |
+
return False
|
494 |
+
|
495 |
+
def _is_table(self, shape):
|
496 |
+
if shape.shape_type == pptx.enum.shapes.MSO_SHAPE_TYPE.TABLE:
|
497 |
+
return True
|
498 |
+
return False
|
499 |
+
|
500 |
+
|
501 |
+
class MediaConverter(DocumentConverter):
|
502 |
+
"""
|
503 |
+
Abstract class for multi-modal media (e.g., images and audio)
|
504 |
+
"""
|
505 |
+
|
506 |
+
def _get_metadata(self, local_path):
|
507 |
+
exiftool = shutil.which("exiftool")
|
508 |
+
if not exiftool:
|
509 |
+
return None
|
510 |
+
else:
|
511 |
+
try:
|
512 |
+
result = subprocess.run([exiftool, "-json", local_path], capture_output=True, text=True).stdout
|
513 |
+
return json.loads(result)[0]
|
514 |
+
except Exception:
|
515 |
+
return None
|
516 |
+
|
517 |
+
|
518 |
+
class WavConverter(MediaConverter):
|
519 |
+
"""
|
520 |
+
Converts WAV files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` is installed).
|
521 |
+
"""
|
522 |
+
|
523 |
+
def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
|
524 |
+
# Bail if not a XLSX
|
525 |
+
extension = kwargs.get("file_extension", "")
|
526 |
+
if extension.lower() != ".wav":
|
527 |
+
return None
|
528 |
+
|
529 |
+
md_content = ""
|
530 |
+
|
531 |
+
# Add metadata
|
532 |
+
metadata = self._get_metadata(local_path)
|
533 |
+
if metadata:
|
534 |
+
for f in [
|
535 |
+
"Title",
|
536 |
+
"Artist",
|
537 |
+
"Author",
|
538 |
+
"Band",
|
539 |
+
"Album",
|
540 |
+
"Genre",
|
541 |
+
"Track",
|
542 |
+
"DateTimeOriginal",
|
543 |
+
"CreateDate",
|
544 |
+
"Duration",
|
545 |
+
]:
|
546 |
+
if f in metadata:
|
547 |
+
md_content += f"{f}: {metadata[f]}\n"
|
548 |
+
|
549 |
+
# Transcribe
|
550 |
+
try:
|
551 |
+
transcript = self._transcribe_audio(local_path)
|
552 |
+
md_content += "\n\n### Audio Transcript:\n" + ("[No speech detected]" if transcript == "" else transcript)
|
553 |
+
except Exception:
|
554 |
+
md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
555 |
+
|
556 |
+
return DocumentConverterResult(
|
557 |
+
title=None,
|
558 |
+
text_content=md_content.strip(),
|
559 |
+
)
|
560 |
+
|
561 |
+
def _transcribe_audio(self, local_path) -> str:
|
562 |
+
recognizer = sr.Recognizer()
|
563 |
+
with sr.AudioFile(local_path) as source:
|
564 |
+
audio = recognizer.record(source)
|
565 |
+
return recognizer.recognize_google(audio).strip()
|
566 |
+
|
567 |
+
|
568 |
+
class Mp3Converter(WavConverter):
|
569 |
+
"""
|
570 |
+
Converts MP3 and M4A files to markdown via extraction of metadata (if `exiftool` is installed), and speech transcription (if `speech_recognition` AND `pydub` are installed).
|
571 |
+
"""
|
572 |
+
|
573 |
+
def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
|
574 |
+
# Bail if not a MP3
|
575 |
+
extension = kwargs.get("file_extension", "")
|
576 |
+
if extension.lower() not in [".mp3", ".m4a"]:
|
577 |
+
return None
|
578 |
+
|
579 |
+
md_content = ""
|
580 |
+
|
581 |
+
# Add metadata
|
582 |
+
metadata = self._get_metadata(local_path)
|
583 |
+
if metadata:
|
584 |
+
for f in [
|
585 |
+
"Title",
|
586 |
+
"Artist",
|
587 |
+
"Author",
|
588 |
+
"Band",
|
589 |
+
"Album",
|
590 |
+
"Genre",
|
591 |
+
"Track",
|
592 |
+
"DateTimeOriginal",
|
593 |
+
"CreateDate",
|
594 |
+
"Duration",
|
595 |
+
]:
|
596 |
+
if f in metadata:
|
597 |
+
md_content += f"{f}: {metadata[f]}\n"
|
598 |
+
|
599 |
+
# Transcribe
|
600 |
+
handle, temp_path = tempfile.mkstemp(suffix=".wav")
|
601 |
+
os.close(handle)
|
602 |
+
try:
|
603 |
+
if extension.lower() == ".mp3":
|
604 |
+
sound = pydub.AudioSegment.from_mp3(local_path)
|
605 |
+
else:
|
606 |
+
sound = pydub.AudioSegment.from_file(local_path, format="m4a")
|
607 |
+
sound.export(temp_path, format="wav")
|
608 |
+
|
609 |
+
_args = dict()
|
610 |
+
_args.update(kwargs)
|
611 |
+
_args["file_extension"] = ".wav"
|
612 |
+
|
613 |
+
try:
|
614 |
+
transcript = super()._transcribe_audio(temp_path).strip()
|
615 |
+
md_content += "\n\n### Audio Transcript:\n" + (
|
616 |
+
"[No speech detected]" if transcript == "" else transcript
|
617 |
+
)
|
618 |
+
except Exception:
|
619 |
+
md_content += "\n\n### Audio Transcript:\nError. Could not transcribe this audio."
|
620 |
+
|
621 |
+
finally:
|
622 |
+
os.unlink(temp_path)
|
623 |
+
|
624 |
+
# Return the result
|
625 |
+
return DocumentConverterResult(
|
626 |
+
title=None,
|
627 |
+
text_content=md_content.strip(),
|
628 |
+
)
|
629 |
+
|
630 |
+
|
631 |
+
class ZipConverter(DocumentConverter):
|
632 |
+
"""
|
633 |
+
Extracts ZIP files to a permanent local directory and returns a listing of extracted files.
|
634 |
+
"""
|
635 |
+
|
636 |
+
def __init__(self, extract_dir: str = "downloads"):
|
637 |
+
"""
|
638 |
+
Initialize with path to extraction directory.
|
639 |
+
|
640 |
+
Args:
|
641 |
+
extract_dir: The directory where files will be extracted. Defaults to "downloads"
|
642 |
+
"""
|
643 |
+
self.extract_dir = extract_dir
|
644 |
+
# Create the extraction directory if it doesn't exist
|
645 |
+
os.makedirs(self.extract_dir, exist_ok=True)
|
646 |
+
|
647 |
+
def convert(self, local_path: str, **kwargs: Any) -> None | DocumentConverterResult:
|
648 |
+
# Bail if not a ZIP file
|
649 |
+
extension = kwargs.get("file_extension", "")
|
650 |
+
if extension.lower() != ".zip":
|
651 |
+
return None
|
652 |
+
|
653 |
+
# Verify it's actually a ZIP file
|
654 |
+
if not zipfile.is_zipfile(local_path):
|
655 |
+
return None
|
656 |
+
|
657 |
+
# Extract all files and build list
|
658 |
+
extracted_files = []
|
659 |
+
with zipfile.ZipFile(local_path, "r") as zip_ref:
|
660 |
+
# Extract all files
|
661 |
+
zip_ref.extractall(self.extract_dir)
|
662 |
+
# Get list of all files
|
663 |
+
for file_path in zip_ref.namelist():
|
664 |
+
# Skip directories
|
665 |
+
if not file_path.endswith("/"):
|
666 |
+
extracted_files.append(self.extract_dir + "/" + file_path)
|
667 |
+
|
668 |
+
# Sort files for consistent output
|
669 |
+
extracted_files.sort()
|
670 |
+
|
671 |
+
# Build the markdown content
|
672 |
+
md_content = "Downloaded the following files:\n"
|
673 |
+
for file in extracted_files:
|
674 |
+
md_content += f"* {file}\n"
|
675 |
+
|
676 |
+
return DocumentConverterResult(title="Extracted Files", text_content=md_content.strip())
|
677 |
+
|
678 |
+
|
679 |
+
class ImageConverter(MediaConverter):
|
680 |
+
"""
|
681 |
+
Converts images to markdown via extraction of metadata (if `exiftool` is installed), OCR (if `easyocr` is installed), and description via a multimodal LLM (if an mlm_client is configured).
|
682 |
+
"""
|
683 |
+
|
684 |
+
def convert(self, local_path, **kwargs) -> None | DocumentConverterResult:
|
685 |
+
# Bail if not a XLSX
|
686 |
+
extension = kwargs.get("file_extension", "")
|
687 |
+
if extension.lower() not in [".jpg", ".jpeg", ".png"]:
|
688 |
+
return None
|
689 |
+
|
690 |
+
md_content = ""
|
691 |
+
|
692 |
+
# Add metadata
|
693 |
+
metadata = self._get_metadata(local_path)
|
694 |
+
if metadata:
|
695 |
+
for f in [
|
696 |
+
"ImageSize",
|
697 |
+
"Title",
|
698 |
+
"Caption",
|
699 |
+
"Description",
|
700 |
+
"Keywords",
|
701 |
+
"Artist",
|
702 |
+
"Author",
|
703 |
+
"DateTimeOriginal",
|
704 |
+
"CreateDate",
|
705 |
+
"GPSPosition",
|
706 |
+
]:
|
707 |
+
if f in metadata:
|
708 |
+
md_content += f"{f}: {metadata[f]}\n"
|
709 |
+
|
710 |
+
# Try describing the image with GPTV
|
711 |
+
mlm_client = kwargs.get("mlm_client")
|
712 |
+
mlm_model = kwargs.get("mlm_model")
|
713 |
+
if mlm_client is not None and mlm_model is not None:
|
714 |
+
md_content += (
|
715 |
+
"\n# Description:\n"
|
716 |
+
+ self._get_mlm_description(
|
717 |
+
local_path, extension, mlm_client, mlm_model, prompt=kwargs.get("mlm_prompt")
|
718 |
+
).strip()
|
719 |
+
+ "\n"
|
720 |
+
)
|
721 |
+
|
722 |
+
return DocumentConverterResult(
|
723 |
+
title=None,
|
724 |
+
text_content=md_content,
|
725 |
+
)
|
726 |
+
|
727 |
+
def _get_mlm_description(self, local_path, extension, client, model, prompt=None):
|
728 |
+
if prompt is None or prompt.strip() == "":
|
729 |
+
prompt = "Write a detailed caption for this image."
|
730 |
+
|
731 |
+
sys.stderr.write(f"MLM Prompt:\n{prompt}\n")
|
732 |
+
|
733 |
+
data_uri = ""
|
734 |
+
with open(local_path, "rb") as image_file:
|
735 |
+
content_type, encoding = mimetypes.guess_type("_dummy" + extension)
|
736 |
+
if content_type is None:
|
737 |
+
content_type = "image/jpeg"
|
738 |
+
image_base64 = base64.b64encode(image_file.read()).decode("utf-8")
|
739 |
+
data_uri = f"data:{content_type};base64,{image_base64}"
|
740 |
+
|
741 |
+
messages = [
|
742 |
+
{
|
743 |
+
"role": "user",
|
744 |
+
"content": [
|
745 |
+
{"type": "text", "text": prompt},
|
746 |
+
{
|
747 |
+
"type": "image_url",
|
748 |
+
"image_url": {
|
749 |
+
"url": data_uri,
|
750 |
+
},
|
751 |
+
},
|
752 |
+
],
|
753 |
+
}
|
754 |
+
]
|
755 |
+
|
756 |
+
response = client.chat.completions.create(model=model, messages=messages)
|
757 |
+
return response.choices[0].message.content
|
758 |
+
|
759 |
+
|
760 |
+
class FileConversionException(Exception):
|
761 |
+
pass
|
762 |
+
|
763 |
+
|
764 |
+
class UnsupportedFormatException(Exception):
|
765 |
+
pass
|
766 |
+
|
767 |
+
|
768 |
+
class MarkdownConverter:
|
769 |
+
"""(In preview) An extremely simple text-based document reader, suitable for LLM use.
|
770 |
+
This reader will convert common file-types or webpages to Markdown."""
|
771 |
+
|
772 |
+
def __init__(
|
773 |
+
self,
|
774 |
+
requests_session: requests.Session | None = None,
|
775 |
+
mlm_client: Any | None = None,
|
776 |
+
mlm_model: Any | None = None,
|
777 |
+
):
|
778 |
+
if requests_session is None:
|
779 |
+
self._requests_session = requests.Session()
|
780 |
+
else:
|
781 |
+
self._requests_session = requests_session
|
782 |
+
|
783 |
+
self._mlm_client = mlm_client
|
784 |
+
self._mlm_model = mlm_model
|
785 |
+
|
786 |
+
self._page_converters: list[DocumentConverter] = []
|
787 |
+
|
788 |
+
# Register converters for successful browsing operations
|
789 |
+
# Later registrations are tried first / take higher priority than earlier registrations
|
790 |
+
# To this end, the most specific converters should appear below the most generic converters
|
791 |
+
self.register_page_converter(PlainTextConverter())
|
792 |
+
self.register_page_converter(HtmlConverter())
|
793 |
+
self.register_page_converter(WikipediaConverter())
|
794 |
+
self.register_page_converter(YouTubeConverter())
|
795 |
+
self.register_page_converter(DocxConverter())
|
796 |
+
self.register_page_converter(XlsxConverter())
|
797 |
+
self.register_page_converter(PptxConverter())
|
798 |
+
self.register_page_converter(WavConverter())
|
799 |
+
self.register_page_converter(Mp3Converter())
|
800 |
+
self.register_page_converter(ImageConverter())
|
801 |
+
self.register_page_converter(ZipConverter())
|
802 |
+
self.register_page_converter(PdfConverter())
|
803 |
+
|
804 |
+
def convert(
|
805 |
+
self, source: str | requests.Response, **kwargs: Any
|
806 |
+
) -> DocumentConverterResult: # TODO: deal with kwargs
|
807 |
+
"""
|
808 |
+
Args:
|
809 |
+
- source: can be a string representing a path or url, or a requests.response object
|
810 |
+
- extension: specifies the file extension to use when interpreting the file. If None, infer from source (path, uri, content-type, etc.)
|
811 |
+
"""
|
812 |
+
|
813 |
+
# Local path or url
|
814 |
+
if isinstance(source, str):
|
815 |
+
if source.startswith("http://") or source.startswith("https://") or source.startswith("file://"):
|
816 |
+
return self.convert_url(source, **kwargs)
|
817 |
+
else:
|
818 |
+
return self.convert_local(source, **kwargs)
|
819 |
+
# Request response
|
820 |
+
elif isinstance(source, requests.Response):
|
821 |
+
return self.convert_response(source, **kwargs)
|
822 |
+
|
823 |
+
def convert_local(self, path: str, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
|
824 |
+
# Prepare a list of extensions to try (in order of priority)
|
825 |
+
ext = kwargs.get("file_extension")
|
826 |
+
extensions = [ext] if ext is not None else []
|
827 |
+
|
828 |
+
# Get extension alternatives from the path and puremagic
|
829 |
+
base, ext = os.path.splitext(path)
|
830 |
+
self._append_ext(extensions, ext)
|
831 |
+
self._append_ext(extensions, self._guess_ext_magic(path))
|
832 |
+
|
833 |
+
# Convert
|
834 |
+
return self._convert(path, extensions, **kwargs)
|
835 |
+
|
836 |
+
# TODO what should stream's type be?
|
837 |
+
def convert_stream(self, stream: Any, **kwargs: Any) -> DocumentConverterResult: # TODO: deal with kwargs
|
838 |
+
# Prepare a list of extensions to try (in order of priority)
|
839 |
+
ext = kwargs.get("file_extension")
|
840 |
+
extensions = [ext] if ext is not None else []
|
841 |
+
|
842 |
+
# Save the file locally to a temporary file. It will be deleted before this method exits
|
843 |
+
handle, temp_path = tempfile.mkstemp()
|
844 |
+
fh = os.fdopen(handle, "wb")
|
845 |
+
result = None
|
846 |
+
try:
|
847 |
+
# Write to the temporary file
|
848 |
+
content = stream.read()
|
849 |
+
if isinstance(content, str):
|
850 |
+
fh.write(content.encode("utf-8"))
|
851 |
+
else:
|
852 |
+
fh.write(content)
|
853 |
+
fh.close()
|
854 |
+
|
855 |
+
# Use puremagic to check for more extension options
|
856 |
+
self._append_ext(extensions, self._guess_ext_magic(temp_path))
|
857 |
+
|
858 |
+
# Convert
|
859 |
+
result = self._convert(temp_path, extensions, **kwargs)
|
860 |
+
# Clean up
|
861 |
+
finally:
|
862 |
+
try:
|
863 |
+
fh.close()
|
864 |
+
except Exception:
|
865 |
+
pass
|
866 |
+
os.unlink(temp_path)
|
867 |
+
|
868 |
+
return result
|
869 |
+
|
870 |
+
def convert_url(self, url: str, **kwargs: Any) -> DocumentConverterResult: # TODO: fix kwargs type
|
871 |
+
# Send a HTTP request to the URL
|
872 |
+
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
|
873 |
+
response = self._requests_session.get(url, stream=True, headers={"User-Agent": user_agent})
|
874 |
+
response.raise_for_status()
|
875 |
+
return self.convert_response(response, **kwargs)
|
876 |
+
|
877 |
+
def convert_response(
|
878 |
+
self, response: requests.Response, **kwargs: Any
|
879 |
+
) -> DocumentConverterResult: # TODO fix kwargs type
|
880 |
+
# Prepare a list of extensions to try (in order of priority)
|
881 |
+
ext = kwargs.get("file_extension")
|
882 |
+
extensions = [ext] if ext is not None else []
|
883 |
+
|
884 |
+
# Guess from the mimetype
|
885 |
+
content_type = response.headers.get("content-type", "").split(";")[0]
|
886 |
+
self._append_ext(extensions, mimetypes.guess_extension(content_type))
|
887 |
+
|
888 |
+
# Read the content disposition if there is one
|
889 |
+
content_disposition = response.headers.get("content-disposition", "")
|
890 |
+
m = re.search(r"filename=([^;]+)", content_disposition)
|
891 |
+
if m:
|
892 |
+
base, ext = os.path.splitext(m.group(1).strip("\"'"))
|
893 |
+
self._append_ext(extensions, ext)
|
894 |
+
|
895 |
+
# Read from the extension from the path
|
896 |
+
base, ext = os.path.splitext(urlparse(response.url).path)
|
897 |
+
self._append_ext(extensions, ext)
|
898 |
+
|
899 |
+
# Save the file locally to a temporary file. It will be deleted before this method exits
|
900 |
+
handle, temp_path = tempfile.mkstemp()
|
901 |
+
fh = os.fdopen(handle, "wb")
|
902 |
+
result = None
|
903 |
+
try:
|
904 |
+
# Download the file
|
905 |
+
for chunk in response.iter_content(chunk_size=512):
|
906 |
+
fh.write(chunk)
|
907 |
+
fh.close()
|
908 |
+
|
909 |
+
# Use puremagic to check for more extension options
|
910 |
+
self._append_ext(extensions, self._guess_ext_magic(temp_path))
|
911 |
+
|
912 |
+
# Convert
|
913 |
+
result = self._convert(temp_path, extensions, url=response.url)
|
914 |
+
except Exception as e:
|
915 |
+
print(f"Error in converting: {e}")
|
916 |
+
|
917 |
+
# Clean up
|
918 |
+
finally:
|
919 |
+
try:
|
920 |
+
fh.close()
|
921 |
+
except Exception:
|
922 |
+
pass
|
923 |
+
os.unlink(temp_path)
|
924 |
+
|
925 |
+
return result
|
926 |
+
|
927 |
+
def _convert(self, local_path: str, extensions: list[str | None], **kwargs) -> DocumentConverterResult:
|
928 |
+
error_trace = ""
|
929 |
+
for ext in extensions + [None]: # Try last with no extension
|
930 |
+
for converter in self._page_converters:
|
931 |
+
_kwargs = copy.deepcopy(kwargs)
|
932 |
+
|
933 |
+
# Overwrite file_extension appropriately
|
934 |
+
if ext is None:
|
935 |
+
if "file_extension" in _kwargs:
|
936 |
+
del _kwargs["file_extension"]
|
937 |
+
else:
|
938 |
+
_kwargs.update({"file_extension": ext})
|
939 |
+
|
940 |
+
# Copy any additional global options
|
941 |
+
if "mlm_client" not in _kwargs and self._mlm_client is not None:
|
942 |
+
_kwargs["mlm_client"] = self._mlm_client
|
943 |
+
|
944 |
+
if "mlm_model" not in _kwargs and self._mlm_model is not None:
|
945 |
+
_kwargs["mlm_model"] = self._mlm_model
|
946 |
+
|
947 |
+
# Initialize res to None before try-except block
|
948 |
+
res = None
|
949 |
+
# If we hit an error log it and keep trying
|
950 |
+
try:
|
951 |
+
res = converter.convert(local_path, **_kwargs)
|
952 |
+
except Exception:
|
953 |
+
error_trace = ("\n\n" + traceback.format_exc()).strip()
|
954 |
+
|
955 |
+
if res is not None:
|
956 |
+
# Normalize the content
|
957 |
+
res.text_content = "\n".join([line.rstrip() for line in re.split(r"\r?\n", res.text_content)])
|
958 |
+
res.text_content = re.sub(r"\n{3,}", "\n\n", res.text_content)
|
959 |
+
|
960 |
+
# Todo
|
961 |
+
return res
|
962 |
+
|
963 |
+
# If we got this far without success, report any exceptions
|
964 |
+
if len(error_trace) > 0:
|
965 |
+
raise FileConversionException(
|
966 |
+
f"Could not convert '{local_path}' to Markdown. File type was recognized as {extensions}. While converting the file, the following error was encountered:\n\n{error_trace}"
|
967 |
+
)
|
968 |
+
|
969 |
+
# Nothing can handle it!
|
970 |
+
raise UnsupportedFormatException(
|
971 |
+
f"Could not convert '{local_path}' to Markdown. The formats {extensions} are not supported."
|
972 |
+
)
|
973 |
+
|
974 |
+
def _append_ext(self, extensions, ext):
|
975 |
+
"""Append a unique non-None, non-empty extension to a list of extensions."""
|
976 |
+
if ext is None:
|
977 |
+
return
|
978 |
+
ext = ext.strip()
|
979 |
+
if ext == "":
|
980 |
+
return
|
981 |
+
# if ext not in extensions:
|
982 |
+
if True:
|
983 |
+
extensions.append(ext)
|
984 |
+
|
985 |
+
def _guess_ext_magic(self, path):
|
986 |
+
"""Use puremagic (a Python implementation of libmagic) to guess a file's extension based on the first few bytes."""
|
987 |
+
# Use puremagic to guess
|
988 |
+
try:
|
989 |
+
guesses = puremagic.magic_file(path)
|
990 |
+
if len(guesses) > 0:
|
991 |
+
ext = guesses[0].extension.strip()
|
992 |
+
if len(ext) > 0:
|
993 |
+
return ext
|
994 |
+
except FileNotFoundError:
|
995 |
+
pass
|
996 |
+
except IsADirectoryError:
|
997 |
+
pass
|
998 |
+
except PermissionError:
|
999 |
+
pass
|
1000 |
+
return None
|
1001 |
+
|
1002 |
+
def register_page_converter(self, converter: DocumentConverter) -> None:
|
1003 |
+
"""Register a page text converter."""
|
1004 |
+
self._page_converters.insert(0, converter)
|
scripts/reformulator.py
ADDED
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
|
2 |
+
# https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
|
3 |
+
import copy
|
4 |
+
|
5 |
+
from smolagents.models import MessageRole, Model
|
6 |
+
|
7 |
+
|
8 |
+
def prepare_response(original_task: str, inner_messages, reformulation_model: Model) -> str:
|
9 |
+
messages = [
|
10 |
+
{
|
11 |
+
"role": MessageRole.SYSTEM,
|
12 |
+
"content": [
|
13 |
+
{
|
14 |
+
"type": "text",
|
15 |
+
"text": f"""Earlier you were asked the following:
|
16 |
+
|
17 |
+
{original_task}
|
18 |
+
|
19 |
+
Your team then worked diligently to address that request. Read below a transcript of that conversation:""",
|
20 |
+
}
|
21 |
+
],
|
22 |
+
}
|
23 |
+
]
|
24 |
+
|
25 |
+
# The first message just repeats the question, so remove it
|
26 |
+
# if len(inner_messages) > 1:
|
27 |
+
# del inner_messages[0]
|
28 |
+
|
29 |
+
# copy them to this context
|
30 |
+
try:
|
31 |
+
for message in inner_messages:
|
32 |
+
if not message.get("content"):
|
33 |
+
continue
|
34 |
+
message = copy.deepcopy(message)
|
35 |
+
message["role"] = MessageRole.USER
|
36 |
+
messages.append(message)
|
37 |
+
except Exception:
|
38 |
+
messages += [{"role": MessageRole.ASSISTANT, "content": str(inner_messages)}]
|
39 |
+
|
40 |
+
# ask for the final answer
|
41 |
+
messages.append(
|
42 |
+
{
|
43 |
+
"role": MessageRole.USER,
|
44 |
+
"content": [
|
45 |
+
{
|
46 |
+
"type": "text",
|
47 |
+
"text": f"""
|
48 |
+
Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
|
49 |
+
|
50 |
+
{original_task}
|
51 |
+
|
52 |
+
To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
|
53 |
+
Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
|
54 |
+
ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
|
55 |
+
If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise.
|
56 |
+
If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
|
57 |
+
If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
|
58 |
+
If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine'
|
59 |
+
""",
|
60 |
+
}
|
61 |
+
],
|
62 |
+
}
|
63 |
+
)
|
64 |
+
|
65 |
+
response = reformulation_model(messages).content
|
66 |
+
|
67 |
+
final_answer = response.split("FINAL ANSWER: ")[-1].strip()
|
68 |
+
print("> Reformulated answer: ", final_answer)
|
69 |
+
|
70 |
+
# if "unable to determine" in final_answer.lower():
|
71 |
+
# messages.append({"role": MessageRole.ASSISTANT, "content": response })
|
72 |
+
# messages.append({"role": MessageRole.USER, "content": [{"type": "text", "text": """
|
73 |
+
# I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation.
|
74 |
+
|
75 |
+
# To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS]
|
76 |
+
# Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc.
|
77 |
+
# ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
|
78 |
+
# If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
|
79 |
+
# If you are asked for a string, don't use articles or abbreviations (e.g. cit for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
|
80 |
+
# If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
|
81 |
+
# """.strip()}]})
|
82 |
+
|
83 |
+
# response = model(messages).content
|
84 |
+
# print("\n>>>Making an educated guess.\n", response)
|
85 |
+
# final_answer = response.split("EDUCATED GUESS: ")[-1].strip()
|
86 |
+
return final_answer
|
scripts/run_agents.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
import shutil
|
4 |
+
import textwrap
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
# import tqdm.asyncio
|
8 |
+
from smolagents.utils import AgentError
|
9 |
+
|
10 |
+
|
11 |
+
def serialize_agent_error(obj):
|
12 |
+
if isinstance(obj, AgentError):
|
13 |
+
return {"error_type": obj.__class__.__name__, "message": obj.message}
|
14 |
+
else:
|
15 |
+
return str(obj)
|
16 |
+
|
17 |
+
|
18 |
+
def get_image_description(file_name: str, question: str, visual_inspection_tool) -> str:
|
19 |
+
prompt = f"""Write a caption of 5 sentences for this image. Pay special attention to any details that might be useful for someone answering the following question:
|
20 |
+
{question}. But do not try to answer the question directly!
|
21 |
+
Do not add any information that is not present in the image."""
|
22 |
+
return visual_inspection_tool(image_path=file_name, question=prompt)
|
23 |
+
|
24 |
+
|
25 |
+
def get_document_description(file_path: str, question: str, document_inspection_tool) -> str:
|
26 |
+
prompt = f"""Write a caption of 5 sentences for this document. Pay special attention to any details that might be useful for someone answering the following question:
|
27 |
+
{question}. But do not try to answer the question directly!
|
28 |
+
Do not add any information that is not present in the document."""
|
29 |
+
return document_inspection_tool.forward_initial_exam_mode(file_path=file_path, question=prompt)
|
30 |
+
|
31 |
+
|
32 |
+
def get_single_file_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
|
33 |
+
file_extension = file_path.split(".")[-1]
|
34 |
+
if file_extension in ["png", "jpg", "jpeg"]:
|
35 |
+
file_description = f" - Attached image: {file_path}"
|
36 |
+
file_description += (
|
37 |
+
f"\n -> Image description: {get_image_description(file_path, question, visual_inspection_tool)}"
|
38 |
+
)
|
39 |
+
return file_description
|
40 |
+
elif file_extension in ["pdf", "xls", "xlsx", "docx", "doc", "xml"]:
|
41 |
+
file_description = f" - Attached document: {file_path}"
|
42 |
+
image_path = file_path.split(".")[0] + ".png"
|
43 |
+
if os.path.exists(image_path):
|
44 |
+
description = get_image_description(image_path, question, visual_inspection_tool)
|
45 |
+
else:
|
46 |
+
description = get_document_description(file_path, question, document_inspection_tool)
|
47 |
+
file_description += f"\n -> File description: {description}"
|
48 |
+
return file_description
|
49 |
+
elif file_extension in ["mp3", "m4a", "wav"]:
|
50 |
+
return f" - Attached audio: {file_path}"
|
51 |
+
else:
|
52 |
+
return f" - Attached file: {file_path}"
|
53 |
+
|
54 |
+
|
55 |
+
def get_zip_description(file_path: str, question: str, visual_inspection_tool, document_inspection_tool):
|
56 |
+
folder_path = file_path.replace(".zip", "")
|
57 |
+
os.makedirs(folder_path, exist_ok=True)
|
58 |
+
shutil.unpack_archive(file_path, folder_path)
|
59 |
+
|
60 |
+
prompt_use_files = ""
|
61 |
+
for root, dirs, files in os.walk(folder_path):
|
62 |
+
for file in files:
|
63 |
+
file_path = os.path.join(root, file)
|
64 |
+
prompt_use_files += "\n" + textwrap.indent(
|
65 |
+
get_single_file_description(file_path, question, visual_inspection_tool, document_inspection_tool),
|
66 |
+
prefix=" ",
|
67 |
+
)
|
68 |
+
return prompt_use_files
|
69 |
+
|
70 |
+
|
71 |
+
def get_tasks_to_run(data, total: int, base_filename: Path, tasks_ids: list[int]):
|
72 |
+
f = base_filename.parent / f"{base_filename.stem}_answers.jsonl"
|
73 |
+
done = set()
|
74 |
+
if f.exists():
|
75 |
+
with open(f, encoding="utf-8") as fh:
|
76 |
+
done = {json.loads(line)["task_id"] for line in fh if line.strip()}
|
77 |
+
|
78 |
+
tasks = []
|
79 |
+
for i in range(total):
|
80 |
+
task_id = int(data[i]["task_id"])
|
81 |
+
if task_id not in done:
|
82 |
+
if tasks_ids is not None:
|
83 |
+
if task_id in tasks_ids:
|
84 |
+
tasks.append(data[i])
|
85 |
+
else:
|
86 |
+
tasks.append(data[i])
|
87 |
+
return tasks
|
scripts/text_inspector_tool.py
ADDED
@@ -0,0 +1,124 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from smolagents import Tool
|
2 |
+
from smolagents.models import Model
|
3 |
+
|
4 |
+
|
5 |
+
class TextInspectorTool(Tool):
|
6 |
+
name = "inspect_file_as_text"
|
7 |
+
description = """
|
8 |
+
You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it.
|
9 |
+
This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".m4a", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES."""
|
10 |
+
|
11 |
+
inputs = {
|
12 |
+
"file_path": {
|
13 |
+
"description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT use this tool for an HTML webpage: use the web_search tool instead!",
|
14 |
+
"type": "string",
|
15 |
+
},
|
16 |
+
"question": {
|
17 |
+
"description": "[Optional]: Your question, as a natural language sentence. Provide as much context as possible. Do not pass this parameter if you just want to directly return the content of the file.",
|
18 |
+
"type": "string",
|
19 |
+
"nullable": True,
|
20 |
+
},
|
21 |
+
}
|
22 |
+
output_type = "string"
|
23 |
+
|
24 |
+
def __init__(self, model: Model = None, text_limit: int = 100000):
|
25 |
+
super().__init__()
|
26 |
+
self.model = model
|
27 |
+
self.text_limit = text_limit
|
28 |
+
from .mdconvert import MarkdownConverter
|
29 |
+
|
30 |
+
self.md_converter = MarkdownConverter()
|
31 |
+
|
32 |
+
def forward_initial_exam_mode(self, file_path, question):
|
33 |
+
from smolagents.models import MessageRole
|
34 |
+
|
35 |
+
result = self.md_converter.convert(file_path)
|
36 |
+
|
37 |
+
if file_path[-4:] in [".png", ".jpg"]:
|
38 |
+
raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
|
39 |
+
|
40 |
+
if ".zip" in file_path:
|
41 |
+
return result.text_content
|
42 |
+
|
43 |
+
if not question:
|
44 |
+
return result.text_content
|
45 |
+
|
46 |
+
if len(result.text_content) < 4000:
|
47 |
+
return "Document content: " + result.text_content
|
48 |
+
|
49 |
+
messages = [
|
50 |
+
{
|
51 |
+
"role": MessageRole.SYSTEM,
|
52 |
+
"content": [
|
53 |
+
{
|
54 |
+
"type": "text",
|
55 |
+
"text": "Here is a file:\n### "
|
56 |
+
+ str(result.title)
|
57 |
+
+ "\n\n"
|
58 |
+
+ result.text_content[: self.text_limit],
|
59 |
+
}
|
60 |
+
],
|
61 |
+
},
|
62 |
+
{
|
63 |
+
"role": MessageRole.USER,
|
64 |
+
"content": [
|
65 |
+
{
|
66 |
+
"type": "text",
|
67 |
+
"text": "Now please write a short, 5 sentence caption for this document, that could help someone asking this question: "
|
68 |
+
+ question
|
69 |
+
+ "\n\nDon't answer the question yourself! Just provide useful notes on the document",
|
70 |
+
}
|
71 |
+
],
|
72 |
+
},
|
73 |
+
]
|
74 |
+
return self.model(messages).content
|
75 |
+
|
76 |
+
def forward(self, file_path, question: str | None = None) -> str:
|
77 |
+
from smolagents.models import MessageRole
|
78 |
+
|
79 |
+
result = self.md_converter.convert(file_path)
|
80 |
+
|
81 |
+
if file_path[-4:] in [".png", ".jpg"]:
|
82 |
+
raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!")
|
83 |
+
|
84 |
+
if ".zip" in file_path:
|
85 |
+
return result.text_content
|
86 |
+
|
87 |
+
if not question:
|
88 |
+
return result.text_content
|
89 |
+
|
90 |
+
messages = [
|
91 |
+
{
|
92 |
+
"role": MessageRole.SYSTEM,
|
93 |
+
"content": [
|
94 |
+
{
|
95 |
+
"type": "text",
|
96 |
+
"text": "You will have to write a short caption for this file, then answer this question:"
|
97 |
+
+ question,
|
98 |
+
}
|
99 |
+
],
|
100 |
+
},
|
101 |
+
{
|
102 |
+
"role": MessageRole.USER,
|
103 |
+
"content": [
|
104 |
+
{
|
105 |
+
"type": "text",
|
106 |
+
"text": "Here is the complete file:\n### "
|
107 |
+
+ str(result.title)
|
108 |
+
+ "\n\n"
|
109 |
+
+ result.text_content[: self.text_limit],
|
110 |
+
}
|
111 |
+
],
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"role": MessageRole.USER,
|
115 |
+
"content": [
|
116 |
+
{
|
117 |
+
"type": "text",
|
118 |
+
"text": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the document and question asked'."
|
119 |
+
+ question,
|
120 |
+
}
|
121 |
+
],
|
122 |
+
},
|
123 |
+
]
|
124 |
+
return self.model(messages).content
|
scripts/text_web_browser.py
ADDED
@@ -0,0 +1,567 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource!
|
2 |
+
# https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py
|
3 |
+
import mimetypes
|
4 |
+
import os
|
5 |
+
import pathlib
|
6 |
+
import re
|
7 |
+
import time
|
8 |
+
import uuid
|
9 |
+
from typing import Any
|
10 |
+
from urllib.parse import unquote, urljoin, urlparse
|
11 |
+
|
12 |
+
import pathvalidate
|
13 |
+
import requests
|
14 |
+
from serpapi import GoogleSearch
|
15 |
+
|
16 |
+
from smolagents import Tool
|
17 |
+
|
18 |
+
from .cookies import COOKIES
|
19 |
+
from .mdconvert import FileConversionException, MarkdownConverter, UnsupportedFormatException
|
20 |
+
|
21 |
+
|
22 |
+
class SimpleTextBrowser:
|
23 |
+
"""(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""
|
24 |
+
|
25 |
+
def __init__(
|
26 |
+
self,
|
27 |
+
start_page: str | None = None,
|
28 |
+
viewport_size: int | None = 1024 * 8,
|
29 |
+
downloads_folder: str | None | None = None,
|
30 |
+
serpapi_key: str | None | None = None,
|
31 |
+
request_kwargs: dict[str, Any] | None | None = None,
|
32 |
+
):
|
33 |
+
self.start_page: str = start_page if start_page else "about:blank"
|
34 |
+
self.viewport_size = viewport_size # Applies only to the standard uri types
|
35 |
+
self.downloads_folder = downloads_folder
|
36 |
+
self.history: list[tuple[str, float]] = list()
|
37 |
+
self.page_title: str | None = None
|
38 |
+
self.viewport_current_page = 0
|
39 |
+
self.viewport_pages: list[tuple[int, int]] = list()
|
40 |
+
self.set_address(self.start_page)
|
41 |
+
self.serpapi_key = serpapi_key
|
42 |
+
self.request_kwargs = request_kwargs
|
43 |
+
self.request_kwargs["cookies"] = COOKIES
|
44 |
+
self._mdconvert = MarkdownConverter()
|
45 |
+
self._page_content: str = ""
|
46 |
+
|
47 |
+
self._find_on_page_query: str | None = None
|
48 |
+
self._find_on_page_last_result: int | None = None # Location of the last result
|
49 |
+
|
50 |
+
@property
|
51 |
+
def address(self) -> str:
|
52 |
+
"""Return the address of the current page."""
|
53 |
+
return self.history[-1][0]
|
54 |
+
|
55 |
+
def set_address(self, uri_or_path: str, filter_year: int | None = None) -> None:
|
56 |
+
# TODO: Handle anchors
|
57 |
+
self.history.append((uri_or_path, time.time()))
|
58 |
+
|
59 |
+
# Handle special URIs
|
60 |
+
if uri_or_path == "about:blank":
|
61 |
+
self._set_page_content("")
|
62 |
+
elif uri_or_path.startswith("google:"):
|
63 |
+
self._serpapi_search(uri_or_path[len("google:") :].strip(), filter_year=filter_year)
|
64 |
+
else:
|
65 |
+
if (
|
66 |
+
not uri_or_path.startswith("http:")
|
67 |
+
and not uri_or_path.startswith("https:")
|
68 |
+
and not uri_or_path.startswith("file:")
|
69 |
+
):
|
70 |
+
if len(self.history) > 1:
|
71 |
+
prior_address = self.history[-2][0]
|
72 |
+
uri_or_path = urljoin(prior_address, uri_or_path)
|
73 |
+
# Update the address with the fully-qualified path
|
74 |
+
self.history[-1] = (uri_or_path, self.history[-1][1])
|
75 |
+
self._fetch_page(uri_or_path)
|
76 |
+
|
77 |
+
self.viewport_current_page = 0
|
78 |
+
self.find_on_page_query = None
|
79 |
+
self.find_on_page_viewport = None
|
80 |
+
|
81 |
+
@property
|
82 |
+
def viewport(self) -> str:
|
83 |
+
"""Return the content of the current viewport."""
|
84 |
+
bounds = self.viewport_pages[self.viewport_current_page]
|
85 |
+
return self.page_content[bounds[0] : bounds[1]]
|
86 |
+
|
87 |
+
@property
|
88 |
+
def page_content(self) -> str:
|
89 |
+
"""Return the full contents of the current page."""
|
90 |
+
return self._page_content
|
91 |
+
|
92 |
+
def _set_page_content(self, content: str) -> None:
|
93 |
+
"""Sets the text content of the current page."""
|
94 |
+
self._page_content = content
|
95 |
+
self._split_pages()
|
96 |
+
if self.viewport_current_page >= len(self.viewport_pages):
|
97 |
+
self.viewport_current_page = len(self.viewport_pages) - 1
|
98 |
+
|
99 |
+
def page_down(self) -> None:
|
100 |
+
self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)
|
101 |
+
|
102 |
+
def page_up(self) -> None:
|
103 |
+
self.viewport_current_page = max(self.viewport_current_page - 1, 0)
|
104 |
+
|
105 |
+
def find_on_page(self, query: str) -> str | None:
|
106 |
+
"""Searches for the query from the current viewport forward, looping back to the start if necessary."""
|
107 |
+
|
108 |
+
# Did we get here via a previous find_on_page search with the same query?
|
109 |
+
# If so, map to find_next
|
110 |
+
if query == self._find_on_page_query and self.viewport_current_page == self._find_on_page_last_result:
|
111 |
+
return self.find_next()
|
112 |
+
|
113 |
+
# Ok it's a new search start from the current viewport
|
114 |
+
self._find_on_page_query = query
|
115 |
+
viewport_match = self._find_next_viewport(query, self.viewport_current_page)
|
116 |
+
if viewport_match is None:
|
117 |
+
self._find_on_page_last_result = None
|
118 |
+
return None
|
119 |
+
else:
|
120 |
+
self.viewport_current_page = viewport_match
|
121 |
+
self._find_on_page_last_result = viewport_match
|
122 |
+
return self.viewport
|
123 |
+
|
124 |
+
def find_next(self) -> str | None:
|
125 |
+
"""Scroll to the next viewport that matches the query"""
|
126 |
+
|
127 |
+
if self._find_on_page_query is None:
|
128 |
+
return None
|
129 |
+
|
130 |
+
starting_viewport = self._find_on_page_last_result
|
131 |
+
if starting_viewport is None:
|
132 |
+
starting_viewport = 0
|
133 |
+
else:
|
134 |
+
starting_viewport += 1
|
135 |
+
if starting_viewport >= len(self.viewport_pages):
|
136 |
+
starting_viewport = 0
|
137 |
+
|
138 |
+
viewport_match = self._find_next_viewport(self._find_on_page_query, starting_viewport)
|
139 |
+
if viewport_match is None:
|
140 |
+
self._find_on_page_last_result = None
|
141 |
+
return None
|
142 |
+
else:
|
143 |
+
self.viewport_current_page = viewport_match
|
144 |
+
self._find_on_page_last_result = viewport_match
|
145 |
+
return self.viewport
|
146 |
+
|
147 |
+
def _find_next_viewport(self, query: str, starting_viewport: int) -> int | None:
|
148 |
+
"""Search for matches between the starting viewport looping when reaching the end."""
|
149 |
+
|
150 |
+
if query is None:
|
151 |
+
return None
|
152 |
+
|
153 |
+
# Normalize the query, and convert to a regular expression
|
154 |
+
nquery = re.sub(r"\*", "__STAR__", query)
|
155 |
+
nquery = " " + (" ".join(re.split(r"\W+", nquery))).strip() + " "
|
156 |
+
nquery = nquery.replace(" __STAR__ ", "__STAR__ ") # Merge isolated stars with prior word
|
157 |
+
nquery = nquery.replace("__STAR__", ".*").lower()
|
158 |
+
|
159 |
+
if nquery.strip() == "":
|
160 |
+
return None
|
161 |
+
|
162 |
+
idxs = list()
|
163 |
+
idxs.extend(range(starting_viewport, len(self.viewport_pages)))
|
164 |
+
idxs.extend(range(0, starting_viewport))
|
165 |
+
|
166 |
+
for i in idxs:
|
167 |
+
bounds = self.viewport_pages[i]
|
168 |
+
content = self.page_content[bounds[0] : bounds[1]]
|
169 |
+
|
170 |
+
# TODO: Remove markdown links and images
|
171 |
+
ncontent = " " + (" ".join(re.split(r"\W+", content))).strip().lower() + " "
|
172 |
+
if re.search(nquery, ncontent):
|
173 |
+
return i
|
174 |
+
|
175 |
+
return None
|
176 |
+
|
177 |
+
def visit_page(self, path_or_uri: str, filter_year: int | None = None) -> str:
|
178 |
+
"""Update the address, visit the page, and return the content of the viewport."""
|
179 |
+
self.set_address(path_or_uri, filter_year=filter_year)
|
180 |
+
return self.viewport
|
181 |
+
|
182 |
+
def _split_pages(self) -> None:
|
183 |
+
# Do not split search results
|
184 |
+
if self.address.startswith("google:"):
|
185 |
+
self.viewport_pages = [(0, len(self._page_content))]
|
186 |
+
return
|
187 |
+
|
188 |
+
# Handle empty pages
|
189 |
+
if len(self._page_content) == 0:
|
190 |
+
self.viewport_pages = [(0, 0)]
|
191 |
+
return
|
192 |
+
|
193 |
+
# Break the viewport into pages
|
194 |
+
self.viewport_pages = []
|
195 |
+
start_idx = 0
|
196 |
+
while start_idx < len(self._page_content):
|
197 |
+
end_idx = min(start_idx + self.viewport_size, len(self._page_content)) # type: ignore[operator]
|
198 |
+
# Adjust to end on a space
|
199 |
+
while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
|
200 |
+
end_idx += 1
|
201 |
+
self.viewport_pages.append((start_idx, end_idx))
|
202 |
+
start_idx = end_idx
|
203 |
+
|
204 |
+
def _serpapi_search(self, query: str, filter_year: int | None = None) -> None:
|
205 |
+
if self.serpapi_key is None:
|
206 |
+
raise ValueError("Missing SerpAPI key.")
|
207 |
+
|
208 |
+
params = {
|
209 |
+
"engine": "google",
|
210 |
+
"q": query,
|
211 |
+
"api_key": self.serpapi_key,
|
212 |
+
}
|
213 |
+
if filter_year is not None:
|
214 |
+
params["tbs"] = f"cdr:1,cd_min:01/01/{filter_year},cd_max:12/31/{filter_year}"
|
215 |
+
|
216 |
+
search = GoogleSearch(params)
|
217 |
+
results = search.get_dict()
|
218 |
+
self.page_title = f"{query} - Search"
|
219 |
+
if "organic_results" not in results.keys():
|
220 |
+
raise Exception(f"No results found for query: '{query}'. Use a less specific query.")
|
221 |
+
if len(results["organic_results"]) == 0:
|
222 |
+
year_filter_message = f" with filter year={filter_year}" if filter_year is not None else ""
|
223 |
+
self._set_page_content(
|
224 |
+
f"No results found for '{query}'{year_filter_message}. Try with a more general query, or remove the year filter."
|
225 |
+
)
|
226 |
+
return
|
227 |
+
|
228 |
+
def _prev_visit(url):
|
229 |
+
for i in range(len(self.history) - 1, -1, -1):
|
230 |
+
if self.history[i][0] == url:
|
231 |
+
return f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
|
232 |
+
return ""
|
233 |
+
|
234 |
+
web_snippets: list[str] = list()
|
235 |
+
idx = 0
|
236 |
+
if "organic_results" in results:
|
237 |
+
for page in results["organic_results"]:
|
238 |
+
idx += 1
|
239 |
+
date_published = ""
|
240 |
+
if "date" in page:
|
241 |
+
date_published = "\nDate published: " + page["date"]
|
242 |
+
|
243 |
+
source = ""
|
244 |
+
if "source" in page:
|
245 |
+
source = "\nSource: " + page["source"]
|
246 |
+
|
247 |
+
snippet = ""
|
248 |
+
if "snippet" in page:
|
249 |
+
snippet = "\n" + page["snippet"]
|
250 |
+
|
251 |
+
redacted_version = f"{idx}. [{page['title']}]({page['link']}){date_published}{source}\n{_prev_visit(page['link'])}{snippet}"
|
252 |
+
|
253 |
+
redacted_version = redacted_version.replace("Your browser can't play this video.", "")
|
254 |
+
web_snippets.append(redacted_version)
|
255 |
+
|
256 |
+
content = (
|
257 |
+
f"A Google search for '{query}' found {len(web_snippets)} results:\n\n## Web Results\n"
|
258 |
+
+ "\n\n".join(web_snippets)
|
259 |
+
)
|
260 |
+
|
261 |
+
self._set_page_content(content)
|
262 |
+
|
263 |
+
def _fetch_page(self, url: str) -> None:
|
264 |
+
download_path = ""
|
265 |
+
try:
|
266 |
+
if url.startswith("file://"):
|
267 |
+
download_path = os.path.normcase(os.path.normpath(unquote(url[7:])))
|
268 |
+
res = self._mdconvert.convert_local(download_path)
|
269 |
+
self.page_title = res.title
|
270 |
+
self._set_page_content(res.text_content)
|
271 |
+
else:
|
272 |
+
# Prepare the request parameters
|
273 |
+
request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
|
274 |
+
request_kwargs["stream"] = True
|
275 |
+
|
276 |
+
# Send a HTTP request to the URL
|
277 |
+
response = requests.get(url, **request_kwargs)
|
278 |
+
response.raise_for_status()
|
279 |
+
|
280 |
+
# If the HTTP request was successful
|
281 |
+
content_type = response.headers.get("content-type", "")
|
282 |
+
|
283 |
+
# Text or HTML
|
284 |
+
if "text/" in content_type.lower():
|
285 |
+
res = self._mdconvert.convert_response(response)
|
286 |
+
self.page_title = res.title
|
287 |
+
self._set_page_content(res.text_content)
|
288 |
+
# A download
|
289 |
+
else:
|
290 |
+
# Try producing a safe filename
|
291 |
+
fname = None
|
292 |
+
download_path = None
|
293 |
+
try:
|
294 |
+
fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
|
295 |
+
download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
|
296 |
+
|
297 |
+
suffix = 0
|
298 |
+
while os.path.exists(download_path) and suffix < 1000:
|
299 |
+
suffix += 1
|
300 |
+
base, ext = os.path.splitext(fname)
|
301 |
+
new_fname = f"{base}__{suffix}{ext}"
|
302 |
+
download_path = os.path.abspath(os.path.join(self.downloads_folder, new_fname))
|
303 |
+
|
304 |
+
except NameError:
|
305 |
+
pass
|
306 |
+
|
307 |
+
# No suitable name, so make one
|
308 |
+
if fname is None:
|
309 |
+
extension = mimetypes.guess_extension(content_type)
|
310 |
+
if extension is None:
|
311 |
+
extension = ".download"
|
312 |
+
fname = str(uuid.uuid4()) + extension
|
313 |
+
download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
|
314 |
+
|
315 |
+
# Open a file for writing
|
316 |
+
with open(download_path, "wb") as fh:
|
317 |
+
for chunk in response.iter_content(chunk_size=512):
|
318 |
+
fh.write(chunk)
|
319 |
+
|
320 |
+
# Render it
|
321 |
+
local_uri = pathlib.Path(download_path).as_uri()
|
322 |
+
self.set_address(local_uri)
|
323 |
+
|
324 |
+
except UnsupportedFormatException as e:
|
325 |
+
print(e)
|
326 |
+
self.page_title = ("Download complete.",)
|
327 |
+
self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
|
328 |
+
except FileConversionException as e:
|
329 |
+
print(e)
|
330 |
+
self.page_title = ("Download complete.",)
|
331 |
+
self._set_page_content(f"# Download complete\n\nSaved file to '{download_path}'")
|
332 |
+
except FileNotFoundError:
|
333 |
+
self.page_title = "Error 404"
|
334 |
+
self._set_page_content(f"## Error 404\n\nFile not found: {download_path}")
|
335 |
+
except requests.exceptions.RequestException as request_exception:
|
336 |
+
try:
|
337 |
+
self.page_title = f"Error {response.status_code}"
|
338 |
+
|
339 |
+
# If the error was rendered in HTML we might as well render it
|
340 |
+
content_type = response.headers.get("content-type", "")
|
341 |
+
if content_type is not None and "text/html" in content_type.lower():
|
342 |
+
res = self._mdconvert.convert(response)
|
343 |
+
self.page_title = f"Error {response.status_code}"
|
344 |
+
self._set_page_content(f"## Error {response.status_code}\n\n{res.text_content}")
|
345 |
+
else:
|
346 |
+
text = ""
|
347 |
+
for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
|
348 |
+
text += chunk
|
349 |
+
self.page_title = f"Error {response.status_code}"
|
350 |
+
self._set_page_content(f"## Error {response.status_code}\n\n{text}")
|
351 |
+
except NameError:
|
352 |
+
self.page_title = "Error"
|
353 |
+
self._set_page_content(f"## Error\n\n{str(request_exception)}")
|
354 |
+
|
355 |
+
def _state(self) -> tuple[str, str]:
|
356 |
+
header = f"Address: {self.address}\n"
|
357 |
+
if self.page_title is not None:
|
358 |
+
header += f"Title: {self.page_title}\n"
|
359 |
+
|
360 |
+
current_page = self.viewport_current_page
|
361 |
+
total_pages = len(self.viewport_pages)
|
362 |
+
|
363 |
+
address = self.address
|
364 |
+
for i in range(len(self.history) - 2, -1, -1): # Start from the second last
|
365 |
+
if self.history[i][0] == address:
|
366 |
+
header += f"You previously visited this page {round(time.time() - self.history[i][1])} seconds ago.\n"
|
367 |
+
break
|
368 |
+
|
369 |
+
header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n"
|
370 |
+
return (header, self.viewport)
|
371 |
+
|
372 |
+
|
373 |
+
class SearchInformationTool(Tool):
|
374 |
+
name = "web_search"
|
375 |
+
description = "Perform a web search query (think a google search) and returns the search results."
|
376 |
+
inputs = {"query": {"type": "string", "description": "The web search query to perform."}}
|
377 |
+
inputs["filter_year"] = {
|
378 |
+
"type": "string",
|
379 |
+
"description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!",
|
380 |
+
"nullable": True,
|
381 |
+
}
|
382 |
+
output_type = "string"
|
383 |
+
|
384 |
+
def __init__(self, browser):
|
385 |
+
super().__init__()
|
386 |
+
self.browser = browser
|
387 |
+
|
388 |
+
def forward(self, query: str, filter_year: int | None = None) -> str:
|
389 |
+
self.browser.visit_page(f"google: {query}", filter_year=filter_year)
|
390 |
+
header, content = self.browser._state()
|
391 |
+
return header.strip() + "\n=======================\n" + content
|
392 |
+
|
393 |
+
|
394 |
+
class VisitTool(Tool):
|
395 |
+
name = "visit_page"
|
396 |
+
description = "Visit a webpage at a given URL and return its text. Given a url to a YouTube video, this returns the transcript."
|
397 |
+
inputs = {"url": {"type": "string", "description": "The relative or absolute url of the webpage to visit."}}
|
398 |
+
output_type = "string"
|
399 |
+
|
400 |
+
def __init__(self, browser=None):
|
401 |
+
super().__init__()
|
402 |
+
self.browser = browser
|
403 |
+
|
404 |
+
def forward(self, url: str) -> str:
|
405 |
+
self.browser.visit_page(url)
|
406 |
+
header, content = self.browser._state()
|
407 |
+
return header.strip() + "\n=======================\n" + content
|
408 |
+
|
409 |
+
|
410 |
+
class DownloadTool(Tool):
|
411 |
+
name = "download_file"
|
412 |
+
description = """
|
413 |
+
Download a file at a given URL. The file should be of this format: [".xlsx", ".pptx", ".wav", ".mp3", ".m4a", ".png", ".docx"]
|
414 |
+
After using this tool, for further inspection of this page you should return the download path to your manager via final_answer, and they will be able to inspect it.
|
415 |
+
DO NOT use this tool for .pdf or .txt or .htm files: for these types of files use visit_page with the file url instead."""
|
416 |
+
inputs = {"url": {"type": "string", "description": "The relative or absolute url of the file to be downloaded."}}
|
417 |
+
output_type = "string"
|
418 |
+
|
419 |
+
def __init__(self, browser):
|
420 |
+
super().__init__()
|
421 |
+
self.browser = browser
|
422 |
+
|
423 |
+
def forward(self, url: str) -> str:
|
424 |
+
import requests
|
425 |
+
|
426 |
+
if "arxiv" in url:
|
427 |
+
url = url.replace("abs", "pdf")
|
428 |
+
response = requests.get(url)
|
429 |
+
content_type = response.headers.get("content-type", "")
|
430 |
+
extension = mimetypes.guess_extension(content_type)
|
431 |
+
if extension and isinstance(extension, str):
|
432 |
+
new_path = f"./downloads/file{extension}"
|
433 |
+
else:
|
434 |
+
new_path = "./downloads/file.object"
|
435 |
+
|
436 |
+
with open(new_path, "wb") as f:
|
437 |
+
f.write(response.content)
|
438 |
+
|
439 |
+
if "pdf" in extension or "txt" in extension or "htm" in extension:
|
440 |
+
raise Exception("Do not use this tool for pdf or txt or html files: use visit_page instead.")
|
441 |
+
|
442 |
+
return f"File was downloaded and saved under path {new_path}."
|
443 |
+
|
444 |
+
|
445 |
+
class ArchiveSearchTool(Tool):
|
446 |
+
name = "find_archived_url"
|
447 |
+
description = "Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date."
|
448 |
+
inputs = {
|
449 |
+
"url": {"type": "string", "description": "The url you need the archive for."},
|
450 |
+
"date": {
|
451 |
+
"type": "string",
|
452 |
+
"description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'.",
|
453 |
+
},
|
454 |
+
}
|
455 |
+
output_type = "string"
|
456 |
+
|
457 |
+
def __init__(self, browser=None):
|
458 |
+
super().__init__()
|
459 |
+
self.browser = browser
|
460 |
+
|
461 |
+
def forward(self, url, date) -> str:
|
462 |
+
import requests
|
463 |
+
|
464 |
+
no_timestamp_url = f"https://archive.org/wayback/available?url={url}"
|
465 |
+
archive_url = no_timestamp_url + f"×tamp={date}"
|
466 |
+
response = requests.get(archive_url).json()
|
467 |
+
response_notimestamp = requests.get(no_timestamp_url).json()
|
468 |
+
if "archived_snapshots" in response and "closest" in response["archived_snapshots"]:
|
469 |
+
closest = response["archived_snapshots"]["closest"]
|
470 |
+
print("Archive found!", closest)
|
471 |
+
|
472 |
+
elif "archived_snapshots" in response_notimestamp and "closest" in response_notimestamp["archived_snapshots"]:
|
473 |
+
closest = response_notimestamp["archived_snapshots"]["closest"]
|
474 |
+
print("Archive found!", closest)
|
475 |
+
else:
|
476 |
+
raise Exception(f"Your {url=} was not archived on Wayback Machine, try a different url.")
|
477 |
+
target_url = closest["url"]
|
478 |
+
self.browser.visit_page(target_url)
|
479 |
+
header, content = self.browser._state()
|
480 |
+
return (
|
481 |
+
f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n"
|
482 |
+
+ header.strip()
|
483 |
+
+ "\n=======================\n"
|
484 |
+
+ content
|
485 |
+
)
|
486 |
+
|
487 |
+
|
488 |
+
class PageUpTool(Tool):
|
489 |
+
name = "page_up"
|
490 |
+
description = "Scroll the viewport UP one page-length in the current webpage and return the new viewport content."
|
491 |
+
inputs = {}
|
492 |
+
output_type = "string"
|
493 |
+
|
494 |
+
def __init__(self, browser=None):
|
495 |
+
super().__init__()
|
496 |
+
self.browser = browser
|
497 |
+
|
498 |
+
def forward(self) -> str:
|
499 |
+
self.browser.page_up()
|
500 |
+
header, content = self.browser._state()
|
501 |
+
return header.strip() + "\n=======================\n" + content
|
502 |
+
|
503 |
+
|
504 |
+
class PageDownTool(Tool):
|
505 |
+
name = "page_down"
|
506 |
+
description = (
|
507 |
+
"Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content."
|
508 |
+
)
|
509 |
+
inputs = {}
|
510 |
+
output_type = "string"
|
511 |
+
|
512 |
+
def __init__(self, browser=None):
|
513 |
+
super().__init__()
|
514 |
+
self.browser = browser
|
515 |
+
|
516 |
+
def forward(self) -> str:
|
517 |
+
self.browser.page_down()
|
518 |
+
header, content = self.browser._state()
|
519 |
+
return header.strip() + "\n=======================\n" + content
|
520 |
+
|
521 |
+
|
522 |
+
class FinderTool(Tool):
|
523 |
+
name = "find_on_page_ctrl_f"
|
524 |
+
description = "Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F."
|
525 |
+
inputs = {
|
526 |
+
"search_string": {
|
527 |
+
"type": "string",
|
528 |
+
"description": "The string to search for on the page. This search string supports wildcards like '*'",
|
529 |
+
}
|
530 |
+
}
|
531 |
+
output_type = "string"
|
532 |
+
|
533 |
+
def __init__(self, browser=None):
|
534 |
+
super().__init__()
|
535 |
+
self.browser = browser
|
536 |
+
|
537 |
+
def forward(self, search_string: str) -> str:
|
538 |
+
find_result = self.browser.find_on_page(search_string)
|
539 |
+
header, content = self.browser._state()
|
540 |
+
|
541 |
+
if find_result is None:
|
542 |
+
return (
|
543 |
+
header.strip()
|
544 |
+
+ f"\n=======================\nThe search string '{search_string}' was not found on this page."
|
545 |
+
)
|
546 |
+
else:
|
547 |
+
return header.strip() + "\n=======================\n" + content
|
548 |
+
|
549 |
+
|
550 |
+
class FindNextTool(Tool):
|
551 |
+
name = "find_next"
|
552 |
+
description = "Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search."
|
553 |
+
inputs = {}
|
554 |
+
output_type = "string"
|
555 |
+
|
556 |
+
def __init__(self, browser=None):
|
557 |
+
super().__init__()
|
558 |
+
self.browser = browser
|
559 |
+
|
560 |
+
def forward(self) -> str:
|
561 |
+
find_result = self.browser.find_next()
|
562 |
+
header, content = self.browser._state()
|
563 |
+
|
564 |
+
if find_result is None:
|
565 |
+
return header.strip() + "\n=======================\nThe search string was not found on this page."
|
566 |
+
else:
|
567 |
+
return header.strip() + "\n=======================\n" + content
|
scripts/visual_qa.py
ADDED
@@ -0,0 +1,313 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Visual QA Tool - A tool for answering questions about images
|
3 |
+
|
4 |
+
This module provides functionality to analyze images and answer questions about them.
|
5 |
+
It leverages powerful vision-language models (VLMs) to understand image content and
|
6 |
+
respond to natural language questions about the images.
|
7 |
+
|
8 |
+
The module offers two implementations:
|
9 |
+
1. VisualQATool class - Uses Hugging Face's IDEFICS-2 model
|
10 |
+
2. visualizer function - Uses OpenAI's GPT-4o model with vision capabilities
|
11 |
+
|
12 |
+
Both implementations handle image loading, processing, and API communication to
|
13 |
+
provide detailed responses about image content.
|
14 |
+
|
15 |
+
Environment variables required:
|
16 |
+
- OPENAI_API_KEY: API key for OpenAI (for the visualizer function)
|
17 |
+
"""
|
18 |
+
|
19 |
+
import base64
|
20 |
+
import json
|
21 |
+
import mimetypes
|
22 |
+
import os
|
23 |
+
import uuid
|
24 |
+
from io import BytesIO
|
25 |
+
|
26 |
+
import PIL.Image
|
27 |
+
import requests
|
28 |
+
from dotenv import load_dotenv
|
29 |
+
from huggingface_hub import InferenceClient
|
30 |
+
|
31 |
+
from smolagents import Tool, tool
|
32 |
+
|
33 |
+
|
34 |
+
# Load environment variables from .env file
|
35 |
+
load_dotenv(override=True)
|
36 |
+
|
37 |
+
|
38 |
+
def process_images_and_text(image_path, query, client):
|
39 |
+
"""
|
40 |
+
Process images and text using the IDEFICS-2 model from Hugging Face.
|
41 |
+
|
42 |
+
This function handles the formatting of prompts and images for the IDEFICS-2 model,
|
43 |
+
which is a powerful vision-language model capable of understanding images and text.
|
44 |
+
|
45 |
+
Args:
|
46 |
+
image_path (str): Path to the image file to analyze
|
47 |
+
query (str): The question or instruction about the image
|
48 |
+
client (InferenceClient): Hugging Face inference client for the model
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
str: The model's response to the query about the image
|
52 |
+
"""
|
53 |
+
from transformers import AutoProcessor
|
54 |
+
|
55 |
+
# Format messages for the chat template
|
56 |
+
messages = [
|
57 |
+
{
|
58 |
+
"role": "user",
|
59 |
+
"content": [
|
60 |
+
{"type": "image"},
|
61 |
+
{"type": "text", "text": query},
|
62 |
+
],
|
63 |
+
},
|
64 |
+
]
|
65 |
+
|
66 |
+
# Load the processor for the IDEFICS-2 model
|
67 |
+
idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-chatty")
|
68 |
+
prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True)
|
69 |
+
|
70 |
+
# Define a nested function to encode local images
|
71 |
+
def encode_local_image(image_path):
|
72 |
+
"""
|
73 |
+
Encode a local image file to a base64 string for API transmission.
|
74 |
+
|
75 |
+
Args:
|
76 |
+
image_path (str): Path to the local image file
|
77 |
+
|
78 |
+
Returns:
|
79 |
+
str: Base64-encoded image with proper formatting for the API
|
80 |
+
"""
|
81 |
+
# Load image and convert to RGB format
|
82 |
+
image = PIL.Image.open(image_path).convert("RGB")
|
83 |
+
|
84 |
+
# Convert the image to a base64 string
|
85 |
+
buffer = BytesIO()
|
86 |
+
image.save(buffer, format="JPEG") # Use the appropriate format (e.g., JPEG, PNG)
|
87 |
+
base64_image = base64.b64encode(buffer.getvalue()).decode("utf-8")
|
88 |
+
|
89 |
+
# Add string formatting required by the endpoint
|
90 |
+
image_string = f"data:image/jpeg;base64,{base64_image}"
|
91 |
+
|
92 |
+
return image_string
|
93 |
+
|
94 |
+
# Encode the image and insert it into the prompt template
|
95 |
+
image_string = encode_local_image(image_path)
|
96 |
+
prompt_with_images = prompt_with_template.replace("<image>", " ").format(image_string)
|
97 |
+
|
98 |
+
# Prepare the payload for the API request
|
99 |
+
payload = {
|
100 |
+
"inputs": prompt_with_images,
|
101 |
+
"parameters": {
|
102 |
+
"return_full_text": False,
|
103 |
+
"max_new_tokens": 200, # Limit response length
|
104 |
+
},
|
105 |
+
}
|
106 |
+
|
107 |
+
# Send the request to the API and parse the response
|
108 |
+
return json.loads(client.post(json=payload).decode())[0]
|
109 |
+
|
110 |
+
|
111 |
+
# Function to encode images for API transmission
|
112 |
+
def encode_image(image_path):
|
113 |
+
"""
|
114 |
+
Encode an image for API transmission, handling both URLs and local files.
|
115 |
+
|
116 |
+
If the image_path is a URL, the function will download the image first.
|
117 |
+
|
118 |
+
Args:
|
119 |
+
image_path (str): Path or URL to the image
|
120 |
+
|
121 |
+
Returns:
|
122 |
+
str: Base64-encoded image string
|
123 |
+
"""
|
124 |
+
# Handle URL-based images by downloading them first
|
125 |
+
if image_path.startswith("http"):
|
126 |
+
# Set up a user agent to avoid being blocked by websites
|
127 |
+
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0"
|
128 |
+
request_kwargs = {
|
129 |
+
"headers": {"User-Agent": user_agent},
|
130 |
+
"stream": True, # Stream the download for large files
|
131 |
+
}
|
132 |
+
|
133 |
+
# Send a HTTP request to the URL
|
134 |
+
response = requests.get(image_path, **request_kwargs)
|
135 |
+
response.raise_for_status() # Raise an exception for HTTP errors
|
136 |
+
content_type = response.headers.get("content-type", "")
|
137 |
+
|
138 |
+
# Determine the file extension from the content type
|
139 |
+
extension = mimetypes.guess_extension(content_type)
|
140 |
+
if extension is None:
|
141 |
+
extension = ".download" # Default extension if unknown
|
142 |
+
|
143 |
+
# Generate a unique filename and save the downloaded image
|
144 |
+
fname = str(uuid.uuid4()) + extension
|
145 |
+
download_path = os.path.abspath(os.path.join("downloads", fname))
|
146 |
+
|
147 |
+
with open(download_path, "wb") as fh:
|
148 |
+
for chunk in response.iter_content(chunk_size=512):
|
149 |
+
fh.write(chunk)
|
150 |
+
|
151 |
+
# Update the image_path to the local downloaded file
|
152 |
+
image_path = download_path
|
153 |
+
|
154 |
+
# Encode the local image file to base64
|
155 |
+
with open(image_path, "rb") as image_file:
|
156 |
+
return base64.b64encode(image_file.read()).decode("utf-8")
|
157 |
+
|
158 |
+
|
159 |
+
def resize_image(image_path):
|
160 |
+
"""
|
161 |
+
Resize an image to half its original dimensions.
|
162 |
+
|
163 |
+
This function is used when the original image is too large for the API.
|
164 |
+
|
165 |
+
Args:
|
166 |
+
image_path (str): Path to the image file
|
167 |
+
|
168 |
+
Returns:
|
169 |
+
str: Path to the resized image
|
170 |
+
"""
|
171 |
+
# Open and get dimensions of the image
|
172 |
+
img = PIL.Image.open(image_path)
|
173 |
+
width, height = img.size
|
174 |
+
|
175 |
+
# Resize to half the original dimensions
|
176 |
+
img = img.resize((int(width / 2), int(height / 2)))
|
177 |
+
|
178 |
+
# Save with a new filename
|
179 |
+
new_image_path = f"resized_{image_path}"
|
180 |
+
img.save(new_image_path)
|
181 |
+
|
182 |
+
return new_image_path
|
183 |
+
|
184 |
+
|
185 |
+
class VisualQATool(Tool):
|
186 |
+
"""
|
187 |
+
A tool that can answer questions about images using the IDEFICS-2 model.
|
188 |
+
|
189 |
+
This class implements the Tool interface from smolagents and provides
|
190 |
+
functionality to analyze images and answer questions about them.
|
191 |
+
"""
|
192 |
+
name = "visualizer"
|
193 |
+
description = "A tool that can answer questions about attached images."
|
194 |
+
inputs = {
|
195 |
+
"image_path": {
|
196 |
+
"description": "The path to the image on which to answer the question",
|
197 |
+
"type": "string",
|
198 |
+
},
|
199 |
+
"question": {"description": "the question to answer", "type": "string", "nullable": True},
|
200 |
+
}
|
201 |
+
output_type = "string"
|
202 |
+
|
203 |
+
# Initialize the Hugging Face inference client for IDEFICS-2
|
204 |
+
client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty")
|
205 |
+
|
206 |
+
def forward(self, image_path: str, question: str | None = None) -> str:
|
207 |
+
"""
|
208 |
+
Process an image and answer a question about it.
|
209 |
+
|
210 |
+
If no question is provided, the function will generate a detailed caption.
|
211 |
+
|
212 |
+
Args:
|
213 |
+
image_path (str): Path to the image file
|
214 |
+
question (str, optional): Question to answer about the image
|
215 |
+
|
216 |
+
Returns:
|
217 |
+
str: Answer to the question or a caption for the image
|
218 |
+
"""
|
219 |
+
output = ""
|
220 |
+
add_note = False
|
221 |
+
|
222 |
+
# If no question is provided, default to generating a caption
|
223 |
+
if not question:
|
224 |
+
add_note = True
|
225 |
+
question = "Please write a detailed caption for this image."
|
226 |
+
|
227 |
+
try:
|
228 |
+
# Try to process the image and question
|
229 |
+
output = process_images_and_text(image_path, question, self.client)
|
230 |
+
except Exception as e:
|
231 |
+
print(e)
|
232 |
+
# If the image is too large, resize it and try again
|
233 |
+
if "Payload Too Large" in str(e):
|
234 |
+
new_image_path = resize_image(image_path)
|
235 |
+
output = process_images_and_text(new_image_path, question, self.client)
|
236 |
+
|
237 |
+
# Add a note if we generated a caption instead of answering a question
|
238 |
+
if add_note:
|
239 |
+
output = (
|
240 |
+
f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
|
241 |
+
)
|
242 |
+
|
243 |
+
return output
|
244 |
+
|
245 |
+
|
246 |
+
@tool
|
247 |
+
def visualizer(image_path: str, question: str | None = None) -> str:
|
248 |
+
"""
|
249 |
+
A tool that can answer questions about attached images using OpenAI's GPT-4o model.
|
250 |
+
|
251 |
+
This function provides an alternative implementation using OpenAI's vision capabilities
|
252 |
+
instead of the Hugging Face model used in VisualQATool.
|
253 |
+
|
254 |
+
Args:
|
255 |
+
image_path: The path to the image on which to answer the question. This should be a local path to downloaded image.
|
256 |
+
question: The question to answer.
|
257 |
+
|
258 |
+
Returns:
|
259 |
+
str: Answer to the question or a caption for the image
|
260 |
+
"""
|
261 |
+
import mimetypes
|
262 |
+
import os
|
263 |
+
|
264 |
+
import requests
|
265 |
+
|
266 |
+
from .visual_qa import encode_image
|
267 |
+
|
268 |
+
# If no question is provided, default to generating a caption
|
269 |
+
add_note = False
|
270 |
+
if not question:
|
271 |
+
add_note = True
|
272 |
+
question = "Please write a detailed caption for this image."
|
273 |
+
|
274 |
+
# Validate input
|
275 |
+
if not isinstance(image_path, str):
|
276 |
+
raise Exception("You should provide at least `image_path` string argument to this tool!")
|
277 |
+
|
278 |
+
# Determine the MIME type and encode the image
|
279 |
+
mime_type, _ = mimetypes.guess_type(image_path)
|
280 |
+
base64_image = encode_image(image_path)
|
281 |
+
|
282 |
+
# Prepare the payload for the OpenAI API request
|
283 |
+
payload = {
|
284 |
+
"model": "gpt-4o", # Using GPT-4o with vision capabilities
|
285 |
+
"messages": [
|
286 |
+
{
|
287 |
+
"role": "user",
|
288 |
+
"content": [
|
289 |
+
{"type": "text", "text": question},
|
290 |
+
{"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_image}"}},
|
291 |
+
],
|
292 |
+
}
|
293 |
+
],
|
294 |
+
"max_tokens": 1000, # Limit response length
|
295 |
+
}
|
296 |
+
|
297 |
+
# Set up headers with API key
|
298 |
+
headers = {"Content-Type": "application/json", "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}"}
|
299 |
+
|
300 |
+
# Send the request to the OpenAI API
|
301 |
+
response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
|
302 |
+
|
303 |
+
# Parse the response
|
304 |
+
try:
|
305 |
+
output = response.json()["choices"][0]["message"]["content"]
|
306 |
+
except Exception:
|
307 |
+
raise Exception(f"Response format unexpected: {response.json()}")
|
308 |
+
|
309 |
+
# Add a note if we generated a caption instead of answering a question
|
310 |
+
if add_note:
|
311 |
+
output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}"
|
312 |
+
|
313 |
+
return output
|