Spaces:

Ekimetrics
/

climate-question-answering

Running

App Files Files Community

timeki commited on 10 days ago

Commit

2619e14

1 Parent(s): 4e4a906

Merge remote-tracking branch 'origin/feature/improve_parsing_and_retrieval' into pr/20

Browse files

Files changed (9) hide show

.gitignore +4 -2
app.py +2 -3
climateqa/engine/chains/prompts.py +55 -4
climateqa/engine/chains/query_transformation.py +2 -0
climateqa/engine/chains/retrieve_documents.py +274 -30
climateqa/engine/graph.py +28 -32
climateqa/engine/talk_to_data/main.py +60 -0
front/tabs/chat_interface.py +21 -2
front/tabs/main_tab.py +1 -2

.gitignore CHANGED Viewed

@@ -12,7 +12,9 @@ notebooks/
 data/
 sandbox/
 *.db
-.vscode/
 *old/
-data_ingestion/

 data/
 sandbox/
+climateqa/talk_to_data/database/
 *.db
+data_ingestion/
+.vscode
 *old/

app.py CHANGED Viewed

@@ -64,7 +64,7 @@ user_id = create_user_id()
 embeddings_function = get_embeddings_function()
 vectorstore = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX"))
 vectorstore_graphs = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX_OWID"), text_key="description")
-vectorstore_region = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX_REGION"))
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
 if os.getenv("ENV")=="GRADIO_ENV":
@@ -73,7 +73,7 @@ else:
     reranker = get_reranker("large")
 agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, vectorstore_region = vectorstore_region, reranker=reranker, threshold_docs=0.2)
-agent_poc = make_graph_agent_poc(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, vectorstore_region = vectorstore_region, reranker=reranker, threshold_docs=0)#TODO put back default 0.2
 async def chat(query, history, audience, sources, reports, relevant_content_sources_selection, search_only):
@@ -268,7 +268,6 @@ def event_handling(
     for component in [textbox, examples_hidden]:
         component.submit(find_papers, [component, after, dropdown_external_sources], [papers_html, citations_network, papers_summary])
 def main_ui():

 embeddings_function = get_embeddings_function()
 vectorstore = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX"))
 vectorstore_graphs = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX_OWID"), text_key="description")
+vectorstore_region = get_pinecone_vectorstore(embeddings_function, index_name=os.getenv("PINECONE_API_INDEX_LOCAL_V2"))
 llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
 if os.getenv("ENV")=="GRADIO_ENV":
     reranker = get_reranker("large")
 agent = make_graph_agent(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, vectorstore_region = vectorstore_region, reranker=reranker, threshold_docs=0.2)
+agent_poc = make_graph_agent_poc(llm=llm, vectorstore_ipcc=vectorstore, vectorstore_graphs=vectorstore_graphs, vectorstore_region = vectorstore_region, reranker=reranker, threshold_docs=0, version="v4")#TODO put back default 0.2
 async def chat(query, history, audience, sources, reports, relevant_content_sources_selection, search_only):
     for component in [textbox, examples_hidden]:
         component.submit(find_papers, [component, after, dropdown_external_sources], [papers_html, citations_network, papers_summary])
 def main_ui():

climateqa/engine/chains/prompts.py CHANGED Viewed

@@ -66,10 +66,11 @@ You are ClimateQ&A, an AI Assistant created by Ekimetrics. You are given a quest
 Guidelines:
 - If the passages have useful facts or numbers, use them in your answer.
 - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
-- You will receive passages from different reports, eg IPCC and PPCP, make separate paragraphs and specify the source of the information in your answer, eg "According to IPCC, ...".
-- The different sources are IPCC, IPBES, PPCP (for Plan Climat Air Energie Territorial de Paris), PBDP (for Plan Biodiversité de Paris), Acclimaterra.
 - Do not mention that you are using specific extract documents, but mention only the source information. "According to IPCC, ..." rather than "According to the provided document from IPCC ..."
-- Make a clear distinction between information from IPCC, IPBES, Acclimaterra that are scientific reports and PPCP, PBDP that are strategic reports. Strategic reports should not be taken has verified facts, but as political or strategic decisions.
 - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
 - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
 - If it makes sense, use bullet points and lists to make your answers easier to understand.
@@ -197,4 +198,54 @@ Graphs and their HTML embedding:
 {format_instructions}
 Output the result as json with a key "graphs" containing a list of dictionaries of the relevant graphs with keys 'embedding', 'category', and 'source'. Do not modify the graph HTML embedding, the category or the source. Do not put any message or text before or after the JSON output.
-"""

 Guidelines:
 - If the passages have useful facts or numbers, use them in your answer.
 - When you use information from a passage, mention where it came from by using [Doc i] at the end of the sentence. i stands for the number of the document.
+- You will receive passages from different reports, e.g., IPCC and PPCP. Make separate paragraphs and specify the source of the information in your answer, e.g., "According to IPCC, ...".
+- The different sources are IPCC, IPBES, PPCP (for Plan Climat Air Energie Territorial de Paris), PBDP (for Plan Biodiversité de Paris), Acclimaterra (Rapport scientifique de la région Nouvelle Aquitaine en France).
+- If the reports are local (like PPCP, PBDP, Acclimaterra), consider that the information is specific to the region and not global. If the document is about a nearby region (for example, an extract from Acclimaterra for a question about Britain), explicitly state the concerned region.
 - Do not mention that you are using specific extract documents, but mention only the source information. "According to IPCC, ..." rather than "According to the provided document from IPCC ..."
+- Make a clear distinction between information from IPCC, IPBES, Acclimaterra that are scientific reports and PPCP, PBDP that are strategic reports. Strategic reports should not be taken as verified facts, but as political or strategic decisions.
 - If the same thing is said in more than one document, you can mention all of them like this: [Doc i, Doc j, Doc k]
 - Do not just summarize each passage one by one. Group your summaries to highlight the key parts in the explanation.
 - If it makes sense, use bullet points and lists to make your answers easier to understand.
 {format_instructions}
 Output the result as json with a key "graphs" containing a list of dictionaries of the relevant graphs with keys 'embedding', 'category', and 'source'. Do not modify the graph HTML embedding, the category or the source. Do not put any message or text before or after the JSON output.
+"""
+retrieve_chapter_prompt_template = """Given the user question and a list of documents with their table of contents, retrieve the 5 most relevant level 0 chapters which could help to answer to the question while taking account their sub-chapters.
+The table of contents is structured like that :
+{{
+  "level": 0,
+  "Chapter 1": {{}},
+  "Chapter 2" : {{
+    "level": 1,
+    "Chapter 2.1": {{
+      ...
+    }}
+  }},
+}}
+Here level is the level of the chapter. For example, Chapter 1 and Chapter 2 are at level 0, and Chapter 2.1 is at level 1.
+### Guidelines ###
+- Keep all the list of documents that is given to you
+- Each chapter must keep **EXACTLY** its assigned level in the table of contents. **DO NOT MODIFY THE LEVELS. **
+- Check systematically the level of a chapter before including it in the answer.
+- Return **valid JSON** result.
+--------------------
+User question :
+{query}
+List of documents with their table of contents :
+{doc_list}
+--------------------
+Return a JSON result with a list of relevant chapters with the following keys **WITHOUT** the json markdown indicator ```json at the beginning:
+- "document" : the document in which we can find the chapter
+- "chapter" : the title of the chapter
+**IMPORTANT : Make sure that the levels of the answer are exactly the same as the ones in the table of contents**
+Example of a JSON response:
+[
+  {{
+    "document": "Document A",
+    "chapter": "Chapter 1",
+  }},
+  {{
+    "document": "Document B",
+    "chapter": "Chapter 5",
+  }}
+]
+"""

climateqa/engine/chains/query_transformation.py CHANGED Viewed

@@ -293,6 +293,8 @@ def make_query_transform_node(llm,k_final=15):
             "n_questions":n_questions,
             "handled_questions_index":[],
         }
         return new_state
     return transform_query

             "n_questions":n_questions,
             "handled_questions_index":[],
         }
+        print("New questions")
+        print(new_questions)
         return new_state
     return transform_query

climateqa/engine/chains/retrieve_documents.py CHANGED Viewed

@@ -15,6 +15,14 @@ from ..utils import log_event
 from langchain_core.vectorstores import VectorStore
 from typing import List
 from langchain_core.documents.base import Document
 import asyncio
 from typing import Any, Dict, List, Tuple
@@ -119,6 +127,21 @@ def remove_duplicates_chunks(docs):
             result.append(doc)
     return result
 async def get_POC_relevant_documents(
     query: str,
     vectorstore:VectorStore,
@@ -169,6 +192,86 @@ async def get_POC_relevant_documents(
         "docs_question" : docs_question,
         "docs_images" : docs_images
     }
 async def get_IPCC_relevant_documents(
@@ -271,6 +374,7 @@ def concatenate_documents(index, source_type, docs_question_dict, k_by_question,
     return docs_question, images_question
 # The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
 # @chain
 async def retrieve_documents(
@@ -279,6 +383,7 @@ async def retrieve_documents(
     source_type: str,
     vectorstore: VectorStore,
     reranker: Any,
     search_figures: bool = False,
     search_only: bool = False,
     reports: list = [],
@@ -286,7 +391,9 @@ async def retrieve_documents(
     k_images_by_question: int = 5,
     k_before_reranking: int = 100,
     k_by_question: int = 5,
-    k_summary_by_question: int = 3
 ) -> Tuple[List[Document], List[Document]]:
     """
     Unpack the first question of the remaining questions, and retrieve and rerank corresponding documents, based on the question and selected_sources
@@ -316,6 +423,7 @@ async def retrieve_documents(
     print(f"""---- Retrieve documents from {current_question["source_type"]}----""")
     if source_type == "IPx":
         docs_question_dict = await get_IPCC_relevant_documents(
             query  = question,
@@ -331,19 +439,36 @@ async def retrieve_documents(
             reports = reports,
         )
-    if source_type == "POC":
-        docs_question_dict = await get_POC_relevant_documents(
-            query = question,
-            vectorstore=vectorstore,
-            search_figures = search_figures,
-            sources = sources,
-            threshold = 0.5,
-            search_only = search_only,
-            reports = reports,
-            min_size= 200,
-            k_documents= k_before_reranking,
-            k_images= k_by_question
-        )
     # Rerank
     if reranker is not None and rerank_by_question:
@@ -369,24 +494,44 @@ async def retrieve_documents(
     return docs_question, images_question
-async def retrieve_documents_for_all_questions(state, config, source_type, to_handle_questions_index, vectorstore, reranker, rerank_by_question=True, k_final=15, k_before_reranking=100):
     """
     Retrieve documents in parallel for all questions.
     """
     # to_handle_questions_index = [x for x in state["questions_list"] if x["source_type"] == "IPx"]
     # TODO split les questions selon le type de sources dans le state question + conditions sur le nombre de questions traités par type de source
-    docs = state.get("documents", [])
-    related_content = state.get("related_content", [])
-    search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources_selection"]
-    search_only = state["search_only"]
-    reports = state["reports"]
-    k_by_question = k_final // state["n_questions"]["total"]
-    k_summary_by_question = _get_k_summary_by_question(state["n_questions"]["total"])
-    k_images_by_question = _get_k_images_by_question(state["n_questions"]["total"])
     k_before_reranking=100
     tasks = [
         retrieve_documents(
             current_question=question,
@@ -401,9 +546,12 @@ async def retrieve_documents_for_all_questions(state, config, source_type, to_ha
             k_images_by_question=k_images_by_question,
             k_before_reranking=k_before_reranking,
             k_by_question=k_by_question,
-            k_summary_by_question=k_summary_by_question
         )
-        for i, question in enumerate(state["questions_list"]) if i in to_handle_questions_index
     ]
     results = await asyncio.gather(*tasks)
     # Combine results
@@ -413,16 +561,50 @@ async def retrieve_documents_for_all_questions(state, config, source_type, to_ha
         new_state["related_contents"].extend(images_question)
     return new_state
 def make_IPx_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
     async def retrieve_IPx_docs(state, config):
         source_type = "IPx"
         IPx_questions_index = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "IPx"]
-        # return {"documents":[], "related_contents": [], "handled_questions_index": list(range(len(state["questions_list"])))} # TODO Remove
         state = await retrieve_documents_for_all_questions(
-            state=state,
             config=config,
             source_type=source_type,
             to_handle_questions_index=IPx_questions_index,
@@ -446,8 +628,18 @@ def make_POC_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_
         source_type = "POC"
         POC_questions_index = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "POC"]
         state = await retrieve_documents_for_all_questions(
-            state=state,
             config=config,
             source_type=source_type,
             to_handle_questions_index=POC_questions_index,
@@ -462,4 +654,56 @@ def make_POC_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_
     return retrieve_POC_docs_node

 from langchain_core.vectorstores import VectorStore
 from typing import List
 from langchain_core.documents.base import Document
+from ..llm import get_llm
+from .prompts import retrieve_chapter_prompt_template
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from ..vectorstore import get_pinecone_vectorstore
+from ..embeddings import get_embeddings_function
 import asyncio
 from typing import Any, Dict, List, Tuple
             result.append(doc)
     return result
+def get_ToCs(version: str) :
+    filters_text = {
+        "chunk_type":"toc",
+        "version": version
+    }
+    embeddings_function = get_embeddings_function()
+    vectorstore = get_pinecone_vectorstore(embeddings_function, index_name="climateqa-v2")
+    tocs = vectorstore.similarity_search_with_score(query="",filter = filters_text)
+    # remove duplicates or almost duplicates
+    tocs = remove_duplicates_chunks(tocs)
+    return tocs
 async def get_POC_relevant_documents(
     query: str,
     vectorstore:VectorStore,
         "docs_question" : docs_question,
         "docs_images" : docs_images
     }
+async def get_POC_documents_by_ToC_relevant_documents(
+    query: str,
+    tocs: list,
+    vectorstore:VectorStore,
+    version: str,
+    sources:list = ["Acclimaterra","PCAET","Plan Biodiversite"],
+    search_figures:bool = False,
+    search_only:bool = False,
+    k_documents:int = 10,
+    threshold:float = 0.6,
+    k_images: int = 5,
+    reports:list = [],
+    min_size:int = 200,
+    proportion: float = 0.5,
+) :
+    """
+        Args:
+            - tocs : list with the table of contents of each document
+            - version : version of the parsed documents (e.g. "v4")
+            - proportion : share of documents retrieved using ToCs
+    """
+    # Prepare base search kwargs
+    filters = {}
+    docs_question = []
+    docs_images = []
+    # TODO add source selection
+    # if len(reports) > 0:
+    #     filters["short_name"] = {"$in":reports}
+    # else:
+    #     filters["source"] = { "$in": sources}
+    k_documents_toc = round(k_documents * proportion)
+    relevant_tocs = await get_relevant_toc_level_for_query(query, tocs)
+    print(f"Relevant ToCs : {relevant_tocs}")
+    # Transform the ToC dict {"document": str, "chapter": str} into a list of string
+    toc_filters = [toc['chapter'] for toc in relevant_tocs]
+    filters_text_toc = {
+        **filters,
+        "chunk_type":"text",
+        "toc_level0": {"$in": toc_filters},
+        "version": version
+        # "report_type": {}, # TODO  to be completed to choose the right documents / chapters according to the analysis of the question
+    }
+    docs_question = vectorstore.similarity_search_with_score(query=query,filter = filters_text_toc,k = k_documents_toc)
+    filters_text = {
+        **filters,
+        "chunk_type":"text",
+        "version": version
+        # "report_type": {}, # TODO  to be completed to choose the right documents / chapters according to the analysis of the question
+    }
+    docs_question += vectorstore.similarity_search_with_score(query=query,filter = filters_text,k = k_documents - k_documents_toc)
+    # remove duplicates or almost duplicates
+    docs_question = remove_duplicates_chunks(docs_question)
+    docs_question = [x for x in docs_question if x[1] > threshold]
+    if search_figures:
+        # Images
+        filters_image = {
+            **filters,
+            "chunk_type":"image"
+        }
+        docs_images = vectorstore.similarity_search_with_score(query=query,filter = filters_image,k = k_images)
+    docs_question, docs_images = _add_metadata_and_score(docs_question), _add_metadata_and_score(docs_images)
+    docs_question = [x for x in docs_question if len(x.page_content) > min_size]
+    return {
+        "docs_question" : docs_question,
+        "docs_images" : docs_images
+    }
 async def get_IPCC_relevant_documents(
     return docs_question, images_question
 # The chain callback is not necessary, but it propagates the langchain callbacks to the astream_events logger to display intermediate results
 # @chain
 async def retrieve_documents(
     source_type: str,
     vectorstore: VectorStore,
     reranker: Any,
+    version: str = "",
     search_figures: bool = False,
     search_only: bool = False,
     reports: list = [],
     k_images_by_question: int = 5,
     k_before_reranking: int = 100,
     k_by_question: int = 5,
+    k_summary_by_question: int = 3,
+    tocs: list = [],
+    by_toc=False
 ) -> Tuple[List[Document], List[Document]]:
     """
     Unpack the first question of the remaining questions, and retrieve and rerank corresponding documents, based on the question and selected_sources
     print(f"""---- Retrieve documents from {current_question["source_type"]}----""")
     if source_type == "IPx":
         docs_question_dict = await get_IPCC_relevant_documents(
             query  = question,
             reports = reports,
         )
+    if source_type == 'POC':
+        if by_toc == True:
+            print("---- Retrieve documents by ToC----")
+            docs_question_dict = await get_POC_documents_by_ToC_relevant_documents(
+                query=question,
+                tocs = tocs,
+                vectorstore=vectorstore,
+                version=version,
+                search_figures = search_figures,
+                sources = sources,
+                threshold = 0.5,
+                search_only = search_only,
+                reports = reports,
+                min_size= 200,
+                k_documents= k_before_reranking,
+                k_images= k_by_question
+            )
+        else :
+            docs_question_dict = await get_POC_relevant_documents(
+                query = question,
+                vectorstore=vectorstore,
+                search_figures = search_figures,
+                sources = sources,
+                threshold = 0.5,
+                search_only = search_only,
+                reports = reports,
+                min_size= 200,
+                k_documents= k_before_reranking,
+                k_images= k_by_question
+            )
     # Rerank
     if reranker is not None and rerank_by_question:
     return docs_question, images_question
+async def retrieve_documents_for_all_questions(
+    search_figures,
+    search_only,
+    reports,
+    questions_list,
+    n_questions,
+    config,
+    source_type,
+    to_handle_questions_index,
+    vectorstore,
+    reranker,
+    rerank_by_question=True,
+    k_final=15,
+    k_before_reranking=100,
+    version: str = "",
+    tocs: list[dict] = [],
+    by_toc: bool = False
+):
     """
     Retrieve documents in parallel for all questions.
     """
     # to_handle_questions_index = [x for x in state["questions_list"] if x["source_type"] == "IPx"]
     # TODO split les questions selon le type de sources dans le state question + conditions sur le nombre de questions traités par type de source
+    # search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources_selection"]
+    # search_only = state["search_only"]
+    # reports = state["reports"]
+    # questions_list = state["questions_list"]
+    # k_by_question = k_final // state["n_questions"]["total"]
+    # k_summary_by_question = _get_k_summary_by_question(state["n_questions"]["total"])
+    # k_images_by_question = _get_k_images_by_question(state["n_questions"]["total"])
+    k_by_question = k_final // n_questions
+    k_summary_by_question = _get_k_summary_by_question(n_questions)
+    k_images_by_question = _get_k_images_by_question(n_questions)
     k_before_reranking=100
+    print(f"Source type here is {source_type}")
     tasks = [
         retrieve_documents(
             current_question=question,
             k_images_by_question=k_images_by_question,
             k_before_reranking=k_before_reranking,
             k_by_question=k_by_question,
+            k_summary_by_question=k_summary_by_question,
+            tocs=tocs,
+            version=version,
+            by_toc=by_toc
         )
+        for i, question in enumerate(questions_list) if i in to_handle_questions_index
     ]
     results = await asyncio.gather(*tasks)
     # Combine results
         new_state["related_contents"].extend(images_question)
     return new_state
+# ToC Retriever
+async def get_relevant_toc_level_for_query(
+    query: str,
+    tocs: list[Document],
+) -> list[dict] :
+    doc_list = []
+    for doc in tocs:
+        doc_name = doc[0].metadata['name']
+        toc = doc[0].page_content
+        doc_list.append({'document': doc_name, 'toc': toc})
+    llm = get_llm(provider="openai",max_tokens = 1024,temperature = 0.0)
+    prompt = ChatPromptTemplate.from_template(retrieve_chapter_prompt_template)
+    chain = prompt | llm | StrOutputParser()
+    response = chain.invoke({"query": query, "doc_list": doc_list})
+    try:
+        relevant_tocs = eval(response)
+    except Exception as e:
+        print(f" Failed to parse the result because of : {e}")
+    return relevant_tocs
 def make_IPx_retriever_node(vectorstore,reranker,llm,rerank_by_question=True, k_final=15, k_before_reranking=100, k_summary=5):
     async def retrieve_IPx_docs(state, config):
         source_type = "IPx"
         IPx_questions_index = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "IPx"]
+        search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources_selection"]
+        search_only = state["search_only"]
+        reports = state["reports"]
+        questions_list = state["questions_list"]
+        n_questions=state["n_questions"]["total"]
         state = await retrieve_documents_for_all_questions(
+            search_figures=search_figures,
+            search_only=search_only,
+            reports=reports,
+            questions_list=questions_list,
+            n_questions=n_questions,
             config=config,
             source_type=source_type,
             to_handle_questions_index=IPx_questions_index,
         source_type = "POC"
         POC_questions_index = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "POC"]
+        search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources_selection"]
+        search_only = state["search_only"]
+        reports = state["reports"]
+        questions_list = state["questions_list"]
+        n_questions=state["n_questions"]["total"]
         state = await retrieve_documents_for_all_questions(
+            search_figures=search_figures,
+            search_only=search_only,
+            reports=reports,
+            questions_list=questions_list,
+            n_questions=n_questions,
             config=config,
             source_type=source_type,
             to_handle_questions_index=POC_questions_index,
     return retrieve_POC_docs_node
+def make_POC_by_ToC_retriever_node(
+        vectorstore: VectorStore,
+        reranker,
+        llm,
+        version: str = "",
+        rerank_by_question=True,
+        k_final=15,
+        k_before_reranking=100,
+        k_summary=5,
+    ):
+    async def retrieve_POC_docs_node(state, config):
+        if "POC region" not in state["relevant_content_sources_selection"]  :
+            return {}
+        search_figures = "Figures (IPCC/IPBES)" in state["relevant_content_sources_selection"]
+        search_only = state["search_only"]
+        search_only = state["search_only"]
+        reports = state["reports"]
+        questions_list = state["questions_list"]
+        n_questions=state["n_questions"]["total"]
+        tocs = get_ToCs(version=version)
+        source_type = "POC"
+        POC_questions_index = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "POC"]
+        state = await retrieve_documents_for_all_questions(
+            search_figures=search_figures,
+            search_only=search_only,
+            config=config,
+            reports=reports,
+            questions_list=questions_list,
+            n_questions=n_questions,
+            source_type=source_type,
+            to_handle_questions_index=POC_questions_index,
+            vectorstore=vectorstore,
+            reranker=reranker,
+            rerank_by_question=rerank_by_question,
+            k_final=k_final,
+            k_before_reranking=k_before_reranking,
+            tocs=tocs,
+            version=version,
+            by_toc=True
+        )
+        return state
+    return retrieve_POC_docs_node

climateqa/engine/graph.py CHANGED Viewed

@@ -11,7 +11,7 @@ from typing import List, Dict
 import operator
 from typing import Annotated
 from IPython.display import display, HTML, Image
 from .chains.answer_chitchat import make_chitchat_node
@@ -19,7 +19,7 @@ from .chains.answer_ai_impact import make_ai_impact_node
 from .chains.query_transformation import make_query_transform_node
 from .chains.translation import make_translation_node
 from .chains.intent_categorization import make_intent_categorization_node
-from .chains.retrieve_documents import make_IPx_retriever_node, make_POC_retriever_node
 from .chains.answer_rag import make_rag_node
 from .chains.graph_retriever import make_graph_retriever_node
 from .chains.chitchat_categorization import make_chitchat_intent_categorization_node
@@ -39,14 +39,14 @@ class GraphState(TypedDict):
     n_questions : int
     answer: str
     audience: str = "experts"
-    sources_input: List[str] = ["IPCC","IPBES"]
     relevant_content_sources_selection: List[str] = ["Figures (IPCC/IPBES)"]
     sources_auto: bool = True
     min_year: int = 1960
     max_year: int = None
     documents: Annotated[List[Document], operator.add]
-    related_contents : Annotated[List[Document], operator.add]
-    recommended_content : List[Document]
     search_only : bool = False
     reports : List[str] = []
@@ -72,7 +72,7 @@ def route_intent(state):
 def chitchat_route_intent(state):
     intent = state["search_graphs_chitchat"]
     if intent is True:
-        return "retrieve_graphs_chitchat"
     elif intent is False:
         return END
@@ -95,20 +95,10 @@ def route_based_on_relevant_docs(state,threshold_docs=0.2):
 def route_continue_retrieve_documents(state):
     index_question_ipx = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "IPx"]
     questions_ipx_finished = all(elem in state["handled_questions_index"] for elem in index_question_ipx)
-    # if questions_ipx_finished and state["search_only"]:
-    #     return END
     if questions_ipx_finished:
         return "end_retrieve_IPx_documents"
     else:
         return "retrieve_documents"
-    # if state["n_questions"]["IPx"] == len(state["handled_questions_index"]) and state["search_only"] :
-    #     return END
-    # elif state["n_questions"]["IPx"] == len(state["handled_questions_index"]):
-    #     return "answer_search"
-    # else :
-    #     return "retrieve_documents"
 def route_continue_retrieve_local_documents(state):
     index_question_poc = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "POC"]
@@ -120,20 +110,6 @@ def route_continue_retrieve_local_documents(state):
     else:
         return "retrieve_local_data"
-    # if state["n_questions"]["POC"] == len(state["handled_questions_index"]) and state["search_only"] :
-    #     return END
-    # elif state["n_questions"]["POC"] == len(state["handled_questions_index"]):
-    #     return "answer_search"
-    # else :
-    #     return "retrieve_local_data"
-    # if len(state["remaining_questions"]) == 0 and state["search_only"] :
-        # return END
-    # elif len(state["remaining_questions"]) > 0:
-    #     return "retrieve_documents"
-    # else:
-    #     return "answer_search"
 def route_retrieve_documents(state):
     sources_to_retrieve = []
@@ -232,8 +208,23 @@ def make_graph_agent(llm, vectorstore_ipcc, vectorstore_graphs, vectorstore_regi
     app = workflow.compile()
     return app
-def make_graph_agent_poc(llm, vectorstore_ipcc, vectorstore_graphs, vectorstore_region, reranker, threshold_docs=0.2):
     workflow = StateGraph(GraphState)
     # Define the node functions
@@ -244,7 +235,8 @@ def make_graph_agent_poc(llm, vectorstore_ipcc, vectorstore_graphs, vectorstore_
     answer_ai_impact = make_ai_impact_node(llm)
     retrieve_documents = make_IPx_retriever_node(vectorstore_ipcc, reranker, llm)
     retrieve_graphs = make_graph_retriever_node(vectorstore_graphs, reranker)
-    retrieve_local_data = make_POC_retriever_node(vectorstore_region, reranker, llm)
     answer_rag = make_rag_node(llm, with_docs=True)
     answer_rag_no_docs = make_rag_node(llm, with_docs=False)
     chitchat_categorize_intent = make_chitchat_intent_categorization_node(llm)
@@ -315,6 +307,10 @@ def make_graph_agent_poc(llm, vectorstore_ipcc, vectorstore_graphs, vectorstore_
     workflow.add_edge("retrieve_local_data", "answer_search")
     workflow.add_edge("retrieve_documents", "answer_search")
     # Compile
     app = workflow.compile()
     return app

 import operator
 from typing import Annotated
+import pandas as pd
 from IPython.display import display, HTML, Image
 from .chains.answer_chitchat import make_chitchat_node
 from .chains.query_transformation import make_query_transform_node
 from .chains.translation import make_translation_node
 from .chains.intent_categorization import make_intent_categorization_node
+from .chains.retrieve_documents import make_IPx_retriever_node, make_POC_retriever_node, make_POC_by_ToC_retriever_node
 from .chains.answer_rag import make_rag_node
 from .chains.graph_retriever import make_graph_retriever_node
 from .chains.chitchat_categorization import make_chitchat_intent_categorization_node
     n_questions : int
     answer: str
     audience: str = "experts"
+    sources_input: List[str] = ["IPCC","IPBES"] # Deprecated -> used only graphs that can only be OWID
     relevant_content_sources_selection: List[str] = ["Figures (IPCC/IPBES)"]
     sources_auto: bool = True
     min_year: int = 1960
     max_year: int = None
     documents: Annotated[List[Document], operator.add]
+    related_contents : Annotated[List[Document], operator.add] # Images
+    recommended_content : List[Document] # OWID Graphs  # TODO merge with related_contents
     search_only : bool = False
     reports : List[str] = []
 def chitchat_route_intent(state):
     intent = state["search_graphs_chitchat"]
     if intent is True:
+        return END #TODO
     elif intent is False:
         return END
 def route_continue_retrieve_documents(state):
     index_question_ipx = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "IPx"]
     questions_ipx_finished = all(elem in state["handled_questions_index"] for elem in index_question_ipx)
     if questions_ipx_finished:
         return "end_retrieve_IPx_documents"
     else:
         return "retrieve_documents"
 def route_continue_retrieve_local_documents(state):
     index_question_poc = [i for i, x in enumerate(state["questions_list"]) if x["source_type"] == "POC"]
     else:
         return "retrieve_local_data"
 def route_retrieve_documents(state):
     sources_to_retrieve = []
     app = workflow.compile()
     return app
+def make_graph_agent_poc(llm, vectorstore_ipcc, vectorstore_graphs, vectorstore_region, reranker, version:str, threshold_docs=0.2):
+    """_summary_
+    Args:
+        llm (_type_): _description_
+        vectorstore_ipcc (_type_): _description_
+        vectorstore_graphs (_type_): _description_
+        vectorstore_region (_type_): _description_
+        reranker (_type_): _description_
+        version (str): version of the parsed documents (e.g "v4")
+        threshold_docs (float, optional): _description_. Defaults to 0.2.
+    Returns:
+        _type_: _description_
+    """
     workflow = StateGraph(GraphState)
     # Define the node functions
     answer_ai_impact = make_ai_impact_node(llm)
     retrieve_documents = make_IPx_retriever_node(vectorstore_ipcc, reranker, llm)
     retrieve_graphs = make_graph_retriever_node(vectorstore_graphs, reranker)
+    # retrieve_local_data = make_POC_retriever_node(vectorstore_region, reranker, llm)
+    retrieve_local_data = make_POC_by_ToC_retriever_node(vectorstore_region, reranker, llm, version=version)
     answer_rag = make_rag_node(llm, with_docs=True)
     answer_rag_no_docs = make_rag_node(llm, with_docs=False)
     chitchat_categorize_intent = make_chitchat_intent_categorization_node(llm)
     workflow.add_edge("retrieve_local_data", "answer_search")
     workflow.add_edge("retrieve_documents", "answer_search")
+    # workflow.add_edge("transform_query", "retrieve_drias_data")
+    # workflow.add_edge("retrieve_drias_data", END)
     # Compile
     app = workflow.compile()
     return app

climateqa/engine/talk_to_data/main.py ADDED Viewed

	@@ -0,0 +1,60 @@

+from climateqa.engine.talk_to_data.myVanna import MyVanna
+from climateqa.engine.talk_to_data.utils import loc2coords, detect_location_with_openai, detectTable, nearestNeighbourSQL, detect_relevant_tables, replace_coordonates
+import sqlite3
+import os
+import pandas as pd
+from climateqa.engine.llm import get_llm
+from dotenv import load_dotenv
+import ast
+load_dotenv()
+OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
+PC_API_KEY = os.getenv('VANNA_PINECONE_API_KEY')
+INDEX_NAME = os.getenv('VANNA_INDEX_NAME')
+VANNA_MODEL = os.getenv('VANNA_MODEL')
+#Vanna object
+vn = MyVanna(config = {"temperature": 0, "api_key": OPENAI_API_KEY, 'model': VANNA_MODEL, 'pc_api_key': PC_API_KEY, 'index_name': INDEX_NAME, "top_k" : 4})
+db_vanna_path = os.path.join(os.path.dirname(__file__), "database/drias.db")
+vn.connect_to_sqlite(db_vanna_path)
+llm = get_llm(provider="openai")
+def ask_llm_to_add_table_names(sql_query, llm):
+    sql_with_table_names = llm.invoke(f"Make the following sql query display the source table in the rows {sql_query}. Just answer the query. The answer should not include ```sql\n").content
+    return sql_with_table_names
+def ask_llm_column_names(sql_query, llm):
+    columns = llm.invoke(f"From the given sql query, list the columns that are being selected. The answer should only be a python list. Just answer the list. The SQL query : {sql_query}").content
+    columns_list = ast.literal_eval(columns.strip("```python\n").strip())
+    return columns_list
+def ask_vanna(query):
+    try :
+        location = detect_location_with_openai(OPENAI_API_KEY, query)
+        if location:
+            coords = loc2coords(location)
+            user_input = query.lower().replace(location.lower(), f"lat, long : {coords}")
+            relevant_tables = detect_relevant_tables(user_input, llm)
+            coords_tables = [nearestNeighbourSQL(db_vanna_path, coords, relevant_tables[i]) for i in range(len(relevant_tables))]
+            user_input_with_coords = replace_coordonates(coords, user_input, coords_tables)
+            sql_query, result_dataframe, figure = vn.ask(user_input_with_coords, print_results=False, allow_llm_to_see_data=True, auto_train=False)
+            return sql_query, result_dataframe, figure
+        else :
+            empty_df = pd.DataFrame()
+            empty_fig = {}
+            return "", empty_df, empty_fig
+    except Exception as e:
+        print(f"Error: {e}")
+        empty_df = pd.DataFrame()
+        empty_fig = {}
+        return "", empty_df, empty_fig

front/tabs/chat_interface.py CHANGED Viewed

@@ -20,12 +20,31 @@ Please note that we log your questions for meta-analysis purposes, so avoid shar
 What do you want to learn ?
 """
 # UI Layout Components
-def create_chat_interface():
     chatbot = gr.Chatbot(
-        value=[ChatMessage(role="assistant", content=init_prompt)],
         type="messages",
         show_copy_button=True,
         show_label=False,

 What do you want to learn ?
 """
+init_prompt_poc = """
+Hello, I am ClimateQ&A, a conversational assistant designed to help you understand climate change and biodiversity loss. I will answer your questions by **sifting through the IPCC and IPBES scientific reports, PCAET of Paris, the Plan Biodiversité 2018-2024, and Acclimaterra reports from la Région Nouvelle-Aquitaine **.
+❓ How to use
+- **Language**: You can ask me your questions in any language.
+- **Audience**: You can specify your audience (children, general public, experts) to get a more adapted answer.
+- **Sources**: You can choose to search in the IPCC or IPBES reports, and POC sources for local documents (PCAET, Plan Biodiversité, Acclimaterra).
+- **Relevant content sources**: You can choose to search for figures, papers, or graphs that can be relevant for your question.
+⚠️ Limitations
+*Please note that the AI is not perfect and may sometimes give irrelevant answers. If you are not satisfied with the answer, please ask a more specific question or report your feedback to help us improve the system.*
+🛈 Information
+Please note that we log your questions for meta-analysis purposes, so avoid sharing any sensitive or personal information.
+What do you want to learn ?
+"""
 # UI Layout Components
+def create_chat_interface(tab):
+    init_prompt_message = init_prompt_poc if tab == "Beta - POC Adapt'Action" else init_prompt
     chatbot = gr.Chatbot(
+        value=[ChatMessage(role="assistant", content=init_prompt_message)],
         type="messages",
         show_copy_button=True,
         show_label=False,

front/tabs/main_tab.py CHANGED Viewed

@@ -3,7 +3,6 @@ from .chat_interface import create_chat_interface
 from .tab_examples import create_examples_tab
 from .tab_papers import create_papers_tab
 from .tab_figures import create_figures_tab
-from .chat_interface import create_chat_interface
 def cqa_tab(tab_name):
     # State variables
@@ -12,7 +11,7 @@ def cqa_tab(tab_name):
         with gr.Row(elem_id="chatbot-row"):
             # Left column - Chat interface
             with gr.Column(scale=2):
-                chatbot, textbox, config_button = create_chat_interface()
             # Right column - Content panels
             with gr.Column(scale=2, variant="panel", elem_id="right-panel"):

 from .tab_examples import create_examples_tab
 from .tab_papers import create_papers_tab
 from .tab_figures import create_figures_tab
 def cqa_tab(tab_name):
     # State variables
         with gr.Row(elem_id="chatbot-row"):
             # Left column - Chat interface
             with gr.Column(scale=2):
+                chatbot, textbox, config_button = create_chat_interface(tab_name)
             # Right column - Content panels
             with gr.Column(scale=2, variant="panel", elem_id="right-panel"):