Spaces:

zjunlp
/

OneKE

Running

App Files Files Community

ShawnRu commited on Feb 21

Commit

4754e33

1 Parent(s): 132649c

update-github

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

README.md +3 -4
app.py +2 -3
src/__pycache__/pipeline.cpython-311.pyc +0 -0
src/__pycache__/pipeline.cpython-39.pyc +0 -0
src/config.yaml +3 -2
src/generate_memory.py +0 -181
src/models/__init__.py +1 -1
src/models/__pycache__/__init__.cpython-311.pyc +0 -0
src/models/__pycache__/__init__.cpython-37.pyc +0 -0
src/models/__pycache__/__init__.cpython-39.pyc +0 -0
src/models/__pycache__/llm_def.cpython-311.pyc +0 -0
src/models/__pycache__/llm_def.cpython-37.pyc +0 -0
src/models/__pycache__/llm_def.cpython-39.pyc +0 -0
src/models/__pycache__/prompt_example.cpython-311.pyc +0 -0
src/models/__pycache__/prompt_example.cpython-39.pyc +0 -0
src/models/__pycache__/prompt_template.cpython-311.pyc +0 -0
src/models/__pycache__/prompt_template.cpython-39.pyc +0 -0
src/models/llm_def.py +80 -13
src/models/prompt_example.py +7 -7
src/models/prompt_template.py +22 -1
src/models/vllm_serve.py +34 -0
src/modules/__pycache__/__init__.cpython-311.pyc +0 -0
src/modules/__pycache__/__init__.cpython-39.pyc +0 -0
src/modules/__pycache__/extraction_agent.cpython-311.pyc +0 -0
src/modules/__pycache__/extraction_agent.cpython-39.pyc +0 -0
src/modules/__pycache__/reflection_agent.cpython-311.pyc +0 -0
src/modules/__pycache__/reflection_agent.cpython-39.pyc +0 -0
src/modules/__pycache__/schema_agent.cpython-311.pyc +0 -0
src/modules/__pycache__/schema_agent.cpython-39.pyc +0 -0
src/modules/extraction_agent.py +28 -7
src/modules/knowledge_base/__pycache__/case_repository.cpython-311.pyc +0 -0
src/modules/knowledge_base/__pycache__/case_repository.cpython-39.pyc +0 -0
src/modules/knowledge_base/__pycache__/schema_repository.cpython-311.pyc +0 -0
src/modules/knowledge_base/__pycache__/schema_repository.cpython-39.pyc +0 -0
src/modules/knowledge_base/case_repository.py +135 -336
src/modules/knowledge_base/schema_repository.py +1 -1
src/modules/schema_agent.py +0 -3
src/pipeline.py +43 -23
src/run.py +21 -67
src/utils/__pycache__/__init__.cpython-311.pyc +0 -0
src/utils/__pycache__/__init__.cpython-39.pyc +0 -0
src/utils/__pycache__/data_def.cpython-311.pyc +0 -0
src/utils/__pycache__/data_def.cpython-39.pyc +0 -0
src/utils/__pycache__/process.cpython-311.pyc +0 -0
src/utils/__pycache__/process.cpython-39.pyc +0 -0
src/utils/data_def.py +0 -1
src/utils/process.py +58 -1
src/{main.py → webui.py} +1 -5
src/webui/__init__.py +0 -1
src/webui/__pycache__/__init__.cpython-39.pyc +0 -0

README.md CHANGED Viewed

@@ -1,13 +1,12 @@
 ---
 title: OneKE
 emoji: 👌🏻
-colorFrom: indigo
 colorTo: indigo
 sdk: gradio
 sdk_version: 5.8.0
 app_file: app.py
 pinned: false
 license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: OneKE
 emoji: 👌🏻
+colorFrom: blue
 colorTo: indigo
 sdk: gradio
 sdk_version: 5.8.0
 app_file: app.py
 pinned: false
 license: mit
+short_description: Schema-Guided LLM Agent-based Knowledge Extraction System
+---

app.py CHANGED Viewed

@@ -1,7 +1,6 @@
-import nltk
 import subprocess
 nltk.download('punkt')
 nltk.download('punkt_tab')
-subprocess.run(["python", "src/main.py"])

 import subprocess
+import nltk
 nltk.download('punkt')
 nltk.download('punkt_tab')
+subprocess.run(["python", "src/webui.py"])

src/__pycache__/pipeline.cpython-311.pyc DELETED Viewed

Binary file (5.34 kB)

src/__pycache__/pipeline.cpython-39.pyc CHANGED Viewed

Binary files a/src/__pycache__/pipeline.cpython-39.pyc and b/src/__pycache__/pipeline.cpython-39.pyc differ

src/config.yaml CHANGED Viewed

@@ -1,3 +1,6 @@
 agent:
   default_schema: The final extraction result should be formatted as a JSON object.
   default_ner: Extract the Named Entities in the given text.
@@ -15,5 +18,3 @@ agent:
     customized:
       schema_agent:  get_retrieved_schema
       extraction_agent: extract_information_direct

+model:
+  embedding_model: all-MiniLM-L6-v2
 agent:
   default_schema: The final extraction result should be formatted as a JSON object.
   default_ner: Extract the Named Entities in the given text.
     customized:
       schema_agent:  get_retrieved_schema
       extraction_agent: extract_information_direct

src/generate_memory.py DELETED Viewed

@@ -1,181 +0,0 @@
-from typing import Literal
-from models import *
-from utils import *
-from modules import *
-class Pipeline:
-    def __init__(self, llm: BaseEngine):
-        self.llm = llm
-        self.case_repo = CaseRepositoryHandler(llm = llm)
-        self.schema_agent = SchemaAgent(llm = llm)
-        self.extraction_agent = ExtractionAgent(llm = llm, case_repo = self.case_repo)
-        self.reflection_agent = ReflectionAgent(llm = llm, case_repo = self.case_repo)
-    def __init_method(self, data: DataPoint, process_method):
-        default_order = ["schema_agent", "extraction_agent", "reflection_agent"]
-        if "schema_agent" not in process_method:
-            process_method["schema_agent"] = "get_default_schema"
-        if data.task != "Base":
-            process_method["schema_agent"] = "get_retrieved_schema"
-        if "extraction_agent" not in process_method:
-            process_method["extraction_agent"] = "extract_information_direct"
-        sorted_process_method = {key: process_method[key] for key in default_order if key in process_method}
-        return sorted_process_method
-    def __init_data(self, data: DataPoint):
-        if data.task == "NER":
-            data.instruction = config['agent']['default_ner']
-            data.output_schema = "EntityList"
-        elif data.task == "RE":
-            data.instruction = config['agent']['default_re']
-            data.output_schema = "RelationList"
-        elif data.task == "EE":
-            data.instruction = config['agent']['default_ee']
-            data.output_schema = "EventList"
-        return data
-    # main entry
-    def get_extract_result(self,
-                           task: TaskType,
-                           instruction: str = "",
-                           text: str = "",
-                           output_schema: str = "",
-                           constraint: str = "",
-                           use_file: bool = False,
-                           truth: str = "",
-                           mode: str = "quick",
-                           update_case: bool = False
-                           ):
-        data = DataPoint(task=task, instruction=instruction, text=text, output_schema=output_schema, constraint=constraint, use_file=use_file, truth=truth)
-        data = self.__init_data(data)
-        data.instruction = "In the tranquil seaside town, the summer evening cast a golden glow over everything. The townsfolk gathered at the café by the pier, enjoying the sea breeze while eagerly anticipating the annual Ocean Festival's opening ceremony. \nFirst to arrive was Mayor William, dressed in a deep blue suit, holding a roll of his speech. He smiled and greeted the residents, who held deep respect for their community-minded mayor. Beside him trotted Max, his loyal golden retriever, wagging his tail excitedly at every familiar face he saw. \nFollowing closely was Emily, the town’s high school teacher, accompanied by a group of students ready to perform a musical piece they'd rehearsed. One of the girls carried Polly, a vibrant green parrot, on her shoulder. Polly occasionally chimed in with cheerful squawks, adding to the lively atmosphere. \nNot far away, Captain Jack, with his trusty pipe in hand, chatted with old friends about this year's catch. His fleet was the town’s economic backbone, and his seasoned face and towering presence were complemented by the presence of Whiskers, his orange tabby cat, who loved lounging on the dock, attentively watching the gentle waves. \nInside the café, Kate was bustling about, serving guests. As the owner, with her fiery red curls and vivacious spirit, she was the heart of the place. Her friend Susan, an artist living in a tiny cottage nearby, was helping her prepare refreshing beverages. Slinky, Susan's mischievous ferret, darted playfully between the tables, much to the delight of the children present. \nLeaning on the café's railing, a young boy named Tommy watched the sea with wide, gleaming eyes, filled with dreams of the future. By his side sat Daisy, a spirited little dachshund, barking excitedly at the seagulls flying overhead. Tommy's mother, Lucy, stood beside him, smiling softly as she held a seashell he had just found on the beach. \nAmong the crowd, a group of unnamed tourists snapped photos, capturing memories of the charming festival. Street vendors called out, selling their wares—handmade jewelry and sweet confections—as the scent of grilled seafood wafted through the air. \nSuddenly, a burst of laughter erupted—it was James and his band making their grand entrance. Accompanying them was Benny, a friendly border collie who \"performed\" with the band, delighting the crowd with his antics. Set to play a big concert after the opening ceremony, James, the town's star musician, had won the hearts of locals with his soulful tunes. \nAs dusk settled, lights were strung across the streets, casting a magical glow over the town. Mayor William took the stage to deliver his speech, with Max sitting proudly by his side. The festival atmosphere reached its vibrant peak, and in this small town, each person—and animal—carried their own dreams and stories, yet at this moment, they were united by the shared celebration."
-        data.chunk_text_list.append("In the tranquil seaside town, the summer evening cast a golden glow over everything. The townsfolk gathered at the café by the pier, enjoying the sea breeze while eagerly anticipating the annual Ocean Festival's opening ceremony. \nFirst to arrive was Mayor William, dressed in a deep blue suit, holding a roll of his speech. He smiled and greeted the residents, who held deep respect for their community-minded mayor. Beside him trotted Max, his loyal golden retriever, wagging his tail excitedly at every familiar face he saw. \nFollowing closely was Emily, the town’s high school teacher, accompanied by a group of students ready to perform a musical piece they'd rehearsed. One of the girls carried Polly, a vibrant green parrot, on her shoulder. Polly occasionally chimed in with cheerful squawks, adding to the lively atmosphere. \nNot far away, Captain Jack, with his trusty pipe in hand, chatted with old friends about this year's catch. His fleet was the town’s economic backbone, and his seasoned face and towering presence were complemented by the presence of Whiskers, his orange tabby cat, who loved lounging on the dock, attentively watching the gentle waves. \nInside the café, Kate was bustling about, serving guests. As the owner, with her fiery red curls and vivacious spirit, she was the heart of the place. Her friend Susan, an artist living in a tiny cottage nearby, was helping her prepare refreshing beverages. Slinky, Susan's mischievous ferret, darted playfully between the tables, much to the delight of the children present. \nLeaning on the café's railing, a young boy named Tommy watched the sea with wide, gleaming eyes, filled with dreams of the future. By his side sat Daisy, a spirited little dachshund, barking excitedly at the seagulls flying overhead. Tommy's mother, Lucy, stood beside him, smiling softly as she held a seashell he had just found on the beach. \nAmong the crowd, a group of unnamed tourists snapped photos, capturing memories of the charming festival. Street vendors called out, selling their wares—handmade jewelry and sweet confections—as the scent of grilled seafood wafted through the air. \nSuddenly, a burst of laughter erupted—it was James and his band making their grand entrance. Accompanying them was Benny, a friendly border collie who \"performed\" with the band, delighting the crowd with his antics. Set to play a big concert after the opening ceremony, James, the town's star musician, had won the hearts of locals with his soulful tunes. \nAs dusk settled, lights were strung across the streets, casting a magical glow over the town. Mayor William took the stage to deliver his speech, with Max sitting proudly by his side. The festival atmosphere reached its vibrant peak, and in this small town, each person—and animal—carried their own dreams and stories, yet at this moment, they were united by the shared celebration.")
-        data.distilled_text = "This text is from the field of Slice of Life and represents the genre of Novel."
-        data.pred = {
-  "characters": [
-    {
-      "name": "Mayor William",
-      "role": "Mayor"
-    },
-    {
-      "name": "Max",
-      "role": "Golden Retriever, Mayor William's dog"
-    },
-    {
-      "name": "Emily",
-      "role": "High school teacher"
-    },
-    {
-      "name": "Polly",
-      "role": "Parrot, accompanying a student"
-    },
-    {
-      "name": "Captain Jack",
-      "role": "Captain"
-    },
-    {
-      "name": "Whiskers",
-      "role": "Orange tabby cat, Captain Jack's pet"
-    },
-    {
-      "name": "Kate",
-      "role": "Café owner"
-    },
-    {
-      "name": "Susan",
-      "role": "Artist, Kate's friend"
-    },
-    {
-      "name": "Slinky",
-      "role": "Ferret, Susan's pet"
-    },
-    {
-      "name": "Tommy",
-      "role": "Young boy"
-    },
-    {
-      "name": "Daisy",
-      "role": "Dachshund, Tommy's pet"
-    },
-    {
-      "name": "Lucy",
-      "role": "Tommy's mother"
-    },
-    {
-      "name": "James",
-      "role": "Musician, band leader"
-    },
-    {
-      "name": "Benny",
-      "role": "Border Collie, accompanying James and his band"
-    },
-    {
-      "name": "Unnamed Tourists",
-      "role": "Visitors at the festival"
-    },
-    {
-      "name": "Street Vendors",
-      "role": "Sellers at the festival"
-    }
-  ]
-}
-        data.truth = {
-  "characters": [
-    {
-      "name": "Mayor William",
-      "role": "The friendly and respected mayor of the seaside town."
-    },
-    {
-      "name": "Emily",
-      "role": "A high school teacher guiding students in a festival performance."
-    },
-    {
-      "name": "Captain Jack",
-      "role": "A seasoned sailor whose fleet supports the town."
-    },
-    {
-      "name": "Kate",
-      "role": "The welcoming owner of the local café."
-    },
-    {
-      "name": "Susan",
-      "role": "An artist known for her ocean-themed paintings."
-    },
-    {
-      "name": "Tommy",
-      "role": "A young boy with dreams of the sea."
-    },
-    {
-      "name": "Lucy",
-      "role": "Tommy's caring and supportive mother."
-    },
-    {
-      "name": "James",
-      "role": "A charismatic musician and band leader."
-    }
-  ]
-}
-        # Case Update
-        if update_case:
-            if (data.truth == ""):
-                truth = input("Please enter the correct answer you prefer, or press Enter to accept the current answer: ")
-                if truth.strip() == "":
-                    data.truth = data.pred
-                else:
-                    data.truth = extract_json_dict(truth)
-            self.case_repo.update_case(data)
-        # return result
-        result = data.pred
-        trajectory = data.get_result_trajectory()
-        return result, trajectory, "a", "b"
-model = DeepSeek(model_name_or_path="deepseek-chat", api_key="")
-pipeline = Pipeline(model)
-result, trajectory, *_ = pipeline.get_extract_result(update_case=True, task="Base")

src/models/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
-from .llm_def import BaseEngine, LLaMA, Qwen, MiniCPM, ChatGLM, ChatGPT, DeepSeek
 from .prompt_example import *
 from .prompt_template import *

+from .llm_def import *
 from .prompt_example import *
 from .prompt_template import *

src/models/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (434 Bytes)

src/models/__pycache__/__init__.cpython-37.pyc DELETED Viewed

Binary file (315 Bytes)

src/models/__pycache__/__init__.cpython-39.pyc CHANGED Viewed

Binary files a/src/models/__pycache__/__init__.cpython-39.pyc and b/src/models/__pycache__/__init__.cpython-39.pyc differ

src/models/__pycache__/llm_def.cpython-311.pyc DELETED Viewed

Binary file (11.8 kB)

src/models/__pycache__/llm_def.cpython-37.pyc DELETED Viewed

Binary file (7.14 kB)

src/models/__pycache__/llm_def.cpython-39.pyc CHANGED Viewed

Binary files a/src/models/__pycache__/llm_def.cpython-39.pyc and b/src/models/__pycache__/llm_def.cpython-39.pyc differ

src/models/__pycache__/prompt_example.cpython-311.pyc DELETED Viewed

Binary file (5.67 kB)

src/models/__pycache__/prompt_example.cpython-39.pyc CHANGED Viewed

Binary files a/src/models/__pycache__/prompt_example.cpython-39.pyc and b/src/models/__pycache__/prompt_example.cpython-39.pyc differ

src/models/__pycache__/prompt_template.cpython-311.pyc DELETED Viewed

Binary file (5.42 kB)

src/models/__pycache__/prompt_template.cpython-39.pyc CHANGED Viewed

Binary files a/src/models/__pycache__/prompt_template.cpython-39.pyc and b/src/models/__pycache__/prompt_template.cpython-39.pyc differ

src/models/llm_def.py CHANGED Viewed

@@ -6,7 +6,7 @@ Supports:
 """
 from transformers import pipeline
-from transformers import AutoTokenizer, AutoModelForCausalLM, AutoTokenizer
 import torch
 import openai
 import os
@@ -21,7 +21,8 @@ class BaseEngine:
         self.temperature = 0.2
         self.top_p = 0.9
         self.max_tokens = 1024
     def get_chat_response(self, prompt):
         raise NotImplementedError
@@ -29,7 +30,7 @@ class BaseEngine:
         self.temperature = temperature
         self.top_p = top_p
         self.max_tokens = max_tokens
 class LLaMA(BaseEngine):
     def __init__(self, model_name_or_path: str):
         super().__init__(model_name_or_path)
@@ -60,7 +61,7 @@ class LLaMA(BaseEngine):
             top_p=self.top_p,
         )
         return outputs[0]["generated_text"][-1]['content'].strip()
 class Qwen(BaseEngine):
     def __init__(self, model_name_or_path: str):
         super().__init__(model_name_or_path)
@@ -71,7 +72,7 @@ class Qwen(BaseEngine):
             torch_dtype="auto",
             device_map="auto"
         )
     def get_chat_response(self, prompt):
         messages = [
             {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
@@ -82,7 +83,7 @@ class Qwen(BaseEngine):
             tokenize=False,
             add_generation_prompt=True
         )
-        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.model.device)
         generated_ids = self.model.generate(
             **model_inputs,
             temperature=self.temperature,
@@ -93,7 +94,7 @@ class Qwen(BaseEngine):
             output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
         ]
         response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
         return response
 class MiniCPM(BaseEngine):
@@ -113,7 +114,7 @@ class MiniCPM(BaseEngine):
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": prompt}
         ]
-        model_inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(self.model.device)
         model_outputs = self.model.generate(
             model_inputs,
             temperature=self.temperature,
@@ -124,7 +125,7 @@ class MiniCPM(BaseEngine):
             model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs))
         ]
         response = self.tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0].strip()
         return response
 class ChatGLM(BaseEngine):
@@ -145,7 +146,7 @@ class ChatGLM(BaseEngine):
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": prompt}
         ]
-        model_inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True, add_generation_prompt=True, tokenize=True).to(self.model.device)
         model_outputs = self.model.generate(
             **model_inputs,
             temperature=self.temperature,
@@ -154,9 +155,45 @@ class ChatGLM(BaseEngine):
         )
         model_outputs = model_outputs[:, model_inputs['input_ids'].shape[1]:]
         response = self.tokenizer.batch_decode(model_outputs, skip_special_tokens=True)[0].strip()
         return response
 class ChatGPT(BaseEngine):
     def __init__(self, model_name_or_path: str, api_key: str, base_url=openai.base_url):
         self.name = "ChatGPT"
@@ -170,7 +207,7 @@ class ChatGPT(BaseEngine):
         else:
             self.api_key = os.environ["OPENAI_API_KEY"]
         self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
     def get_chat_response(self, input):
         response = self.client.chat.completions.create(
             model=self.model,
@@ -197,7 +234,7 @@ class DeepSeek(BaseEngine):
         else:
             self.api_key = os.environ["DEEPSEEK_API_KEY"]
         self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
     def get_chat_response(self, input):
         response = self.client.chat.completions.create(
             model=self.model,
@@ -210,3 +247,33 @@ class DeepSeek(BaseEngine):
             stop=None
         )
         return response.choices[0].message.content

 """
 from transformers import pipeline
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig, GenerationConfig
 import torch
 import openai
 import os
         self.temperature = 0.2
         self.top_p = 0.9
         self.max_tokens = 1024
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     def get_chat_response(self, prompt):
         raise NotImplementedError
         self.temperature = temperature
         self.top_p = top_p
         self.max_tokens = max_tokens
 class LLaMA(BaseEngine):
     def __init__(self, model_name_or_path: str):
         super().__init__(model_name_or_path)
             top_p=self.top_p,
         )
         return outputs[0]["generated_text"][-1]['content'].strip()
 class Qwen(BaseEngine):
     def __init__(self, model_name_or_path: str):
         super().__init__(model_name_or_path)
             torch_dtype="auto",
             device_map="auto"
         )
     def get_chat_response(self, prompt):
         messages = [
             {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
             tokenize=False,
             add_generation_prompt=True
         )
+        model_inputs = self.tokenizer([text], return_tensors="pt").to(self.device)
         generated_ids = self.model.generate(
             **model_inputs,
             temperature=self.temperature,
             output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
         ]
         response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
         return response
 class MiniCPM(BaseEngine):
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": prompt}
         ]
+        model_inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to(self.device)
         model_outputs = self.model.generate(
             model_inputs,
             temperature=self.temperature,
             model_outputs[i][len(model_inputs[i]):] for i in range(len(model_inputs))
         ]
         response = self.tokenizer.batch_decode(output_token_ids, skip_special_tokens=True)[0].strip()
         return response
 class ChatGLM(BaseEngine):
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": prompt}
         ]
+        model_inputs = self.tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True, add_generation_prompt=True, tokenize=True).to(self.device)
         model_outputs = self.model.generate(
             **model_inputs,
             temperature=self.temperature,
         )
         model_outputs = model_outputs[:, model_inputs['input_ids'].shape[1]:]
         response = self.tokenizer.batch_decode(model_outputs, skip_special_tokens=True)[0].strip()
         return response
+class OneKE(BaseEngine):
+    def __init__(self, model_name_or_path: str):
+        super().__init__(model_name_or_path)
+        self.name = "OneKE"
+        self.model_id = model_name_or_path
+        config = AutoConfig.from_pretrained(self.model_id, trust_remote_code=True)
+        quantization_config=BitsAndBytesConfig(
+            load_in_4bit=True,
+            llm_int8_threshold=6.0,
+            llm_int8_has_fp16_weight=False,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_id,
+            config=config,
+            device_map="auto",
+            quantization_config=quantization_config,
+            torch_dtype=torch.bfloat16,
+            trust_remote_code=True,
+        )
+    def get_chat_response(self, prompt):
+        system_prompt = '<<SYS>>\nYou are a helpful assistant. 你是一个乐于助人的助手。\n<</SYS>>\n\n'
+        sintruct = '[INST] ' + system_prompt + prompt + '[/INST]'
+        input_ids = self.tokenizer.encode(prompt, return_tensors='pt')
+        input_ids = self.tokenizer.encode(sintruct, return_tensors="pt").to(self.device)
+        input_length = input_ids.size(1)
+        generation_output = self.model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_length=1024, max_new_tokens=512, return_dict_in_generate=True,pad_token_id=self.tokenizer.pad_token_id,eos_token_id=self.tokenizer.eos_token_id))
+        generation_output = generation_output.sequences[0]
+        generation_output = generation_output[input_length:]
+        response = self.tokenizer.decode(generation_output, skip_special_tokens=True)
+        return response
 class ChatGPT(BaseEngine):
     def __init__(self, model_name_or_path: str, api_key: str, base_url=openai.base_url):
         self.name = "ChatGPT"
         else:
             self.api_key = os.environ["OPENAI_API_KEY"]
         self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
     def get_chat_response(self, input):
         response = self.client.chat.completions.create(
             model=self.model,
         else:
             self.api_key = os.environ["DEEPSEEK_API_KEY"]
         self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
     def get_chat_response(self, input):
         response = self.client.chat.completions.create(
             model=self.model,
             stop=None
         )
         return response.choices[0].message.content
+class LocalServer(BaseEngine):
+    def __init__(self, model_name_or_path: str, base_url="http://localhost:8000/v1"):
+        self.name = model_name_or_path.split('/')[-1]
+        self.model = model_name_or_path
+        self.base_url = base_url
+        self.temperature = 0.2
+        self.top_p = 0.9
+        self.max_tokens = 1024
+        self.api_key = "EMPTY_API_KEY"
+        self.client = OpenAI(api_key=self.api_key, base_url=self.base_url)
+    def get_chat_response(self, input):
+        try:
+            response = self.client.chat.completions.create(
+                model=self.model,
+                messages=[
+                    {"role": "user", "content": input},
+                ],
+                stream=False,
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+                stop=None
+            )
+            return response.choices[0].message.content
+        except ConnectionError:
+            print("Error: Unable to connect to the server. Please check if the vllm service is running and the port is 8080.")
+        except Exception as e:
+            print(f"Error: {e}")

src/models/prompt_example.py CHANGED Viewed

@@ -95,13 +95,13 @@ class Event(BaseModel):
     process: Optional[str] = Field(description="Details of the event process")
     result: Optional[str] = Field(default=None, description="Result or outcome of the event")
-class ExtractionTarget(BaseModel):
-    title: str = Field(description="The title or headline of the news article")
-    summary: str = Field(description="A brief summary of the news article")
-    publication_date: Optional[str] = Field(description="The publication date of the article")
-    keywords: Optional[List[str]] = Field(description="List of keywords or topics covered in the article")
-    events: List[Event] = Field(description="Events covered in the article")
-    quotes: Optional[List[str]] = Field(default=None, description="Quotes related to the news, if any")
     viewpoints: Optional[List[str]] = Field(default=None, description="Different viewpoints regarding the news")
 ```

     process: Optional[str] = Field(description="Details of the event process")
     result: Optional[str] = Field(default=None, description="Result or outcome of the event")
+class NewsReport(BaseModel):
+    title: str = Field(description="The title or headline of the news report")
+    summary: str = Field(description="A brief summary of the news report")
+    publication_date: Optional[str] = Field(description="The publication date of the report")
+    keywords: Optional[List[str]] = Field(description="List of keywords or topics covered in the news report")
+    events: List[Event] = Field(description="Events covered in the news report")
+    quotes: Optional[dict] = Field(default=None, description="Quotes related to the news, with keys as the citation sources and values as the quoted content. ")
     viewpoints: Optional[List[str]] = Field(default=None, description="Different viewpoints regarding the news")
 ```

src/models/prompt_template.py CHANGED Viewed

@@ -76,6 +76,25 @@ extract_instruction = PromptTemplate(
     template=EXTRACT_INSTRUCTION,
 )
 SUMMARIZE_INSTRUCTION = """
 **Instruction**: Below is a list of results obtained after segmenting and extracting information from a long article. Please consolidate all the answers to generate a final response.
 {examples}
@@ -84,7 +103,7 @@ SUMMARIZE_INSTRUCTION = """
 **Result List**: {answer_list}
 **Output Schema**: {schema}
-Now summarize all the information from the Result List.
 """
 summarize_instruction = PromptTemplate(
     input_variables=["instruction", "examples", "answer_list", "schema"],
@@ -92,6 +111,8 @@ summarize_instruction = PromptTemplate(
 )
 # ==================================================================== #
 #                          REFLECION AGENT                             #
 # ==================================================================== #

     template=EXTRACT_INSTRUCTION,
 )
+instruction_mapper = {
+    'NER': "You are an expert in named entity recognition. Please extract entities that match the schema definition from the input. Return an empty list if the entity type does not exist. Please respond in the format of a JSON string.",
+    'RE': "You are an expert in relationship extraction. Please extract relationship triples that match the schema definition from the input. Return an empty list for relationships that do not exist. Please respond in the format of a JSON string.",
+    'EE': "You are an expert in event extraction. Please extract events from the input that conform to the schema definition. Return an empty list for events that do not exist, and return NAN for arguments that do not exist. If an argument has multiple values, please return a list. Respond in the format of a JSON string.",
+}
+EXTRACT_INSTRUCTION_JSON = """
+{{
+    "instruction": {instruction},
+    "schema": {constraint},
+    "input": {input},
+}}
+"""
+extract_instruction_json = PromptTemplate(
+    input_variables=["instruction", "constraint", "input"],
+    template=EXTRACT_INSTRUCTION_JSON,
+)
 SUMMARIZE_INSTRUCTION = """
 **Instruction**: Below is a list of results obtained after segmenting and extracting information from a long article. Please consolidate all the answers to generate a final response.
 {examples}
 **Result List**: {answer_list}
 **Output Schema**: {schema}
+Now summarize all the information from the Result List. Filter or merge the redundant information.
 """
 summarize_instruction = PromptTemplate(
     input_variables=["instruction", "examples", "answer_list", "schema"],
 )
 # ==================================================================== #
 #                          REFLECION AGENT                             #
 # ==================================================================== #

src/models/vllm_serve.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import argparse
+import warnings
+import subprocess
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from utils import *
+def main():
+    # Create command-line argument parser
+    parser = argparse.ArgumentParser(description='Run the extraction model.')
+    parser.add_argument('--config', type=str, required=True,
+                        help='Path to the YAML configuration file.')
+    parser.add_argument('--tensor-parallel-size', type=int, default=2,
+                        help='Tensor parallel size for the VLLM server.')
+    parser.add_argument('--max-model-len', type=int, default=32768,
+                        help='Maximum model length for the VLLM server.')
+    # Parse command-line arguments
+    args = parser.parse_args()
+    # Load configuration
+    config = load_extraction_config(args.config)
+    # Model config
+    model_config = config['model']
+    if model_config['vllm_serve'] == False:
+        warnings.warn("VLLM-deployed model will not be used for extraction. To enable VLLM, set vllm_serve to true in the configuration file.")
+    model_name_or_path = model_config['model_name_or_path']
+    command = f"vllm serve {model_name_or_path} --tensor-parallel-size {args.tensor_parallel_size} --max-model-len {args.max_model_len} --enforce-eager --port 8000"
+    subprocess.run(command, shell=True)
+if __name__ == "__main__":
+    main()

src/modules/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (459 Bytes)

src/modules/__pycache__/__init__.cpython-39.pyc CHANGED Viewed

Binary files a/src/modules/__pycache__/__init__.cpython-39.pyc and b/src/modules/__pycache__/__init__.cpython-39.pyc differ

src/modules/__pycache__/extraction_agent.cpython-311.pyc DELETED Viewed

Binary file (6.66 kB)

src/modules/__pycache__/extraction_agent.cpython-39.pyc CHANGED Viewed

Binary files a/src/modules/__pycache__/extraction_agent.cpython-39.pyc and b/src/modules/__pycache__/extraction_agent.cpython-39.pyc differ

src/modules/__pycache__/reflection_agent.cpython-311.pyc DELETED Viewed

Binary file (6.98 kB)

src/modules/__pycache__/reflection_agent.cpython-39.pyc CHANGED Viewed

Binary files a/src/modules/__pycache__/reflection_agent.cpython-39.pyc and b/src/modules/__pycache__/reflection_agent.cpython-39.pyc differ

src/modules/__pycache__/schema_agent.cpython-311.pyc DELETED Viewed

Binary file (10.7 kB)

src/modules/__pycache__/schema_agent.cpython-39.pyc CHANGED Viewed

Binary files a/src/modules/__pycache__/schema_agent.cpython-39.pyc and b/src/modules/__pycache__/schema_agent.cpython-39.pyc differ

src/modules/extraction_agent.py CHANGED Viewed

@@ -11,9 +11,13 @@ class InformationExtractor:
         prompt = extract_instruction.format(instruction=instruction, examples=examples, text=text, additional_info=additional_info, schema=schema)
         response = self.llm.get_chat_response(prompt)
         response = extract_json_dict(response)
-        print(f"prompt: {prompt}")
-        print("========================================")
-        print(f"response: {response}")
         return response
     def summarize_answer(self, instruction="", answer_list="", schema="", additional_info=""):
@@ -34,26 +38,43 @@ class ExtractionAgent:
             return data
         if data.task == "NER":
             constraint = json.dumps(data.constraint)
-            if "**Entity Type Constraint**" in constraint:
                 return data
             data.constraint = f"\n**Entity Type Constraint**: The type of entities must be chosen from the following list.\n{constraint}\n"
         elif data.task == "RE":
             constraint = json.dumps(data.constraint)
-            if "**Relation Type Constraint**" in constraint:
                 return data
             data.constraint = f"\n**Relation Type Constraint**: The type of relations must be chosen from the following list.\n{constraint}\n"
         elif data.task == "EE":
             constraint = json.dumps(data.constraint)
             if "**Event Extraction Constraint**" in constraint:
                 return data
-            data.constraint = f"\n**Event Extraction Constraint**: The event type must be selected from the following dictionary keys, and its event arguments should be chosen from its corresponding dictionary values. \n{constraint}\n"
         return data
     def extract_information_direct(self, data: DataPoint):
         data = self.__get_constraint(data)
         result_list = []
         for chunk_text in data.chunk_text_list:
-            extract_direct_result = self.module.extract_information(instruction=data.instruction, text=chunk_text, schema=data.output_schema, examples="", additional_info=data.constraint)
             result_list.append(extract_direct_result)
         function_name = current_function_name()
         data.set_result_list(result_list)

         prompt = extract_instruction.format(instruction=instruction, examples=examples, text=text, additional_info=additional_info, schema=schema)
         response = self.llm.get_chat_response(prompt)
         response = extract_json_dict(response)
+        return response
+    def extract_information_compatible(self, task="", text="", constraint=""):
+        instruction = instruction_mapper.get(task)
+        prompt = extract_instruction_json.format(instruction=instruction, constraint=constraint, input=text)
+        response = self.llm.get_chat_response(prompt)
+        response = extract_json_dict(response)
         return response
     def summarize_answer(self, instruction="", answer_list="", schema="", additional_info=""):
             return data
         if data.task == "NER":
             constraint = json.dumps(data.constraint)
+            if "**Entity Type Constraint**" in constraint or self.llm.name == "OneKE":
                 return data
             data.constraint = f"\n**Entity Type Constraint**: The type of entities must be chosen from the following list.\n{constraint}\n"
         elif data.task == "RE":
             constraint = json.dumps(data.constraint)
+            if "**Relation Type Constraint**" in constraint or self.llm.name == "OneKE":
                 return data
             data.constraint = f"\n**Relation Type Constraint**: The type of relations must be chosen from the following list.\n{constraint}\n"
         elif data.task == "EE":
             constraint = json.dumps(data.constraint)
             if "**Event Extraction Constraint**" in constraint:
                 return data
+            if self.llm.name != "OneKE":
+                data.constraint = f"\n**Event Extraction Constraint**: The event type must be selected from the following dictionary keys, and its event arguments should be chosen from its corresponding dictionary values. \n{constraint}\n"
+            else:
+                try:
+                    result = [
+                                {
+                                    "event_type": key,
+                                    "trigger": True,
+                                    "arguments": value
+                                }
+                                for key, value in data.constraint.items()
+                            ]
+                    data.constraint = json.dumps(result)
+                except:
+                    print("Invalid Constraint: Event Extraction constraint must be a dictionary with event types as keys and lists of arguments as values.", data.constraint)
         return data
     def extract_information_direct(self, data: DataPoint):
         data = self.__get_constraint(data)
         result_list = []
         for chunk_text in data.chunk_text_list:
+            if self.llm.name != "OneKE":
+                extract_direct_result = self.module.extract_information(instruction=data.instruction, text=chunk_text, schema=data.output_schema, examples="", additional_info=data.constraint)
+            else:
+                extract_direct_result = self.module.extract_information_compatible(task=data.task, text=chunk_text, constraint=data.constraint)
             result_list.append(extract_direct_result)
         function_name = current_function_name()
         data.set_result_list(result_list)

src/modules/knowledge_base/__pycache__/case_repository.cpython-311.pyc DELETED Viewed

Binary file (4.64 kB)

src/modules/knowledge_base/__pycache__/case_repository.cpython-39.pyc CHANGED Viewed

Binary files a/src/modules/knowledge_base/__pycache__/case_repository.cpython-39.pyc and b/src/modules/knowledge_base/__pycache__/case_repository.cpython-39.pyc differ

src/modules/knowledge_base/__pycache__/schema_repository.cpython-311.pyc DELETED Viewed

Binary file (9.25 kB)

src/modules/knowledge_base/__pycache__/schema_repository.cpython-39.pyc CHANGED Viewed

Binary files a/src/modules/knowledge_base/__pycache__/schema_repository.cpython-39.pyc and b/src/modules/knowledge_base/__pycache__/schema_repository.cpython-39.pyc differ

src/modules/knowledge_base/case_repository.py CHANGED Viewed

@@ -1,192 +1,3 @@
-# import json
-# import os
-# import torch
-# import numpy as np
-# from utils import *
-# from sentence_transformers import SentenceTransformer
-# from rapidfuzz import process
-# from models import *
-# import copy
-# import warnings
-# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# warnings.filterwarnings("ignore", category=FutureWarning, message=r".*clean_up_tokenization_spaces*")
-# class CaseRepository:
-#     def __init__(self):
-#         self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
-#         self.embedder.to(device)
-#         self.corpus = self.load_corpus()
-#         self.embedded_corpus = self.embed_corpus()
-#     def load_corpus(self):
-#         with open(os.path.join(os.path.dirname(__file__), "case_repository.json")) as file:
-#             corpus = json.load(file)
-#         return corpus
-#     def update_corpus(self):
-#         try:
-#             with open(os.path.join(os.path.dirname(__file__), "case_repository.json"), "w") as file:
-#                 json.dump(self.corpus, file, indent=2)
-#         except Exception as e:
-#             print(f"Error when updating corpus: {e}")
-#     def embed_corpus(self):
-#         embedded_corpus = {}
-#         for key, content in self.corpus.items():
-#             good_index = [item['index']['embed_index'] for item in content['good']]
-#             encoded_good_index = self.embedder.encode(good_index, convert_to_tensor=True).to(device)
-#             bad_index = [item['index']['embed_index'] for item in content['bad']]
-#             encoded_bad_index = self.embedder.encode(bad_index, convert_to_tensor=True).to(device)
-#             embedded_corpus[key] = {"good": encoded_good_index, "bad": encoded_bad_index}
-#         return embedded_corpus
-#     def get_similarity_scores(self, task: TaskType, embed_index="", str_index="", case_type="", top_k=2):
-#         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-#         # Embedding similarity match
-#         encoded_embed_query = self.embedder.encode(embed_index, convert_to_tensor=True).to(device)
-#         embedding_similarity_matrix = self.embedder.similarity(encoded_embed_query, self.embedded_corpus[task][case_type])
-#         embedding_similarity_scores = embedding_similarity_matrix[0].to(device)
-#         # String similarity match
-#         str_match_corpus = [item['index']['str_index'] for item in self.corpus[task][case_type]]
-#         str_similarity_results = process.extract(str_index, str_match_corpus, limit=len(str_match_corpus))
-#         scores_dict = {match[0]: match[1] for match in str_similarity_results}
-#         scores_in_order = [scores_dict[candidate] for candidate in str_match_corpus]
-#         str_similarity_scores = torch.tensor(scores_in_order, dtype=torch.float32).to(device)
-#         # Normalize scores
-#         embedding_score_range = embedding_similarity_scores.max() - embedding_similarity_scores.min()
-#         str_score_range = str_similarity_scores.max() - str_similarity_scores.min()
-#         if embedding_score_range > 0:
-#             embed_norm_scores = (embedding_similarity_scores - embedding_similarity_scores.min()) / embedding_score_range
-#         else:
-#             embed_norm_scores = embedding_similarity_scores
-#         if str_score_range > 0:
-#             str_norm_scores = (str_similarity_scores - str_similarity_scores.min()) / str_score_range
-#         else:
-#             str_norm_scores = str_similarity_scores / 100
-#         # Combine the scores with weights
-#         combined_scores = 0.5 * embed_norm_scores + 0.5 * str_norm_scores
-#         original_combined_scores = 0.5 * embedding_similarity_scores + 0.5 * str_similarity_scores / 100
-#         scores, indices = torch.topk(combined_scores, k=min(top_k, combined_scores.size(0)))
-#         original_scores, original_indices = torch.topk(original_combined_scores, k=min(top_k, original_combined_scores.size(0)))
-#         return scores, indices, original_scores, original_indices
-#     def query_case(self, task: TaskType, embed_index="", str_index="", case_type="", top_k=2) -> list:
-#         _, indices, _, _ = self.get_similarity_scores(task, embed_index, str_index, case_type, top_k)
-#         top_matches = [self.corpus[task][case_type][idx]["content"] for idx in indices]
-#         return top_matches
-#     def update_case(self, task: TaskType, embed_index="", str_index="", content="" ,case_type=""):
-#         self.corpus[task][case_type].append({"index": {"embed_index": embed_index, "str_index": str_index}, "content": content})
-#         self.embedded_corpus[task][case_type] = torch.cat([self.embedded_corpus[task][case_type], self.embedder.encode([embed_index], convert_to_tensor=True).to(device)], dim=0)
-#         print(f"Case updated for {task} task.")
-# class CaseRepositoryHandler:
-#     def __init__(self, llm: BaseEngine):
-#         self.repository = CaseRepository()
-#         self.llm = llm
-#     def __get_good_case_analysis(self, instruction="", text="", result="", additional_info=""):
-#         prompt = good_case_analysis_instruction.format(
-#             instruction=instruction, text=text, result=result, additional_info=additional_info
-#         )
-#         for _ in range(3):
-#             response = self.llm.get_chat_response(prompt)
-#             response = extract_json_dict(response)
-#             if not isinstance(response, dict):
-#                 return response
-#         return None
-#     def __get_bad_case_reflection(self, instruction="", text="", original_answer="", correct_answer="", additional_info=""):
-#         prompt = bad_case_reflection_instruction.format(
-#             instruction=instruction, text=text, original_answer=original_answer, correct_answer=correct_answer, additional_info=additional_info
-#         )
-#         for _ in range(3):
-#             response = self.llm.get_chat_response(prompt)
-#             response = extract_json_dict(response)
-#             if not isinstance(response, dict):
-#                 return response
-#         return None
-#     def __get_index(self, data: DataPoint, case_type: str):
-#         # set embed_index
-#         embed_index = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
-#         # set str_index
-#         if data.task == "Base":
-#             str_index = f"**Task**: {data.instruction}"
-#         else:
-#             str_index = f"{data.constraint}"
-#         if case_type == "bad":
-#             str_index += f"\n\n**Original Result**: {json.dumps(data.pred)}"
-#         return embed_index, str_index
-#     def query_good_case(self, data: DataPoint):
-#         embed_index, str_index = self.__get_index(data, "good")
-#         return self.repository.query_case(task=data.task, embed_index=embed_index, str_index=str_index, case_type="good")
-#     def query_bad_case(self, data: DataPoint):
-#         embed_index, str_index = self.__get_index(data, "bad")
-#         return self.repository.query_case(task=data.task, embed_index=embed_index, str_index=str_index, case_type="bad")
-#     def update_good_case(self, data: DataPoint):
-#         if data.truth == "" :
-#             print("No truth value provided.")
-#             return
-#         embed_index, str_index = self.__get_index(data, "good")
-#         _, _, original_scores, _ = self.repository.get_similarity_scores(data.task, embed_index, str_index, "good", 1)
-#         original_scores = original_scores.tolist()
-#         if original_scores[0] >= 0.9:
-#             print("The similar good case is already in the corpus. Similarity Score: ", original_scores[0])
-#             return
-#         good_case_alaysis = self.__get_good_case_analysis(instruction=data.instruction, text=data.distilled_text, result=data.truth, additional_info=data.constraint)
-#         wrapped_good_case_analysis = f"**Analysis**: {good_case_alaysis}"
-#         wrapped_instruction = f"**Task**: {data.instruction}"
-#         wrapped_text = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
-#         wrapped_answer = f"**Correct Answer**: {json.dumps(data.truth)}"
-#         if data.task == "Base":
-#             content = f"{wrapped_instruction}\n\n{wrapped_text}\n\n{wrapped_good_case_analysis}\n\n{wrapped_answer}"
-#         else:
-#             content = f"{wrapped_text}\n\n{data.constraint}\n\n{wrapped_good_case_analysis}\n\n{wrapped_answer}"
-#         self.repository.update_case(data.task, embed_index, str_index, content, "good")
-#     def update_bad_case(self, data: DataPoint):
-#         if data.truth == "" :
-#             print("No truth value provided.")
-#             return
-#         if normalize_obj(data.pred) == normalize_obj(data.truth):
-#             return
-#         embed_index, str_index = self.__get_index(data, "bad")
-#         _, _, original_scores, _ = self.repository.get_similarity_scores(data.task, embed_index, str_index, "bad", 1)
-#         original_scores = original_scores.tolist()
-#         if original_scores[0] >= 0.9:
-#             print("The similar bad case is already in the corpus. Similarity Score: ", original_scores[0])
-#             return
-#         bad_case_reflection = self.__get_bad_case_reflection(instruction=data.instruction, text=data.distilled_text, original_answer=data.pred, correct_answer=data.truth, additional_info=data.constraint)
-#         wrapped_bad_case_reflection = f"**Reflection**: {bad_case_reflection}"
-#         wrapper_original_answer = f"**Original Answer**: {json.dumps(data.pred)}"
-#         wrapper_correct_answer = f"**Correct Answer**: {json.dumps(data.truth)}"
-#         wrapped_instruction = f"**Task**: {data.instruction}"
-#         wrapped_text = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
-#         if data.task == "Base":
-#             content = f"{wrapped_instruction}\n\n{wrapped_text}\n\n{wrapper_original_answer}\n\n{wrapped_bad_case_reflection}\n\n{wrapper_correct_answer}"
-#         else:
-#             content =  f"{wrapped_text}\n\n{data.constraint}\n\n{wrapper_original_answer}\n\n{wrapped_bad_case_reflection}\n\n{wrapper_correct_answer}"
-#         self.repository.update_case(data.task, embed_index, str_index, content, "bad")
-#     def update_case(self, data: DataPoint):
-#         self.update_good_case(data)
-#         self.update_bad_case(data)
-#         self.repository.update_corpus()
 import json
 import os
 import torch
@@ -199,87 +10,84 @@ import copy
 import warnings
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 warnings.filterwarnings("ignore", category=FutureWarning, message=r".*clean_up_tokenization_spaces*")
 class CaseRepository:
     def __init__(self):
-        # self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
-        # self.embedder.to(device)
-        # self.corpus = self.load_corpus()
-        # self.embedded_corpus = self.embed_corpus()
-        pass
     def load_corpus(self):
-        # with open(os.path.join(os.path.dirname(__file__), "case_repository.json")) as file:
-        #     corpus = json.load(file)
-        # return corpus
-        pass
     def update_corpus(self):
-        # try:
-        #     with open(os.path.join(os.path.dirname(__file__), "case_repository.json"), "w") as file:
-        #         json.dump(self.corpus, file, indent=2)
-        # except Exception as e:
-        #     print(f"Error when updating corpus: {e}")
-        pass
     def embed_corpus(self):
-        # embedded_corpus = {}
-        # for key, content in self.corpus.items():
-        #     good_index = [item['index']['embed_index'] for item in content['good']]
-        #     encoded_good_index = self.embedder.encode(good_index, convert_to_tensor=True).to(device)
-        #     bad_index = [item['index']['embed_index'] for item in content['bad']]
-        #     encoded_bad_index = self.embedder.encode(bad_index, convert_to_tensor=True).to(device)
-        #     embedded_corpus[key] = {"good": encoded_good_index, "bad": encoded_bad_index}
-        # return embedded_corpus
-        pass
     def get_similarity_scores(self, task: TaskType, embed_index="", str_index="", case_type="", top_k=2):
-        # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        # # Embedding similarity match
-        # encoded_embed_query = self.embedder.encode(embed_index, convert_to_tensor=True).to(device)
-        # embedding_similarity_matrix = self.embedder.similarity(encoded_embed_query, self.embedded_corpus[task][case_type])
-        # embedding_similarity_scores = embedding_similarity_matrix[0].to(device)
-        # # String similarity match
-        # str_match_corpus = [item['index']['str_index'] for item in self.corpus[task][case_type]]
-        # str_similarity_results = process.extract(str_index, str_match_corpus, limit=len(str_match_corpus))
-        # scores_dict = {match[0]: match[1] for match in str_similarity_results}
-        # scores_in_order = [scores_dict[candidate] for candidate in str_match_corpus]
-        # str_similarity_scores = torch.tensor(scores_in_order, dtype=torch.float32).to(device)
-        # # Normalize scores
-        # embedding_score_range = embedding_similarity_scores.max() - embedding_similarity_scores.min()
-        # str_score_range = str_similarity_scores.max() - str_similarity_scores.min()
-        # if embedding_score_range > 0:
-        #     embed_norm_scores = (embedding_similarity_scores - embedding_similarity_scores.min()) / embedding_score_range
-        # else:
-        #     embed_norm_scores = embedding_similarity_scores
-        # if str_score_range > 0:
-        #     str_norm_scores = (str_similarity_scores - str_similarity_scores.min()) / str_score_range
-        # else:
-        #     str_norm_scores = str_similarity_scores / 100
-        # # Combine the scores with weights
-        # combined_scores = 0.5 * embed_norm_scores + 0.5 * str_norm_scores
-        # original_combined_scores = 0.5 * embedding_similarity_scores + 0.5 * str_similarity_scores / 100
-        # scores, indices = torch.topk(combined_scores, k=min(top_k, combined_scores.size(0)))
-        # original_scores, original_indices = torch.topk(original_combined_scores, k=min(top_k, original_combined_scores.size(0)))
-        # return scores, indices, original_scores, original_indices
-        pass
     def query_case(self, task: TaskType, embed_index="", str_index="", case_type="", top_k=2) -> list:
-        # _, indices, _, _ = self.get_similarity_scores(task, embed_index, str_index, case_type, top_k)
-        # top_matches = [self.corpus[task][case_type][idx]["content"] for idx in indices]
-        # return top_matches
-        pass
     def update_case(self, task: TaskType, embed_index="", str_index="", content="" ,case_type=""):
-        # self.corpus[task][case_type].append({"index": {"embed_index": embed_index, "str_index": str_index}, "content": content})
-        # self.embedded_corpus[task][case_type] = torch.cat([self.embedded_corpus[task][case_type], self.embedder.encode([embed_index], convert_to_tensor=True).to(device)], dim=0)
-        # print(f"Case updated for {task} task.")
-        pass
 class CaseRepositoryHandler:
     def __init__(self, llm: BaseEngine):
@@ -287,105 +95,96 @@ class CaseRepositoryHandler:
         self.llm = llm
     def __get_good_case_analysis(self, instruction="", text="", result="", additional_info=""):
-        # prompt = good_case_analysis_instruction.format(
-        #     instruction=instruction, text=text, result=result, additional_info=additional_info
-        # )
-        # for _ in range(3):
-        #     response = self.llm.get_chat_response(prompt)
-        #     response = extract_json_dict(response)
-        #     if not isinstance(response, dict):
-        #         return response
-        # return None
-        pass
     def __get_bad_case_reflection(self, instruction="", text="", original_answer="", correct_answer="", additional_info=""):
-        # prompt = bad_case_reflection_instruction.format(
-        #     instruction=instruction, text=text, original_answer=original_answer, correct_answer=correct_answer, additional_info=additional_info
-        # )
-        # for _ in range(3):
-        #     response = self.llm.get_chat_response(prompt)
-        #     response = extract_json_dict(response)
-        #     if not isinstance(response, dict):
-        #         return response
-        # return None
-        pass
     def __get_index(self, data: DataPoint, case_type: str):
         # set embed_index
-        # embed_index = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
-        # # set str_index
-        # if data.task == "Base":
-        #     str_index = f"**Task**: {data.instruction}"
-        # else:
-        #     str_index = f"{data.constraint}"
-        # if case_type == "bad":
-        #     str_index += f"\n\n**Original Result**: {json.dumps(data.pred)}"
-        # return embed_index, str_index
-        pass
     def query_good_case(self, data: DataPoint):
-        # embed_index, str_index = self.__get_index(data, "good")
-        # return self.repository.query_case(task=data.task, embed_index=embed_index, str_index=str_index, case_type="good")
-        pass
     def query_bad_case(self, data: DataPoint):
-        # embed_index, str_index = self.__get_index(data, "bad")
-        # return self.repository.query_case(task=data.task, embed_index=embed_index, str_index=str_index, case_type="bad")
-        pass
     def update_good_case(self, data: DataPoint):
-        # if data.truth == "" :
-        #     print("No truth value provided.")
-        #     return
-        # embed_index, str_index = self.__get_index(data, "good")
-        # _, _, original_scores, _ = self.repository.get_similarity_scores(data.task, embed_index, str_index, "good", 1)
-        # original_scores = original_scores.tolist()
-        # if original_scores[0] >= 0.9:
-        #     print("The similar good case is already in the corpus. Similarity Score: ", original_scores[0])
-        #     return
-        # good_case_alaysis = self.__get_good_case_analysis(instruction=data.instruction, text=data.distilled_text, result=data.truth, additional_info=data.constraint)
-        # wrapped_good_case_analysis = f"**Analysis**: {good_case_alaysis}"
-        # wrapped_instruction = f"**Task**: {data.instruction}"
-        # wrapped_text = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
-        # wrapped_answer = f"**Correct Answer**: {json.dumps(data.truth)}"
-        # if data.task == "Base":
-        #     content = f"{wrapped_instruction}\n\n{wrapped_text}\n\n{wrapped_good_case_analysis}\n\n{wrapped_answer}"
-        # else:
-        #     content = f"{wrapped_text}\n\n{data.constraint}\n\n{wrapped_good_case_analysis}\n\n{wrapped_answer}"
-        # self.repository.update_case(data.task, embed_index, str_index, content, "good")
-        pass
     def update_bad_case(self, data: DataPoint):
-        # if data.truth == "" :
-        #     print("No truth value provided.")
-        #     return
-        # if normalize_obj(data.pred) == normalize_obj(data.truth):
-        #     return
-        # embed_index, str_index = self.__get_index(data, "bad")
-        # _, _, original_scores, _ = self.repository.get_similarity_scores(data.task, embed_index, str_index, "bad", 1)
-        # original_scores = original_scores.tolist()
-        # if original_scores[0] >= 0.9:
-        #     print("The similar bad case is already in the corpus. Similarity Score: ", original_scores[0])
-        #     return
-        # bad_case_reflection = self.__get_bad_case_reflection(instruction=data.instruction, text=data.distilled_text, original_answer=data.pred, correct_answer=data.truth, additional_info=data.constraint)
-        # wrapped_bad_case_reflection = f"**Reflection**: {bad_case_reflection}"
-        # wrapper_original_answer = f"**Original Answer**: {json.dumps(data.pred)}"
-        # wrapper_correct_answer = f"**Correct Answer**: {json.dumps(data.truth)}"
-        # wrapped_instruction = f"**Task**: {data.instruction}"
-        # wrapped_text = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
-        # if data.task == "Base":
-        #     content = f"{wrapped_instruction}\n\n{wrapped_text}\n\n{wrapper_original_answer}\n\n{wrapped_bad_case_reflection}\n\n{wrapper_correct_answer}"
-        # else:
-        #     content =  f"{wrapped_text}\n\n{data.constraint}\n\n{wrapper_original_answer}\n\n{wrapped_bad_case_reflection}\n\n{wrapper_correct_answer}"
-        # self.repository.update_case(data.task, embed_index, str_index, content, "bad")
-        pass
     def update_case(self, data: DataPoint):
-        # self.update_good_case(data)
-        # self.update_bad_case(data)
-        # self.repository.update_corpus()
-        pass

 import json
 import os
 import torch
 import warnings
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+docker_model_path = "/app/model/all-MiniLM-L6-v2"
 warnings.filterwarnings("ignore", category=FutureWarning, message=r".*clean_up_tokenization_spaces*")
 class CaseRepository:
     def __init__(self):
+        try:
+            self.embedder = SentenceTransformer(docker_model_path)
+        except:
+            self.embedder = SentenceTransformer(config['model']['embedding_model'])
+        self.embedder.to(device)
+        self.corpus = self.load_corpus()
+        self.embedded_corpus = self.embed_corpus()
     def load_corpus(self):
+        with open(os.path.join(os.path.dirname(__file__), "case_repository.json")) as file:
+            corpus = json.load(file)
+        return corpus
     def update_corpus(self):
+        try:
+            with open(os.path.join(os.path.dirname(__file__), "case_repository.json"), "w") as file:
+                json.dump(self.corpus, file, indent=2)
+        except Exception as e:
+            print(f"Error when updating corpus: {e}")
     def embed_corpus(self):
+        embedded_corpus = {}
+        for key, content in self.corpus.items():
+            good_index = [item['index']['embed_index'] for item in content['good']]
+            encoded_good_index = self.embedder.encode(good_index, convert_to_tensor=True).to(device)
+            bad_index = [item['index']['embed_index'] for item in content['bad']]
+            encoded_bad_index = self.embedder.encode(bad_index, convert_to_tensor=True).to(device)
+            embedded_corpus[key] = {"good": encoded_good_index, "bad": encoded_bad_index}
+        return embedded_corpus
     def get_similarity_scores(self, task: TaskType, embed_index="", str_index="", case_type="", top_k=2):
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Embedding similarity match
+        encoded_embed_query = self.embedder.encode(embed_index, convert_to_tensor=True).to(device)
+        embedding_similarity_matrix = self.embedder.similarity(encoded_embed_query, self.embedded_corpus[task][case_type])
+        embedding_similarity_scores = embedding_similarity_matrix[0].to(device)
+        # String similarity match
+        str_match_corpus = [item['index']['str_index'] for item in self.corpus[task][case_type]]
+        str_similarity_results = process.extract(str_index, str_match_corpus, limit=len(str_match_corpus))
+        scores_dict = {match[0]: match[1] for match in str_similarity_results}
+        scores_in_order = [scores_dict[candidate] for candidate in str_match_corpus]
+        str_similarity_scores = torch.tensor(scores_in_order, dtype=torch.float32).to(device)
+        # Normalize scores
+        embedding_score_range = embedding_similarity_scores.max() - embedding_similarity_scores.min()
+        str_score_range = str_similarity_scores.max() - str_similarity_scores.min()
+        if embedding_score_range > 0:
+            embed_norm_scores = (embedding_similarity_scores - embedding_similarity_scores.min()) / embedding_score_range
+        else:
+            embed_norm_scores = embedding_similarity_scores
+        if str_score_range > 0:
+            str_norm_scores = (str_similarity_scores - str_similarity_scores.min()) / str_score_range
+        else:
+            str_norm_scores = str_similarity_scores / 100
+        # Combine the scores with weights
+        combined_scores = 0.5 * embed_norm_scores + 0.5 * str_norm_scores
+        original_combined_scores = 0.5 * embedding_similarity_scores + 0.5 * str_similarity_scores / 100
+        scores, indices = torch.topk(combined_scores, k=min(top_k, combined_scores.size(0)))
+        original_scores, original_indices = torch.topk(original_combined_scores, k=min(top_k, original_combined_scores.size(0)))
+        return scores, indices, original_scores, original_indices
     def query_case(self, task: TaskType, embed_index="", str_index="", case_type="", top_k=2) -> list:
+        _, indices, _, _ = self.get_similarity_scores(task, embed_index, str_index, case_type, top_k)
+        top_matches = [self.corpus[task][case_type][idx]["content"] for idx in indices]
+        return top_matches
     def update_case(self, task: TaskType, embed_index="", str_index="", content="" ,case_type=""):
+        self.corpus[task][case_type].append({"index": {"embed_index": embed_index, "str_index": str_index}, "content": content})
+        self.embedded_corpus[task][case_type] = torch.cat([self.embedded_corpus[task][case_type], self.embedder.encode([embed_index], convert_to_tensor=True).to(device)], dim=0)
+        print(f"A {case_type} case updated for {task} task.")
 class CaseRepositoryHandler:
     def __init__(self, llm: BaseEngine):
         self.llm = llm
     def __get_good_case_analysis(self, instruction="", text="", result="", additional_info=""):
+        prompt = good_case_analysis_instruction.format(
+            instruction=instruction, text=text, result=result, additional_info=additional_info
+        )
+        for _ in range(3):
+            response = self.llm.get_chat_response(prompt)
+            response = extract_json_dict(response)
+            if not isinstance(response, dict):
+                return response
+        return None
     def __get_bad_case_reflection(self, instruction="", text="", original_answer="", correct_answer="", additional_info=""):
+        prompt = bad_case_reflection_instruction.format(
+            instruction=instruction, text=text, original_answer=original_answer, correct_answer=correct_answer, additional_info=additional_info
+        )
+        for _ in range(3):
+            response = self.llm.get_chat_response(prompt)
+            response = extract_json_dict(response)
+            if not isinstance(response, dict):
+                return response
+        return None
     def __get_index(self, data: DataPoint, case_type: str):
         # set embed_index
+        embed_index = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
+        # set str_index
+        if data.task == "Base":
+            str_index = f"**Task**: {data.instruction}"
+        else:
+            str_index = f"{data.constraint}"
+        if case_type == "bad":
+            str_index += f"\n\n**Original Result**: {json.dumps(data.pred)}"
+        return embed_index, str_index
     def query_good_case(self, data: DataPoint):
+        embed_index, str_index = self.__get_index(data, "good")
+        return self.repository.query_case(task=data.task, embed_index=embed_index, str_index=str_index, case_type="good")
     def query_bad_case(self, data: DataPoint):
+        embed_index, str_index = self.__get_index(data, "bad")
+        return self.repository.query_case(task=data.task, embed_index=embed_index, str_index=str_index, case_type="bad")
     def update_good_case(self, data: DataPoint):
+        if data.truth == "" :
+            print("No truth value provided.")
+            return
+        embed_index, str_index = self.__get_index(data, "good")
+        _, _, original_scores, _ = self.repository.get_similarity_scores(data.task, embed_index, str_index, "good", 1)
+        original_scores = original_scores.tolist()
+        if original_scores[0] >= 0.9:
+            print("The similar good case is already in the corpus. Similarity Score: ", original_scores[0])
+            return
+        good_case_alaysis = self.__get_good_case_analysis(instruction=data.instruction, text=data.distilled_text, result=data.truth, additional_info=data.constraint)
+        wrapped_good_case_analysis = f"**Analysis**: {good_case_alaysis}"
+        wrapped_instruction = f"**Task**: {data.instruction}"
+        wrapped_text = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
+        wrapped_answer = f"**Correct Answer**: {json.dumps(data.truth)}"
+        if data.task == "Base":
+            content = f"{wrapped_instruction}\n\n{wrapped_text}\n\n{wrapped_good_case_analysis}\n\n{wrapped_answer}"
+        else:
+            content = f"{wrapped_text}\n\n{data.constraint}\n\n{wrapped_good_case_analysis}\n\n{wrapped_answer}"
+        self.repository.update_case(data.task, embed_index, str_index, content, "good")
     def update_bad_case(self, data: DataPoint):
+        if data.truth == "" :
+            print("No truth value provided.")
+            return
+        if normalize_obj(data.pred) == normalize_obj(data.truth):
+            return
+        embed_index, str_index = self.__get_index(data, "bad")
+        _, _, original_scores, _ = self.repository.get_similarity_scores(data.task, embed_index, str_index, "bad", 1)
+        original_scores = original_scores.tolist()
+        if original_scores[0] >= 0.9:
+            print("The similar bad case is already in the corpus. Similarity Score: ", original_scores[0])
+            return
+        bad_case_reflection = self.__get_bad_case_reflection(instruction=data.instruction, text=data.distilled_text, original_answer=data.pred, correct_answer=data.truth, additional_info=data.constraint)
+        wrapped_bad_case_reflection = f"**Reflection**: {bad_case_reflection}"
+        wrapper_original_answer = f"**Original Answer**: {json.dumps(data.pred)}"
+        wrapper_correct_answer = f"**Correct Answer**: {json.dumps(data.truth)}"
+        wrapped_instruction = f"**Task**: {data.instruction}"
+        wrapped_text = f"**Text**: {data.distilled_text}\n{data.chunk_text_list[0]}"
+        if data.task == "Base":
+            content = f"{wrapped_instruction}\n\n{wrapped_text}\n\n{wrapper_original_answer}\n\n{wrapped_bad_case_reflection}\n\n{wrapper_correct_answer}"
+        else:
+            content =  f"{wrapped_text}\n\n{data.constraint}\n\n{wrapper_original_answer}\n\n{wrapped_bad_case_reflection}\n\n{wrapper_correct_answer}"
+        self.repository.update_case(data.task, embed_index, str_index, content, "bad")
     def update_case(self, data: DataPoint):
+        self.update_good_case(data)
+        self.update_bad_case(data)
+        self.repository.update_corpus()

src/modules/knowledge_base/schema_repository.py CHANGED Viewed

@@ -85,7 +85,7 @@ class NewsReport(BaseModel):
     publication_date: Optional[str] = Field(description="The publication date of the report")
     keywords: Optional[List[str]] = Field(description="List of keywords or topics covered in the news report")
     events: List[Event] = Field(description="Events covered in the news report")
-    quotes: Optional[List[str]] = Field(default=None, description="Quotes related to the news, if any")
     viewpoints: Optional[List[str]] = Field(default=None, description="Different viewpoints regarding the news")
 # --------- You can customize new extraction schemas below -------- #

     publication_date: Optional[str] = Field(description="The publication date of the report")
     keywords: Optional[List[str]] = Field(description="List of keywords or topics covered in the news report")
     events: List[Event] = Field(description="Events covered in the news report")
+    quotes: Optional[dict] = Field(default=None, description="Quotes related to the news, with keys as the citation sources and values as the quoted content. ")
     viewpoints: Optional[List[str]] = Field(default=None, description="Different viewpoints regarding the news")
 # --------- You can customize new extraction schemas below -------- #

src/modules/schema_agent.py CHANGED Viewed

@@ -48,9 +48,6 @@ class SchemaAnalyzer:
     def get_deduced_schema_code(self, instruction: str, text: str, distilled_text: str):
         prompt = deduced_schema_code_instruction.format(examples=example_wrapper(code_schema_examples), instruction=instruction, distilled_text=distilled_text, text=text)
         response = self.llm.get_chat_response(prompt)
-        print(f"schema prompt: {prompt}")
-        print("========================================")
-        print(f"schema response: {response}")
         code_blocks = re.findall(r'```[^\n]*\n(.*?)\n```', response, re.DOTALL)
         if code_blocks:
             try:

     def get_deduced_schema_code(self, instruction: str, text: str, distilled_text: str):
         prompt = deduced_schema_code_instruction.format(examples=example_wrapper(code_schema_examples), instruction=instruction, distilled_text=distilled_text, text=text)
         response = self.llm.get_chat_response(prompt)
         code_blocks = re.findall(r'```[^\n]*\n(.*?)\n```', response, re.DOTALL)
         if code_blocks:
             try:

src/pipeline.py CHANGED Viewed

@@ -3,6 +3,7 @@ from models import *
 from utils import *
 from modules import *
 class Pipeline:
     def __init__(self, llm: BaseEngine):
         self.llm = llm
@@ -11,17 +12,26 @@ class Pipeline:
         self.extraction_agent = ExtractionAgent(llm = llm, case_repo = self.case_repo)
         self.reflection_agent = ReflectionAgent(llm = llm, case_repo = self.case_repo)
-    def __init_method(self, data: DataPoint, process_method):
         default_order = ["schema_agent", "extraction_agent", "reflection_agent"]
-        if "schema_agent" not in process_method:
-            process_method["schema_agent"] = "get_default_schema"
-        if data.task == "Base":
-            process_method["schema_agent"] = "get_deduced_schema"
         if data.task != "Base":
-            process_method["schema_agent"] = "get_retrieved_schema"
-        if "extraction_agent" not in process_method:
-            process_method["extraction_agent"] = "extract_information_direct"
-        sorted_process_method = {key: process_method[key] for key in default_order if key in process_method}
         return sorted_process_method
     def __init_data(self, data: DataPoint):
@@ -36,8 +46,6 @@ class Pipeline:
             data.output_schema = "EventList"
         return data
     # main entry
     def get_extract_result(self,
                            task: TaskType,
@@ -49,23 +57,29 @@ class Pipeline:
                            file_path: str = "",
                            truth: str = "",
                            mode: str = "quick",
-                           update_case: bool = False
-    ):
         print(f" task: {task},\n instruction: {instruction},\n text: {text},\n output_schema: {output_schema},\n constraint: {constraint},\n use_file: {use_file},\n file_path: {file_path},\n truth: {truth},\n mode: {mode},\n update_case: {update_case}")
         data = DataPoint(task=task, instruction=instruction, text=text, output_schema=output_schema, constraint=constraint, use_file=use_file, file_path=file_path, truth=truth)
         data = self.__init_data(data)
         if mode in config['agent']['mode'].keys():
-            process_method = config['agent']['mode'][mode]
         else:
             process_method = mode
-        print(f"data=================: {data.task}")
-        print(f"process_method=================: {process_method}")
         sorted_process_method = self.__init_method(data, process_method)
-        print_schema = False
-        frontend_schema = ""
-        frontend_res = ""
         # Information Extract
-        print(f"sorted_process_method=================: {sorted_process_method}")
         for agent_name, method_name in sorted_process_method.items():
             agent = getattr(self, agent_name, None)
             if not agent:
@@ -74,17 +88,23 @@ class Pipeline:
             if not method:
                 raise AttributeError(f"Method '{method_name}' not found in {agent_name}.")
             data = method(data)
-            if not print_schema and data.print_schema:
                 print("Schema: \n", data.print_schema)
                 frontend_schema = data.print_schema
                 print_schema = True
         data = self.extraction_agent.summarize_answer(data)
         print("Extraction Result: \n", json.dumps(data.pred, indent=2))
-        frontend_res = data.pred
         # Case Update
         if update_case:
             if (data.truth == ""):
-                truth = input("Please enter the correct answer you prefer, or press Enter to accept the current answer: ")
                 if truth.strip() == "":
                     data.truth = data.pred
                 else:

 from utils import *
 from modules import *
 class Pipeline:
     def __init__(self, llm: BaseEngine):
         self.llm = llm
         self.extraction_agent = ExtractionAgent(llm = llm, case_repo = self.case_repo)
         self.reflection_agent = ReflectionAgent(llm = llm, case_repo = self.case_repo)
+    def __check_consistancy(self, llm, task, mode, update_case):
+        if llm.name == "OneKE":
+            if task == "Base":
+                raise ValueError("The finetuned OneKE only supports quick extraction mode for NER, RE and EE Task.")
+            else:
+                mode = "quick"
+                update_case = False
+                print("The fine-tuned OneKE defaults to quick extraction mode without case update.")
+                return mode, update_case
+        return mode, update_case
+    def __init_method(self, data: DataPoint, process_method2):
         default_order = ["schema_agent", "extraction_agent", "reflection_agent"]
+        if "schema_agent" not in process_method2:
+            process_method2["schema_agent"] = "get_default_schema"
         if data.task != "Base":
+            process_method2["schema_agent"] = "get_retrieved_schema"
+        if "extraction_agent" not in process_method2:
+            process_method2["extraction_agent"] = "extract_information_direct"
+        sorted_process_method = {key: process_method2[key] for key in default_order if key in process_method2}
         return sorted_process_method
     def __init_data(self, data: DataPoint):
             data.output_schema = "EventList"
         return data
     # main entry
     def get_extract_result(self,
                            task: TaskType,
                            file_path: str = "",
                            truth: str = "",
                            mode: str = "quick",
+                           update_case: bool = False,
+                           show_trajectory: bool = False
+                           ):
         print(f" task: {task},\n instruction: {instruction},\n text: {text},\n output_schema: {output_schema},\n constraint: {constraint},\n use_file: {use_file},\n file_path: {file_path},\n truth: {truth},\n mode: {mode},\n update_case: {update_case}")
+        # Check Consistancy
+        mode, update_case = self.__check_consistancy(self.llm, task, mode, update_case)
+        # Load Data
         data = DataPoint(task=task, instruction=instruction, text=text, output_schema=output_schema, constraint=constraint, use_file=use_file, file_path=file_path, truth=truth)
         data = self.__init_data(data)
         if mode in config['agent']['mode'].keys():
+            process_method = config['agent']['mode'][mode].copy()
         else:
             process_method = mode
         sorted_process_method = self.__init_method(data, process_method)
+        print("Process Method: ", sorted_process_method)
+        print_schema = False #
+        frontend_schema = "" #
+        frontend_res = "" #
         # Information Extract
         for agent_name, method_name in sorted_process_method.items():
             agent = getattr(self, agent_name, None)
             if not agent:
             if not method:
                 raise AttributeError(f"Method '{method_name}' not found in {agent_name}.")
             data = method(data)
+            if not print_schema and data.print_schema: #
                 print("Schema: \n", data.print_schema)
                 frontend_schema = data.print_schema
                 print_schema = True
         data = self.extraction_agent.summarize_answer(data)
+        # show result
+        if show_trajectory:
+            print("Extraction Trajectory: \n", json.dumps(data.get_result_trajectory(), indent=2))
         print("Extraction Result: \n", json.dumps(data.pred, indent=2))
+        frontend_res = data.pred #
         # Case Update
         if update_case:
             if (data.truth == ""):
+                truth = input("Please enter the correct answer you prefer, or just press Enter to accept the current answer: ")
                 if truth.strip() == "":
                     data.truth = data.pred
                 else:

src/run.py CHANGED Viewed

@@ -8,81 +8,35 @@ from models import *
 from utils import *
 from modules import *
-def load_extraction_config(yaml_path):
-    # 从文件路径读取 YAML 内容
-    if not os.path.exists(yaml_path):
-        print(f"Error: The config file '{yaml_path}' does not exist.")
-        return {}
-    with open(yaml_path, 'r') as file:
-        config = yaml.safe_load(file)
-    # 提取'extraction'配置的字典
-    model_config = config.get('model', {})
-    extraction_config = config.get('extraction', {})
-    # model config
-    model_name_or_path = model_config.get('model_name_or_path', "")
-    model_category = model_config.get('category', "")
-    api_key = model_config.get('api_key', "")
-    base_url = model_config.get('base_url', "")
-    # extraction config
-    task = extraction_config.get('task', "")
-    instruction = extraction_config.get('instruction', "")
-    text = extraction_config.get('text', "")
-    output_schema = extraction_config.get('output_schema', "")
-    constraint = extraction_config.get('constraint', "")
-    truth = extraction_config.get('truth', "")
-    use_file = extraction_config.get('use_file', False)
-    mode = extraction_config.get('mode', "quick")
-    update_case = extraction_config.get('update_case', False)
-    # 返回一个包含这些变量的字典
-    return {
-        "model": {
-            "model_name_or_path": model_name_or_path,
-            "category": model_category,
-            "api_key": api_key,
-            "base_url": base_url
-        },
-        "extraction": {
-            "task": task,
-            "instruction": instruction,
-            "text": text,
-            "output_schema": output_schema,
-            "constraint": constraint,
-            "truth": truth,
-            "use_file": use_file,
-            "mode": mode,
-            "update_case": update_case
-        }
-    }
 def main():
-    # 创建命令行参数解析器
-    parser = argparse.ArgumentParser(description='Run the extraction model.')
-    parser.add_argument('--config', type=str, required=True,
                         help='Path to the YAML configuration file.')
-    # 解析命令行参数
     args = parser.parse_args()
-    # 加载配置
     config = load_extraction_config(args.config)
     model_config = config['model']
-    extraction_config = config['extraction']
-    clazz = getattr(models, model_config['category'], None)
-    if clazz is None:
-        print(f"Error: The model category '{model_config['category']}' is not supported.")
-        return
-    if model_config['api_key'] == "":
-        model = clazz(model_config['model_name_or_path'])
     else:
-        model = clazz(model_config['model_name_or_path'], model_config['api_key'], model_config['base_url'])
     pipeline = Pipeline(model)
-    result, trajectory, *_ = pipeline.get_extract_result(task=extraction_config['task'], instruction=extraction_config['instruction'], text=extraction_config['text'], output_schema=extraction_config['output_schema'], constraint=extraction_config['constraint'], use_file=extraction_config['use_file'], truth=extraction_config['truth'], mode=extraction_config['mode'], update_case=extraction_config['update_case'])
-    return
 if __name__ == "__main__":
     main()

 from utils import *
 from modules import *
 def main():
+    # Create command-line argument parser
+    parser = argparse.ArgumentParser(description='Run the extraction framefork.')
+    parser.add_argument('--config', type=str, required=True,
                         help='Path to the YAML configuration file.')
+    # Parse command-line arguments
     args = parser.parse_args()
+    # Load configuration
     config = load_extraction_config(args.config)
+    # Model config
     model_config = config['model']
+    if model_config['vllm_serve'] == True:
+        model = LocalServer(model_config['model_name_or_path'])
     else:
+        clazz = getattr(models, model_config['category'], None)
+        if clazz is None:
+            print(f"Error: The model category '{model_config['category']}' is not supported.")
+            return
+        if model_config['api_key'] == "":
+            model = clazz(model_config['model_name_or_path'])
+        else:
+            model = clazz(model_config['model_name_or_path'], model_config['api_key'], model_config['base_url'])
     pipeline = Pipeline(model)
+    # Extraction config
+    extraction_config = config['extraction']
+    result, trajectory = pipeline.get_extract_result(task=extraction_config['task'], instruction=extraction_config['instruction'], text=extraction_config['text'], output_schema=extraction_config['output_schema'], constraint=extraction_config['constraint'], use_file=extraction_config['use_file'], file_path=extraction_config['file_path'], truth=extraction_config['truth'], mode=extraction_config['mode'], update_case=extraction_config['update_case'], show_trajectory=extraction_config['show_trajectory'])
+    return
 if __name__ == "__main__":
     main()

src/utils/__pycache__/__init__.cpython-311.pyc DELETED Viewed

Binary file (274 Bytes)

src/utils/__pycache__/__init__.cpython-39.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/__init__.cpython-39.pyc and b/src/utils/__pycache__/__init__.cpython-39.pyc differ

src/utils/__pycache__/data_def.cpython-311.pyc DELETED Viewed

Binary file (3.07 kB)

src/utils/__pycache__/data_def.cpython-39.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/data_def.cpython-39.pyc and b/src/utils/__pycache__/data_def.cpython-39.pyc differ

src/utils/__pycache__/process.cpython-311.pyc DELETED Viewed

Binary file (10.7 kB)

src/utils/__pycache__/process.cpython-39.pyc CHANGED Viewed

Binary files a/src/utils/__pycache__/process.cpython-39.pyc and b/src/utils/__pycache__/process.cpython-39.pyc differ

src/utils/data_def.py CHANGED Viewed

@@ -3,7 +3,6 @@ from models import *
 from .process import *
 # predefined processing logic for routine extraction tasks
 TaskType = Literal["NER", "RE", "EE", "Base"]
-ModelType = Literal["gpt-3.5-turbo", "gpt-4o"]
 class DataPoint:
     def __init__(self,

 from .process import *
 # predefined processing logic for routine extraction tasks
 TaskType = Literal["NER", "RE", "EE", "Base"]
 class DataPoint:
     def __init__(self,

src/utils/process.py CHANGED Viewed

@@ -17,7 +17,65 @@ import inspect
 import ast
 with open(os.path.join(os.path.dirname(__file__), "..", "config.yaml")) as file:
     config = yaml.safe_load(file)
 # Split the string text into chunks
 def chunk_str(text):
     sentences = sent_tokenize(text)
@@ -165,7 +223,6 @@ def normalize_obj(value):
     if isinstance(value, dict):
         return frozenset((k, normalize_obj(v)) for k, v in value.items())
     elif isinstance(value, (list, set, tuple)):
-        # 将 Counter 转换为元组以便于被哈希
         return tuple(Counter(map(normalize_obj, value)).items())
     elif isinstance(value, str):
         return format_string(value)

 import ast
 with open(os.path.join(os.path.dirname(__file__), "..", "config.yaml")) as file:
     config = yaml.safe_load(file)
+# Load configuration
+def load_extraction_config(yaml_path):
+    # Read YAML content from the file path
+    if not os.path.exists(yaml_path):
+        print(f"Error: The config file '{yaml_path}' does not exist.")
+        return {}
+    with open(yaml_path, 'r') as file:
+        config = yaml.safe_load(file)
+    # Extract the 'extraction' configuration dictionary
+    model_config = config.get('model', {})
+    extraction_config = config.get('extraction', {})
+    # Model config
+    model_name_or_path = model_config.get('model_name_or_path', "")
+    model_category = model_config.get('category', "")
+    api_key = model_config.get('api_key', "")
+    base_url = model_config.get('base_url', "")
+    vllm_serve = model_config.get('vllm_serve', False)
+    # Extraction config
+    task = extraction_config.get('task', "")
+    instruction = extraction_config.get('instruction', "")
+    text = extraction_config.get('text', "")
+    output_schema = extraction_config.get('output_schema', "")
+    constraint = extraction_config.get('constraint', "")
+    truth = extraction_config.get('truth', "")
+    use_file = extraction_config.get('use_file', False)
+    file_path = extraction_config.get('file_path', "")
+    mode = extraction_config.get('mode', "quick")
+    update_case = extraction_config.get('update_case', False)
+    show_trajectory = extraction_config.get('show_trajectory', False)
+    # Return a dictionary containing these variables
+    return {
+        "model": {
+            "model_name_or_path": model_name_or_path,
+            "category": model_category,
+            "api_key": api_key,
+            "base_url": base_url,
+            "vllm_serve": vllm_serve
+        },
+        "extraction": {
+            "task": task,
+            "instruction": instruction,
+            "text": text,
+            "output_schema": output_schema,
+            "constraint": constraint,
+            "truth": truth,
+            "use_file": use_file,
+            "file_path": file_path,
+            "mode": mode,
+            "update_case": update_case,
+            "show_trajectory": show_trajectory
+        }
+    }
 # Split the string text into chunks
 def chunk_str(text):
     sentences = sent_tokenize(text)
     if isinstance(value, dict):
         return frozenset((k, normalize_obj(v)) for k, v in value.items())
     elif isinstance(value, (list, set, tuple)):
         return tuple(Counter(map(normalize_obj, value)).items())
     elif isinstance(value, str):
         return format_string(value)

src/{main.py → webui.py} RENAMED Viewed

@@ -147,6 +147,7 @@ def create_interface():
                     use_file=use_file,
                     file_path=file_path,
                     text=text,
                 )
                 ger_frontend_schema = str(ger_frontend_schema)
@@ -159,8 +160,6 @@ def create_interface():
         def clear_all():
             return (
-                gr.update(value=""),  # model
-                gr.update(value=""),  # API Key
                 gr.update(value=""),  # task
                 gr.update(value="", visible=False),  # instruction
                 gr.update(value="", visible=False),  # constraint
@@ -223,9 +222,6 @@ def create_interface():
         clear_button.click(
             fn=clear_all,
             outputs=[
-                model_gr,
-                api_key_gr,
-                base_url_gr,
                 task_gr,
                 instruction_gr,
                 constraint_gr,

                     use_file=use_file,
                     file_path=file_path,
                     text=text,
+                    show_trajectory=False,
                 )
                 ger_frontend_schema = str(ger_frontend_schema)
         def clear_all():
             return (
                 gr.update(value=""),  # task
                 gr.update(value="", visible=False),  # instruction
                 gr.update(value="", visible=False),  # constraint
         clear_button.click(
             fn=clear_all,
             outputs=[
                 task_gr,
                 instruction_gr,
                 constraint_gr,

src/webui/__init__.py DELETED Viewed

	@@ -1 +0,0 @@
1	- from .interface import InterFace

src/webui/__pycache__/__init__.cpython-39.pyc DELETED Viewed

Binary file (197 Bytes)