|
import gradio as gr |
|
import os |
|
import json |
|
import shutil |
|
import html |
|
from datetime import datetime |
|
from retriever import retriever, reload_retriever |
|
from generator import answer_query |
|
from langchain_community.document_loaders import ( |
|
PyPDFLoader, TextLoader, CSVLoader, UnstructuredWordDocumentLoader |
|
) |
|
from langchain.text_splitter import RecursiveCharacterTextSplitter |
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
from langchain_community.vectorstores import FAISS |
|
|
|
|
|
CUSTOM_CSS_PATH = "gradio_theme.css" |
|
|
|
|
|
UPLOADED_FILES_JSON = "uploaded_files.json" |
|
uploaded_files = [] |
|
|
|
def save_uploaded_files_to_json(): |
|
with open(UPLOADED_FILES_JSON, "w", encoding="utf-8") as f: |
|
json.dump(uploaded_files, f, ensure_ascii=False, indent=2) |
|
|
|
def load_uploaded_files_from_json(): |
|
global uploaded_files |
|
if os.path.exists(UPLOADED_FILES_JSON): |
|
with open(UPLOADED_FILES_JSON, "r", encoding="utf-8") as f: |
|
uploaded_files = json.load(f) |
|
else: |
|
uploaded_files = [] |
|
|
|
def update_uploaded_files(): |
|
if not uploaded_files: |
|
return "_Chưa có tài liệu nào được tải lên._" |
|
return "### 📚 Danh sách tài liệu đã xử lý:\n" + "\n".join( |
|
f"- {f['name']} (Uploaded: {f['timestamp'][:19]})" for f in uploaded_files |
|
) |
|
|
|
|
|
load_uploaded_files_from_json() |
|
|
|
def process_document(file): |
|
file_path = file.name |
|
|
|
if os.path.exists("vectorstore"): |
|
shutil.rmtree("vectorstore") |
|
|
|
try: |
|
if file_path.endswith(".pdf"): |
|
loader = PyPDFLoader(file_path) |
|
elif file_path.endswith(".csv"): |
|
loader = CSVLoader(file_path) |
|
elif file_path.endswith(".txt"): |
|
loader = TextLoader(file_path, autodetect_encoding=True) |
|
elif file_path.endswith(".docx") or file_path.endswith(".doc"): |
|
loader = UnstructuredWordDocumentLoader(file_path) |
|
else: |
|
return "Định dạng file không hỗ trợ.", update_uploaded_files() |
|
|
|
documents = loader.load() |
|
except Exception as e: |
|
return f"Lỗi khi tải tài liệu: {e}", update_uploaded_files() |
|
|
|
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) |
|
docs = splitter.split_documents(documents) |
|
|
|
if not docs: |
|
return "Không trích xuất được nội dung từ tài liệu.", update_uploaded_files() |
|
|
|
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") |
|
db = FAISS.from_documents(docs, embeddings) |
|
db.save_local("vectorstore") |
|
reload_retriever() |
|
|
|
uploaded_files.append({"name": os.path.basename(file.name), "timestamp": datetime.now().isoformat()}) |
|
save_uploaded_files_to_json() |
|
|
|
return f"Đã xử lý {len(docs)} đoạn từ **{file.name}**", update_uploaded_files() |
|
|
|
def delete_file(filename): |
|
global uploaded_files |
|
filename = filename.strip() |
|
uploaded_files = [f for f in uploaded_files if f["name"] != filename] |
|
save_uploaded_files_to_json() |
|
return update_uploaded_files() |
|
|
|
def clear_inputs(): |
|
return "", "" |
|
|
|
def query_function(question, temperature, include_sources): |
|
fixed_model = "sentence-transformers/all-MiniLM-L6-v2" |
|
answer, docs = answer_query(question, model=fixed_model, temperature=temperature) |
|
answer = html.escape(answer) |
|
|
|
if include_sources and docs: |
|
unique_sources = set() |
|
for doc in docs: |
|
section = doc.metadata.get("section") |
|
if section: |
|
unique_sources.add(section.strip()) |
|
else: |
|
filename = os.path.basename(doc.metadata.get("source", "Unknown")) |
|
unique_sources.add(filename.strip()) |
|
if unique_sources: |
|
sources_list = [f"- {src}" for src in sorted(unique_sources)] |
|
sources_text = "\n\n**Nguồn tham khảo:**\n" + "\n".join(sources_list) |
|
answer += sources_text |
|
return answer |
|
|
|
|
|
with gr.Blocks(css=CUSTOM_CSS_PATH) as demo: |
|
with gr.Row(): |
|
with gr.Column(scale=5): |
|
gr.Markdown("## 🔍 RAGFlow Enterprise Search\nTìm kiếm thông minh từ tài liệu nội bộ", elem_classes="container-box") |
|
|
|
with gr.Tabs(): |
|
|
|
with gr.TabItem("🔍 Tìm kiếm"): |
|
with gr.Column(elem_classes="container-box"): |
|
question = gr.Textbox(lines=3, label="Câu hỏi") |
|
with gr.Row(): |
|
temperature = gr.Slider(0, 1, value=0.2, step=0.1, label="Temperature") |
|
include_sources = gr.Checkbox(label="Hiển thị nguồn", value=True) |
|
with gr.Row(): |
|
search_btn = gr.Button("🔍 Tìm kiếm", variant="primary", elem_classes="button-primary") |
|
clear_btn = gr.Button("🗑️ Xóa", variant="secondary", elem_classes="button-secondary") |
|
output = gr.Markdown(elem_classes="output-box") |
|
|
|
search_btn.click(query_function, |
|
inputs=[question, temperature, include_sources], |
|
outputs=[output]) |
|
clear_btn.click(clear_inputs, |
|
outputs=[question, output]) |
|
|
|
|
|
with gr.TabItem("📚 Quản lý tài liệu"): |
|
with gr.Column(elem_classes="container-box"): |
|
upload_file = gr.File(label="Tải lên tài liệu", file_types=[".pdf", ".docx", ".doc", ".csv", ".txt"]) |
|
upload_btn = gr.Button("📄 Tải lên và xử lý", variant="primary") |
|
upload_status = gr.Textbox(label="Trạng thái", lines=3, interactive=False) |
|
uploaded_files_list = gr.Markdown(value=update_uploaded_files(), elem_classes="scroll-box") |
|
with gr.Column(elem_classes="container-box"): |
|
delete_filename = gr.Textbox(label="Tên file muốn xóa") |
|
delete_btn = gr.Button("🗑️ Xóa tài liệu", variant="secondary") |
|
|
|
upload_btn.click(process_document, |
|
inputs=[upload_file], |
|
outputs=[upload_status, uploaded_files_list]) |
|
delete_btn.click(delete_file, |
|
inputs=[delete_filename], |
|
outputs=[uploaded_files_list]) |
|
|
|
demo.launch(share=True) |
|
|