# vectordb_relank_law.py import faiss import numpy as np import os from chromadb import PersistentClient from chromadb.utils import embedding_functions from sentence_transformers import SentenceTransformer from retriever.reranker import rerank_documents # chroma vector config v2 embedding_models = [ "upskyy/bge-m3-korean", "jhgan/ko-sbert-sts", "BM-K/KoSimCSE-roberta", "BM-K/KoSimCSE-v2-multitask", "snunlp/KR-SBERT-V40K-klueNLI-augSTS", "beomi/KcELECTRA-small-v2022", ] # law_db config v2 CHROMA_PATH = os.path.abspath("data/index/law_db") COLLECTION_NAME = "law_all" EMBEDDING_MODEL_NAME = embedding_models[0] # 사용하고자 하는 모델 선택 # 1. 임베딩 모델 로드 v2 # embedding_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") embedding_model = SentenceTransformer(EMBEDDING_MODEL_NAME) # 2. 임베딩 함수 설정 embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL_NAME) # 3. Chroma 클라이언트 및 컬렉션 로드 client = PersistentClient(path=CHROMA_PATH) collection = client.get_collection(name=COLLECTION_NAME, embedding_function=embedding_fn) # 4. 검색 함수 def search_documents(query: str, top_k: int = 5): print(f"\n🔍 검색어: '{query}'") results = collection.query( query_texts=[query], n_results=top_k, include=["documents", "metadatas", "distances"] ) # 문서 리스트만 추출 docs = results['documents'][0] metadatas = results['metadatas'][0] distances = results['distances'][0] # Rerank 문서 reranked_docs = rerank_documents(query, docs, top_k=top_k) # Rerank된 문서에 맞춰 metadata, distance 다시 정렬 reranked_data = [] for doc in reranked_docs: idx = docs.index(doc) reranked_data.append((doc, metadatas[idx], distances[idx])) for i, (doc, meta, dist) in enumerate(reranked_data): print(f"\n📄 결과 {i+1} (유사도: {1 - dist:.2f})") print(f"문서: {doc[:150]}...") print("메타데이터:") print(meta) return reranked_data # 필요하면 리턴