traning / powersearchtraning.py
aikobay's picture
Update powersearchtraning.py
5896bfc verified
raw
history blame contribute delete
22.2 kB
import os
import torch
import pandas as pd
import logging
import re
import faiss
import numpy as np
import time
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from huggingface_hub import login
from sentence_transformers import SentenceTransformer
from joblib import Parallel, delayed
from tqdm import tqdm
# Pydantic λͺ¨λΈ μ •μ˜ (API μž…λ ₯용)
class RecommendRequest(BaseModel):
search_query: str
top_k: int = 10
# πŸ”Ή 둜그 μ„€μ •
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info(f"βœ… NumPy 버전: {np.__version__}")
logger.info(f"βœ… FAISS 버전: {faiss.__version__}")
# πŸ”Ή FastAPI μΈμŠ€ν„΄μŠ€ 생성
app = FastAPI(title="πŸš€ ν•œκΈ€ LLAMA 3.2 μΆ”μ²œ μ‹œμŠ€ν…œ API", version="1.3")
# βœ… λͺ¨λΈ 정보
MODEL_NAME = "Bllossom/llama-3.2-Korean-Bllossom-3B"
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
# βœ… Hugging Face 둜그인
if HF_API_TOKEN:
logger.info("πŸ”‘ Hugging Face API 토큰을 μ‚¬μš©ν•˜μ—¬ 둜그인 쀑...")
login(token=HF_API_TOKEN)
else:
logger.warning("⚠️ ν™˜κ²½ λ³€μˆ˜ 'HF_API_TOKEN'이 μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€!")
# βœ… GPU μ‚¬μš© μ—¬λΆ€ 확인
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"πŸš€ μ‹€ν–‰ λ””λ°”μ΄μŠ€: {device.upper()}")
# βœ… λͺ¨λΈ 및 ν† ν¬λ‚˜μ΄μ € λ‘œλ“œ
logger.info(f"πŸ”„ {MODEL_NAME} λͺ¨λΈ λ‘œλ“œ 쀑...")
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_API_TOKEN)
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
token=HF_API_TOKEN,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
device_map="auto" if device == "cuda" else None
)
logger.info("βœ… ν•œκΈ€ LLAMA 3.2 λͺ¨λΈ λ‘œλ“œ μ™„λ£Œ!")
except Exception as e:
logger.error(f"❌ λͺ¨λΈ λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {e}")
model = None
# βœ… 데이터 λ‘œλ“œ ν•¨μˆ˜
def load_huggingface_jsonl(dataset_name, split="train"):
if HF_API_TOKEN:
login(token=HF_API_TOKEN)
try:
repo_id = f"aikobay/{dataset_name}"
dataset = load_dataset(repo_id, split=split)
df = dataset.to_pandas().dropna() # βœ… NaN κ°’ 제거
return df
except Exception as e:
logger.error(f"❌ 데이터 λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {e}")
return pd.DataFrame()
# βœ… μ§„ν–‰ 쀑인 κ²½λ§€ μƒν’ˆ 데이터 λ‘œλ“œ
try:
active_sale_items = load_huggingface_jsonl("initial_saleitem_dataset")
logger.info(f"βœ… μ§„ν–‰ 쀑인 κ²½λ§€ μƒν’ˆ 데이터 λ‘œλ“œ μ™„λ£Œ! 총 {len(active_sale_items)}개 μƒν’ˆ")
except Exception as e:
logger.error(f"❌ μƒν’ˆ 데이터 λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {e}")
active_sale_items = pd.DataFrame()
# βœ… FAISS 벑터 λͺ¨λΈ
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# βœ… λ©€ν‹°μ½”μ–΄ 벑터화 ν•¨μˆ˜
def encode_texts_parallel(texts, batch_size=512):
"""λ©€ν‹° ν”„λ‘œμ„Έμ‹±μ„ ν™œμš©ν•œ 벑터화 속도 μ΅œμ ν™”"""
num_cores = os.cpu_count() # CPU 개수 확인
logger.info(f"πŸ”„ λ©€ν‹°μ½”μ–΄ 벑터화 μ§„ν–‰ (μ½”μ–΄ 수: {num_cores})")
def encode_batch(batch):
return embedding_model.encode(batch, convert_to_numpy=True)
# 배치 λ‹¨μœ„λ‘œ 병렬 처리
text_batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
embeddings = Parallel(n_jobs=num_cores)(delayed(encode_batch)(batch) for batch in text_batches)
return np.vstack(embeddings).astype("float32")
"""
def encode_texts_parallel(texts, batch_size=128):
# μ½”μ–΄ 수 μ œν•œ (GPU λ©”λͺ¨λ¦¬ λΆ€μ‘± λ°©μ§€)
num_cores = min(2, os.cpu_count())
logger.info(f"πŸ”„ λ©€ν‹°μ½”μ–΄ 벑터화 μ§„ν–‰ (μ½”μ–΄ 수: {num_cores}, 배치 크기: {batch_size})")
def encode_batch(batch):
try:
# 각 배치 처리 μ‹œμž‘/μ’…λ£Œ λ‘œκΉ… μΆ”κ°€
batch_id = hash(str(batch[0]))[:6] if batch else "empty"
logger.debug(f"배치 {batch_id} 처리 μ‹œμž‘ (크기: {len(batch)})")
result = embedding_model.encode(batch, convert_to_numpy=True)
logger.debug(f"배치 {batch_id} 처리 μ™„λ£Œ")
return result
except Exception as e:
logger.error(f"배치 인코딩 쀑 였λ₯˜ λ°œμƒ: {e}")
# 였λ₯˜ λ°œμƒ μ‹œ 빈 λ°°μ—΄ λŒ€μ‹  더미 λ°μ΄ν„°λ‘œ μ±„μ›Œ 전체 μ‹€νŒ¨ λ°©μ§€
return np.zeros((len(batch), 384), dtype=np.float32) # SentenceTransformer λͺ¨λΈμ˜ 좜λ ₯ 차원에 맞게 μ‘°μ •
# 전체 데이터 크기 확인
total_batches = (len(texts) + batch_size - 1) // batch_size
logger.info(f"총 {total_batches}개 배치둜 λ‚˜λˆ„μ–΄ μ²˜λ¦¬ν•©λ‹ˆλ‹€")
# 배치 λ‹¨μœ„λ‘œ λ‚˜λˆ„κΈ°
text_batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
# νƒ€μž„μ•„μ›ƒ 증가 및 verbose ν™œμ„±ν™”
embeddings = Parallel(n_jobs=num_cores, timeout=600, verbose=10)(
delayed(encode_batch)(batch) for batch in text_batches
)
# κ²°κ³Ό ν•©μΉ˜κΈ°
try:
result = np.vstack(embeddings).astype("float32")
logger.info(f"βœ… 벑터화 μ™„λ£Œ: {result.shape[0]}개 ν•­λͺ©, 차원: {result.shape[1]}")
return result
except Exception as e:
logger.error(f"❌ 벑터 κ²°ν•© 쀑 였λ₯˜ λ°œμƒ: {e}")
# μ‹¬κ°ν•œ 였λ₯˜ μ‹œ μž¬μ‹œλ„ 둜직
logger.warning("비상 λŒ€μ•ˆμœΌλ‘œ 순차 처리λ₯Ό μ‹œλ„ν•©λ‹ˆλ‹€...")
# 순차 처리둜 λŒ€μ²΄ (λŠλ¦¬μ§€λ§Œ μ•ˆμ •μ )
single_embeddings = []
for item in tqdm(texts, desc="πŸ”„ 순차 μž„λ² λ”© μ§„ν–‰", ncols=80):
try:
emb = embedding_model.encode(item, convert_to_numpy=True)
single_embeddings.append(emb)
except Exception as item_err:
logger.error(f"ν•­λͺ© 인코딩 쀑 였λ₯˜: {item_err}")
# 였λ₯˜ λ°œμƒ μ‹œ κΈ°λ³Έ μž„λ² λ”© μ‚¬μš©
single_embeddings.append(np.zeros(384, dtype=np.float32))
return np.vstack(single_embeddings).astype("float32")
"""
# βœ… FAISS 인덱슀 μ €μž₯ & λ‘œλ“œ
def save_faiss_index():
"""FAISS 인덱슀λ₯Ό Hugging Face Hub에 μ €μž₯ν•˜μ—¬ μ„œλ²„ μž¬μ‹œμž‘ μ‹œμ—λ„ 데이터 μœ μ§€"""
from huggingface_hub import HfApi, create_repo
import tempfile
try:
# λ ˆν¬μ§€ν† λ¦¬ ID (ν™˜κ²½ λ³€μˆ˜μ—μ„œ κ°€μ Έμ˜€κ±°λ‚˜ κΈ°λ³Έκ°’ μ‚¬μš©)
repo_id = os.getenv("HF_INDEX_REPO", "aikobay/saleitem_faiss_index")
# HfApi 객체 생성
api = HfApi()
# λ ˆν¬μ§€ν† λ¦¬ 쑴재 μ—¬λΆ€ 확인 및 생성
try:
# λ ˆν¬μ§€ν† λ¦¬ 정보 쑰회 μ‹œλ„
api.repo_info(repo_id=repo_id, repo_type="dataset")
logger.info(f"βœ… κΈ°μ‘΄ λ ˆν¬μ§€ν† λ¦¬ μ‚¬μš©: {repo_id}")
except Exception:
# λ ˆν¬μ§€ν† λ¦¬κ°€ μ—†μœΌλ©΄ μƒˆλ‘œ 생성
logger.info(f"πŸ”„ λ ˆν¬μ§€ν† λ¦¬κ°€ μ‘΄μž¬ν•˜μ§€ μ•Šμ•„ μƒˆλ‘œ μƒμ„±ν•©λ‹ˆλ‹€: {repo_id}")
create_repo(
repo_id=repo_id,
repo_type="dataset",
private=True, # λΉ„κ³΅κ°œ λ ˆν¬μ§€ν† λ¦¬λ‘œ μ„€μ •
exist_ok=True # 이미 μ‘΄μž¬ν•΄λ„ 였λ₯˜ λ°œμƒν•˜μ§€ μ•ŠμŒ
)
logger.info(f"βœ… λ ˆν¬μ§€ν† λ¦¬ 생성 μ™„λ£Œ: {repo_id}")
# μž„μ‹œ 파일둜 λ¨Όμ € λ‘œμ»¬μ— μ €μž₯
with tempfile.TemporaryDirectory() as temp_dir:
index_path = os.path.join(temp_dir, "faiss_index.bin")
items_path = os.path.join(temp_dir, "indexed_items.txt")
# FAISS 인덱슀 μ €μž₯
faiss.write_index(faiss_index, index_path)
# μ•„μ΄ν…œ λͺ©λ‘ μ €μž₯
with open(items_path, "w", encoding="utf-8") as f:
f.write("\n".join(indexed_items))
# README 파일 생성 (λ ˆν¬μ§€ν† λ¦¬ μ •λ³΄μš©)
readme_path = os.path.join(temp_dir, "README.md")
with open(readme_path, "w", encoding="utf-8") as f:
f.write(f"""# FAISS 인덱슀 μ €μž₯μ†Œ
이 μ €μž₯μ†ŒλŠ” μƒν’ˆ 검색을 μœ„ν•œ FAISS μΈλ±μŠ€μ™€ κ΄€λ ¨ 데이터λ₯Ό ν¬ν•¨ν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€.
- μ΅œμ’… μ—…λ°μ΄νŠΈ: {pd.Timestamp.now()}
- 인덱슀 ν•­λͺ© 수: {len(indexed_items)}
- λͺ¨λΈ: {MODEL_NAME}
이 μ €μž₯μ†ŒλŠ”, 'aikobay/initial_saleitem_dataset'의 μƒν’ˆ 데이터λ₯Ό 기반으둜 μƒμ„±λœ 벑터 인덱슀λ₯Ό μ €μž₯ν•˜κΈ° μœ„ν•΄ μžλ™ μƒμ„±λ˜μ—ˆμŠ΅λ‹ˆλ‹€.
""")
# 파일 μ—…λ‘œλ“œ
for file_path, file_name in [
(index_path, "faiss_index.bin"),
(items_path, "indexed_items.txt"),
(readme_path, "README.md")
]:
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=file_name,
repo_id=repo_id,
repo_type="dataset"
)
logger.info(f"βœ… FAISS μΈλ±μŠ€κ°€ Hugging Face Hub에 μ €μž₯λ˜μ—ˆμŠ΅λ‹ˆλ‹€. 레포: {repo_id}")
except Exception as e:
logger.error(f"❌ FAISS 인덱슀 Hub μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {e}")
# λ‘œμ»¬μ— λ°±μ—… μ €μž₯ μ‹œλ„
try:
local_path = os.path.join(os.getcwd(), "faiss_index.bin")
faiss.write_index(faiss_index, local_path)
with open("indexed_items.txt", "w", encoding="utf-8") as f:
f.write("\n".join(indexed_items))
logger.info(f"βœ… FAISS μΈλ±μŠ€κ°€ λ‘œμ»¬μ— λ°±μ—… μ €μž₯λ˜μ—ˆμŠ΅λ‹ˆλ‹€: {local_path}")
except Exception as local_err:
logger.error(f"❌ 둜컬 λ°±μ—… μ €μž₯도 μ‹€νŒ¨: {local_err}")
def load_faiss_index():
"""Hugging Face Hubμ—μ„œ FAISS 인덱슀λ₯Ό λ‘œλ“œν•˜μ—¬ 검색 속도 ν–₯상"""
from huggingface_hub import hf_hub_download, HfApi
global faiss_index, indexed_items, active_sale_items
# λ ˆν¬μ§€ν† λ¦¬ ID (ν™˜κ²½ λ³€μˆ˜μ—μ„œ κ°€μ Έμ˜€κ±°λ‚˜ κΈ°λ³Έκ°’ μ‚¬μš©)
repo_id = os.getenv("HF_INDEX_REPO", "aikobay/saleitem_faiss_index")
try:
# λ ˆν¬μ§€ν† λ¦¬ 쑴재 확인
api = HfApi()
try:
api.repo_info(repo_id=repo_id, repo_type="dataset")
logger.info(f"βœ… FAISS 인덱슀 λ ˆν¬μ§€ν† λ¦¬ 확인: {repo_id}")
except Exception as repo_err:
logger.warning(f"⚠️ λ ˆν¬μ§€ν† λ¦¬κ°€ μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€: {repo_err}")
raise FileNotFoundError("Hub λ ˆν¬μ§€ν† λ¦¬κ°€ μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€")
# Hubμ—μ„œ 파일 λ‹€μš΄λ‘œλ“œ
index_path = hf_hub_download(
repo_id=repo_id,
filename="faiss_index.bin",
repo_type="dataset"
)
items_path = hf_hub_download(
repo_id=repo_id,
filename="indexed_items.txt",
repo_type="dataset"
)
# 파일 λ‘œλ“œ
faiss_index = faiss.read_index(index_path)
with open(items_path, "r", encoding="utf-8") as f:
indexed_items = f.read().splitlines()
logger.info(f"βœ… FAISS μΈλ±μŠ€κ°€ Hubμ—μ„œ λ‘œλ“œλ˜μ—ˆμŠ΅λ‹ˆλ‹€. 총 {len(indexed_items)}개 μƒν’ˆ")
except Exception as e:
logger.warning(f"⚠️ Hubμ—μ„œ FAISS 인덱슀 λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {e}")
# 둜컬 파일 확인
try:
faiss_index = faiss.read_index("faiss_index.bin")
with open("indexed_items.txt", "r", encoding="utf-8") as f:
indexed_items = f.read().splitlines()
logger.info(f"βœ… 둜컬 FAISS 인덱슀 λ‘œλ“œ 성곡. 총 {len(indexed_items)}개 μƒν’ˆ")
except FileNotFoundError:
logger.warning("⚠️ FAISS 인덱슀 파일이 μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€. μƒˆλ‘œ κ΅¬μΆ•ν•©λ‹ˆλ‹€.")
rebuild_faiss_index()
except Exception as local_err:
logger.error(f"❌ 둜컬 FAISS 인덱슀 λ‘œλ“œ 쀑 였λ₯˜: {local_err}")
rebuild_faiss_index()
# βœ… FAISS 인덱슀 생성 (μ§„ν–‰λ₯  ν‘œμ‹œ μΆ”κ°€)
def rebuild_faiss_index():
global faiss_index, indexed_items, active_sale_items
logger.info("πŸ”„ μƒˆλ‘œμš΄ sale_item λ°μ΄ν„°λ‘œ FAISS 인덱슀λ₯Ό μž¬κ΅¬μΆ•ν•©λ‹ˆλ‹€...")
active_sale_items = load_huggingface_jsonl("initial_saleitem_dataset")
item_names = active_sale_items["ITEMNAME"].tolist()
logger.info(f"πŸ”Ή 총 {len(item_names)}개 μƒν’ˆ 벑터화 μ‹œμž‘...")
# 병렬 처리λ₯Ό ν†΅ν•œ 벑터화 (더 λΉ λ₯Έ 처리)
item_vectors = encode_texts_parallel(item_names)
# βœ… FAISS 인덱슀 생성
faiss_index = faiss.IndexFlatL2(item_vectors.shape[1])
faiss_index.add(item_vectors)
indexed_items = item_names
logger.info(f"βœ… FAISS μΈλ±μŠ€κ°€ {len(indexed_items)}개 μƒν’ˆμœΌλ‘œ μƒˆλ‘­κ²Œ κ΅¬μΆ•λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
save_faiss_index()
# ν‚€μ›Œλ“œ μΆ”μΆœ ν•¨μˆ˜ μΆ”κ°€
#def extract_keywords(query):
# """검색 μΏΌλ¦¬μ—μ„œ μ£Όμš” ν‚€μ›Œλ“œ μΆ”μΆœ 및 μ •μ œ"""
# try:
# # κΈ°λ³Έ μ •μ œ (특수문자 제거, μ†Œλ¬Έμž λ³€ν™˜ λ“±)
# cleaned_query = re.sub(r'[^\w\s]', ' ', query).strip().lower()
# # ν˜•νƒœμ†Œ 뢄석을 ν†΅ν•œ λͺ…사 μΆ”μΆœ
# nouns = okt.nouns(cleaned_query)
# # 2κΈ€μž 이상 λͺ…μ‚¬λ§Œ 선택 (μ˜λ―ΈμžˆλŠ” ν‚€μ›Œλ“œ μœ„μ£Ό)
# keywords = [noun for noun in nouns if len(noun) >= 2]
# # λͺ…사가 μ—†κ±°λ‚˜ μΆ”μΆœ μ‹€νŒ¨ μ‹œ, 원본 쿼리의 곡백으둜 κ΅¬λΆ„λœ 토큰 μ‚¬μš©
# if not keywords:
# keywords = cleaned_query.split()
# return keywords
# except Exception as e:
# logger.error(f"❌ ν‚€μ›Œλ“œ μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ: {e}")
# # 였λ₯˜ λ°œμƒ μ‹œ 원본 쿼리 κ·ΈλŒ€λ‘œ λ°˜ν™˜
# return [query]
# 검색 κ²°κ³Ό ν–₯상을 μœ„ν•œ ν‚€μ›Œλ“œ λ§€μΉ­ 점수 계산 ν•¨μˆ˜
#def calculate_keyword_score(item_name, keywords):
# """μ•„μ΄ν…œ 이름과 ν‚€μ›Œλ“œ κ°„μ˜ λ§€μΉ­ 점수 계산"""
# score = 0
# item_lower = item_name.lower()
# # 1. 전체 쿼리가 μƒν’ˆλͺ…에 ν¬ν•¨λ˜λ©΄ 높은 점수
# if ''.join(keywords).lower() in item_lower:
# score += 10
# # 2. κ°œλ³„ ν‚€μ›Œλ“œ λ§€μΉ­ 점수
# for keyword in keywords:
# if keyword.lower() in item_lower:
# # μ •ν™•νžˆ μΌμΉ˜ν•˜λŠ” 경우 높은 점수
# if keyword.lower() == item_lower:
# score += 15
# # μƒν’ˆλͺ…이 ν‚€μ›Œλ“œλ‘œ μ‹œμž‘ν•˜λŠ” 경우
# elif item_lower.startswith(keyword.lower()):
# score += 8
# # λ‹¨μˆœ ν¬ν•¨λ˜λŠ” 경우
# else:
# score += 5
# return score
# κΈ°μ‘΄ extract_keywords ν•¨μˆ˜ λŒ€μ‹  μ •κ·œμ‹ 기반 ν•¨μˆ˜λ‘œ λŒ€μ²΄
def extract_keywords_simple(query):
"""ν•œκ΅­μ–΄ 검색어에 μ΅œμ ν™”λœ ν‚€μ›Œλ“œ μΆ”μΆœ"""
# 특수문자 제거 및 μ†Œλ¬Έμž λ³€ν™˜
cleaned_query = re.sub(r'[^\w\sκ°€-힣]', ' ', query).strip().lower()
# ν•œκΈ€ 단어와 영문 단어 뢄리 μ •κ·œμ‹
#pattern = re.compile(r'[κ°€-힣]+|[a-zA-Z]+')
# ν•œκΈ€ 단어 영문 단어 쀑ꡭ어 뢄리 μ •κ·œμ‹
pattern = re.compile(r'[κ°€-힣]+|[\u4e00-\u9fff]+|[a-zA-Z]+')
# λͺ¨λ“  λ§€μΉ­ μ°ΎκΈ°
matches = pattern.findall(cleaned_query)
# 길이 필터링 (ν•œκΈ€μ€ 1자 이상, μ˜λ¬Έμ€ 2자 이상)
words = []
for word in matches:
if re.match(r'[κ°€-힣]+', word) and len(word) >= 1:
words.append(word)
elif re.match(r'[a-zA-Z]+', word) and len(word) >= 2:
words.append(word)
# 곡백으둜 κ΅¬λΆ„λœ 원본 토큰도 μΆ”κ°€ (볡합어 κ³ λ €)
for token in cleaned_query.split():
if token not in words and len(token) >= 2:
words.append(token)
# 빈 리슀트인 경우 원본 쿼리의 토큰 μ‚¬μš©
if not words:
return [w for w in cleaned_query.split() if w]
return words
# ν‚€μ›Œλ“œ λ§€μΉ­ 점수 계산 ν•¨μˆ˜ κ°œμ„ 
def calculate_keyword_score(item_name, keywords):
"""κ°œμ„ λœ ν‚€μ›Œλ“œ λ§€μΉ­ 점수 계산"""
score = 0
item_lower = item_name.lower()
# 전체 쿼리가 μ™„μ „νžˆ μΌμΉ˜ν•˜λ©΄ κ°€μž₯ 높은 점수
joined_query = ''.join(keywords).lower()
if item_lower == joined_query:
return 100 # μ™„μ „ μΌμΉ˜λŠ” 졜고 점수
# 전체 쿼리가 λΆ€λΆ„ μΌμΉ˜ν•˜λŠ” 경우
if joined_query in item_lower:
score += 50
# κ°œλ³„ ν‚€μ›Œλ“œ λ§€μΉ­ (단, 길이가 2자 이상인 μ˜λ―ΈμžˆλŠ” ν‚€μ›Œλ“œλ§Œ)
meaningful_keywords = [k for k in keywords if len(k) >= 2]
for keyword in meaningful_keywords:
kw_lower = keyword.lower()
if kw_lower in item_lower:
# 단어 κ²½κ³„μ—μ„œ μ‹œμž‘ν•˜λŠ”μ§€ 확인 (더 μ •ν™•ν•œ λ§€μΉ­)
word_boundary_match = re.search(r'(^|\s|_)' + re.escape(kw_lower), item_lower) is not None
# μ •ν™•νžˆ μΌμΉ˜ν•˜λŠ” 경우
if item_lower == kw_lower:
score += 40
# μƒν’ˆλͺ…이 ν‚€μ›Œλ“œλ‘œ μ‹œμž‘ν•˜λŠ” 경우
elif item_lower.startswith(kw_lower):
score += 30
# 단어 κ²½κ³„μ—μ„œ λ§€μΉ­λ˜λŠ” 경우
elif word_boundary_match:
score += 20
# λ‹¨μˆœ ν¬ν•¨λ˜λŠ” 경우
else:
score += 10
# μœ μ˜λ―Έν•œ ν‚€μ›Œλ“œκ°€ λ§Žμ„μˆ˜λ‘ 더 높은 점수 λΆ€μ—¬
if meaningful_keywords:
matched_keywords = sum(1 for k in meaningful_keywords if k.lower() in item_lower)
coverage_ratio = matched_keywords / len(meaningful_keywords)
score += coverage_ratio * 15
return score
# βœ… FAISS 검색 ν•¨μˆ˜ (검색 속도 λͺ¨λ‹ˆν„°λ§)
#def search_faiss(query, top_k=10):
# if faiss_index is None or indexed_items is None:
# logger.error("❌ FAISS μΈλ±μŠ€κ°€ μ΄ˆκΈ°ν™”λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
# return []
# start_time = time.time() # πŸ”Ή 검색 μ‹œκ°„ μΈ‘μ • μ‹œμž‘
# query_vector = np.array([embedding_model.encode(query)]).astype("float32")
# _, indices = faiss_index.search(query_vector, top_k)
# end_time = time.time() # πŸ”Ή 검색 μ‹œκ°„ μΈ‘μ • 끝
# logger.info(f"πŸ” 검색 μˆ˜ν–‰ μ™„λ£Œ! κ±Έλ¦° μ‹œκ°„: {end_time - start_time:.4f}초")
# recommendations = []
# for idx in indices[0]:
# if idx >= len(indexed_items):
# continue
# item_name = indexed_items[idx]
# item_seq = active_sale_items.loc[active_sale_items["ITEMNAME"] == item_name, "ITEMSEQ"].values[0]
# recommendations.append({"ITEMSEQ": item_seq, "ITEMNAME": item_name})
# return recommendations
# FAISS 검색 ν•¨μˆ˜ μˆ˜μ •
# search_faiss ν•¨μˆ˜ λ‚΄μ—μ„œ extract_keywords λŒ€μ‹  extract_keywords_simple μ‚¬μš©ν•˜λ„λ‘ μˆ˜μ •
def search_faiss(query, top_k=10):
if faiss_index is None or indexed_items is None:
logger.error("❌ FAISS μΈλ±μŠ€κ°€ μ΄ˆκΈ°ν™”λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
return []
# ν‚€μ›Œλ“œ μΆ”μΆœ 및 λ‘œκΉ… (extract_keywords_simple μ‚¬μš©)
keywords = extract_keywords_simple(query)
logger.info(f"πŸ” 검색 쿼리: '{query}' β†’ μΆ”μΆœ ν‚€μ›Œλ“œ: {keywords}")
start_time = time.time()
# 원본 벑터 기반 검색
query_vector = np.array([embedding_model.encode(query)]).astype("float32")
distances, indices = faiss_index.search(query_vector, top_k * 2) # 더 λ§Žμ€ 후보 검색
# κ²°κ³Όλ₯Ό μ €μž₯ν•  리슀트
candidates = []
# 검색 결과에 ν‚€μ›Œλ“œ λ§€μΉ­ 점수 μΆ”κ°€
for i, idx in enumerate(indices[0]):
if idx >= len(indexed_items):
continue
item_name = indexed_items[idx]
# ν‚€μ›Œλ“œ λ§€μΉ­ 점수 계산
keyword_score = calculate_keyword_score(item_name, keywords)
# 벑터 μœ μ‚¬λ„ 점수 (거리λ₯Ό 0~1 사이 점수둜 λ³€ν™˜)
vector_score = max(0, 100 - distances[0][i] * 10) # 거리가 μž‘μ„μˆ˜λ‘ 점수 λ†’μŒ
# μ΅œμ’… 점수 (ν‚€μ›Œλ“œ λ§€μΉ­ κ°€μ€‘μΉ˜ + 벑터 μœ μ‚¬λ„ κ°€μ€‘μΉ˜)
final_score = (keyword_score * 0.7) + (vector_score * 0.3)
try:
item_seq = active_sale_items.loc[active_sale_items["ITEMNAME"] == item_name, "ITEMSEQ"].values[0]
candidates.append({
"ITEMSEQ": item_seq,
"ITEMNAME": item_name,
"score": final_score,
"keyword_match": keyword_score > 0
})
except (IndexError, KeyError) as e:
logger.warning(f"⚠️ μƒν’ˆ 정보 λ§€ν•‘ 였λ₯˜ (ID: {idx}): {e}")
# μ΅œμ’… 점수둜 μ •λ ¬
candidates.sort(key=lambda x: x["score"], reverse=True)
# top_k개 선택
recommendations = candidates[:top_k]
end_time = time.time()
logger.info(f"πŸ” 검색 μˆ˜ν–‰ μ™„λ£Œ! κ±Έλ¦° μ‹œκ°„: {end_time - start_time:.4f}초, κ²°κ³Ό: {len(recommendations)}개")
# κ²°κ³Ό λ‘œκΉ… (μƒμœ„ 3개만)
for i, rec in enumerate(recommendations[:3]):
logger.info(f" #{i+1}: {rec['ITEMNAME']} (점수: {rec['score']:.2f}, ν‚€μ›Œλ“œ λ§€μΉ­: {rec['keyword_match']})")
# API μ‘λ‹΅μ—λŠ” 점수 정보 μ œμ™Έ
for rec in recommendations:
rec.pop("score", None)
rec.pop("keyword_match", None)
return recommendations
# βœ… μΆ”μ²œ API μ—”λ“œν¬μΈνŠΈ
@app.post("/api/recommend")
async def recommend(request: RecommendRequest):
try:
recommendations = search_faiss(request.search_query, request.top_k)
return {"query": request.search_query, "recommendations": recommendations}
except Exception as e:
raise HTTPException(status_code=500, detail=f"μΆ”μ²œ 였λ₯˜: {str(e)}")
# βœ… 인덱슀 κ°±μ‹  API
@app.post("/api/update_index")
async def update_index():
rebuild_faiss_index()
return {"message": "βœ… FAISS 인덱슀 μ—…λ°μ΄νŠΈ μ™„λ£Œ!"}
# βœ… FastAPI μ‹€ν–‰
if __name__ == "__main__":
load_faiss_index()
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)