traning / fastRecommend.py
aikobay's picture
Update fastRecommend.py
b37699e verified
raw
history blame contribute delete
23.4 kB
import os
import torch
import pandas as pd
import logging
import re
import faiss
import numpy as np
import time
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from huggingface_hub import login
from sentence_transformers import SentenceTransformer
from joblib import Parallel, delayed
from tqdm import tqdm
# Pydantic λͺ¨λΈ μ •μ˜ (API μž…λ ₯용)
class RecommendRequest(BaseModel):
search_query: str
top_k: int = 10
# 둜그 μ„€μ •
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info(f"βœ… NumPy 버전: {np.__version__}")
logger.info(f"βœ… FAISS 버전: {faiss.__version__}")
# FastAPI μΈμŠ€ν„΄μŠ€ 생성
app = FastAPI(title="πŸš€ ν•œκΈ€ LLAMA 3.2 μΆ”μ²œ μ‹œμŠ€ν…œ API", version="1.4")
# λͺ¨λΈ 정보
MODEL_NAME = "Bllossom/llama-3.2-Korean-Bllossom-3B"
HF_API_TOKEN = os.getenv("HF_API_TOKEN")
# Hugging Face 둜그인
if HF_API_TOKEN:
logger.info("πŸ”‘ Hugging Face API 토큰을 μ‚¬μš©ν•˜μ—¬ 둜그인 쀑...")
login(token=HF_API_TOKEN)
else:
logger.warning("⚠️ ν™˜κ²½ λ³€μˆ˜ 'HF_API_TOKEN'이 μ„€μ •λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€!")
# GPU μ‚¬μš© μ—¬λΆ€ 확인
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"πŸš€ μ‹€ν–‰ λ””λ°”μ΄μŠ€: {device.upper()}")
# λͺ¨λΈ 및 ν† ν¬λ‚˜μ΄μ € λ‘œλ“œ
logger.info(f"πŸ”„ {MODEL_NAME} λͺ¨λΈ λ‘œλ“œ 쀑...")
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, token=HF_API_TOKEN)
# pad_token_idλ₯Ό λͺ…μ‹œμ μœΌλ‘œ μ„€μ •
if tokenizer.pad_token_id is None:
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained(
MODEL_NAME,
token=HF_API_TOKEN,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
device_map="auto" if device == "cuda" else None,
# νŒ¨λ”© 토큰 ID λͺ…μ‹œμ  μ„€μ •
pad_token_id=tokenizer.pad_token_id
)
logger.info("βœ… ν•œκΈ€ LLAMA 3.2 λͺ¨λΈ λ‘œλ“œ μ™„λ£Œ!")
except Exception as e:
logger.error(f"❌ λͺ¨λΈ λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {e}")
model = None
tokenizer = None
# Llama 3λ₯Ό μ΄μš©ν•œ μ—°κ΄€ ν‚€μ›Œλ“œ 생성 ν•¨μˆ˜
def generate_related_keywords(query, max_keywords=5):
"""Llama 3 λͺ¨λΈμ„ μ‚¬μš©ν•˜μ—¬ 쿼리와 κ΄€λ ¨λœ ν‚€μ›Œλ“œ 생성 및 μ •κ·œν™”"""
if not model or not tokenizer:
logger.warning("λͺ¨λΈμ΄ λ‘œλ“œλ˜μ§€ μ•Šμ•„ κΈ°λ³Έ ν‚€μ›Œλ“œ μΆ”μΆœλ‘œ λŒ€μ²΄λ©λ‹ˆλ‹€.")
return extract_keywords_simple(query)
try:
# κ°œμ„ λœ ν”„λ‘¬ν”„νŠΈ template ꡬ성 (ν•œκ΅­μ–΄)
prompt = f""""{query}"에 λŒ€ν•΄ κ΄€λ ¨μ„± 높은 μ—°κ΄€ ν‚€μ›Œλ“œ λ˜λŠ” ν™•μž₯ μΆ”λ‘  ν‚€μ›Œλ“œ {max_keywords}개λ₯Ό μ œμ•ˆν•΄.
μˆ«μžμ œμ™Έ
"""
# 토큰화 및 λͺ¨λΈ μž…λ ₯ μ€€λΉ„
inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=True).to(device)
# ν…μŠ€νŠΈ 생성
outputs = model.generate(
inputs.input_ids,
max_new_tokens=100, # 생성할 토큰 수 μ‘°μ •
num_return_sequences=1,
temperature=0.6, # λ‹€μ–‘μ„±κ³Ό μ •ν™•μ„± 사이 밸런슀
do_sample=True,
repetition_penalty=1.2 # 반볡 λ°©μ§€
)
# μƒμ„±λœ ν…μŠ€νŠΈ λ””μ½”λ”©
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# ν‚€μ›Œλ“œ μΆ”μΆœ (μˆ«μžμ™€ 점 λ‹€μŒμ— μ˜€λŠ” ν…μŠ€νŠΈ μΆ”μΆœ)
raw_keywords = []
for line in generated_text.split('\n'):
line = line.strip()
# μ—„κ²©ν•œ 필터링 쑰건 μΆ”κ°€
if (line and
len(line) > 1 and # μ΅œμ†Œ 길이 2자 이상
not any(x in line.lower() for x in [
'좜λ ₯', 'μ˜ˆμ‹œ', 'κ°€μ΄λ“œ', 'μ œμ•ˆ', '라인', '검색어', 'ν‚€μ›Œλ“œ'
])):
# μˆ«μžμ™€ 점 제거, 곡백 제거
keyword = re.sub(r'^\d+\.\s*', '', line).strip()
# 원본 쿼리의 ν‚€μ›Œλ“œμ™€ μ€‘λ³΅λ˜λŠ” ν‚€μ›Œλ“œ μ œμ™Έ
query_keywords = extract_keywords_simple(query)
if (keyword and
keyword not in raw_keywords and
keyword not in query_keywords and
keyword.lower() not in [k.lower() for k in query_keywords]):
raw_keywords.append(keyword)
# ===== μƒˆλ‘œ μΆ”κ°€λœ μ •κ·œν™” 둜직 =====
normalized_keywords = []
for keyword in raw_keywords:
# 1. λ¬Έμž₯λΆ€ν˜Έ 제거
keyword = re.sub(r'[.,;:!?"\'\(\)\[\]\{\}]', '', keyword)
# 2. λ¬Έμž₯μ΄λ‚˜ ꡬ문을 λ‹¨μ–΄λ‘œ 뢄리 (ν•œκΈ€ & 영문)
if len(keyword.split()) > 2: # 2단어 이상인 경우
# ν•œκΈ€ 처리: 2음절 μ΄μƒμ˜ λͺ…μ‚¬λ§Œ μΆ”μΆœ
korean_words = re.findall(r'[κ°€-힣]{2,}', keyword)
# 영문 처리: 3자 μ΄μƒμ˜, μ˜λ―ΈμžˆλŠ” μ˜λ‹¨μ–΄λ§Œ μΆ”μΆœ
english_words = re.findall(r'[a-zA-Z]{3,}', keyword)
# μΆ”μΆœλœ 단어듀 κ²°ν•©
extracted_words = korean_words + english_words
if extracted_words:
# κ°€μž₯ κΈΈκ³  μ˜λ―ΈμžˆλŠ” 단어 선택 (볡합어가 더 μ •λ³΄λŸ‰μ΄ 많음)
keyword = max(extracted_words, key=len)
# 3. λΆˆν•„μš”ν•œ 쑰사 제거 (ν•œκΈ€ 쑰사 νŒ¨ν„΄)
keyword = re.sub(r'(은|λŠ”|이|κ°€|을|λ₯Ό|의|와|κ³Ό|둜|으둜|에|μ—μ„œ)$', '', keyword)
# 4. 길이 및 λ‚΄μš© 검증
if len(keyword) >= 2 and not keyword.isspace() and keyword not in normalized_keywords:
normalized_keywords.append(keyword)
# 쀑볡 제거 및 μ΅œλŒ€ ν‚€μ›Œλ“œ 수 μ œν•œ
normalized_keywords = list(dict.fromkeys(normalized_keywords))[:max_keywords]
# ν‚€μ›Œλ“œκ°€ μ—†μœΌλ©΄ κΈ°λ³Έ ν‚€μ›Œλ“œ μΆ”μΆœ 방법 μ‚¬μš©
if not normalized_keywords:
normalized_keywords = extract_keywords_simple(query)
logger.info(f"πŸ” μƒμ„±λœ μ—°κ΄€ ν‚€μ›Œλ“œ: {normalized_keywords}")
return normalized_keywords
except Exception as e:
logger.error(f"❌ μ—°κ΄€ ν‚€μ›Œλ“œ 생성 쀑 였λ₯˜ λ°œμƒ: {e}")
# 였λ₯˜ λ°œμƒ μ‹œ κΈ°λ³Έ ν‚€μ›Œλ“œ μΆ”μΆœ λ°©λ²•μœΌλ‘œ λŒ€μ²΄
return extract_keywords_simple(query)
# 기쑴의 ν‚€μ›Œλ“œ μΆ”μΆœ ν•¨μˆ˜ (μœ μ§€)
def extract_keywords_simple(query):
"""ν•œκ΅­μ–΄ 검색어에 μ΅œμ ν™”λœ ν‚€μ›Œλ“œ μΆ”μΆœ"""
# 특수문자 제거 및 μ†Œλ¬Έμž λ³€ν™˜
cleaned_query = re.sub(r'[^\w\sκ°€-힣]', ' ', query).strip().lower()
# ν•œκΈ€ 단어와 영문 단어 뢄리 μ •κ·œμ‹
pattern = re.compile(r'[κ°€-힣]+|[\u4e00-\u9fff]+|[a-zA-Z]+')
# λͺ¨λ“  λ§€μΉ­ μ°ΎκΈ°
matches = pattern.findall(cleaned_query)
# 길이 필터링 (ν•œκΈ€μ€ 1자 이상, μ˜λ¬Έμ€ 2자 이상)
words = []
for word in matches:
if re.match(r'[κ°€-힣]+', word) and len(word) >= 1:
words.append(word)
elif re.match(r'[a-zA-Z]+', word) and len(word) >= 2:
words.append(word)
# 곡백으둜 κ΅¬λΆ„λœ 원본 토큰도 μΆ”κ°€ (볡합어 κ³ λ €)
for token in cleaned_query.split():
if token not in words and len(token) >= 2:
words.append(token)
# 빈 리슀트인 경우 원본 쿼리의 토큰 μ‚¬μš©
if not words:
return [w for w in cleaned_query.split() if w]
return words
# FAISS 검색 ν•¨μˆ˜ μˆ˜μ • (Llama 3 μ—°κ΄€ ν‚€μ›Œλ“œ 톡합)
def search_faiss(query, top_k=10):
if faiss_index is None or indexed_items is None:
logger.error("❌ FAISS μΈλ±μŠ€κ°€ μ΄ˆκΈ°ν™”λ˜μ§€ μ•Šμ•˜μŠ΅λ‹ˆλ‹€.")
return []
# Llama 3λ₯Ό μ΄μš©ν•œ μ—°κ΄€ ν‚€μ›Œλ“œ 생성
keywords = generate_related_keywords(query)
logger.info(f"πŸ” 검색 쿼리: '{query}' β†’ μ—°κ΄€ ν‚€μ›Œλ“œ: {keywords}")
start_time = time.time()
# 원본 벑터 기반 검색
query_vector = np.array([embedding_model.encode(query)]).astype("float32")
distances, indices = faiss_index.search(query_vector, top_k * 2) # 더 λ§Žμ€ 후보 검색
# κ²°κ³Όλ₯Ό μ €μž₯ν•  리슀트
candidates = []
# 검색 결과에 ν‚€μ›Œλ“œ λ§€μΉ­ 점수 μΆ”κ°€
for i, idx in enumerate(indices[0]):
if idx >= len(indexed_items):
continue
item_name = indexed_items[idx]
# ν‚€μ›Œλ“œ λ§€μΉ­ 점수 계산
keyword_score = 0
for keyword in keywords:
keyword_match_score = calculate_keyword_score(item_name, [keyword])
keyword_score = max(keyword_score, keyword_match_score)
# 벑터 μœ μ‚¬λ„ 점수 (거리λ₯Ό 0~1 사이 점수둜 λ³€ν™˜)
vector_score = max(0, 100 - distances[0][i] * 10) # 거리가 μž‘μ„μˆ˜λ‘ 점수 λ†’μŒ
# μ΅œμ’… 점수 (ν‚€μ›Œλ“œ λ§€μΉ­ κ°€μ€‘μΉ˜ + 벑터 μœ μ‚¬λ„ κ°€μ€‘μΉ˜)
final_score = (keyword_score * 0.7) + (vector_score * 0.3)
try:
item_seq = active_sale_items.loc[active_sale_items["ITEMNAME"] == item_name, "ITEMSEQ"].values[0]
candidates.append({
"ITEMSEQ": item_seq,
"ITEMNAME": item_name,
"score": final_score,
"keyword_match": keyword_score > 0
})
except (IndexError, KeyError) as e:
logger.warning(f"⚠️ μƒν’ˆ 정보 λ§€ν•‘ 였λ₯˜ (ID: {idx}): {e}")
# μ΅œμ’… 점수둜 μ •λ ¬
candidates.sort(key=lambda x: x["score"], reverse=True)
# top_k개 선택
recommendations = candidates[:top_k]
end_time = time.time()
logger.info(f"πŸ” 검색 μˆ˜ν–‰ μ™„λ£Œ! κ±Έλ¦° μ‹œκ°„: {end_time - start_time:.4f}초, κ²°κ³Ό: {len(recommendations)}개")
# κ²°κ³Ό λ‘œκΉ… (μƒμœ„ 3개만)
for i, rec in enumerate(recommendations[:3]):
logger.info(f" #{i+1}: {rec['ITEMNAME']} (점수: {rec['score']:.2f}, ν‚€μ›Œλ“œ λ§€μΉ­: {rec['keyword_match']})")
# API μ‘λ‹΅μ—λŠ” 점수 정보 μ œμ™Έ
for rec in recommendations:
rec.pop("score", None)
rec.pop("keyword_match", None)
return recommendations
# ν‚€μ›Œλ“œ λ§€μΉ­ 점수 계산 ν•¨μˆ˜ (κΈ°μ‘΄ 둜직 μœ μ§€)
def calculate_keyword_score(item_name, keywords):
"""κ°œμ„ λœ ν‚€μ›Œλ“œ λ§€μΉ­ 점수 계산"""
score = 0
item_lower = item_name.lower()
# 전체 쿼리가 μ™„μ „νžˆ μΌμΉ˜ν•˜λ©΄ κ°€μž₯ 높은 점수
joined_query = ''.join(keywords).lower()
if item_lower == joined_query:
return 100 # μ™„μ „ μΌμΉ˜λŠ” 졜고 점수
# 전체 쿼리가 λΆ€λΆ„ μΌμΉ˜ν•˜λŠ” 경우
if joined_query in item_lower:
score += 50
# κ°œλ³„ ν‚€μ›Œλ“œ λ§€μΉ­ (단, 길이가 2자 이상인 μ˜λ―ΈμžˆλŠ” ν‚€μ›Œλ“œλ§Œ)
meaningful_keywords = [k for k in keywords if len(k) >= 2]
for keyword in meaningful_keywords:
kw_lower = keyword.lower()
if kw_lower in item_lower:
# 단어 κ²½κ³„μ—μ„œ μ‹œμž‘ν•˜λŠ”μ§€ 확인 (더 μ •ν™•ν•œ λ§€μΉ­)
word_boundary_match = re.search(r'(^|\s|_)' + re.escape(kw_lower), item_lower) is not None
# μ •ν™•νžˆ μΌμΉ˜ν•˜λŠ” 경우
if item_lower == kw_lower:
score += 40
# μƒν’ˆλͺ…이 ν‚€μ›Œλ“œλ‘œ μ‹œμž‘ν•˜λŠ” 경우
elif item_lower.startswith(kw_lower):
score += 30
# 단어 κ²½κ³„μ—μ„œ λ§€μΉ­λ˜λŠ” 경우
elif word_boundary_match:
score += 20
# λ‹¨μˆœ ν¬ν•¨λ˜λŠ” 경우
else:
score += 10
# μœ μ˜λ―Έν•œ ν‚€μ›Œλ“œκ°€ λ§Žμ„μˆ˜λ‘ 더 높은 점수 λΆ€μ—¬
if meaningful_keywords:
matched_keywords = sum(1 for k in meaningful_keywords if k.lower() in item_lower)
coverage_ratio = matched_keywords / len(meaningful_keywords)
score += coverage_ratio * 15
return score
# βœ… 데이터 λ‘œλ“œ ν•¨μˆ˜
def load_huggingface_jsonl(dataset_name, split="train"):
if HF_API_TOKEN:
login(token=HF_API_TOKEN)
try:
repo_id = f"aikobay/{dataset_name}"
dataset = load_dataset(repo_id, split=split)
df = dataset.to_pandas().dropna() # βœ… NaN κ°’ 제거
return df
except Exception as e:
logger.error(f"❌ 데이터 λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {e}")
return pd.DataFrame()
# βœ… μ§„ν–‰ 쀑인 κ²½λ§€ μƒν’ˆ 데이터 λ‘œλ“œ
try:
active_sale_items = load_huggingface_jsonl("initial_saleitem_dataset")
logger.info(f"βœ… μ§„ν–‰ 쀑인 κ²½λ§€ μƒν’ˆ 데이터 λ‘œλ“œ μ™„λ£Œ! 총 {len(active_sale_items)}개 μƒν’ˆ")
except Exception as e:
logger.error(f"❌ μƒν’ˆ 데이터 λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {e}")
active_sale_items = pd.DataFrame()
# βœ… FAISS 벑터 λͺ¨λΈ
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
# βœ… λ©€ν‹°μ½”μ–΄ 벑터화 ν•¨μˆ˜
def encode_texts_parallel(texts, batch_size=512):
"""λ©€ν‹° ν”„λ‘œμ„Έμ‹±μ„ ν™œμš©ν•œ 벑터화 속도 μ΅œμ ν™”"""
num_cores = os.cpu_count() # CPU 개수 확인
logger.info(f"πŸ”„ λ©€ν‹°μ½”μ–΄ 벑터화 μ§„ν–‰ (μ½”μ–΄ 수: {num_cores})")
def encode_batch(batch):
return embedding_model.encode(batch, convert_to_numpy=True)
# 배치 λ‹¨μœ„λ‘œ 병렬 처리
text_batches = [texts[i:i + batch_size] for i in range(0, len(texts), batch_size)]
embeddings = Parallel(n_jobs=num_cores)(delayed(encode_batch)(batch) for batch in text_batches)
return np.vstack(embeddings).astype("float32")
# βœ… FAISS 인덱슀 μ €μž₯ & λ‘œλ“œ
def save_faiss_index():
"""FAISS 인덱슀λ₯Ό Hugging Face Hub에 μ €μž₯ν•˜μ—¬ μ„œλ²„ μž¬μ‹œμž‘ μ‹œμ—λ„ 데이터 μœ μ§€"""
from huggingface_hub import HfApi, create_repo
import tempfile
try:
# λ ˆν¬μ§€ν† λ¦¬ ID (ν™˜κ²½ λ³€μˆ˜μ—μ„œ κ°€μ Έμ˜€κ±°λ‚˜ κΈ°λ³Έκ°’ μ‚¬μš©)
repo_id = os.getenv("HF_INDEX_REPO", "aikobay/saleitem_faiss_index")
# HfApi 객체 생성
api = HfApi()
# λ ˆν¬μ§€ν† λ¦¬ 쑴재 μ—¬λΆ€ 확인 및 생성
try:
# λ ˆν¬μ§€ν† λ¦¬ 정보 쑰회 μ‹œλ„
api.repo_info(repo_id=repo_id, repo_type="dataset")
logger.info(f"βœ… κΈ°μ‘΄ λ ˆν¬μ§€ν† λ¦¬ μ‚¬μš©: {repo_id}")
except Exception:
# λ ˆν¬μ§€ν† λ¦¬κ°€ μ—†μœΌλ©΄ μƒˆλ‘œ 생성
logger.info(f"πŸ”„ λ ˆν¬μ§€ν† λ¦¬κ°€ μ‘΄μž¬ν•˜μ§€ μ•Šμ•„ μƒˆλ‘œ μƒμ„±ν•©λ‹ˆλ‹€: {repo_id}")
create_repo(
repo_id=repo_id,
repo_type="dataset",
private=True, # λΉ„κ³΅κ°œ λ ˆν¬μ§€ν† λ¦¬λ‘œ μ„€μ •
exist_ok=True # 이미 μ‘΄μž¬ν•΄λ„ 였λ₯˜ λ°œμƒν•˜μ§€ μ•ŠμŒ
)
logger.info(f"βœ… λ ˆν¬μ§€ν† λ¦¬ 생성 μ™„λ£Œ: {repo_id}")
# μž„μ‹œ 파일둜 λ¨Όμ € λ‘œμ»¬μ— μ €μž₯
with tempfile.TemporaryDirectory() as temp_dir:
index_path = os.path.join(temp_dir, "faiss_index.bin")
items_path = os.path.join(temp_dir, "indexed_items.txt")
# FAISS 인덱슀 μ €μž₯
faiss.write_index(faiss_index, index_path)
# μ•„μ΄ν…œ λͺ©λ‘ μ €μž₯
with open(items_path, "w", encoding="utf-8") as f:
f.write("\n".join(indexed_items))
# README 파일 생성 (λ ˆν¬μ§€ν† λ¦¬ μ •λ³΄μš©)
readme_path = os.path.join(temp_dir, "README.md")
with open(readme_path, "w", encoding="utf-8") as f:
f.write(f"""# FAISS 인덱슀 μ €μž₯μ†Œ
이 μ €μž₯μ†ŒλŠ” μƒν’ˆ 검색을 μœ„ν•œ FAISS μΈλ±μŠ€μ™€ κ΄€λ ¨ 데이터λ₯Ό ν¬ν•¨ν•˜κ³  μžˆμŠ΅λ‹ˆλ‹€.
- μ΅œμ’… μ—…λ°μ΄νŠΈ: {pd.Timestamp.now()}
- 인덱슀 ν•­λͺ© 수: {len(indexed_items)}
- λͺ¨λΈ: {MODEL_NAME}
이 μ €μž₯μ†ŒλŠ”, 'aikobay/initial_saleitem_dataset'의 μƒν’ˆ 데이터λ₯Ό 기반으둜 μƒμ„±λœ 벑터 인덱슀λ₯Ό μ €μž₯ν•˜κΈ° μœ„ν•΄ μžλ™ μƒμ„±λ˜μ—ˆμŠ΅λ‹ˆλ‹€.
""")
# 파일 μ—…λ‘œλ“œ
for file_path, file_name in [
(index_path, "faiss_index.bin"),
(items_path, "indexed_items.txt"),
(readme_path, "README.md")
]:
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=file_name,
repo_id=repo_id,
repo_type="dataset"
)
logger.info(f"βœ… FAISS μΈλ±μŠ€κ°€ Hugging Face Hub에 μ €μž₯λ˜μ—ˆμŠ΅λ‹ˆλ‹€. 레포: {repo_id}")
except Exception as e:
logger.error(f"❌ FAISS 인덱슀 Hub μ €μž₯ 쀑 였λ₯˜ λ°œμƒ: {e}")
# λ‘œμ»¬μ— λ°±μ—… μ €μž₯ μ‹œλ„
try:
local_path = os.path.join(os.getcwd(), "faiss_index.bin")
faiss.write_index(faiss_index, local_path)
with open("indexed_items.txt", "w", encoding="utf-8") as f:
f.write("\n".join(indexed_items))
logger.info(f"βœ… FAISS μΈλ±μŠ€κ°€ λ‘œμ»¬μ— λ°±μ—… μ €μž₯λ˜μ—ˆμŠ΅λ‹ˆλ‹€: {local_path}")
except Exception as local_err:
logger.error(f"❌ 둜컬 λ°±μ—… μ €μž₯도 μ‹€νŒ¨: {local_err}")
def load_faiss_index():
"""Hugging Face Hubμ—μ„œ FAISS 인덱슀λ₯Ό λ‘œλ“œν•˜μ—¬ 검색 속도 ν–₯상"""
from huggingface_hub import hf_hub_download, HfApi
global faiss_index, indexed_items, active_sale_items
# λ ˆν¬μ§€ν† λ¦¬ ID (ν™˜κ²½ λ³€μˆ˜μ—μ„œ κ°€μ Έμ˜€κ±°λ‚˜ κΈ°λ³Έκ°’ μ‚¬μš©)
repo_id = os.getenv("HF_INDEX_REPO", "aikobay/saleitem_faiss_index")
try:
# λ ˆν¬μ§€ν† λ¦¬ 쑴재 확인
api = HfApi()
try:
api.repo_info(repo_id=repo_id, repo_type="dataset")
logger.info(f"βœ… FAISS 인덱슀 λ ˆν¬μ§€ν† λ¦¬ 확인: {repo_id}")
except Exception as repo_err:
logger.warning(f"⚠️ λ ˆν¬μ§€ν† λ¦¬κ°€ μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€: {repo_err}")
raise FileNotFoundError("Hub λ ˆν¬μ§€ν† λ¦¬κ°€ μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€")
# Hubμ—μ„œ 파일 λ‹€μš΄λ‘œλ“œ
index_path = hf_hub_download(
repo_id=repo_id,
filename="faiss_index.bin",
repo_type="dataset"
)
items_path = hf_hub_download(
repo_id=repo_id,
filename="indexed_items.txt",
repo_type="dataset"
)
# 파일 λ‘œλ“œ
faiss_index = faiss.read_index(index_path)
with open(items_path, "r", encoding="utf-8") as f:
indexed_items = f.read().splitlines()
logger.info(f"βœ… FAISS μΈλ±μŠ€κ°€ Hubμ—μ„œ λ‘œλ“œλ˜μ—ˆμŠ΅λ‹ˆλ‹€. 총 {len(indexed_items)}개 μƒν’ˆ")
except Exception as e:
logger.warning(f"⚠️ Hubμ—μ„œ FAISS 인덱슀 λ‘œλ“œ 쀑 였λ₯˜ λ°œμƒ: {e}")
# 둜컬 파일 확인
try:
faiss_index = faiss.read_index("faiss_index.bin")
with open("indexed_items.txt", "r", encoding="utf-8") as f:
indexed_items = f.read().splitlines()
logger.info(f"βœ… 둜컬 FAISS 인덱슀 λ‘œλ“œ 성곡. 총 {len(indexed_items)}개 μƒν’ˆ")
except FileNotFoundError:
logger.warning("⚠️ FAISS 인덱슀 파일이 μ‘΄μž¬ν•˜μ§€ μ•ŠμŠ΅λ‹ˆλ‹€. μƒˆλ‘œ κ΅¬μΆ•ν•©λ‹ˆλ‹€.")
rebuild_faiss_index()
except Exception as local_err:
logger.error(f"❌ 둜컬 FAISS 인덱슀 λ‘œλ“œ 쀑 였λ₯˜: {local_err}")
rebuild_faiss_index()
# βœ… FAISS 인덱슀 생성 (μ§„ν–‰λ₯  ν‘œμ‹œ μΆ”κ°€)
def rebuild_faiss_index():
global faiss_index, indexed_items, active_sale_items
logger.info("πŸ”„ μƒˆλ‘œμš΄ sale_item λ°μ΄ν„°λ‘œ FAISS 인덱슀λ₯Ό μž¬κ΅¬μΆ•ν•©λ‹ˆλ‹€...")
active_sale_items = load_huggingface_jsonl("initial_saleitem_dataset")
item_names = active_sale_items["ITEMNAME"].tolist()
logger.info(f"πŸ”Ή 총 {len(item_names)}개 μƒν’ˆ 벑터화 μ‹œμž‘...")
# 병렬 처리λ₯Ό ν†΅ν•œ 벑터화 (더 λΉ λ₯Έ 처리)
item_vectors = encode_texts_parallel(item_names)
# βœ… FAISS 인덱슀 생성
faiss_index = faiss.IndexFlatL2(item_vectors.shape[1])
faiss_index.add(item_vectors)
indexed_items = item_names
logger.info(f"βœ… FAISS μΈλ±μŠ€κ°€ {len(indexed_items)}개 μƒν’ˆμœΌλ‘œ μƒˆλ‘­κ²Œ κ΅¬μΆ•λ˜μ—ˆμŠ΅λ‹ˆλ‹€.")
save_faiss_index()
# ν‚€μ›Œλ“œ μΆ”μΆœ ν•¨μˆ˜ μΆ”κ°€
#def extract_keywords(query):
# """검색 μΏΌλ¦¬μ—μ„œ μ£Όμš” ν‚€μ›Œλ“œ μΆ”μΆœ 및 μ •μ œ"""
# try:
# # κΈ°λ³Έ μ •μ œ (특수문자 제거, μ†Œλ¬Έμž λ³€ν™˜ λ“±)
# cleaned_query = re.sub(r'[^\w\s]', ' ', query).strip().lower()
# # ν˜•νƒœμ†Œ 뢄석을 ν†΅ν•œ λͺ…사 μΆ”μΆœ
# nouns = okt.nouns(cleaned_query)
# # 2κΈ€μž 이상 λͺ…μ‚¬λ§Œ 선택 (μ˜λ―ΈμžˆλŠ” ν‚€μ›Œλ“œ μœ„μ£Ό)
# keywords = [noun for noun in nouns if len(noun) >= 2]
# # λͺ…사가 μ—†κ±°λ‚˜ μΆ”μΆœ μ‹€νŒ¨ μ‹œ, 원본 쿼리의 곡백으둜 κ΅¬λΆ„λœ 토큰 μ‚¬μš©
# if not keywords:
# keywords = cleaned_query.split()
# return keywords
# except Exception as e:
# logger.error(f"❌ ν‚€μ›Œλ“œ μΆ”μΆœ 쀑 였λ₯˜ λ°œμƒ: {e}")
# # 였λ₯˜ λ°œμƒ μ‹œ 원본 쿼리 κ·ΈλŒ€λ‘œ λ°˜ν™˜
# return [query]
# 검색 κ²°κ³Ό ν–₯상을 μœ„ν•œ ν‚€μ›Œλ“œ λ§€μΉ­ 점수 계산 ν•¨μˆ˜
#def calculate_keyword_score(item_name, keywords):
# """μ•„μ΄ν…œ 이름과 ν‚€μ›Œλ“œ κ°„μ˜ λ§€μΉ­ 점수 계산"""
# score = 0
# item_lower = item_name.lower()
# # 1. 전체 쿼리가 μƒν’ˆλͺ…에 ν¬ν•¨λ˜λ©΄ 높은 점수
# if ''.join(keywords).lower() in item_lower:
# score += 10
# # 2. κ°œλ³„ ν‚€μ›Œλ“œ λ§€μΉ­ 점수
# for keyword in keywords:
# if keyword.lower() in item_lower:
# # μ •ν™•νžˆ μΌμΉ˜ν•˜λŠ” 경우 높은 점수
# if keyword.lower() == item_lower:
# score += 15
# # μƒν’ˆλͺ…이 ν‚€μ›Œλ“œλ‘œ μ‹œμž‘ν•˜λŠ” 경우
# elif item_lower.startswith(keyword.lower()):
# score += 8
# # λ‹¨μˆœ ν¬ν•¨λ˜λŠ” 경우
# else:
# score += 5
# return score
# μΆ”μ²œ API μ—”λ“œν¬μΈνŠΈ
@app.post("/api/recommend")
async def recommend(request: RecommendRequest):
try:
recommendations = search_faiss(request.search_query, request.top_k)
return {
"query": request.search_query,
"recommendations": recommendations,
"related_keywords": generate_related_keywords(request.search_query)
}
except Exception as e:
raise HTTPException(status_code=500, detail=f"μΆ”μ²œ 였λ₯˜: {str(e)}")
# βœ… 인덱슀 κ°±μ‹  API
@app.post("/api/update_index")
async def update_index():
rebuild_faiss_index()
return {"message": "βœ… FAISS 인덱슀 μ—…λ°μ΄νŠΈ μ™„λ£Œ!"}
# βœ… FastAPI μ‹€ν–‰
if __name__ == "__main__":
load_faiss_index()
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)