|
|
|
|
|
import spacy
|
|
from pathlib import Path
|
|
import logging
|
|
import itertools
|
|
import re
|
|
import string
|
|
|
|
|
|
from src.data_management import storage
|
|
from src.data_management import loaders
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
|
|
|
|
nlp = None
|
|
STOP_WORDS = set()
|
|
try:
|
|
nlp = spacy.load("en_core_web_lg")
|
|
logging.info("spaCy 'en_core_web_lg' modeli başarıyla yüklendi.")
|
|
STOP_WORDS = nlp.Defaults.stop_words
|
|
except OSError:
|
|
logging.error("spaCy 'en_core_web_lg' modeli bulunamadı. Lütfen indirin: python -m spacy download en_core_web_lg")
|
|
|
|
|
|
TRUSTED_ENTITY_LABELS = {"PRODUCT", "ORG", "WORK_OF_ART"}
|
|
OTHER_ENTITY_LABELS = {"PERSON", "EVENT", "LAW", "NORP", "FAC", "GPE", "LOC"}
|
|
NOUN_CHUNK_PATTERNS = re.compile(r".*\b(learning|network|model|algorithm|system|technique|approach|agent|layer|architecture|transformer|attention)\b$", re.IGNORECASE)
|
|
MIN_CONCEPT_WORDS = 1
|
|
MAX_CONCEPT_WORDS = 6
|
|
AI_KEYWORDS = {"artificial intelligence", "machine learning", "deep learning",
|
|
"neural network", "reinforcement learning", "transformer", "llm",
|
|
"large language model", "computer vision", "natural language processing",
|
|
"algorithm", "model", "gpt", "bert", "agent", "attention", "supervised",
|
|
"unsupervised", "classification", "regression", "clustering"}
|
|
|
|
RELATION_VERBS = {
|
|
"use": "USES", "utilize": "USES", "apply": "USES", "employ": "USES",
|
|
"improve": "IMPROVES", "enhance": "IMPROVES", "extend": "IMPROVES", "outperform": "IMPROVES",
|
|
"base on": "BASED_ON", "rely on": "BASED_ON",
|
|
"compare": "COMPARES_TO", "relate": "RELATED_TO", "associate": "RELATED_TO", "link": "RELATED_TO",
|
|
"propose": "PROPOSES", "introduce": "PROPOSES", "develop": "PROPOSES",
|
|
}
|
|
|
|
def normalize_and_validate_concept(text: str, is_entity: bool = False, entity_label: str = "") -> str | None:
|
|
""" Verilen metni temizler, doğrular... """
|
|
cleaned_text = text.strip()
|
|
word_count = len(cleaned_text.split())
|
|
if not (MIN_CONCEPT_WORDS <= word_count <= MAX_CONCEPT_WORDS): return None
|
|
if cleaned_text and all(word.lower() in STOP_WORDS for word in re.findall(r'\b\w+\b', cleaned_text)): return None
|
|
if cleaned_text.isdigit() or all(c in string.punctuation for c in cleaned_text): return None
|
|
generic_phrases = {"this approach", "these models", "this technique", "this system",
|
|
"the model", "the algorithm", "the method", "the approach",
|
|
"the system", "the technique", "our model", "our approach"}
|
|
if cleaned_text.lower() in generic_phrases: return None
|
|
return cleaned_text
|
|
|
|
def find_verb_relation(token1: spacy.tokens.Token, token2: spacy.tokens.Token) -> tuple[str, str] | None:
|
|
""" İki token arasındaki dependency path'e bakarak fiil ilişkisi bulur. """
|
|
common_ancestor = None
|
|
ancestors1 = list(token1.ancestors)
|
|
ancestors2 = list(token2.ancestors)
|
|
for t in reversed(ancestors1):
|
|
if t in ancestors2:
|
|
common_ancestor = t
|
|
break
|
|
if not common_ancestor: return None
|
|
|
|
verb1 = None; head = token1
|
|
while head != common_ancestor:
|
|
if head.pos_ == "VERB": verb1 = head; break
|
|
head = head.head
|
|
verb2 = None; head = token2
|
|
while head != common_ancestor:
|
|
if head.pos_ == "VERB": verb2 = head; break
|
|
head = head.head
|
|
|
|
verb_token = None
|
|
if common_ancestor.pos_ == "VERB": verb_token = common_ancestor
|
|
elif verb1 and verb1 == verb2: verb_token = verb1
|
|
|
|
|
|
elif common_ancestor.head.pos_ == "VERB": verb_token = common_ancestor.head
|
|
|
|
if verb_token:
|
|
verb_lemma = verb_token.lemma_
|
|
|
|
|
|
|
|
|
|
for verb, rel_type in RELATION_VERBS.items():
|
|
if verb_lemma == verb or verb_lemma in verb.split():
|
|
logging.debug(f"Fiil ilişkisi bulundu: {token1.text}... {verb_lemma} ({rel_type}) ...{token2.text}")
|
|
return rel_type, verb_lemma
|
|
return None
|
|
|
|
def extract_entities_and_relations(text: str, doc_id: str):
|
|
""" Metinden konseptleri, mention'ları ve İYİLEŞTİRİLMİŞ ilişkileri çıkarır. """
|
|
if not nlp: raise RuntimeError("spaCy modeli yüklenemedi.")
|
|
spacy_doc = nlp(text)
|
|
potential_concepts = {}; mentions_in_doc = []; valid_mentions = {}
|
|
processed_spans = set(); added_relations = set()
|
|
|
|
|
|
candidates = []
|
|
for ent in spacy_doc.ents:
|
|
if ent.label_ in TRUSTED_ENTITY_LABELS or ent.label_ in OTHER_ENTITY_LABELS:
|
|
candidates.append({"span": ent, "is_entity": True, "label": ent.label_})
|
|
for chunk in spacy_doc.noun_chunks:
|
|
is_covered = any(ent_data["span"].start_char <= chunk.start_char and ent_data["span"].end_char >= chunk.end_char
|
|
for ent_data in candidates if ent_data["is_entity"])
|
|
if not is_covered:
|
|
candidates.append({"span": chunk, "is_entity": False, "label": ""})
|
|
|
|
|
|
for data in candidates:
|
|
span = data["span"];
|
|
if span in processed_spans: continue
|
|
validated_text = normalize_and_validate_concept(span.text, data["is_entity"], data["label"])
|
|
if not validated_text: processed_spans.add(span); continue
|
|
concept_lemma = span.lemma_.lower().strip() if span.lemma_ else validated_text.lower()
|
|
is_concept = False
|
|
if data["is_entity"] and data["label"] in TRUSTED_ENTITY_LABELS: is_concept = True
|
|
elif NOUN_CHUNK_PATTERNS.match(validated_text): is_concept = True
|
|
elif any(keyword in concept_lemma.split() or keyword in validated_text.lower().split() for keyword in AI_KEYWORDS): is_concept = True
|
|
elif validated_text.isupper() and len(validated_text) > 1 and len(validated_text) < 6: is_concept = True
|
|
|
|
if is_concept:
|
|
concept_id = storage.add_concept(validated_text)
|
|
if concept_id:
|
|
mention_id = storage.add_mention(
|
|
doc_id=doc_id, concept_id=concept_id,
|
|
context=span.sent.text, start=span.start_char, end=span.end_char
|
|
)
|
|
if mention_id:
|
|
mention_data = {
|
|
"mention_id": mention_id, "concept_id": concept_id,
|
|
"start_char": span.start_char, "end_char": span.end_char,
|
|
"sentence": span.sent, "root_token": span.root
|
|
}
|
|
mentions_in_doc.append(mention_data); valid_mentions[mention_id] = mention_data
|
|
processed_spans.add(span)
|
|
|
|
|
|
for sentence in spacy_doc.sents:
|
|
mentions_in_sentence = [m for m in mentions_in_doc if m["sentence"] == sentence]
|
|
if len(mentions_in_sentence) >= 2:
|
|
for m1_data, m2_data in itertools.combinations(mentions_in_sentence, 2):
|
|
c1_id = m1_data["concept_id"]; c2_id = m2_data["concept_id"]
|
|
if c1_id == c2_id: continue
|
|
rel_pair = tuple(sorted((c1_id, c2_id)))
|
|
if rel_pair in added_relations: continue
|
|
relation_found = False
|
|
relation_info = find_verb_relation(m1_data["root_token"], m2_data["root_token"])
|
|
if relation_info:
|
|
rel_type, verb = relation_info
|
|
storage.add_relationship(
|
|
source_concept_id=c1_id, target_concept_id=c2_id, rel_type=rel_type,
|
|
mention_id=m1_data["mention_id"], doc_id=doc_id, sentence=sentence.text
|
|
)
|
|
relation_found = True; added_relations.add(rel_pair)
|
|
if not relation_found:
|
|
storage.add_relationship(
|
|
source_concept_id=c1_id, target_concept_id=c2_id, rel_type="RELATED_TO",
|
|
mention_id=m1_data["mention_id"], doc_id=doc_id, sentence=sentence.text
|
|
)
|
|
added_relations.add(rel_pair)
|
|
|
|
def process_documents_for_extraction():
|
|
""" Dokümanları işler ve durumu günceller... (Öncekiyle aynı) """
|
|
if not nlp: raise RuntimeError("spaCy modeli yüklenemedi.")
|
|
logging.info("Gelişmiş bilgi çıkarımı için dokümanlar işleniyor...")
|
|
documents_df = storage.load_dataframe('documents', storage.DOC_COLUMNS)
|
|
docs_to_process = documents_df[documents_df['status'] == 'added']
|
|
if docs_to_process.empty:
|
|
logging.info("Durumu 'added' olan ve işlenecek doküman bulunamadı.")
|
|
return
|
|
processed_count = 0; failed_count = 0
|
|
for index, doc_row in docs_to_process.iterrows():
|
|
doc_id = doc_row['doc_id']; filepath = Path(doc_row['filepath'])
|
|
logging.info(f"İşleniyor: {filepath.name} (ID: {doc_id})")
|
|
text = loaders.extract_text_from_pdf(filepath)
|
|
if text:
|
|
try:
|
|
extract_entities_and_relations(text, doc_id)
|
|
storage.update_document_status(doc_id, 'processed_v3')
|
|
processed_count += 1
|
|
except Exception as e:
|
|
logging.exception(f"'{filepath.name}' işlenirken BEKLENMEYEN HATA oluştu: {e}")
|
|
storage.update_document_status(doc_id, 'extraction_failed_v3')
|
|
failed_count += 1
|
|
else:
|
|
logging.warning(f"Metin çıkarılamadı: {filepath.name}")
|
|
storage.update_document_status(doc_id, 'text_extraction_failed')
|
|
failed_count += 1
|
|
logging.info(f"Gelişmiş bilgi çıkarımı tamamlandı. Başarılı: {processed_count}, Başarısız: {failed_count}") |