ChronoSense / extractor.py
NextGenC's picture
Upload 27 files
64b5d29 verified
raw
history blame contribute delete
10.5 kB
# src/extraction/extractor.py (AttributeError DÜZELTİLMİŞ TAM KOD)
import spacy
from pathlib import Path
import logging
import itertools
import re
import string
# Yerel modüllerimizi içe aktaralım
from src.data_management import storage
from src.data_management import loaders # extract_text_from_pdf için
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
# --- spaCy Model Yükleme ---
nlp = None
STOP_WORDS = set()
try:
nlp = spacy.load("en_core_web_lg")
logging.info("spaCy 'en_core_web_lg' modeli başarıyla yüklendi.")
STOP_WORDS = nlp.Defaults.stop_words
except OSError:
logging.error("spaCy 'en_core_web_lg' modeli bulunamadı. Lütfen indirin: python -m spacy download en_core_web_lg")
# --- Konsept Belirleme Kriterleri (Aynı kaldı) ---
TRUSTED_ENTITY_LABELS = {"PRODUCT", "ORG", "WORK_OF_ART"}
OTHER_ENTITY_LABELS = {"PERSON", "EVENT", "LAW", "NORP", "FAC", "GPE", "LOC"}
NOUN_CHUNK_PATTERNS = re.compile(r".*\b(learning|network|model|algorithm|system|technique|approach|agent|layer|architecture|transformer|attention)\b$", re.IGNORECASE)
MIN_CONCEPT_WORDS = 1
MAX_CONCEPT_WORDS = 6
AI_KEYWORDS = {"artificial intelligence", "machine learning", "deep learning",
"neural network", "reinforcement learning", "transformer", "llm",
"large language model", "computer vision", "natural language processing",
"algorithm", "model", "gpt", "bert", "agent", "attention", "supervised",
"unsupervised", "classification", "regression", "clustering"}
# --- İlişki Çıkarımı için Fiiller ve Desenler ---
RELATION_VERBS = {
"use": "USES", "utilize": "USES", "apply": "USES", "employ": "USES",
"improve": "IMPROVES", "enhance": "IMPROVES", "extend": "IMPROVES", "outperform": "IMPROVES",
"base on": "BASED_ON", "rely on": "BASED_ON",
"compare": "COMPARES_TO", "relate": "RELATED_TO", "associate": "RELATED_TO", "link": "RELATED_TO",
"propose": "PROPOSES", "introduce": "PROPOSES", "develop": "PROPOSES",
}
def normalize_and_validate_concept(text: str, is_entity: bool = False, entity_label: str = "") -> str | None:
""" Verilen metni temizler, doğrular... """
cleaned_text = text.strip()
word_count = len(cleaned_text.split())
if not (MIN_CONCEPT_WORDS <= word_count <= MAX_CONCEPT_WORDS): return None
if cleaned_text and all(word.lower() in STOP_WORDS for word in re.findall(r'\b\w+\b', cleaned_text)): return None
if cleaned_text.isdigit() or all(c in string.punctuation for c in cleaned_text): return None
generic_phrases = {"this approach", "these models", "this technique", "this system",
"the model", "the algorithm", "the method", "the approach",
"the system", "the technique", "our model", "our approach"}
if cleaned_text.lower() in generic_phrases: return None
return cleaned_text
def find_verb_relation(token1: spacy.tokens.Token, token2: spacy.tokens.Token) -> tuple[str, str] | None:
""" İki token arasındaki dependency path'e bakarak fiil ilişkisi bulur. """
common_ancestor = None
ancestors1 = list(token1.ancestors)
ancestors2 = list(token2.ancestors)
for t in reversed(ancestors1):
if t in ancestors2:
common_ancestor = t
break
if not common_ancestor: return None
verb1 = None; head = token1
while head != common_ancestor:
if head.pos_ == "VERB": verb1 = head; break
head = head.head
verb2 = None; head = token2
while head != common_ancestor:
if head.pos_ == "VERB": verb2 = head; break
head = head.head
verb_token = None
if common_ancestor.pos_ == "VERB": verb_token = common_ancestor
elif verb1 and verb1 == verb2: verb_token = verb1
# elif verb1: verb_token = verb1 # Tek taraflı fiilleri şimdilik yoksayalım
# elif verb2: verb_token = verb2
elif common_ancestor.head.pos_ == "VERB": verb_token = common_ancestor.head
if verb_token:
verb_lemma = verb_token.lemma_
# *** HATA DÜZELTME: Bu satırı geçici olarak kaldırıyoruz/yorum yapıyoruz ***
# if verb_token.is_aux or verb_token.is_stop:
# return None
# **********************************************************************
for verb, rel_type in RELATION_VERBS.items():
if verb_lemma == verb or verb_lemma in verb.split():
logging.debug(f"Fiil ilişkisi bulundu: {token1.text}... {verb_lemma} ({rel_type}) ...{token2.text}")
return rel_type, verb_lemma
return None
def extract_entities_and_relations(text: str, doc_id: str):
""" Metinden konseptleri, mention'ları ve İYİLEŞTİRİLMİŞ ilişkileri çıkarır. """
if not nlp: raise RuntimeError("spaCy modeli yüklenemedi.")
spacy_doc = nlp(text)
potential_concepts = {}; mentions_in_doc = []; valid_mentions = {}
processed_spans = set(); added_relations = set()
# 1. Adayları Bul
candidates = []
for ent in spacy_doc.ents:
if ent.label_ in TRUSTED_ENTITY_LABELS or ent.label_ in OTHER_ENTITY_LABELS:
candidates.append({"span": ent, "is_entity": True, "label": ent.label_})
for chunk in spacy_doc.noun_chunks:
is_covered = any(ent_data["span"].start_char <= chunk.start_char and ent_data["span"].end_char >= chunk.end_char
for ent_data in candidates if ent_data["is_entity"])
if not is_covered:
candidates.append({"span": chunk, "is_entity": False, "label": ""})
# 2. Adayları Filtrele, Normalleştir ve Kaydet
for data in candidates:
span = data["span"];
if span in processed_spans: continue
validated_text = normalize_and_validate_concept(span.text, data["is_entity"], data["label"])
if not validated_text: processed_spans.add(span); continue
concept_lemma = span.lemma_.lower().strip() if span.lemma_ else validated_text.lower()
is_concept = False
if data["is_entity"] and data["label"] in TRUSTED_ENTITY_LABELS: is_concept = True
elif NOUN_CHUNK_PATTERNS.match(validated_text): is_concept = True
elif any(keyword in concept_lemma.split() or keyword in validated_text.lower().split() for keyword in AI_KEYWORDS): is_concept = True
elif validated_text.isupper() and len(validated_text) > 1 and len(validated_text) < 6: is_concept = True
if is_concept:
concept_id = storage.add_concept(validated_text)
if concept_id:
mention_id = storage.add_mention(
doc_id=doc_id, concept_id=concept_id,
context=span.sent.text, start=span.start_char, end=span.end_char
)
if mention_id:
mention_data = {
"mention_id": mention_id, "concept_id": concept_id,
"start_char": span.start_char, "end_char": span.end_char,
"sentence": span.sent, "root_token": span.root
}
mentions_in_doc.append(mention_data); valid_mentions[mention_id] = mention_data
processed_spans.add(span)
# 3. İlişkileri Çıkar
for sentence in spacy_doc.sents:
mentions_in_sentence = [m for m in mentions_in_doc if m["sentence"] == sentence]
if len(mentions_in_sentence) >= 2:
for m1_data, m2_data in itertools.combinations(mentions_in_sentence, 2):
c1_id = m1_data["concept_id"]; c2_id = m2_data["concept_id"]
if c1_id == c2_id: continue
rel_pair = tuple(sorted((c1_id, c2_id)))
if rel_pair in added_relations: continue
relation_found = False
relation_info = find_verb_relation(m1_data["root_token"], m2_data["root_token"])
if relation_info:
rel_type, verb = relation_info
storage.add_relationship(
source_concept_id=c1_id, target_concept_id=c2_id, rel_type=rel_type,
mention_id=m1_data["mention_id"], doc_id=doc_id, sentence=sentence.text
)
relation_found = True; added_relations.add(rel_pair)
if not relation_found:
storage.add_relationship(
source_concept_id=c1_id, target_concept_id=c2_id, rel_type="RELATED_TO",
mention_id=m1_data["mention_id"], doc_id=doc_id, sentence=sentence.text
)
added_relations.add(rel_pair)
def process_documents_for_extraction():
""" Dokümanları işler ve durumu günceller... (Öncekiyle aynı) """
if not nlp: raise RuntimeError("spaCy modeli yüklenemedi.")
logging.info("Gelişmiş bilgi çıkarımı için dokümanlar işleniyor...")
documents_df = storage.load_dataframe('documents', storage.DOC_COLUMNS)
docs_to_process = documents_df[documents_df['status'] == 'added']
if docs_to_process.empty:
logging.info("Durumu 'added' olan ve işlenecek doküman bulunamadı.")
return
processed_count = 0; failed_count = 0
for index, doc_row in docs_to_process.iterrows():
doc_id = doc_row['doc_id']; filepath = Path(doc_row['filepath'])
logging.info(f"İşleniyor: {filepath.name} (ID: {doc_id})")
text = loaders.extract_text_from_pdf(filepath)
if text:
try:
extract_entities_and_relations(text, doc_id)
storage.update_document_status(doc_id, 'processed_v3') # Yeni versiyon durumu
processed_count += 1
except Exception as e:
logging.exception(f"'{filepath.name}' işlenirken BEKLENMEYEN HATA oluştu: {e}")
storage.update_document_status(doc_id, 'extraction_failed_v3')
failed_count += 1
else:
logging.warning(f"Metin çıkarılamadı: {filepath.name}")
storage.update_document_status(doc_id, 'text_extraction_failed')
failed_count += 1
logging.info(f"Gelişmiş bilgi çıkarımı tamamlandı. Başarılı: {processed_count}, Başarısız: {failed_count}")