# import nltk # nltk.download('punkt') # Only needed once # from nltk.tokenize import sent_tokenize from typing import List import re from nltk.tokenize import sent_tokenize def split_by_sections(text: str) -> List[str]: """ Splits legal text into chunks based on section headings like 'Section 1' or 'Sec. 2.3'. """ section_pattern = re.compile(r'(Section\s+\d+[^\n]*|Sec\.\s*\d+[^\n]*)', re.IGNORECASE) parts = section_pattern.split(text) chunks = [] for i in range(1, len(parts), 2): header = parts[i].strip() content = parts[i + 1].strip() if i + 1 < len(parts) else "" chunks.append(f"{header}\n{content}") return chunks def smart_chunk_text(text: str, max_tokens: int = 128) -> List[str]: """ Split a long legal document into semantically meaningful chunks, with a fallback to split section-wise using section headers. """ final_chunks = [] section_chunks = split_by_sections(text) for section in section_chunks: sentences = sent_tokenize(section) current_chunk = "" for sentence in sentences: if len(current_chunk.split()) + len(sentence.split()) <= max_tokens: current_chunk += " " + sentence else: final_chunks.append(current_chunk.strip()) current_chunk = sentence if current_chunk: final_chunks.append(current_chunk.strip()) return final_chunks