Spaces:

allenlsl
/

legal_rag

Running

allenlsl commited on 24 days ago

Commit

9eadfc9

verified ·

1 Parent(s): e62a9ed

Update smart_chunk.py

Files changed (1) hide show

smart_chunk.py CHANGED Viewed

@@ -1,29 +1,45 @@
-# import nltk
-# nltk.download('punkt')  # Only needed once
-# from nltk.tokenize import sent_tokenize
-import nltk
-# Automatically download 'punkt' if not already available
-try:
-    nltk.data.find('tokenizers/punkt')
-except LookupError:
-    nltk.download('punkt')
-from nltk.tokenize import sent_tokenize
-def smart_chunk_text(text, max_tokens=128):
-    sentences = sent_tokenize(text)
-    chunks, current_chunk = [], ""
-    for sentence in sentences:
-        if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
-            current_chunk += " " + sentence
-        else:
-            chunks.append(current_chunk.strip())
-            current_chunk = sentence
-    if current_chunk:
-        chunks.append(current_chunk.strip())
-    return chunks

+# import nltk
+# nltk.download('punkt')  # Only needed once
+# from nltk.tokenize import sent_tokenize
+from typing import List
+import re
+from nltk.tokenize import sent_tokenize
+def split_by_sections(text: str) -> List[str]:
+    """
+    Splits legal text into chunks based on section headings like 'Section 1' or 'Sec. 2.3'.
+    """
+    section_pattern = re.compile(r'(Section\s+\d+[^\n]*|Sec\.\s*\d+[^\n]*)', re.IGNORECASE)
+    parts = section_pattern.split(text)
+    chunks = []
+    for i in range(1, len(parts), 2):
+        header = parts[i].strip()
+        content = parts[i + 1].strip() if i + 1 < len(parts) else ""
+        chunks.append(f"{header}\n{content}")
+    return chunks
+def smart_chunk_text(text: str, max_tokens: int = 128) -> List[str]:
+    """
+    Split a long legal document into semantically meaningful chunks, with a fallback
+    to split section-wise using section headers.
+    """
+    final_chunks = []
+    section_chunks = split_by_sections(text)
+    for section in section_chunks:
+        sentences = sent_tokenize(section)
+        current_chunk = ""
+        for sentence in sentences:
+            if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
+                current_chunk += " " + sentence
+            else:
+                final_chunks.append(current_chunk.strip())
+                current_chunk = sentence
+        if current_chunk:
+            final_chunks.append(current_chunk.strip())
+    return final_chunks