Update smart_chunk.py
Browse files- smart_chunk.py +45 -29
smart_chunk.py
CHANGED
@@ -1,29 +1,45 @@
|
|
1 |
-
# import nltk
|
2 |
-
# nltk.download('punkt') # Only needed once
|
3 |
-
# from nltk.tokenize import sent_tokenize
|
4 |
-
|
5 |
-
|
6 |
-
import
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import nltk
|
2 |
+
# nltk.download('punkt') # Only needed once
|
3 |
+
# from nltk.tokenize import sent_tokenize
|
4 |
+
|
5 |
+
|
6 |
+
from typing import List
|
7 |
+
import re
|
8 |
+
from nltk.tokenize import sent_tokenize
|
9 |
+
|
10 |
+
def split_by_sections(text: str) -> List[str]:
|
11 |
+
"""
|
12 |
+
Splits legal text into chunks based on section headings like 'Section 1' or 'Sec. 2.3'.
|
13 |
+
"""
|
14 |
+
section_pattern = re.compile(r'(Section\s+\d+[^\n]*|Sec\.\s*\d+[^\n]*)', re.IGNORECASE)
|
15 |
+
parts = section_pattern.split(text)
|
16 |
+
|
17 |
+
chunks = []
|
18 |
+
for i in range(1, len(parts), 2):
|
19 |
+
header = parts[i].strip()
|
20 |
+
content = parts[i + 1].strip() if i + 1 < len(parts) else ""
|
21 |
+
chunks.append(f"{header}\n{content}")
|
22 |
+
return chunks
|
23 |
+
|
24 |
+
def smart_chunk_text(text: str, max_tokens: int = 128) -> List[str]:
|
25 |
+
"""
|
26 |
+
Split a long legal document into semantically meaningful chunks, with a fallback
|
27 |
+
to split section-wise using section headers.
|
28 |
+
"""
|
29 |
+
final_chunks = []
|
30 |
+
section_chunks = split_by_sections(text)
|
31 |
+
|
32 |
+
for section in section_chunks:
|
33 |
+
sentences = sent_tokenize(section)
|
34 |
+
current_chunk = ""
|
35 |
+
for sentence in sentences:
|
36 |
+
if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
|
37 |
+
current_chunk += " " + sentence
|
38 |
+
else:
|
39 |
+
final_chunks.append(current_chunk.strip())
|
40 |
+
current_chunk = sentence
|
41 |
+
if current_chunk:
|
42 |
+
final_chunks.append(current_chunk.strip())
|
43 |
+
|
44 |
+
return final_chunks
|
45 |
+
|