allenlsl commited on
Commit
9eadfc9
·
verified ·
1 Parent(s): e62a9ed

Update smart_chunk.py

Browse files
Files changed (1) hide show
  1. smart_chunk.py +45 -29
smart_chunk.py CHANGED
@@ -1,29 +1,45 @@
1
- # import nltk
2
- # nltk.download('punkt') # Only needed once
3
- # from nltk.tokenize import sent_tokenize
4
-
5
-
6
- import nltk
7
-
8
-
9
- # Automatically download 'punkt' if not already available
10
- try:
11
- nltk.data.find('tokenizers/punkt')
12
- except LookupError:
13
- nltk.download('punkt')
14
-
15
- from nltk.tokenize import sent_tokenize
16
-
17
-
18
- def smart_chunk_text(text, max_tokens=128):
19
- sentences = sent_tokenize(text)
20
- chunks, current_chunk = [], ""
21
- for sentence in sentences:
22
- if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
23
- current_chunk += " " + sentence
24
- else:
25
- chunks.append(current_chunk.strip())
26
- current_chunk = sentence
27
- if current_chunk:
28
- chunks.append(current_chunk.strip())
29
- return chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import nltk
2
+ # nltk.download('punkt') # Only needed once
3
+ # from nltk.tokenize import sent_tokenize
4
+
5
+
6
+ from typing import List
7
+ import re
8
+ from nltk.tokenize import sent_tokenize
9
+
10
+ def split_by_sections(text: str) -> List[str]:
11
+ """
12
+ Splits legal text into chunks based on section headings like 'Section 1' or 'Sec. 2.3'.
13
+ """
14
+ section_pattern = re.compile(r'(Section\s+\d+[^\n]*|Sec\.\s*\d+[^\n]*)', re.IGNORECASE)
15
+ parts = section_pattern.split(text)
16
+
17
+ chunks = []
18
+ for i in range(1, len(parts), 2):
19
+ header = parts[i].strip()
20
+ content = parts[i + 1].strip() if i + 1 < len(parts) else ""
21
+ chunks.append(f"{header}\n{content}")
22
+ return chunks
23
+
24
+ def smart_chunk_text(text: str, max_tokens: int = 128) -> List[str]:
25
+ """
26
+ Split a long legal document into semantically meaningful chunks, with a fallback
27
+ to split section-wise using section headers.
28
+ """
29
+ final_chunks = []
30
+ section_chunks = split_by_sections(text)
31
+
32
+ for section in section_chunks:
33
+ sentences = sent_tokenize(section)
34
+ current_chunk = ""
35
+ for sentence in sentences:
36
+ if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
37
+ current_chunk += " " + sentence
38
+ else:
39
+ final_chunks.append(current_chunk.strip())
40
+ current_chunk = sentence
41
+ if current_chunk:
42
+ final_chunks.append(current_chunk.strip())
43
+
44
+ return final_chunks
45
+