Upload 10 files
Browse files- .gitattributes +1 -0
- app.py +21 -0
- legal_chunks.pkl +3 -0
- legal_index.faiss +3 -0
- llama_query.py +45 -0
- main.py +214 -0
- ollama_initial.py +22 -0
- processed_pdfs.pkl +3 -0
- processed_urls.pkl +3 -0
- requirements.txt +9 -0
- smart_chunk.py +29 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
legal_index.faiss filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# interface.py
|
2 |
+
|
3 |
+
import gradio as gr
|
4 |
+
from main import query_index, ask_llm_with_context
|
5 |
+
|
6 |
+
def legal_assistant(question):
|
7 |
+
if not question.strip():
|
8 |
+
return "Please enter a valid question."
|
9 |
+
context = query_index(question)
|
10 |
+
answer = ask_llm_with_context(question, context)
|
11 |
+
return answer
|
12 |
+
|
13 |
+
iface = gr.Interface(
|
14 |
+
fn=legal_assistant,
|
15 |
+
inputs=gr.Textbox(lines=4, placeholder="Ask about BC land survey law..."),
|
16 |
+
outputs="text",
|
17 |
+
title="π BC Land Survey Legal Assistant",
|
18 |
+
description="Ask any question related to BC Land Surveying based on laws, regulations, acts, bulletins, and more.",
|
19 |
+
)
|
20 |
+
|
21 |
+
iface.launch()
|
legal_chunks.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ef6cbd006cf78cbd4f00098f1d34284453e38c5065f8f58d6a8056a7862ded4a
|
3 |
+
size 1819221
|
legal_index.faiss
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9b7e5d8e61adabcd00a6aa01fb19f48a124ba8ba309c475294e9d3325e73505b
|
3 |
+
size 7756845
|
llama_query.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# ask_llm_with_context.py
|
2 |
+
|
3 |
+
import openai
|
4 |
+
import os
|
5 |
+
|
6 |
+
# Load your OpenAI key (use environment variable for Hugging Face later)
|
7 |
+
openai.api_key = os.getenv("OPENAI_API_KEY")
|
8 |
+
|
9 |
+
def ask_llm_with_context(question, context, model="gpt-3.5-turbo"):
|
10 |
+
prompt = f"""
|
11 |
+
You are a knowledgeable teaching assistant for land surveyors in British Columbia.
|
12 |
+
Your role is to support students, candidates, and practicing professionals (such as BCLS or LSTs)
|
13 |
+
by answering their questions based strictly on the legal and regulatory framework governing land surveying in British Columbia.
|
14 |
+
|
15 |
+
These users are subject to the statutes, regulations, Survey and Plan Rules, bylaws, circular letters, practice bulletins, and the Code of Ethics
|
16 |
+
of the Association of British Columbia Land Surveyors (ABCLS).
|
17 |
+
Your task is to provide clear, accurate, and context-grounded answers to support their professional development.
|
18 |
+
|
19 |
+
When answering, always:
|
20 |
+
- Reference relevant Acts, manuals, Survey and Plan Rules, circular letters, or bulletins (include section/page if known)
|
21 |
+
- Mention the Code of Ethics if applicable to professional conduct
|
22 |
+
- Avoid speculation β only respond based on the provided legal context
|
23 |
+
- If an answer is not directly found in the context, recommend where it might be found (e.g., specific manual or circular)
|
24 |
+
|
25 |
+
Context:
|
26 |
+
\"\"\"
|
27 |
+
{context}
|
28 |
+
\"\"\"
|
29 |
+
|
30 |
+
Question: {question}
|
31 |
+
Answer:
|
32 |
+
"""
|
33 |
+
try:
|
34 |
+
response = openai.ChatCompletion.create(
|
35 |
+
model=model,
|
36 |
+
messages=[
|
37 |
+
{"role": "system", "content": "You are a helpful legal assistant."},
|
38 |
+
{"role": "user", "content": prompt}
|
39 |
+
],
|
40 |
+
temperature=0.3
|
41 |
+
)
|
42 |
+
return response['choices'][0]['message']['content'].strip()
|
43 |
+
except Exception as e:
|
44 |
+
print(f"[!] Error calling OpenAI: {e}")
|
45 |
+
return "Sorry, I couldn't generate a response."
|
main.py
ADDED
@@ -0,0 +1,214 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import requests
|
3 |
+
import pdfplumber
|
4 |
+
import trafilatura
|
5 |
+
from bs4 import BeautifulSoup
|
6 |
+
from urllib.parse import urljoin, urlparse
|
7 |
+
from sentence_transformers import SentenceTransformer
|
8 |
+
import faiss
|
9 |
+
import numpy as np
|
10 |
+
import pickle
|
11 |
+
import argparse
|
12 |
+
|
13 |
+
from ollama_initial import start_ollama_model
|
14 |
+
from llama_query import ask_llm_with_context
|
15 |
+
from smart_chunk import smart_chunk_text # for semantic-aware chunking
|
16 |
+
|
17 |
+
# === Config ===
|
18 |
+
INDEX_FILE = "legal_index.faiss"
|
19 |
+
DOCS_FILE = "legal_chunks.pkl"
|
20 |
+
PDF_CACHE_FILE = "processed_pdfs.pkl"
|
21 |
+
URL_CACHE_FILE = "processed_urls.pkl"
|
22 |
+
|
23 |
+
EMBEDDING_MODEL = "intfloat/e5-base-v2"
|
24 |
+
ALLOWED_DOMAINS = ["gov", "org", "ca"]
|
25 |
+
PDF_FOLDER = "pdf"
|
26 |
+
URL_FILE = "urls.txt"
|
27 |
+
|
28 |
+
# === CLI args ===
|
29 |
+
parser = argparse.ArgumentParser()
|
30 |
+
parser.add_argument("--update", action="store_true", help="Update only new PDFs/URLs (uses cache)")
|
31 |
+
parser.add_argument("--updateall", action="store_true", help="Force complete reindexing of all documents (ignores cache)")
|
32 |
+
args = parser.parse_args()
|
33 |
+
|
34 |
+
# === Embedding setup ===
|
35 |
+
model = SentenceTransformer(EMBEDDING_MODEL)
|
36 |
+
vector_index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
|
37 |
+
documents = []
|
38 |
+
|
39 |
+
# === Cache handling ===
|
40 |
+
def load_cache(file):
|
41 |
+
if os.path.exists(file):
|
42 |
+
with open(file, "rb") as f:
|
43 |
+
return pickle.load(f)
|
44 |
+
return set()
|
45 |
+
|
46 |
+
def save_cache(data, file):
|
47 |
+
with open(file, "wb") as f:
|
48 |
+
pickle.dump(data, f)
|
49 |
+
|
50 |
+
# === Index persistence ===
|
51 |
+
def save_index():
|
52 |
+
faiss.write_index(vector_index, INDEX_FILE)
|
53 |
+
with open(DOCS_FILE, "wb") as f:
|
54 |
+
pickle.dump(documents, f)
|
55 |
+
print("β
Vector index and chunks saved.")
|
56 |
+
|
57 |
+
def load_index():
|
58 |
+
global vector_index, documents
|
59 |
+
if os.path.exists(INDEX_FILE) and os.path.exists(DOCS_FILE):
|
60 |
+
print("π Loading existing vector index and documents...")
|
61 |
+
vector_index = faiss.read_index(INDEX_FILE)
|
62 |
+
with open(DOCS_FILE, "rb") as f:
|
63 |
+
documents = pickle.load(f)
|
64 |
+
return True
|
65 |
+
return False
|
66 |
+
|
67 |
+
# === Chunk + embed ===
|
68 |
+
def store_text_chunks(text):
|
69 |
+
chunks = smart_chunk_text(text)
|
70 |
+
for chunk in chunks:
|
71 |
+
if chunk.strip():
|
72 |
+
documents.append(chunk)
|
73 |
+
vector = model.encode(chunk)
|
74 |
+
vector_index.add(np.array([vector]))
|
75 |
+
|
76 |
+
# === Text extraction ===
|
77 |
+
def get_text_from_pdf_file(filepath):
|
78 |
+
try:
|
79 |
+
with pdfplumber.open(filepath) as pdf:
|
80 |
+
return "\n".join(page.extract_text() or '' for page in pdf.pages)
|
81 |
+
except Exception as e:
|
82 |
+
print(f"[!] Failed to read PDF: {filepath} β {e}")
|
83 |
+
return ""
|
84 |
+
|
85 |
+
def get_text_from_pdf_url(url):
|
86 |
+
try:
|
87 |
+
response = requests.get(url)
|
88 |
+
filename = "temp.pdf"
|
89 |
+
with open(filename, "wb") as f:
|
90 |
+
f.write(response.content)
|
91 |
+
text = get_text_from_pdf_file(filename)
|
92 |
+
os.remove(filename)
|
93 |
+
return text
|
94 |
+
except Exception as e:
|
95 |
+
print(f"[!] Failed to fetch PDF from URL: {url} β {e}")
|
96 |
+
return ""
|
97 |
+
|
98 |
+
def get_text_from_html(url):
|
99 |
+
try:
|
100 |
+
html = requests.get(url).text
|
101 |
+
return trafilatura.extract(html, include_comments=False, include_tables=False) or ""
|
102 |
+
except Exception as e:
|
103 |
+
print(f"[!] Failed HTML: {url} β {e}")
|
104 |
+
return ""
|
105 |
+
|
106 |
+
def is_valid_link(link, base_url):
|
107 |
+
full_url = urljoin(base_url, link)
|
108 |
+
parsed = urlparse(full_url)
|
109 |
+
return parsed.scheme.startswith("http") and any(tld in parsed.netloc for tld in ALLOWED_DOMAINS)
|
110 |
+
|
111 |
+
# === Processing ===
|
112 |
+
def process_pdf_folder(folder_path=PDF_FOLDER, processed_files=None):
|
113 |
+
if processed_files is None:
|
114 |
+
processed_files = set()
|
115 |
+
for filename in os.listdir(folder_path):
|
116 |
+
if filename.lower().endswith(".pdf") and filename not in processed_files:
|
117 |
+
full_path = os.path.join(folder_path, filename)
|
118 |
+
print(f"π Reading new PDF: {full_path}")
|
119 |
+
text = get_text_from_pdf_file(full_path)
|
120 |
+
store_text_chunks(text)
|
121 |
+
processed_files.add(filename)
|
122 |
+
else:
|
123 |
+
print(f"β
Skipping already processed PDF: {filename}")
|
124 |
+
|
125 |
+
def crawl_url(url, depth=1, processed_urls=None):
|
126 |
+
if processed_urls is None:
|
127 |
+
processed_urls = set()
|
128 |
+
if url in processed_urls:
|
129 |
+
print(f"β
Skipping already crawled URL: {url}")
|
130 |
+
return
|
131 |
+
|
132 |
+
print(f"π Crawling: {url}")
|
133 |
+
visited = set()
|
134 |
+
to_visit = [url]
|
135 |
+
|
136 |
+
while to_visit and depth > 0:
|
137 |
+
current = to_visit.pop()
|
138 |
+
visited.add(current)
|
139 |
+
|
140 |
+
if current.endswith(".pdf"):
|
141 |
+
text = get_text_from_pdf_url(current)
|
142 |
+
else:
|
143 |
+
text = get_text_from_html(current)
|
144 |
+
|
145 |
+
store_text_chunks(text)
|
146 |
+
processed_urls.add(current)
|
147 |
+
|
148 |
+
try:
|
149 |
+
page = requests.get(current).text
|
150 |
+
soup = BeautifulSoup(page, "html.parser")
|
151 |
+
for a in soup.find_all("a", href=True):
|
152 |
+
href = a["href"]
|
153 |
+
full_url = urljoin(current, href)
|
154 |
+
if full_url not in visited and is_valid_link(href, current):
|
155 |
+
to_visit.append(full_url)
|
156 |
+
except Exception:
|
157 |
+
continue
|
158 |
+
|
159 |
+
depth -= 1
|
160 |
+
|
161 |
+
# === Retrieval ===
|
162 |
+
def load_urls(file_path=URL_FILE):
|
163 |
+
with open(file_path, "r", encoding="utf-8") as f:
|
164 |
+
return [line.strip() for line in f if line.strip()]
|
165 |
+
|
166 |
+
def query_index(question, top_k=3):
|
167 |
+
if not documents:
|
168 |
+
return "No documents found in the index."
|
169 |
+
query = f"query: {question}"
|
170 |
+
q_vector = model.encode(query)
|
171 |
+
D, I = vector_index.search(np.array([q_vector]), top_k)
|
172 |
+
return "\n---\n".join([documents[i] for i in I[0]])
|
173 |
+
|
174 |
+
# === Main Execution ===
|
175 |
+
if __name__ == "__main__":
|
176 |
+
if args.updateall:
|
177 |
+
print("π Rebuilding index from scratch...")
|
178 |
+
processed_pdfs = set()
|
179 |
+
processed_urls = set()
|
180 |
+
else:
|
181 |
+
processed_pdfs = load_cache(PDF_CACHE_FILE)
|
182 |
+
processed_urls = load_cache(URL_CACHE_FILE)
|
183 |
+
|
184 |
+
started = start_ollama_model()
|
185 |
+
if not started:
|
186 |
+
print("β Could not connect to Ollama")
|
187 |
+
exit(1)
|
188 |
+
|
189 |
+
if args.updateall or not load_index() or args.update:
|
190 |
+
print("π Updating or creating index...")
|
191 |
+
process_pdf_folder(processed_files=processed_pdfs)
|
192 |
+
for url in load_urls():
|
193 |
+
crawl_url(url, depth=1, processed_urls=processed_urls)
|
194 |
+
save_index()
|
195 |
+
save_cache(processed_pdfs, PDF_CACHE_FILE)
|
196 |
+
save_cache(processed_urls, URL_CACHE_FILE)
|
197 |
+
else:
|
198 |
+
print("β
Loaded existing index. Ready to query.")
|
199 |
+
|
200 |
+
print("\nβ Ready to query your legal database (type 'exit' to quit)")
|
201 |
+
while True:
|
202 |
+
question = input("\nπ Your question: ")
|
203 |
+
if question.strip().lower() in ["exit", "quit", "q"]:
|
204 |
+
print("π Exiting. See you next time!")
|
205 |
+
break
|
206 |
+
context = query_index(question)
|
207 |
+
answer = ask_llm_with_context(question, context)
|
208 |
+
print("\nπ§ LLaMA 3 Answer:")
|
209 |
+
print(answer)
|
210 |
+
|
211 |
+
# This version includes all 3 enhancements:
|
212 |
+
# - Smart chunking via smart_chunk.py
|
213 |
+
# - High-quality embedding model (E5)
|
214 |
+
# - Structured prompt with legal assistant context and disclaimer
|
ollama_initial.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import subprocess
|
2 |
+
import time
|
3 |
+
import requests
|
4 |
+
|
5 |
+
def is_ollama_running():
|
6 |
+
try:
|
7 |
+
r = requests.get("http://localhost:11434")
|
8 |
+
return r.status_code == 200
|
9 |
+
except Exception:
|
10 |
+
return False
|
11 |
+
|
12 |
+
def start_ollama_model(model_name="llama3"):
|
13 |
+
print(f"π Starting Ollama model: {model_name}")
|
14 |
+
subprocess.Popen(["ollama", "run", model_name], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
15 |
+
print("β³ Waiting for Ollama to be ready...")
|
16 |
+
for _ in range(20): # wait up to ~10 seconds
|
17 |
+
if is_ollama_running():
|
18 |
+
print("β
Ollama is up!")
|
19 |
+
return True
|
20 |
+
time.sleep(0.5)
|
21 |
+
print("β Ollama failed to start or is not responding.")
|
22 |
+
return False
|
processed_pdfs.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3013d2b193e976388ff0e46a0dece10d3a6fbf44fb741c45bea556d57d0d9dc9
|
3 |
+
size 1349
|
processed_urls.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f804961b5bd976bfafce25a083bcce0527feea4de5964d8000e4ae063009fd20
|
3 |
+
size 776
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio
|
2 |
+
pdfplumber
|
3 |
+
requests
|
4 |
+
beautifulsoup4
|
5 |
+
trafilatura
|
6 |
+
sentence-transformers
|
7 |
+
faiss-cpu
|
8 |
+
numpy
|
9 |
+
openai
|
smart_chunk.py
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# import nltk
|
2 |
+
# nltk.download('punkt') # Only needed once
|
3 |
+
# from nltk.tokenize import sent_tokenize
|
4 |
+
|
5 |
+
|
6 |
+
import nltk
|
7 |
+
|
8 |
+
|
9 |
+
# Automatically download 'punkt' if not already available
|
10 |
+
try:
|
11 |
+
nltk.data.find('tokenizers/punkt')
|
12 |
+
except LookupError:
|
13 |
+
nltk.download('punkt')
|
14 |
+
|
15 |
+
from nltk.tokenize import sent_tokenize
|
16 |
+
|
17 |
+
|
18 |
+
def smart_chunk_text(text, max_tokens=128):
|
19 |
+
sentences = sent_tokenize(text)
|
20 |
+
chunks, current_chunk = [], ""
|
21 |
+
for sentence in sentences:
|
22 |
+
if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
|
23 |
+
current_chunk += " " + sentence
|
24 |
+
else:
|
25 |
+
chunks.append(current_chunk.strip())
|
26 |
+
current_chunk = sentence
|
27 |
+
if current_chunk:
|
28 |
+
chunks.append(current_chunk.strip())
|
29 |
+
return chunks
|