allenlsl commited on
Commit
8c37f9b
Β·
verified Β·
1 Parent(s): 87304c3

Upload 10 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ legal_index.faiss filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # interface.py
2
+
3
+ import gradio as gr
4
+ from main import query_index, ask_llm_with_context
5
+
6
+ def legal_assistant(question):
7
+ if not question.strip():
8
+ return "Please enter a valid question."
9
+ context = query_index(question)
10
+ answer = ask_llm_with_context(question, context)
11
+ return answer
12
+
13
+ iface = gr.Interface(
14
+ fn=legal_assistant,
15
+ inputs=gr.Textbox(lines=4, placeholder="Ask about BC land survey law..."),
16
+ outputs="text",
17
+ title="πŸ“˜ BC Land Survey Legal Assistant",
18
+ description="Ask any question related to BC Land Surveying based on laws, regulations, acts, bulletins, and more.",
19
+ )
20
+
21
+ iface.launch()
legal_chunks.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef6cbd006cf78cbd4f00098f1d34284453e38c5065f8f58d6a8056a7862ded4a
3
+ size 1819221
legal_index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b7e5d8e61adabcd00a6aa01fb19f48a124ba8ba309c475294e9d3325e73505b
3
+ size 7756845
llama_query.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ask_llm_with_context.py
2
+
3
+ import openai
4
+ import os
5
+
6
+ # Load your OpenAI key (use environment variable for Hugging Face later)
7
+ openai.api_key = os.getenv("OPENAI_API_KEY")
8
+
9
+ def ask_llm_with_context(question, context, model="gpt-3.5-turbo"):
10
+ prompt = f"""
11
+ You are a knowledgeable teaching assistant for land surveyors in British Columbia.
12
+ Your role is to support students, candidates, and practicing professionals (such as BCLS or LSTs)
13
+ by answering their questions based strictly on the legal and regulatory framework governing land surveying in British Columbia.
14
+
15
+ These users are subject to the statutes, regulations, Survey and Plan Rules, bylaws, circular letters, practice bulletins, and the Code of Ethics
16
+ of the Association of British Columbia Land Surveyors (ABCLS).
17
+ Your task is to provide clear, accurate, and context-grounded answers to support their professional development.
18
+
19
+ When answering, always:
20
+ - Reference relevant Acts, manuals, Survey and Plan Rules, circular letters, or bulletins (include section/page if known)
21
+ - Mention the Code of Ethics if applicable to professional conduct
22
+ - Avoid speculation β€” only respond based on the provided legal context
23
+ - If an answer is not directly found in the context, recommend where it might be found (e.g., specific manual or circular)
24
+
25
+ Context:
26
+ \"\"\"
27
+ {context}
28
+ \"\"\"
29
+
30
+ Question: {question}
31
+ Answer:
32
+ """
33
+ try:
34
+ response = openai.ChatCompletion.create(
35
+ model=model,
36
+ messages=[
37
+ {"role": "system", "content": "You are a helpful legal assistant."},
38
+ {"role": "user", "content": prompt}
39
+ ],
40
+ temperature=0.3
41
+ )
42
+ return response['choices'][0]['message']['content'].strip()
43
+ except Exception as e:
44
+ print(f"[!] Error calling OpenAI: {e}")
45
+ return "Sorry, I couldn't generate a response."
main.py ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import requests
3
+ import pdfplumber
4
+ import trafilatura
5
+ from bs4 import BeautifulSoup
6
+ from urllib.parse import urljoin, urlparse
7
+ from sentence_transformers import SentenceTransformer
8
+ import faiss
9
+ import numpy as np
10
+ import pickle
11
+ import argparse
12
+
13
+ from ollama_initial import start_ollama_model
14
+ from llama_query import ask_llm_with_context
15
+ from smart_chunk import smart_chunk_text # for semantic-aware chunking
16
+
17
+ # === Config ===
18
+ INDEX_FILE = "legal_index.faiss"
19
+ DOCS_FILE = "legal_chunks.pkl"
20
+ PDF_CACHE_FILE = "processed_pdfs.pkl"
21
+ URL_CACHE_FILE = "processed_urls.pkl"
22
+
23
+ EMBEDDING_MODEL = "intfloat/e5-base-v2"
24
+ ALLOWED_DOMAINS = ["gov", "org", "ca"]
25
+ PDF_FOLDER = "pdf"
26
+ URL_FILE = "urls.txt"
27
+
28
+ # === CLI args ===
29
+ parser = argparse.ArgumentParser()
30
+ parser.add_argument("--update", action="store_true", help="Update only new PDFs/URLs (uses cache)")
31
+ parser.add_argument("--updateall", action="store_true", help="Force complete reindexing of all documents (ignores cache)")
32
+ args = parser.parse_args()
33
+
34
+ # === Embedding setup ===
35
+ model = SentenceTransformer(EMBEDDING_MODEL)
36
+ vector_index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
37
+ documents = []
38
+
39
+ # === Cache handling ===
40
+ def load_cache(file):
41
+ if os.path.exists(file):
42
+ with open(file, "rb") as f:
43
+ return pickle.load(f)
44
+ return set()
45
+
46
+ def save_cache(data, file):
47
+ with open(file, "wb") as f:
48
+ pickle.dump(data, f)
49
+
50
+ # === Index persistence ===
51
+ def save_index():
52
+ faiss.write_index(vector_index, INDEX_FILE)
53
+ with open(DOCS_FILE, "wb") as f:
54
+ pickle.dump(documents, f)
55
+ print("βœ… Vector index and chunks saved.")
56
+
57
+ def load_index():
58
+ global vector_index, documents
59
+ if os.path.exists(INDEX_FILE) and os.path.exists(DOCS_FILE):
60
+ print("πŸ“‚ Loading existing vector index and documents...")
61
+ vector_index = faiss.read_index(INDEX_FILE)
62
+ with open(DOCS_FILE, "rb") as f:
63
+ documents = pickle.load(f)
64
+ return True
65
+ return False
66
+
67
+ # === Chunk + embed ===
68
+ def store_text_chunks(text):
69
+ chunks = smart_chunk_text(text)
70
+ for chunk in chunks:
71
+ if chunk.strip():
72
+ documents.append(chunk)
73
+ vector = model.encode(chunk)
74
+ vector_index.add(np.array([vector]))
75
+
76
+ # === Text extraction ===
77
+ def get_text_from_pdf_file(filepath):
78
+ try:
79
+ with pdfplumber.open(filepath) as pdf:
80
+ return "\n".join(page.extract_text() or '' for page in pdf.pages)
81
+ except Exception as e:
82
+ print(f"[!] Failed to read PDF: {filepath} β€” {e}")
83
+ return ""
84
+
85
+ def get_text_from_pdf_url(url):
86
+ try:
87
+ response = requests.get(url)
88
+ filename = "temp.pdf"
89
+ with open(filename, "wb") as f:
90
+ f.write(response.content)
91
+ text = get_text_from_pdf_file(filename)
92
+ os.remove(filename)
93
+ return text
94
+ except Exception as e:
95
+ print(f"[!] Failed to fetch PDF from URL: {url} β€” {e}")
96
+ return ""
97
+
98
+ def get_text_from_html(url):
99
+ try:
100
+ html = requests.get(url).text
101
+ return trafilatura.extract(html, include_comments=False, include_tables=False) or ""
102
+ except Exception as e:
103
+ print(f"[!] Failed HTML: {url} β€” {e}")
104
+ return ""
105
+
106
+ def is_valid_link(link, base_url):
107
+ full_url = urljoin(base_url, link)
108
+ parsed = urlparse(full_url)
109
+ return parsed.scheme.startswith("http") and any(tld in parsed.netloc for tld in ALLOWED_DOMAINS)
110
+
111
+ # === Processing ===
112
+ def process_pdf_folder(folder_path=PDF_FOLDER, processed_files=None):
113
+ if processed_files is None:
114
+ processed_files = set()
115
+ for filename in os.listdir(folder_path):
116
+ if filename.lower().endswith(".pdf") and filename not in processed_files:
117
+ full_path = os.path.join(folder_path, filename)
118
+ print(f"πŸ“„ Reading new PDF: {full_path}")
119
+ text = get_text_from_pdf_file(full_path)
120
+ store_text_chunks(text)
121
+ processed_files.add(filename)
122
+ else:
123
+ print(f"βœ… Skipping already processed PDF: {filename}")
124
+
125
+ def crawl_url(url, depth=1, processed_urls=None):
126
+ if processed_urls is None:
127
+ processed_urls = set()
128
+ if url in processed_urls:
129
+ print(f"βœ… Skipping already crawled URL: {url}")
130
+ return
131
+
132
+ print(f"πŸ”— Crawling: {url}")
133
+ visited = set()
134
+ to_visit = [url]
135
+
136
+ while to_visit and depth > 0:
137
+ current = to_visit.pop()
138
+ visited.add(current)
139
+
140
+ if current.endswith(".pdf"):
141
+ text = get_text_from_pdf_url(current)
142
+ else:
143
+ text = get_text_from_html(current)
144
+
145
+ store_text_chunks(text)
146
+ processed_urls.add(current)
147
+
148
+ try:
149
+ page = requests.get(current).text
150
+ soup = BeautifulSoup(page, "html.parser")
151
+ for a in soup.find_all("a", href=True):
152
+ href = a["href"]
153
+ full_url = urljoin(current, href)
154
+ if full_url not in visited and is_valid_link(href, current):
155
+ to_visit.append(full_url)
156
+ except Exception:
157
+ continue
158
+
159
+ depth -= 1
160
+
161
+ # === Retrieval ===
162
+ def load_urls(file_path=URL_FILE):
163
+ with open(file_path, "r", encoding="utf-8") as f:
164
+ return [line.strip() for line in f if line.strip()]
165
+
166
+ def query_index(question, top_k=3):
167
+ if not documents:
168
+ return "No documents found in the index."
169
+ query = f"query: {question}"
170
+ q_vector = model.encode(query)
171
+ D, I = vector_index.search(np.array([q_vector]), top_k)
172
+ return "\n---\n".join([documents[i] for i in I[0]])
173
+
174
+ # === Main Execution ===
175
+ if __name__ == "__main__":
176
+ if args.updateall:
177
+ print("πŸ” Rebuilding index from scratch...")
178
+ processed_pdfs = set()
179
+ processed_urls = set()
180
+ else:
181
+ processed_pdfs = load_cache(PDF_CACHE_FILE)
182
+ processed_urls = load_cache(URL_CACHE_FILE)
183
+
184
+ started = start_ollama_model()
185
+ if not started:
186
+ print("❌ Could not connect to Ollama")
187
+ exit(1)
188
+
189
+ if args.updateall or not load_index() or args.update:
190
+ print("πŸ”„ Updating or creating index...")
191
+ process_pdf_folder(processed_files=processed_pdfs)
192
+ for url in load_urls():
193
+ crawl_url(url, depth=1, processed_urls=processed_urls)
194
+ save_index()
195
+ save_cache(processed_pdfs, PDF_CACHE_FILE)
196
+ save_cache(processed_urls, URL_CACHE_FILE)
197
+ else:
198
+ print("βœ… Loaded existing index. Ready to query.")
199
+
200
+ print("\n❓ Ready to query your legal database (type 'exit' to quit)")
201
+ while True:
202
+ question = input("\nπŸ”Ž Your question: ")
203
+ if question.strip().lower() in ["exit", "quit", "q"]:
204
+ print("πŸ‘‹ Exiting. See you next time!")
205
+ break
206
+ context = query_index(question)
207
+ answer = ask_llm_with_context(question, context)
208
+ print("\n🧠 LLaMA 3 Answer:")
209
+ print(answer)
210
+
211
+ # This version includes all 3 enhancements:
212
+ # - Smart chunking via smart_chunk.py
213
+ # - High-quality embedding model (E5)
214
+ # - Structured prompt with legal assistant context and disclaimer
ollama_initial.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import time
3
+ import requests
4
+
5
+ def is_ollama_running():
6
+ try:
7
+ r = requests.get("http://localhost:11434")
8
+ return r.status_code == 200
9
+ except Exception:
10
+ return False
11
+
12
+ def start_ollama_model(model_name="llama3"):
13
+ print(f"πŸš€ Starting Ollama model: {model_name}")
14
+ subprocess.Popen(["ollama", "run", model_name], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
15
+ print("⏳ Waiting for Ollama to be ready...")
16
+ for _ in range(20): # wait up to ~10 seconds
17
+ if is_ollama_running():
18
+ print("βœ… Ollama is up!")
19
+ return True
20
+ time.sleep(0.5)
21
+ print("❌ Ollama failed to start or is not responding.")
22
+ return False
processed_pdfs.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3013d2b193e976388ff0e46a0dece10d3a6fbf44fb741c45bea556d57d0d9dc9
3
+ size 1349
processed_urls.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f804961b5bd976bfafce25a083bcce0527feea4de5964d8000e4ae063009fd20
3
+ size 776
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ pdfplumber
3
+ requests
4
+ beautifulsoup4
5
+ trafilatura
6
+ sentence-transformers
7
+ faiss-cpu
8
+ numpy
9
+ openai
smart_chunk.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import nltk
2
+ # nltk.download('punkt') # Only needed once
3
+ # from nltk.tokenize import sent_tokenize
4
+
5
+
6
+ import nltk
7
+
8
+
9
+ # Automatically download 'punkt' if not already available
10
+ try:
11
+ nltk.data.find('tokenizers/punkt')
12
+ except LookupError:
13
+ nltk.download('punkt')
14
+
15
+ from nltk.tokenize import sent_tokenize
16
+
17
+
18
+ def smart_chunk_text(text, max_tokens=128):
19
+ sentences = sent_tokenize(text)
20
+ chunks, current_chunk = [], ""
21
+ for sentence in sentences:
22
+ if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
23
+ current_chunk += " " + sentence
24
+ else:
25
+ chunks.append(current_chunk.strip())
26
+ current_chunk = sentence
27
+ if current_chunk:
28
+ chunks.append(current_chunk.strip())
29
+ return chunks