Spaces:

allenlsl
/

legal_rag

Running

App Files Files Community

allenlsl commited on 27 days ago

Commit

8c37f9b

verified ·

1 Parent(s): 87304c3

Upload 10 files

Browse files

Files changed (11) hide show

.gitattributes +1 -0
app.py +21 -0
legal_chunks.pkl +3 -0
legal_index.faiss +3 -0
llama_query.py +45 -0
main.py +214 -0
ollama_initial.py +22 -0
processed_pdfs.pkl +3 -0
processed_urls.pkl +3 -0
requirements.txt +9 -0
smart_chunk.py +29 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+legal_index.faiss filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,21 @@

+# interface.py
+import gradio as gr
+from main import query_index, ask_llm_with_context
+def legal_assistant(question):
+    if not question.strip():
+        return "Please enter a valid question."
+    context = query_index(question)
+    answer = ask_llm_with_context(question, context)
+    return answer
+iface = gr.Interface(
+    fn=legal_assistant,
+    inputs=gr.Textbox(lines=4, placeholder="Ask about BC land survey law..."),
+    outputs="text",
+    title="📘 BC Land Survey Legal Assistant",
+    description="Ask any question related to BC Land Surveying based on laws, regulations, acts, bulletins, and more.",
+)
+iface.launch()

legal_chunks.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ef6cbd006cf78cbd4f00098f1d34284453e38c5065f8f58d6a8056a7862ded4a
+size 1819221

legal_index.faiss ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b7e5d8e61adabcd00a6aa01fb19f48a124ba8ba309c475294e9d3325e73505b
+size 7756845

llama_query.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# ask_llm_with_context.py
+import openai
+import os
+# Load your OpenAI key (use environment variable for Hugging Face later)
+openai.api_key = os.getenv("OPENAI_API_KEY")
+def ask_llm_with_context(question, context, model="gpt-3.5-turbo"):
+    prompt = f"""
+You are a knowledgeable teaching assistant for land surveyors in British Columbia.
+Your role is to support students, candidates, and practicing professionals (such as BCLS or LSTs)
+by answering their questions based strictly on the legal and regulatory framework governing land surveying in British Columbia.
+These users are subject to the statutes, regulations, Survey and Plan Rules, bylaws, circular letters, practice bulletins, and the Code of Ethics
+of the Association of British Columbia Land Surveyors (ABCLS).
+Your task is to provide clear, accurate, and context-grounded answers to support their professional development.
+When answering, always:
+- Reference relevant Acts, manuals, Survey and Plan Rules, circular letters, or bulletins (include section/page if known)
+- Mention the Code of Ethics if applicable to professional conduct
+- Avoid speculation — only respond based on the provided legal context
+- If an answer is not directly found in the context, recommend where it might be found (e.g., specific manual or circular)
+Context:
+\"\"\"
+{context}
+\"\"\"
+Question: {question}
+Answer:
+"""
+    try:
+        response = openai.ChatCompletion.create(
+            model=model,
+            messages=[
+                {"role": "system", "content": "You are a helpful legal assistant."},
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0.3
+        )
+        return response['choices'][0]['message']['content'].strip()
+    except Exception as e:
+        print(f"[!] Error calling OpenAI: {e}")
+        return "Sorry, I couldn't generate a response."

main.py ADDED Viewed

	@@ -0,0 +1,214 @@

+import os
+import requests
+import pdfplumber
+import trafilatura
+from bs4 import BeautifulSoup
+from urllib.parse import urljoin, urlparse
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+import pickle
+import argparse
+from ollama_initial import start_ollama_model
+from llama_query import ask_llm_with_context
+from smart_chunk import smart_chunk_text  # for semantic-aware chunking
+# === Config ===
+INDEX_FILE = "legal_index.faiss"
+DOCS_FILE = "legal_chunks.pkl"
+PDF_CACHE_FILE = "processed_pdfs.pkl"
+URL_CACHE_FILE = "processed_urls.pkl"
+EMBEDDING_MODEL = "intfloat/e5-base-v2"
+ALLOWED_DOMAINS = ["gov", "org", "ca"]
+PDF_FOLDER = "pdf"
+URL_FILE = "urls.txt"
+# === CLI args ===
+parser = argparse.ArgumentParser()
+parser.add_argument("--update", action="store_true", help="Update only new PDFs/URLs (uses cache)")
+parser.add_argument("--updateall", action="store_true", help="Force complete reindexing of all documents (ignores cache)")
+args = parser.parse_args()
+# === Embedding setup ===
+model = SentenceTransformer(EMBEDDING_MODEL)
+vector_index = faiss.IndexFlatL2(model.get_sentence_embedding_dimension())
+documents = []
+# === Cache handling ===
+def load_cache(file):
+    if os.path.exists(file):
+        with open(file, "rb") as f:
+            return pickle.load(f)
+    return set()
+def save_cache(data, file):
+    with open(file, "wb") as f:
+        pickle.dump(data, f)
+# === Index persistence ===
+def save_index():
+    faiss.write_index(vector_index, INDEX_FILE)
+    with open(DOCS_FILE, "wb") as f:
+        pickle.dump(documents, f)
+    print("✅ Vector index and chunks saved.")
+def load_index():
+    global vector_index, documents
+    if os.path.exists(INDEX_FILE) and os.path.exists(DOCS_FILE):
+        print("📂 Loading existing vector index and documents...")
+        vector_index = faiss.read_index(INDEX_FILE)
+        with open(DOCS_FILE, "rb") as f:
+            documents = pickle.load(f)
+        return True
+    return False
+# === Chunk + embed ===
+def store_text_chunks(text):
+    chunks = smart_chunk_text(text)
+    for chunk in chunks:
+        if chunk.strip():
+            documents.append(chunk)
+            vector = model.encode(chunk)
+            vector_index.add(np.array([vector]))
+# === Text extraction ===
+def get_text_from_pdf_file(filepath):
+    try:
+        with pdfplumber.open(filepath) as pdf:
+            return "\n".join(page.extract_text() or '' for page in pdf.pages)
+    except Exception as e:
+        print(f"[!] Failed to read PDF: {filepath} — {e}")
+        return ""
+def get_text_from_pdf_url(url):
+    try:
+        response = requests.get(url)
+        filename = "temp.pdf"
+        with open(filename, "wb") as f:
+            f.write(response.content)
+        text = get_text_from_pdf_file(filename)
+        os.remove(filename)
+        return text
+    except Exception as e:
+        print(f"[!] Failed to fetch PDF from URL: {url} — {e}")
+        return ""
+def get_text_from_html(url):
+    try:
+        html = requests.get(url).text
+        return trafilatura.extract(html, include_comments=False, include_tables=False) or ""
+    except Exception as e:
+        print(f"[!] Failed HTML: {url} — {e}")
+        return ""
+def is_valid_link(link, base_url):
+    full_url = urljoin(base_url, link)
+    parsed = urlparse(full_url)
+    return parsed.scheme.startswith("http") and any(tld in parsed.netloc for tld in ALLOWED_DOMAINS)
+# === Processing ===
+def process_pdf_folder(folder_path=PDF_FOLDER, processed_files=None):
+    if processed_files is None:
+        processed_files = set()
+    for filename in os.listdir(folder_path):
+        if filename.lower().endswith(".pdf") and filename not in processed_files:
+            full_path = os.path.join(folder_path, filename)
+            print(f"📄 Reading new PDF: {full_path}")
+            text = get_text_from_pdf_file(full_path)
+            store_text_chunks(text)
+            processed_files.add(filename)
+        else:
+            print(f"✅ Skipping already processed PDF: {filename}")
+def crawl_url(url, depth=1, processed_urls=None):
+    if processed_urls is None:
+        processed_urls = set()
+    if url in processed_urls:
+        print(f"✅ Skipping already crawled URL: {url}")
+        return
+    print(f"🔗 Crawling: {url}")
+    visited = set()
+    to_visit = [url]
+    while to_visit and depth > 0:
+        current = to_visit.pop()
+        visited.add(current)
+        if current.endswith(".pdf"):
+            text = get_text_from_pdf_url(current)
+        else:
+            text = get_text_from_html(current)
+        store_text_chunks(text)
+        processed_urls.add(current)
+        try:
+            page = requests.get(current).text
+            soup = BeautifulSoup(page, "html.parser")
+            for a in soup.find_all("a", href=True):
+                href = a["href"]
+                full_url = urljoin(current, href)
+                if full_url not in visited and is_valid_link(href, current):
+                    to_visit.append(full_url)
+        except Exception:
+            continue
+        depth -= 1
+# === Retrieval ===
+def load_urls(file_path=URL_FILE):
+    with open(file_path, "r", encoding="utf-8") as f:
+        return [line.strip() for line in f if line.strip()]
+def query_index(question, top_k=3):
+    if not documents:
+        return "No documents found in the index."
+    query = f"query: {question}"
+    q_vector = model.encode(query)
+    D, I = vector_index.search(np.array([q_vector]), top_k)
+    return "\n---\n".join([documents[i] for i in I[0]])
+# === Main Execution ===
+if __name__ == "__main__":
+    if args.updateall:
+        print("🔁 Rebuilding index from scratch...")
+        processed_pdfs = set()
+        processed_urls = set()
+    else:
+        processed_pdfs = load_cache(PDF_CACHE_FILE)
+        processed_urls = load_cache(URL_CACHE_FILE)
+    started = start_ollama_model()
+    if not started:
+        print("❌ Could not connect to Ollama")
+        exit(1)
+    if args.updateall or not load_index() or args.update:
+        print("🔄 Updating or creating index...")
+        process_pdf_folder(processed_files=processed_pdfs)
+        for url in load_urls():
+            crawl_url(url, depth=1, processed_urls=processed_urls)
+        save_index()
+        save_cache(processed_pdfs, PDF_CACHE_FILE)
+        save_cache(processed_urls, URL_CACHE_FILE)
+    else:
+        print("✅ Loaded existing index. Ready to query.")
+    print("\n❓ Ready to query your legal database (type 'exit' to quit)")
+    while True:
+        question = input("\n🔎 Your question: ")
+        if question.strip().lower() in ["exit", "quit", "q"]:
+            print("👋 Exiting. See you next time!")
+            break
+        context = query_index(question)
+        answer = ask_llm_with_context(question, context)
+        print("\n🧠 LLaMA 3 Answer:")
+        print(answer)
+# This version includes all 3 enhancements:
+# - Smart chunking via smart_chunk.py
+# - High-quality embedding model (E5)
+# - Structured prompt with legal assistant context and disclaimer

ollama_initial.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import subprocess
+import time
+import requests
+def is_ollama_running():
+    try:
+        r = requests.get("http://localhost:11434")
+        return r.status_code == 200
+    except Exception:
+        return False
+def start_ollama_model(model_name="llama3"):
+    print(f"🚀 Starting Ollama model: {model_name}")
+    subprocess.Popen(["ollama", "run", model_name], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    print("⏳ Waiting for Ollama to be ready...")
+    for _ in range(20):  # wait up to ~10 seconds
+        if is_ollama_running():
+            print("✅ Ollama is up!")
+            return True
+        time.sleep(0.5)
+    print("❌ Ollama failed to start or is not responding.")
+    return False

processed_pdfs.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3013d2b193e976388ff0e46a0dece10d3a6fbf44fb741c45bea556d57d0d9dc9
+size 1349

processed_urls.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f804961b5bd976bfafce25a083bcce0527feea4de5964d8000e4ae063009fd20
+size 776

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio
+pdfplumber
+requests
+beautifulsoup4
+trafilatura
+sentence-transformers
+faiss-cpu
+numpy
+openai

smart_chunk.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# import nltk
+# nltk.download('punkt')  # Only needed once
+# from nltk.tokenize import sent_tokenize
+import nltk
+# Automatically download 'punkt' if not already available
+try:
+    nltk.data.find('tokenizers/punkt')
+except LookupError:
+    nltk.download('punkt')
+from nltk.tokenize import sent_tokenize
+def smart_chunk_text(text, max_tokens=128):
+    sentences = sent_tokenize(text)
+    chunks, current_chunk = [], ""
+    for sentence in sentences:
+        if len(current_chunk.split()) + len(sentence.split()) <= max_tokens:
+            current_chunk += " " + sentence
+        else:
+            chunks.append(current_chunk.strip())
+            current_chunk = sentence
+    if current_chunk:
+        chunks.append(current_chunk.strip())
+    return chunks