File size: 9,102 Bytes
cff93e7
bd5a8e5
2e344b9
 
 
 
cff93e7
2e344b9
fe0834d
 
cff93e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd5a8e5
 
cff93e7
 
 
 
 
 
 
bd5a8e5
cff93e7
d921ab4
bd5a8e5
cff93e7
 
bd5a8e5
cff93e7
 
 
 
 
 
 
 
bd5a8e5
cff93e7
 
 
 
 
 
 
bd5a8e5
cff93e7
 
bd5a8e5
cff93e7
 
 
bd5a8e5
cff93e7
 
bd5a8e5
cff93e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d921ab4
cff93e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef18b52
cff93e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
697ec14
8f30f8a
0b3b342
8f30f8a
1ff2886
 
 
 
 
0b3b342
1ff2886
 
 
8f30f8a
 
 
 
 
cff93e7
 
 
0052dfe
cff93e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af3b26c
 
e28abe4
af3b26c
cff93e7
 
af3b26c
cff93e7
 
 
 
 
 
 
 
3129874
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ebbcd01
cff93e7
 
bd5a8e5
cff93e7
 
 
 
 
 
 
ebbcd01
bd5a8e5
 
 
0810050
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import os

# Get the secret key from the environment
groq_api_key = os.environ.get('groq2')

## LLM used for RAG
from langchain_groq import ChatGroq

#llm = ChatGroq(model="llama-3.3-70b-specdec",api_key=groq_api_key )
llm = ChatGroq(model="Qwen-Qwq-32b",api_key=groq_api_key )

from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.output_parsers import ResponseSchema, StructuredOutputParser

import PyPDF2
# Initialize required components
TEMPLATE = """
You are a helpful agent. Your task is to generate a meaningful question and an answer using the following provided "{context}"

You MUST obey the following criteria:
- No preamble.
- Restrict the question to the context information provided and provide answer with its details in summary.
- Do NOT create a question that cannot be answered from the context.
- Phrase the question so that it does NOT refer to specific context.
- For instance, do NOT use phrases like 'given the provided context' or 'in this work' in the question or 'according to the text' in the answer because if the question is asked elsewhere it would not be provided specific context. Replace these terms with specific details.
- Please do NOT repeat the provided context.
- Please Only generate a question and an answer without any sentence in advance such as "Here is the generated question and answer:".
- Please follow the JSON recommended format below.
- Please ensure that the output is a valid JSON object.
{format_instructions}
"""

prompt = ChatPromptTemplate.from_template(template=TEMPLATE)
response_schemas = [
    {"name": "Question", "description": "The generated question from the provided context"},
    {"name": "Answer", "description": "The corresponding answer from the provided context"}
]
output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = output_parser.get_format_instructions(only_json=True)

# Folder containing PDF files
folder_path = "./"

# List to store questions and answers as tuples
data = []

# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

# Process each PDF in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(folder_path, filename)
        try:
            # Extract text from the PDF
            context = extract_text_from_pdf(pdf_path)

            # Split context into manageable chunks (optional)
            chunks = [context[i:i+200] for i in range(0, len(context), 200)]

            for chunk in chunks:
                # Format the messages
                messages = prompt.format_messages(context=chunk, format_instructions=format_instructions)

                # Invoke the LLM
                response = llm.invoke(messages)

                # Parse the response
                output_dict = output_parser.parse(response.content)

                # Extract question and answer
                question = output_dict["Question"]
                answer = output_dict["Answer"]

                # Append question and answer as a tuple to the list
                data.append((question, answer))

        except Exception as e:
            print(f"Error processing file {filename}: {e}")

import PyPDF2

# Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text

# Function to chunk text into pieces of max_length
def chunk_text(text, max_length=500):
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

# Specify the path to the PDF file
pdf_path = "./LAW Nยบ 59 ON THE CRIME OF GENOCIDE IDEOLOGY AND RELATED CRIMES.pdf"
# List to hold context data
context_data = []

try:
    # Extract text from the PDF
    pdf_text = extract_text_from_pdf(pdf_path)

    if pdf_text:
        # Create chunks of 500 characters
        chunks = chunk_text(pdf_text, max_length=500)

        # Add each chunk to context_data list as plain strings
        context_data = []  # Initialize the list
        for chunk in chunks:
            context_data.append(chunk)  # Save each chunk as a string

        # Print the context_data list
        for entry in context_data:
            print(entry)
            print("-" * 40)  # Separator for readability
    else:
        print("No text found in the PDF.")
except Exception as e:
    print(f"Error reading the PDF: {e}")

context_data.extend(data)

processed_texts = []

for element in context_data:
    if isinstance(element, tuple):
        question, answer = element
        processed_texts.append(f"Question: {question} Answer: {answer}")
    elif isinstance(element, str):
       
        processed_texts.append(element)
    else:
        
        processed_texts.append(str(element))

## Embedding model!
from langchain_huggingface import HuggingFaceEmbeddings
embed_model = HuggingFaceEmbeddings(model_name="mixedbread-ai/mxbai-embed-large-v1")



# create vector store!
from langchain_chroma import Chroma

vectorstore = Chroma(
    collection_name="laws_dataset",  # Changed the name to be compliant
    embedding_function=embed_model,
    persist_directory="./",
)

vectorstore.get().keys()

# add data to vector nstore
vectorstore.add_texts(processed_texts)

from langchain_core.prompts import PromptTemplate

# Define the template
template = ("""
    You are a friendly and intelligent chatbot designed to assist users in a conversational and human-like manner. Your goal is to provide accurate, helpful, and engaging responses from the provided context: {context} while maintaining a natural tone. Follow these guidelines:

    1. **Greetings:** If the user greets you (e.g., "Morning," "Hello," "Hi"), respond warmly and acknowledge the greeting. For example:
       - "๐Ÿ˜Š Good morning! How can I assist you today?"
       - "Hello! What can I do for you? ๐Ÿš€"
    2. **Extract Information:** If the user asks for specific information, extract only the relevant details from the provided context: {context}.
    3. **Human-like Interaction:** Respond in a warm, conversational tone. Use emojis occasionally to make the interaction more engaging (e.g., ๐Ÿ˜Š, ๐Ÿš€).
    4. **Stay Updated:** Acknowledge the current date and time to show you are aware of real-time updates.
    5. **No Extra Content:** If no information matches the user's request, respond politely: "I don't have that information at the moment, but I'm happy to help with something else! ๐Ÿ˜Š"
    6. **Personalized Interaction:** Use the user's historical interactions (if available) to tailor your responses and make the conversation more personalized.
    7. **Direct Data Only:** If the user requests specific data, provide only the requested information without additional explanations unless asked.

    Context: {context}
    User's Question: {question}
    Your Response:
""")

rag_prompt = PromptTemplate.from_template(template)


retriever = vectorstore.as_retriever()

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | rag_prompt
    | llm
    | StrOutputParser()
)

import gradio as gr

def rag_memory_stream(message, history):
    partial_text = ""
    for new_text in rag_chain.stream(message):  # Replace with actual streaming logic
        partial_text += new_text
        yield partial_text

# Correctly define examples as a list
examples =[
    "What is the main purpose of Law Nยบ 59/2018 of 22/8/2018?",
    "What happens to a person who deliberately conceals or destroys evidence related to genocide?",
    "What are the penalties for violating a specific article?"
]


description = (
    "This Regal AI Assistance specializes in LAW Nยบ 59/2018 OF 22/8/2018 "
    "ON THE CRIME OF GENOCIDE IDEOLOGY AND RELATED CRIMES."
)

title = "โš–๏ธ Chat with me and learn Laws! โš–๏ธ"

# Custom CSS for styling the interface
custom_css = """

body {
    font-family: "Times New Roman", serif;
}

.gradio-container {

    font-family: "Times New Roman", serif;
}

.gr-button {
    background-color: #007bff; /* Blue button */
    color: white;
    border: none;
    border-radius: 5px;
    font-size: 16px;
    padding: 10px 20px;
    cursor: pointer;
}

.gr-textbox:focus, .gr-button:focus {
    outline: none; /* Remove outline focus for a cleaner look */
}
"""


# Create the Chat Interface
demo = gr.ChatInterface(
    fn=rag_memory_stream,
    type="messages",
    title=title,
    description=description,
    fill_height=True,
    examples=examples,  # Pass the corrected examples list
    theme="soft",
    #css=custom_css,  # Apply the custom CSS
)

if __name__ == "__main__":
    demo.launch(share=True, inbrowser=True, height=800, width="100%")