demo_lowcode_llm / run_model.py
junaidbaber's picture
Uploading the demo
7077c22
raw
history blame contribute delete
994 Bytes
from llama_cpp import Llama
# Path to the GGUF model file
MODEL_PATH = "llama-3.1-8B.gguf"
# Load the model
print("Loading the model...")
try:
llama = Llama(model_path=MODEL_PATH, n_ctx=1024, n_threads=4)
print("Model loaded successfully!")
except Exception as e:
print(f"Failed to load the model: {e}")
exit(1)
# Chat loop
print("Chat with the model! Type 'exit' to end the conversation.")
while True:
user_input = input("You: ").strip()
if user_input.lower() == "exit":
print("Exiting chat. Goodbye!")
break
# Query the model
print("Thinking...")
response = llama(
user_input,
max_tokens=50, # Limit response length
temperature=0.7, # Control randomness
top_p=0.9, # Top-p sampling
stop=["You:"] # Stop at the next user prompt
)
# Extract and clean response text
response_text = response['choices'][0]['text'].strip()
print(f"Model: {response_text}")