from llama_cpp import Llama

# Path to the GGUF model file
MODEL_PATH = "llama-3.1-8B.gguf"

# Load the model
print("Loading the model...")
try:
    llama = Llama(model_path=MODEL_PATH, n_ctx=1024, n_threads=4)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Failed to load the model: {e}")
    exit(1)

# Chat loop
print("Chat with the model! Type 'exit' to end the conversation.")
while True:
    user_input = input("You: ").strip()
    if user_input.lower() == "exit":
        print("Exiting chat. Goodbye!")
        break

    # Query the model
    print("Thinking...")
    response = llama(
        user_input,
        max_tokens=50,       # Limit response length
        temperature=0.7,     # Control randomness
        top_p=0.9,           # Top-p sampling
        stop=["You:"]        # Stop at the next user prompt
    )

    # Extract and clean response text
    response_text = response['choices'][0]['text'].strip()
    print(f"Model: {response_text}")