from llama_cpp import Llama # Path to the GGUF model file MODEL_PATH = "llama-3.1-8B.gguf" # Load the model print("Loading the model...") try: llama = Llama(model_path=MODEL_PATH, n_ctx=1024, n_threads=4) print("Model loaded successfully!") except Exception as e: print(f"Failed to load the model: {e}") exit(1) # Chat loop print("Chat with the model! Type 'exit' to end the conversation.") while True: user_input = input("You: ").strip() if user_input.lower() == "exit": print("Exiting chat. Goodbye!") break # Query the model print("Thinking...") response = llama( user_input, max_tokens=50, # Limit response length temperature=0.7, # Control randomness top_p=0.9, # Top-p sampling stop=["You:"] # Stop at the next user prompt ) # Extract and clean response text response_text = response['choices'][0]['text'].strip() print(f"Model: {response_text}")