Spaces:

junaidbaber
/

demo_lowcode_llm

Sleeping

demo_lowcode_llm / run_model.py

Uploading the demo

7077c22 2 months ago

994 Bytes

	from llama_cpp import Llama

	# Path to the GGUF model file
	MODEL_PATH = "llama-3.1-8B.gguf"

	# Load the model
	print("Loading the model...")
	try:
	llama = Llama(model_path=MODEL_PATH, n_ctx=1024, n_threads=4)
	print("Model loaded successfully!")
	except Exception as e:
	print(f"Failed to load the model: {e}")
	exit(1)

	# Chat loop
	print("Chat with the model! Type 'exit' to end the conversation.")
	while True:
	user_input = input("You: ").strip()
	if user_input.lower() == "exit":
	print("Exiting chat. Goodbye!")
	break

	# Query the model
	print("Thinking...")
	response = llama(
	user_input,
	max_tokens=50, # Limit response length
	temperature=0.7, # Control randomness
	top_p=0.9, # Top-p sampling
	stop=["You:"] # Stop at the next user prompt
	)

	# Extract and clean response text
	response_text = response['choices'][0]['text'].strip()
	print(f"Model: {response_text}")