Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,140 Bytes
b867d63 6aea303 322634a 6aea303 b867d63 6aea303 b867d63 6aea303 322634a b867d63 0b266af 6aea303 dc060e7 6aea303 b867d63 dc060e7 b867d63 f9a074f b867d63 dc060e7 b867d63 dc060e7 b867d63 dc060e7 c53d7c3 b867d63 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import gradio as gr
import subprocess
import sys
import os
import spaces
# Install the necessary packages that require CUDA
try:
subprocess.check_call([sys.executable, "-m", "pip", "install", "causal-conv1d>=1.4.0", "--no-build-isolation"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "mamba-ssm"])
except Exception as e:
print(f"Warning: Could not install CUDA extensions: {e}")
print("The model might not work correctly or will be slower.")
# Now import the required libraries
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
# Define model repository
repo_name = "hanzla/Falcon3-Mamba-R1-v0"
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(repo_name)
# Load model with appropriate settings
print("Loading model... (this may take some time)")
model = None
try:
# Try to load the model with GPU acceleration
model = AutoModelForCausalLM.from_pretrained(
repo_name,
device_map="auto",
torch_dtype=torch.float16,
)
except Exception as e:
print(f"Error loading model with GPU: {e}")
print("Attempting to load with CPU only...")
try:
model = AutoModelForCausalLM.from_pretrained(
repo_name,
device_map="cpu",
torch_dtype=torch.float32,
)
except Exception as e2:
print(f"Error loading model with CPU: {e2}")
if model is None:
print("Could not load the model. Please check the logs.")
else:
print("Model loaded successfully!")
@spaces.GPU
def generate_response(message, history):
print(message)
if model is None:
return "Sorry, the model could not be loaded. Please check the logs."
messages = [
{"role": "system", "content": "You are a helpful assistant. You think before answering"},
]
# Add chat history to messages
for h in history:
messages.append({"role": "user", "content": h[0]})
messages.append({"role": "assistant", "content": h[1]})
# Add current message
messages.append({"role": "user", "content": message})
# Generate input text using chat template
input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
# Tokenize input
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(model.device)
# Generate response
outputs = model.generate(
input_ids,
max_new_tokens=8000,
temperature=0.7,
do_sample=True,
)
# Decode the generated tokens
generated_tokens = outputs[0][len(input_ids[0]):]
response = tokenizer.decode(generated_tokens, skip_special_tokens=True)
return response
# Create Gradio interface
demo = gr.ChatInterface(
generate_response,
title="Falcon3-Mamba-R1-v0 Chat",
description="Chat with the Falcon3-Mamba-R1-v0 model.",
examples=[
"How does the surface area of moon compare with that of earth?",
"Why it takes 8 minutes for sunlight to reach earth?"],
theme="soft"
)
# Launch the interface
demo.launch()
|