Spaces:

hanzla
/

Falcon3MambaReasoner

Running on Zero

App Files Files Community

mjavaid commited on 6 days ago

Commit

6aea303

1 Parent(s): 6ec1159

add

Browse files

Files changed (2) hide show

app.py +48 -10
requirements.txt +0 -2

app.py CHANGED Viewed

@@ -1,21 +1,59 @@
 import gradio as gr
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
-import spaces
 # Define model repository
 repo_name = "hanzla/Falcon3-Mamba-R1-v0"
-# Load tokenizer and model
 tokenizer = AutoTokenizer.from_pretrained(repo_name)
-model = AutoModelForCausalLM.from_pretrained(
-    repo_name,
-    device_map="auto",  # Auto place layers across available GPUs
-    torch_dtype=torch.float16,
-)
-@spaces.GPU
 def generate_response(message, history):
     messages = [
         {"role": "system", "content": "You are a helpful assistant. You think before answering"},
     ]
@@ -37,7 +75,7 @@ def generate_response(message, history):
     # Generate response
     outputs = model.generate(
         input_ids,
-        max_new_tokens=1024,
         temperature=0.7,
         do_sample=True,
     )
@@ -52,7 +90,7 @@ def generate_response(message, history):
 demo = gr.ChatInterface(
     generate_response,
     title="Falcon3-Mamba-R1-v0 Chat",
-    description="Chat with the Falcon3-Mamba-R1-v0 model..",
     examples=["Tell me about yourself",
               "Explain quantum computing like I'm 10",
               "Write a short poem about AI"],

 import gradio as gr
+import subprocess
+import sys
+import os
+# Install the necessary packages that require CUDA
+try:
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "causal-conv1d>=1.4.0", "--no-build-isolation"])
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "mamba-ssm"])
+except Exception as e:
+    print(f"Warning: Could not install CUDA extensions: {e}")
+    print("The model might not work correctly or will be slower.")
+# Now import the required libraries
 from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
 # Define model repository
 repo_name = "hanzla/Falcon3-Mamba-R1-v0"
+# Load tokenizer
+print("Loading tokenizer...")
 tokenizer = AutoTokenizer.from_pretrained(repo_name)
+# Load model with appropriate settings
+print("Loading model... (this may take some time)")
+model = None
+try:
+    # Try to load the model with GPU acceleration
+    model = AutoModelForCausalLM.from_pretrained(
+        repo_name,
+        device_map="auto",
+        torch_dtype=torch.float16,
+    )
+except Exception as e:
+    print(f"Error loading model with GPU: {e}")
+    print("Attempting to load with CPU only...")
+    try:
+        model = AutoModelForCausalLM.from_pretrained(
+            repo_name,
+            device_map="cpu",
+            torch_dtype=torch.float32,
+        )
+    except Exception as e2:
+        print(f"Error loading model with CPU: {e2}")
+if model is None:
+    print("Could not load the model. Please check the logs.")
+else:
+    print("Model loaded successfully!")
 def generate_response(message, history):
+    if model is None:
+        return "Sorry, the model could not be loaded. Please check the logs."
     messages = [
         {"role": "system", "content": "You are a helpful assistant. You think before answering"},
     ]
     # Generate response
     outputs = model.generate(
         input_ids,
+        max_new_tokens=512,  # Reduced from 1024 to improve speed
         temperature=0.7,
         do_sample=True,
     )
 demo = gr.ChatInterface(
     generate_response,
     title="Falcon3-Mamba-R1-v0 Chat",
+    description="Chat with the Falcon3-Mamba-R1-v0 model. This is a hybrid Falcon-Mamba architecture.",
     examples=["Tell me about yourself",
               "Explain quantum computing like I'm 10",
               "Write a short poem about AI"],

requirements.txt CHANGED Viewed

@@ -2,5 +2,3 @@ gradio>=4.0.0
 transformers>=4.34.0
 torch
 accelerate
-causal-conv1d>=1.4.0
-mamba-ssm

 transformers>=4.34.0
 torch
 accelerate