mjavaid commited on
Commit
6aea303
·
1 Parent(s): 6ec1159
Files changed (2) hide show
  1. app.py +48 -10
  2. requirements.txt +0 -2
app.py CHANGED
@@ -1,21 +1,59 @@
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from transformers import AutoTokenizer, AutoModelForCausalLM
3
  import torch
4
- import spaces
5
 
6
  # Define model repository
7
  repo_name = "hanzla/Falcon3-Mamba-R1-v0"
8
 
9
- # Load tokenizer and model
 
10
  tokenizer = AutoTokenizer.from_pretrained(repo_name)
11
- model = AutoModelForCausalLM.from_pretrained(
12
- repo_name,
13
- device_map="auto", # Auto place layers across available GPUs
14
- torch_dtype=torch.float16,
15
- )
16
 
17
- @spaces.GPU
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def generate_response(message, history):
 
 
 
19
  messages = [
20
  {"role": "system", "content": "You are a helpful assistant. You think before answering"},
21
  ]
@@ -37,7 +75,7 @@ def generate_response(message, history):
37
  # Generate response
38
  outputs = model.generate(
39
  input_ids,
40
- max_new_tokens=1024,
41
  temperature=0.7,
42
  do_sample=True,
43
  )
@@ -52,7 +90,7 @@ def generate_response(message, history):
52
  demo = gr.ChatInterface(
53
  generate_response,
54
  title="Falcon3-Mamba-R1-v0 Chat",
55
- description="Chat with the Falcon3-Mamba-R1-v0 model..",
56
  examples=["Tell me about yourself",
57
  "Explain quantum computing like I'm 10",
58
  "Write a short poem about AI"],
 
1
  import gradio as gr
2
+ import subprocess
3
+ import sys
4
+ import os
5
+
6
+ # Install the necessary packages that require CUDA
7
+ try:
8
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "causal-conv1d>=1.4.0", "--no-build-isolation"])
9
+ subprocess.check_call([sys.executable, "-m", "pip", "install", "mamba-ssm"])
10
+ except Exception as e:
11
+ print(f"Warning: Could not install CUDA extensions: {e}")
12
+ print("The model might not work correctly or will be slower.")
13
+
14
+ # Now import the required libraries
15
  from transformers import AutoTokenizer, AutoModelForCausalLM
16
  import torch
 
17
 
18
  # Define model repository
19
  repo_name = "hanzla/Falcon3-Mamba-R1-v0"
20
 
21
+ # Load tokenizer
22
+ print("Loading tokenizer...")
23
  tokenizer = AutoTokenizer.from_pretrained(repo_name)
 
 
 
 
 
24
 
25
+ # Load model with appropriate settings
26
+ print("Loading model... (this may take some time)")
27
+ model = None
28
+
29
+ try:
30
+ # Try to load the model with GPU acceleration
31
+ model = AutoModelForCausalLM.from_pretrained(
32
+ repo_name,
33
+ device_map="auto",
34
+ torch_dtype=torch.float16,
35
+ )
36
+ except Exception as e:
37
+ print(f"Error loading model with GPU: {e}")
38
+ print("Attempting to load with CPU only...")
39
+ try:
40
+ model = AutoModelForCausalLM.from_pretrained(
41
+ repo_name,
42
+ device_map="cpu",
43
+ torch_dtype=torch.float32,
44
+ )
45
+ except Exception as e2:
46
+ print(f"Error loading model with CPU: {e2}")
47
+
48
+ if model is None:
49
+ print("Could not load the model. Please check the logs.")
50
+ else:
51
+ print("Model loaded successfully!")
52
+
53
  def generate_response(message, history):
54
+ if model is None:
55
+ return "Sorry, the model could not be loaded. Please check the logs."
56
+
57
  messages = [
58
  {"role": "system", "content": "You are a helpful assistant. You think before answering"},
59
  ]
 
75
  # Generate response
76
  outputs = model.generate(
77
  input_ids,
78
+ max_new_tokens=512, # Reduced from 1024 to improve speed
79
  temperature=0.7,
80
  do_sample=True,
81
  )
 
90
  demo = gr.ChatInterface(
91
  generate_response,
92
  title="Falcon3-Mamba-R1-v0 Chat",
93
+ description="Chat with the Falcon3-Mamba-R1-v0 model. This is a hybrid Falcon-Mamba architecture.",
94
  examples=["Tell me about yourself",
95
  "Explain quantum computing like I'm 10",
96
  "Write a short poem about AI"],
requirements.txt CHANGED
@@ -2,5 +2,3 @@ gradio>=4.0.0
2
  transformers>=4.34.0
3
  torch
4
  accelerate
5
- causal-conv1d>=1.4.0
6
- mamba-ssm
 
2
  transformers>=4.34.0
3
  torch
4
  accelerate