lunarflu HF staff commited on
Commit
af3e6a9
·
verified ·
1 Parent(s): 4a88289

use simplified code instead

Browse files
Files changed (1) hide show
  1. app.py +16 -38
app.py CHANGED
@@ -1,20 +1,14 @@
1
  import gradio as gr
2
  import spaces
3
- import torch
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
- # Load the model and tokenizer
7
- tokenizer = AutoTokenizer.from_pretrained("TheBloke/Chronoboros-33B-GPTQ")
8
- model = AutoModelForCausalLM.from_pretrained("TheBloke/Chronoboros-33B-GPTQ", device_map="auto")
9
-
10
- # Set a valid pad_token_id to avoid generation errors
11
- model.generation_config.pad_token_id = tokenizer.eos_token_id
12
-
13
- model.eval() # Ensure the model is in evaluation mode
14
 
15
  @spaces.GPU
16
  def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
17
- # Build the prompt using conversation history
18
  prompt = f"{system_message}\n"
19
  for user_text, assistant_text in history:
20
  if user_text:
@@ -22,44 +16,28 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
22
  if assistant_text:
23
  prompt += f"Assistant: {assistant_text}\n"
24
  prompt += f"User: {message}\nAssistant: "
25
-
26
- # Tokenize the prompt
27
- input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
28
 
29
- # Generate the response with no gradients
30
- with torch.no_grad():
31
- output_ids = model.generate(
32
- input_ids,
33
- max_new_tokens=max_tokens,
34
- temperature=temperature,
35
- top_p=top_p,
36
- do_sample=True,
37
- pad_token_id=tokenizer.eos_token_id, # also pass it here to be safe
38
- )
39
 
40
- # Extract the new tokens (tokens generated after the prompt)
41
- new_tokens = output_ids[0][input_ids.shape[1]:]
42
 
43
- # Stream output in chunks (here yielding every 5 tokens)
44
  chunk_size = 5
45
- for i in range(0, new_tokens.shape[0], chunk_size):
46
- current_response = tokenizer.decode(new_tokens[: i + chunk_size], skip_special_tokens=True)
47
- yield current_response
48
 
49
- # Configure the ChatInterface with additional inputs
50
  demo = gr.ChatInterface(
51
  respond,
52
  additional_inputs=[
53
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
54
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
55
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
56
- gr.Slider(
57
- minimum=0.1,
58
- maximum=1.0,
59
- value=0.95,
60
- step=0.05,
61
- label="Top-p (nucleus sampling)",
62
- ),
63
  ],
64
  )
65
 
 
1
  import gradio as gr
2
  import spaces
3
+ from transformers import pipeline
 
4
 
5
+ # Create the text generation pipeline.
6
+ # If you're running on GPU, you can specify device=0 (or use device_map="auto" if supported).
7
+ pipe = pipeline("text-generation", model="TheBloke/Chronoboros-33B-GPTQ", device=0)
 
 
 
 
 
8
 
9
  @spaces.GPU
10
  def respond(message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p):
11
+ # Build the prompt from system message and conversation history.
12
  prompt = f"{system_message}\n"
13
  for user_text, assistant_text in history:
14
  if user_text:
 
16
  if assistant_text:
17
  prompt += f"Assistant: {assistant_text}\n"
18
  prompt += f"User: {message}\nAssistant: "
 
 
 
19
 
20
+ # Generate a response using the pipeline.
21
+ # The pipeline returns a list of dictionaries; we take the generated text from the first output.
22
+ output = pipe(prompt, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p)
23
+ full_text = output[0]["generated_text"]
 
 
 
 
 
 
24
 
25
+ # Remove the prompt from the generated text to isolate the response.
26
+ response_text = full_text[len(prompt):]
27
 
28
+ # Simulate streaming output in chunks (e.g., 5 characters at a time).
29
  chunk_size = 5
30
+ for i in range(0, len(response_text), chunk_size):
31
+ yield response_text[: i + chunk_size]
 
32
 
33
+ # Configure the ChatInterface with additional inputs.
34
  demo = gr.ChatInterface(
35
  respond,
36
  additional_inputs=[
37
  gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
38
  gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
39
  gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
40
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
 
 
 
 
 
 
41
  ],
42
  )
43