xfcxcxcdfdfd commited on
Commit
f4672c6
·
verified ·
1 Parent(s): aade08d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -39
app.py CHANGED
@@ -13,6 +13,11 @@ from threading import Thread
13
  import psutil
14
  import gc
15
  import torch
 
 
 
 
 
16
 
17
  load_dotenv()
18
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
@@ -217,9 +222,11 @@ global_data = {
217
  'end': {},
218
  'llama_perf_context_print': {},
219
  'llm_load_print_meta': {},
220
- 'model_type': {}
 
221
  }
222
 
 
223
  model_configs = [
224
  {
225
  "repo_id": "Hjgugugjhuhjggg/testing_semifinal-Q2_K-GGUF",
@@ -231,57 +238,63 @@ model_configs = [
231
  "filename": "Llama-3.2-3B-Instruct-uncensored-Q2_K.gguf",
232
  "name": "Llama-3.2-3B-Instruct"
233
  },
234
- {
235
- "repo_id": "Ffftdtd5dtft/Mistral-Nemo-Instruct-2407-Q2_K-GGUF",
236
- "filename": "mistral-nemo-instruct-2407-q2_k.gguf",
237
- "name": "Mistral-Nemo-Instruct"
238
- },
239
- {
240
- "repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF",
241
- "filename": "meta-llama-3.1-70b-instruct-q2_k.gguf",
242
- "name": "Meta-Llama-3.1-70B-Instruct"
243
- },
244
- {
245
- "repo_id": "Ffftdtd5dtft/Qwen2-Math-72B-Instruct-Q2_K-GGUF",
246
- "filename": "qwen2-math-72b-instruct-q2_k.gguf",
247
- "name": "Qwen2-Math-72B-Instruct"
248
- },
249
- {
250
- "repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF",
251
- "filename": "meta-llama-3.1-70b-q2_k.gguf",
252
- "name": "Meta-Llama-3.1-70B"
253
- }
254
  ]
255
 
256
  class ModelManager:
257
  def __init__(self):
258
  self.models = {}
 
259
 
260
  def load_model(self, model_config):
261
- if model_config['name'] not in self.models:
262
- try:
263
- self.models[model_config['name']] = Llama.from_pretrained(
264
- repo_id=model_config['repo_id'],
265
- filename=model_config['filename'],
266
- use_auth_token=HUGGINGFACE_TOKEN,
267
- n_threads=20,
268
- use_gpu=False
269
- )
270
- except Exception as e:
271
- pass
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
  def load_all_models(self):
274
  with ThreadPoolExecutor() as executor:
275
  for config in model_configs:
276
- executor.submit(self.load_model, config)
277
- return self.models
 
 
 
 
278
 
279
  model_manager = ModelManager()
280
- global_data['models'] = model_manager.load_all_models()
281
 
282
  class ChatRequest(BaseModel):
283
  message: str
284
 
 
 
 
285
  def normalize_input(input_text):
286
  return input_text.strip()
287
 
@@ -305,6 +318,7 @@ def cache_response(func):
305
  return response
306
  return wrapper
307
 
 
308
  @cache_response
309
  def generate_model_response(model, inputs):
310
  try:
@@ -332,9 +346,33 @@ async def process_message(message):
332
  for model_name, future in zip(global_data['models'].keys(), as_completed(futures))
333
  ]
334
  unique_responses = remove_repetitive_responses(responses)
335
- formatted_response = next(iter(unique_responses.values())) # Get a single response
336
  return formatted_response
337
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
338
  app = FastAPI()
339
 
340
  @app.post("/generate")
@@ -345,6 +383,22 @@ async def generate(request: ChatRequest):
345
  except Exception as e:
346
  return JSONResponse(content={"error": str(e)})
347
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  def run_uvicorn():
349
  try:
350
  uvicorn.run(app, host="0.0.0.0", port=7860)
@@ -355,14 +409,28 @@ iface = gr.Interface(
355
  fn=process_message,
356
  inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
357
  outputs=gr.Markdown(),
358
- title="Multi-Model LLM API (CPU Optimized)",
359
  description="Optimized version using GPU and memory management techniques."
360
  )
 
 
 
 
 
 
 
 
361
 
362
  def run_gradio():
363
- iface.launch(server_port=7862, prevent_thread_lock=True)
 
 
 
 
 
 
364
 
365
  if __name__ == "__main__":
366
  Thread(target=run_uvicorn).start()
367
  Thread(target=run_gradio).start()
368
- asyncio.get_event_loop().run_forever()
 
13
  import psutil
14
  import gc
15
  import torch
16
+ import numpy as np
17
+ from PIL import Image
18
+ import stable_diffusion_cpp as sdcpp
19
+ import base64
20
+ import io
21
 
22
  load_dotenv()
23
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 
222
  'end': {},
223
  'llama_perf_context_print': {},
224
  'llm_load_print_meta': {},
225
+ 'model_type': {},
226
+ 'image_model': {}
227
  }
228
 
229
+
230
  model_configs = [
231
  {
232
  "repo_id": "Hjgugugjhuhjggg/testing_semifinal-Q2_K-GGUF",
 
238
  "filename": "Llama-3.2-3B-Instruct-uncensored-Q2_K.gguf",
239
  "name": "Llama-3.2-3B-Instruct"
240
  },
241
+ {
242
+ "repo_id": "city96/FLUX.1-schnell-gguf",
243
+ "filename": "flux1-schnell-Q2_K.gguf",
244
+ "name": "flux1-schnell"
245
+ },
246
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  ]
248
 
249
  class ModelManager:
250
  def __init__(self):
251
  self.models = {}
252
+ self.image_model = None
253
 
254
  def load_model(self, model_config):
255
+ if model_config['name'] not in self.models and model_config['name'] != "flux1-schnell":
256
+ try:
257
+ self.models[model_config['name']] = Llama.from_pretrained(
258
+ repo_id=model_config['repo_id'],
259
+ filename=model_config['filename'],
260
+ use_auth_token=HUGGINGFACE_TOKEN,
261
+ n_threads=20,
262
+ use_gpu=False
263
+ )
264
+ except Exception as e:
265
+ pass
266
+
267
+ def load_image_model(self, model_config):
268
+ try:
269
+ self.image_model = sdcpp.StableDiffusionCpp(
270
+ repo_id=model_config['repo_id'],
271
+ filename=model_config['filename'],
272
+ use_auth_token=HUGGINGFACE_TOKEN,
273
+ n_threads=20,
274
+ use_gpu=False
275
+ )
276
+ except Exception as e:
277
+ print(f"Error loading image model: {e}")
278
 
279
  def load_all_models(self):
280
  with ThreadPoolExecutor() as executor:
281
  for config in model_configs:
282
+ if config['name'] == "flux1-schnell":
283
+ executor.submit(self.load_image_model, config)
284
+ else:
285
+ executor.submit(self.load_model, config)
286
+ return self.models, self.image_model
287
+
288
 
289
  model_manager = ModelManager()
290
+ global_data['models'], global_data['image_model'] = model_manager.load_all_models()
291
 
292
  class ChatRequest(BaseModel):
293
  message: str
294
 
295
+ class ImageRequest(BaseModel):
296
+ prompt: str
297
+
298
  def normalize_input(input_text):
299
  return input_text.strip()
300
 
 
318
  return response
319
  return wrapper
320
 
321
+
322
  @cache_response
323
  def generate_model_response(model, inputs):
324
  try:
 
346
  for model_name, future in zip(global_data['models'].keys(), as_completed(futures))
347
  ]
348
  unique_responses = remove_repetitive_responses(responses)
349
+ formatted_response = next(iter(unique_responses.values()))
350
  return formatted_response
351
 
352
+ async def generate_image(prompt: str):
353
+ if global_data['image_model']:
354
+ try:
355
+ image_bytes = global_data['image_model'].generate(
356
+ prompt=prompt,
357
+ negative_prompt="ugly, deformed, disfigured",
358
+ steps=25,
359
+ cfg_scale=7.0,
360
+ width=512,
361
+ height=512,
362
+ seed=-1,
363
+ return_type='bytes'
364
+ )
365
+
366
+ image = Image.open(io.BytesIO(image_bytes))
367
+ return image
368
+ except Exception as e:
369
+ print(f"Error generating image: {e}")
370
+ return None
371
+ else:
372
+ print("No image model loaded.")
373
+ return None
374
+
375
+
376
  app = FastAPI()
377
 
378
  @app.post("/generate")
 
383
  except Exception as e:
384
  return JSONResponse(content={"error": str(e)})
385
 
386
+ @app.post("/generate_image")
387
+ async def generate_image_endpoint(request: ImageRequest):
388
+ try:
389
+ image = await generate_image(request.prompt)
390
+ if image:
391
+ buffered = io.BytesIO()
392
+ image.save(buffered, format="PNG")
393
+ image_base64 = base64.b64encode(buffered.getvalue()).decode()
394
+
395
+ return JSONResponse(content={"image": image_base64})
396
+ else:
397
+ return JSONResponse(content={"error": "Image generation failed or no model loaded"})
398
+ except Exception as e:
399
+ return JSONResponse(content={"error": str(e)})
400
+
401
+
402
  def run_uvicorn():
403
  try:
404
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
409
  fn=process_message,
410
  inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
411
  outputs=gr.Markdown(),
412
+ title="Multi-Model LLM & Image API (CPU Optimized)",
413
  description="Optimized version using GPU and memory management techniques."
414
  )
415
+ iface_image = gr.Interface(
416
+ fn=generate_image,
417
+ inputs=gr.Textbox(lines=2, placeholder="Enter image prompt here..."),
418
+ outputs=gr.Image(),
419
+ title="Stable Diffusion Image Generator",
420
+ description="Generate images using the specified stable diffusion model."
421
+ )
422
+
423
 
424
  def run_gradio():
425
+ with gr.Blocks(title="Multi-Model LLM & Image API (CPU Optimized)") as demo:
426
+ with gr.Tab("LLM"):
427
+ iface.render()
428
+ with gr.Tab("Image Generator"):
429
+ iface_image.render()
430
+ demo.launch(server_port=7862, prevent_thread_lock=True)
431
+
432
 
433
  if __name__ == "__main__":
434
  Thread(target=run_uvicorn).start()
435
  Thread(target=run_gradio).start()
436
+ asyncio.get_event_loop().run_forever()