Update app.py
Browse files
app.py
CHANGED
@@ -13,6 +13,11 @@ from threading import Thread
|
|
13 |
import psutil
|
14 |
import gc
|
15 |
import torch
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
load_dotenv()
|
18 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
@@ -217,9 +222,11 @@ global_data = {
|
|
217 |
'end': {},
|
218 |
'llama_perf_context_print': {},
|
219 |
'llm_load_print_meta': {},
|
220 |
-
'model_type': {}
|
|
|
221 |
}
|
222 |
|
|
|
223 |
model_configs = [
|
224 |
{
|
225 |
"repo_id": "Hjgugugjhuhjggg/testing_semifinal-Q2_K-GGUF",
|
@@ -231,57 +238,63 @@ model_configs = [
|
|
231 |
"filename": "Llama-3.2-3B-Instruct-uncensored-Q2_K.gguf",
|
232 |
"name": "Llama-3.2-3B-Instruct"
|
233 |
},
|
234 |
-
|
235 |
-
"repo_id": "
|
236 |
-
"filename": "
|
237 |
-
"name": "
|
238 |
-
|
239 |
-
|
240 |
-
"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Instruct-Q2_K-GGUF",
|
241 |
-
"filename": "meta-llama-3.1-70b-instruct-q2_k.gguf",
|
242 |
-
"name": "Meta-Llama-3.1-70B-Instruct"
|
243 |
-
},
|
244 |
-
{
|
245 |
-
"repo_id": "Ffftdtd5dtft/Qwen2-Math-72B-Instruct-Q2_K-GGUF",
|
246 |
-
"filename": "qwen2-math-72b-instruct-q2_k.gguf",
|
247 |
-
"name": "Qwen2-Math-72B-Instruct"
|
248 |
-
},
|
249 |
-
{
|
250 |
-
"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF",
|
251 |
-
"filename": "meta-llama-3.1-70b-q2_k.gguf",
|
252 |
-
"name": "Meta-Llama-3.1-70B"
|
253 |
-
}
|
254 |
]
|
255 |
|
256 |
class ModelManager:
|
257 |
def __init__(self):
|
258 |
self.models = {}
|
|
|
259 |
|
260 |
def load_model(self, model_config):
|
261 |
-
if model_config['name'] not in self.models:
|
262 |
-
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
-
|
271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
|
273 |
def load_all_models(self):
|
274 |
with ThreadPoolExecutor() as executor:
|
275 |
for config in model_configs:
|
276 |
-
|
277 |
-
|
|
|
|
|
|
|
|
|
278 |
|
279 |
model_manager = ModelManager()
|
280 |
-
global_data['models'] = model_manager.load_all_models()
|
281 |
|
282 |
class ChatRequest(BaseModel):
|
283 |
message: str
|
284 |
|
|
|
|
|
|
|
285 |
def normalize_input(input_text):
|
286 |
return input_text.strip()
|
287 |
|
@@ -305,6 +318,7 @@ def cache_response(func):
|
|
305 |
return response
|
306 |
return wrapper
|
307 |
|
|
|
308 |
@cache_response
|
309 |
def generate_model_response(model, inputs):
|
310 |
try:
|
@@ -332,9 +346,33 @@ async def process_message(message):
|
|
332 |
for model_name, future in zip(global_data['models'].keys(), as_completed(futures))
|
333 |
]
|
334 |
unique_responses = remove_repetitive_responses(responses)
|
335 |
-
formatted_response = next(iter(unique_responses.values()))
|
336 |
return formatted_response
|
337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
app = FastAPI()
|
339 |
|
340 |
@app.post("/generate")
|
@@ -345,6 +383,22 @@ async def generate(request: ChatRequest):
|
|
345 |
except Exception as e:
|
346 |
return JSONResponse(content={"error": str(e)})
|
347 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
348 |
def run_uvicorn():
|
349 |
try:
|
350 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
@@ -355,14 +409,28 @@ iface = gr.Interface(
|
|
355 |
fn=process_message,
|
356 |
inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
|
357 |
outputs=gr.Markdown(),
|
358 |
-
title="Multi-Model LLM API (CPU Optimized)",
|
359 |
description="Optimized version using GPU and memory management techniques."
|
360 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
361 |
|
362 |
def run_gradio():
|
363 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
364 |
|
365 |
if __name__ == "__main__":
|
366 |
Thread(target=run_uvicorn).start()
|
367 |
Thread(target=run_gradio).start()
|
368 |
-
asyncio.get_event_loop().run_forever()
|
|
|
13 |
import psutil
|
14 |
import gc
|
15 |
import torch
|
16 |
+
import numpy as np
|
17 |
+
from PIL import Image
|
18 |
+
import stable_diffusion_cpp as sdcpp
|
19 |
+
import base64
|
20 |
+
import io
|
21 |
|
22 |
load_dotenv()
|
23 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
|
|
222 |
'end': {},
|
223 |
'llama_perf_context_print': {},
|
224 |
'llm_load_print_meta': {},
|
225 |
+
'model_type': {},
|
226 |
+
'image_model': {}
|
227 |
}
|
228 |
|
229 |
+
|
230 |
model_configs = [
|
231 |
{
|
232 |
"repo_id": "Hjgugugjhuhjggg/testing_semifinal-Q2_K-GGUF",
|
|
|
238 |
"filename": "Llama-3.2-3B-Instruct-uncensored-Q2_K.gguf",
|
239 |
"name": "Llama-3.2-3B-Instruct"
|
240 |
},
|
241 |
+
{
|
242 |
+
"repo_id": "city96/FLUX.1-schnell-gguf",
|
243 |
+
"filename": "flux1-schnell-Q2_K.gguf",
|
244 |
+
"name": "flux1-schnell"
|
245 |
+
},
|
246 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
247 |
]
|
248 |
|
249 |
class ModelManager:
|
250 |
def __init__(self):
|
251 |
self.models = {}
|
252 |
+
self.image_model = None
|
253 |
|
254 |
def load_model(self, model_config):
|
255 |
+
if model_config['name'] not in self.models and model_config['name'] != "flux1-schnell":
|
256 |
+
try:
|
257 |
+
self.models[model_config['name']] = Llama.from_pretrained(
|
258 |
+
repo_id=model_config['repo_id'],
|
259 |
+
filename=model_config['filename'],
|
260 |
+
use_auth_token=HUGGINGFACE_TOKEN,
|
261 |
+
n_threads=20,
|
262 |
+
use_gpu=False
|
263 |
+
)
|
264 |
+
except Exception as e:
|
265 |
+
pass
|
266 |
+
|
267 |
+
def load_image_model(self, model_config):
|
268 |
+
try:
|
269 |
+
self.image_model = sdcpp.StableDiffusionCpp(
|
270 |
+
repo_id=model_config['repo_id'],
|
271 |
+
filename=model_config['filename'],
|
272 |
+
use_auth_token=HUGGINGFACE_TOKEN,
|
273 |
+
n_threads=20,
|
274 |
+
use_gpu=False
|
275 |
+
)
|
276 |
+
except Exception as e:
|
277 |
+
print(f"Error loading image model: {e}")
|
278 |
|
279 |
def load_all_models(self):
|
280 |
with ThreadPoolExecutor() as executor:
|
281 |
for config in model_configs:
|
282 |
+
if config['name'] == "flux1-schnell":
|
283 |
+
executor.submit(self.load_image_model, config)
|
284 |
+
else:
|
285 |
+
executor.submit(self.load_model, config)
|
286 |
+
return self.models, self.image_model
|
287 |
+
|
288 |
|
289 |
model_manager = ModelManager()
|
290 |
+
global_data['models'], global_data['image_model'] = model_manager.load_all_models()
|
291 |
|
292 |
class ChatRequest(BaseModel):
|
293 |
message: str
|
294 |
|
295 |
+
class ImageRequest(BaseModel):
|
296 |
+
prompt: str
|
297 |
+
|
298 |
def normalize_input(input_text):
|
299 |
return input_text.strip()
|
300 |
|
|
|
318 |
return response
|
319 |
return wrapper
|
320 |
|
321 |
+
|
322 |
@cache_response
|
323 |
def generate_model_response(model, inputs):
|
324 |
try:
|
|
|
346 |
for model_name, future in zip(global_data['models'].keys(), as_completed(futures))
|
347 |
]
|
348 |
unique_responses = remove_repetitive_responses(responses)
|
349 |
+
formatted_response = next(iter(unique_responses.values()))
|
350 |
return formatted_response
|
351 |
|
352 |
+
async def generate_image(prompt: str):
|
353 |
+
if global_data['image_model']:
|
354 |
+
try:
|
355 |
+
image_bytes = global_data['image_model'].generate(
|
356 |
+
prompt=prompt,
|
357 |
+
negative_prompt="ugly, deformed, disfigured",
|
358 |
+
steps=25,
|
359 |
+
cfg_scale=7.0,
|
360 |
+
width=512,
|
361 |
+
height=512,
|
362 |
+
seed=-1,
|
363 |
+
return_type='bytes'
|
364 |
+
)
|
365 |
+
|
366 |
+
image = Image.open(io.BytesIO(image_bytes))
|
367 |
+
return image
|
368 |
+
except Exception as e:
|
369 |
+
print(f"Error generating image: {e}")
|
370 |
+
return None
|
371 |
+
else:
|
372 |
+
print("No image model loaded.")
|
373 |
+
return None
|
374 |
+
|
375 |
+
|
376 |
app = FastAPI()
|
377 |
|
378 |
@app.post("/generate")
|
|
|
383 |
except Exception as e:
|
384 |
return JSONResponse(content={"error": str(e)})
|
385 |
|
386 |
+
@app.post("/generate_image")
|
387 |
+
async def generate_image_endpoint(request: ImageRequest):
|
388 |
+
try:
|
389 |
+
image = await generate_image(request.prompt)
|
390 |
+
if image:
|
391 |
+
buffered = io.BytesIO()
|
392 |
+
image.save(buffered, format="PNG")
|
393 |
+
image_base64 = base64.b64encode(buffered.getvalue()).decode()
|
394 |
+
|
395 |
+
return JSONResponse(content={"image": image_base64})
|
396 |
+
else:
|
397 |
+
return JSONResponse(content={"error": "Image generation failed or no model loaded"})
|
398 |
+
except Exception as e:
|
399 |
+
return JSONResponse(content={"error": str(e)})
|
400 |
+
|
401 |
+
|
402 |
def run_uvicorn():
|
403 |
try:
|
404 |
uvicorn.run(app, host="0.0.0.0", port=7860)
|
|
|
409 |
fn=process_message,
|
410 |
inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
|
411 |
outputs=gr.Markdown(),
|
412 |
+
title="Multi-Model LLM & Image API (CPU Optimized)",
|
413 |
description="Optimized version using GPU and memory management techniques."
|
414 |
)
|
415 |
+
iface_image = gr.Interface(
|
416 |
+
fn=generate_image,
|
417 |
+
inputs=gr.Textbox(lines=2, placeholder="Enter image prompt here..."),
|
418 |
+
outputs=gr.Image(),
|
419 |
+
title="Stable Diffusion Image Generator",
|
420 |
+
description="Generate images using the specified stable diffusion model."
|
421 |
+
)
|
422 |
+
|
423 |
|
424 |
def run_gradio():
|
425 |
+
with gr.Blocks(title="Multi-Model LLM & Image API (CPU Optimized)") as demo:
|
426 |
+
with gr.Tab("LLM"):
|
427 |
+
iface.render()
|
428 |
+
with gr.Tab("Image Generator"):
|
429 |
+
iface_image.render()
|
430 |
+
demo.launch(server_port=7862, prevent_thread_lock=True)
|
431 |
+
|
432 |
|
433 |
if __name__ == "__main__":
|
434 |
Thread(target=run_uvicorn).start()
|
435 |
Thread(target=run_gradio).start()
|
436 |
+
asyncio.get_event_loop().run_forever()
|