Update app.py
Browse files
app.py
CHANGED
@@ -1,28 +1,17 @@
|
|
1 |
-
import os
|
2 |
-
import gc
|
3 |
-
import psutil
|
4 |
import cachetools
|
5 |
from pydantic import BaseModel
|
6 |
from llama_cpp import Llama
|
7 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
8 |
-
import re
|
9 |
-
import httpx
|
10 |
import asyncio
|
11 |
import gradio as gr
|
12 |
-
import
|
13 |
from dotenv import load_dotenv
|
14 |
-
from fastapi import FastAPI,
|
15 |
-
from fastapi.responses import JSONResponse
|
16 |
import uvicorn
|
17 |
from threading import Thread
|
18 |
-
import
|
19 |
-
|
20 |
-
|
21 |
-
import nltk
|
22 |
-
from nltk.corpus import stopwords
|
23 |
-
import wget
|
24 |
-
|
25 |
-
nltk.download('stopwords')
|
26 |
|
27 |
load_dotenv()
|
28 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
@@ -34,7 +23,6 @@ global_data = {
|
|
34 |
'tokensxx': {
|
35 |
'eos': '<|end_of-text|>',
|
36 |
'pad': '<pad>',
|
37 |
-
'padding': '<pad>',
|
38 |
'unk': '<unk>',
|
39 |
'bos': '<|begin_of_text|>',
|
40 |
'sep': '<|sep|>',
|
@@ -47,7 +35,6 @@ global_data = {
|
|
47 |
'tokens': {
|
48 |
'eos': 'eos_token',
|
49 |
'pad': 'pad_token',
|
50 |
-
'padding': 'padding_token',
|
51 |
'unk': 'unk_token',
|
52 |
'bos': 'bos_token',
|
53 |
'sep': 'sep_token',
|
@@ -200,7 +187,6 @@ global_data = {
|
|
200 |
"n_layer_dense_lead": {},
|
201 |
"expert_weights_scale": {},
|
202 |
"rope_yarn_log_mul": {},
|
203 |
-
'model_type': {},
|
204 |
'eval': {},
|
205 |
'time': {},
|
206 |
'token': {},
|
@@ -229,111 +215,135 @@ global_data = {
|
|
229 |
'load_model': {},
|
230 |
'end': {},
|
231 |
'llama_perf_context_print': {},
|
232 |
-
'llm_load_print_meta': {}
|
|
|
233 |
}
|
234 |
|
235 |
model_configs = [
|
236 |
-
{
|
237 |
-
|
238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
]
|
240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
241 |
def normalize_input(input_text):
|
242 |
-
|
243 |
-
words = input_text.split()
|
244 |
-
filtered_words = [word for word in words if word.lower() not in stop_words]
|
245 |
-
return " ".join(filtered_words)
|
246 |
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
|
256 |
-
model_path = os.path.join("models", model["filename"])
|
257 |
-
global_data['models'][model["name"]] = Llama(model_path)
|
258 |
|
259 |
-
|
260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
-
|
|
|
263 |
try:
|
264 |
-
response =
|
265 |
-
return response
|
266 |
except Exception as e:
|
267 |
-
return
|
268 |
|
269 |
-
def
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
return
|
275 |
|
276 |
async def process_message(message):
|
277 |
inputs = normalize_input(message)
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
282 |
|
283 |
def run_uvicorn():
|
284 |
-
|
|
|
|
|
|
|
285 |
|
286 |
iface = gr.Interface(
|
287 |
fn=process_message,
|
288 |
inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
|
289 |
outputs=gr.Markdown(),
|
290 |
title="Multi-Model LLM API (CPU Optimized)",
|
291 |
-
description=""
|
292 |
)
|
293 |
|
294 |
def run_gradio():
|
295 |
iface.launch(server_port=7862, prevent_thread_lock=True)
|
296 |
|
297 |
-
def release_resources():
|
298 |
-
try:
|
299 |
-
torch.cuda.empty_cache()
|
300 |
-
gc.collect()
|
301 |
-
except Exception as e:
|
302 |
-
print(f"Failed to release resources: {e}")
|
303 |
-
|
304 |
-
def resource_manager():
|
305 |
-
MAX_RAM_PERCENT = 1
|
306 |
-
MAX_CPU_PERCENT = 1
|
307 |
-
MAX_GPU_PERCENT = 1
|
308 |
-
MAX_RAM_MB = 1
|
309 |
-
|
310 |
-
while True:
|
311 |
-
try:
|
312 |
-
virtual_mem = psutil.virtual_memory()
|
313 |
-
current_ram_percent = virtual_mem.percent
|
314 |
-
current_ram_mb = virtual_mem.used / (1024 * 1024)
|
315 |
-
|
316 |
-
if current_ram_percent > MAX_RAM_PERCENT or current_ram_mb > MAX_RAM_MB:
|
317 |
-
release_resources()
|
318 |
-
|
319 |
-
current_cpu_percent = psutil.cpu_percent()
|
320 |
-
if current_cpu_percent > MAX_CPU_PERCENT:
|
321 |
-
psutil.Process(os.getpid()).nice()
|
322 |
-
|
323 |
-
if torch.cuda.is_available():
|
324 |
-
gpu = torch.cuda.current_device()
|
325 |
-
gpu_mem = torch.cuda.memory_percent(gpu)
|
326 |
-
|
327 |
-
if gpu_mem > MAX_GPU_PERCENT:
|
328 |
-
release_resources()
|
329 |
-
|
330 |
-
except Exception as e:
|
331 |
-
print(f"Error in resource manager: {e}")
|
332 |
-
|
333 |
-
resource_manager()
|
334 |
-
|
335 |
if __name__ == "__main__":
|
336 |
-
asyncio.run(load_models())
|
337 |
Thread(target=run_uvicorn).start()
|
338 |
Thread(target=run_gradio).start()
|
339 |
asyncio.get_event_loop().run_forever()
|
|
|
|
|
|
|
|
|
1 |
import cachetools
|
2 |
from pydantic import BaseModel
|
3 |
from llama_cpp import Llama
|
4 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
|
5 |
import asyncio
|
6 |
import gradio as gr
|
7 |
+
import os
|
8 |
from dotenv import load_dotenv
|
9 |
+
from fastapi import FastAPI, JSONResponse
|
|
|
10 |
import uvicorn
|
11 |
from threading import Thread
|
12 |
+
import psutil
|
13 |
+
import gc
|
14 |
+
import torch
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
load_dotenv()
|
17 |
HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
|
|
|
23 |
'tokensxx': {
|
24 |
'eos': '<|end_of-text|>',
|
25 |
'pad': '<pad>',
|
|
|
26 |
'unk': '<unk>',
|
27 |
'bos': '<|begin_of_text|>',
|
28 |
'sep': '<|sep|>',
|
|
|
35 |
'tokens': {
|
36 |
'eos': 'eos_token',
|
37 |
'pad': 'pad_token',
|
|
|
38 |
'unk': 'unk_token',
|
39 |
'bos': 'bos_token',
|
40 |
'sep': 'sep_token',
|
|
|
187 |
"n_layer_dense_lead": {},
|
188 |
"expert_weights_scale": {},
|
189 |
"rope_yarn_log_mul": {},
|
|
|
190 |
'eval': {},
|
191 |
'time': {},
|
192 |
'token': {},
|
|
|
215 |
'load_model': {},
|
216 |
'end': {},
|
217 |
'llama_perf_context_print': {},
|
218 |
+
'llm_load_print_meta': {},
|
219 |
+
'model_type': {}
|
220 |
}
|
221 |
|
222 |
model_configs = [
|
223 |
+
{
|
224 |
+
"repo_id": "Hjgugugjhuhjggg/testing_semifinal-Q2_K-GGUF",
|
225 |
+
"filename": "testing_semifinal-q2_k.gguf",
|
226 |
+
"name": "testing"
|
227 |
+
},
|
228 |
+
{
|
229 |
+
"repo_id": "bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF",
|
230 |
+
"filename": "Llama-3.2-3B-Instruct-uncensored-Q2_K.gguf",
|
231 |
+
"name": "Llama-3.2-3B-Instruct"
|
232 |
+
}
|
233 |
]
|
234 |
|
235 |
+
class ModelManager:
|
236 |
+
def __init__(self):
|
237 |
+
self.models = {}
|
238 |
+
|
239 |
+
def load_model(self, model_config):
|
240 |
+
if model_config['name'] not in self.models:
|
241 |
+
try:
|
242 |
+
self.models[model_config['name']] = Llama.from_pretrained(
|
243 |
+
repo_id=model_config['repo_id'],
|
244 |
+
filename=model_config['filename'],
|
245 |
+
use_auth_token=HUGGINGFACE_TOKEN,
|
246 |
+
n_threads=8,
|
247 |
+
use_gpu=False
|
248 |
+
)
|
249 |
+
except Exception as e:
|
250 |
+
pass
|
251 |
+
|
252 |
+
def load_all_models(self):
|
253 |
+
with ThreadPoolExecutor() as executor:
|
254 |
+
for config in model_configs:
|
255 |
+
executor.submit(self.load_model, config)
|
256 |
+
return self.models
|
257 |
+
|
258 |
+
model_manager = ModelManager()
|
259 |
+
global_data['models'] = model_manager.load_all_models()
|
260 |
+
|
261 |
+
class ChatRequest(BaseModel):
|
262 |
+
message: str
|
263 |
+
|
264 |
def normalize_input(input_text):
|
265 |
+
return input_text.strip()
|
|
|
|
|
|
|
266 |
|
267 |
+
def remove_duplicates(text):
|
268 |
+
lines = text.split('\n')
|
269 |
+
unique_lines = []
|
270 |
+
seen_lines = set()
|
271 |
+
for line in lines:
|
272 |
+
if line not in seen_lines:
|
273 |
+
unique_lines.append(line)
|
274 |
+
seen_lines.add(line)
|
275 |
+
return '\n'.join(unique_lines)
|
|
|
|
|
276 |
|
277 |
+
def cache_response(func):
|
278 |
+
def wrapper(*args, **kwargs):
|
279 |
+
cache_key = f"{args}-{kwargs}"
|
280 |
+
if cache_key in cache:
|
281 |
+
return cache[cache_key]
|
282 |
+
response = func(*args, **kwargs)
|
283 |
+
cache[cache_key] = response
|
284 |
+
return response
|
285 |
+
return wrapper
|
286 |
|
287 |
+
@cache_response
|
288 |
+
def generate_model_response(model, inputs):
|
289 |
try:
|
290 |
+
response = model(inputs)
|
291 |
+
return remove_duplicates(response['choices'][0]['text'])
|
292 |
except Exception as e:
|
293 |
+
return ""
|
294 |
|
295 |
+
def remove_repetitive_responses(responses):
|
296 |
+
unique_responses = {}
|
297 |
+
for response in responses:
|
298 |
+
if response['model'] not in unique_responses:
|
299 |
+
unique_responses[response['model']] = response['response']
|
300 |
+
return unique_responses
|
301 |
|
302 |
async def process_message(message):
|
303 |
inputs = normalize_input(message)
|
304 |
+
with ThreadPoolExecutor() as executor:
|
305 |
+
futures = [
|
306 |
+
executor.submit(generate_model_response, model, inputs)
|
307 |
+
for model in global_data['models'].values()
|
308 |
+
]
|
309 |
+
responses = [
|
310 |
+
{'model': model_name, 'response': future.result()}
|
311 |
+
for model_name, future in zip(global_data['models'].keys(), as_completed(futures))
|
312 |
+
]
|
313 |
+
unique_responses = remove_repetitive_responses(responses)
|
314 |
+
formatted_response = ""
|
315 |
+
for model, response in unique_responses.items():
|
316 |
+
formatted_response += f"**{model}:**\n{response}\n\n"
|
317 |
+
return formatted_response
|
318 |
+
|
319 |
+
app = FastAPI()
|
320 |
+
|
321 |
+
@app.post("/generate")
|
322 |
+
async def generate(request: ChatRequest):
|
323 |
+
try:
|
324 |
+
response = await process_message(request.message)
|
325 |
+
return JSONResponse(content={"response": response})
|
326 |
+
except Exception as e:
|
327 |
+
return JSONResponse(content={"error": str(e)})
|
328 |
|
329 |
def run_uvicorn():
|
330 |
+
try:
|
331 |
+
uvicorn.run(app, host="0.0.0.0", port=7860)
|
332 |
+
except Exception as e:
|
333 |
+
print(f"Error al ejecutar uvicorn: {e}")
|
334 |
|
335 |
iface = gr.Interface(
|
336 |
fn=process_message,
|
337 |
inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
|
338 |
outputs=gr.Markdown(),
|
339 |
title="Multi-Model LLM API (CPU Optimized)",
|
340 |
+
description="Optimized version using GPU and memory management techniques."
|
341 |
)
|
342 |
|
343 |
def run_gradio():
|
344 |
iface.launch(server_port=7862, prevent_thread_lock=True)
|
345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
346 |
if __name__ == "__main__":
|
|
|
347 |
Thread(target=run_uvicorn).start()
|
348 |
Thread(target=run_gradio).start()
|
349 |
asyncio.get_event_loop().run_forever()
|