Hjgugugjhuhjggg commited on
Commit
cbbd51f
·
verified ·
1 Parent(s): 796e287

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +105 -95
app.py CHANGED
@@ -1,28 +1,17 @@
1
- import os
2
- import gc
3
- import psutil
4
  import cachetools
5
  from pydantic import BaseModel
6
  from llama_cpp import Llama
7
  from concurrent.futures import ThreadPoolExecutor, as_completed
8
- import re
9
- import httpx
10
  import asyncio
11
  import gradio as gr
12
- import torch
13
  from dotenv import load_dotenv
14
- from fastapi import FastAPI, Request
15
- from fastapi.responses import JSONResponse
16
  import uvicorn
17
  from threading import Thread
18
- import gptcache
19
- from sklearn.metrics.pairwise import cosine_similarity
20
- from sklearn.feature_extraction.text import TfidfVectorizer
21
- import nltk
22
- from nltk.corpus import stopwords
23
- import wget
24
-
25
- nltk.download('stopwords')
26
 
27
  load_dotenv()
28
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
@@ -34,7 +23,6 @@ global_data = {
34
  'tokensxx': {
35
  'eos': '<|end_of-text|>',
36
  'pad': '<pad>',
37
- 'padding': '<pad>',
38
  'unk': '<unk>',
39
  'bos': '<|begin_of_text|>',
40
  'sep': '<|sep|>',
@@ -47,7 +35,6 @@ global_data = {
47
  'tokens': {
48
  'eos': 'eos_token',
49
  'pad': 'pad_token',
50
- 'padding': 'padding_token',
51
  'unk': 'unk_token',
52
  'bos': 'bos_token',
53
  'sep': 'sep_token',
@@ -200,7 +187,6 @@ global_data = {
200
  "n_layer_dense_lead": {},
201
  "expert_weights_scale": {},
202
  "rope_yarn_log_mul": {},
203
- 'model_type': {},
204
  'eval': {},
205
  'time': {},
206
  'token': {},
@@ -229,111 +215,135 @@ global_data = {
229
  'load_model': {},
230
  'end': {},
231
  'llama_perf_context_print': {},
232
- 'llm_load_print_meta': {}
 
233
  }
234
 
235
  model_configs = [
236
- {"repo_id": "Hjgugugjhuhjggg/testing_semifinal-Q2_K-GGUF", "filename": "testing_semifinal-q2_k.gguf", "name": "testing"},
237
- {"repo_id": "bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF", "filename": "Llama-3.2-3B-Instruct-uncensored-Q2_K.gguf", "name": "Llama-3.2-3B-Instruct"},
238
- {"repo_id": "Ffftdtd5dtft/Meta-Llama-3.1-70B-Q2_K-GGUF", "filename": "Meta-Llama-3.1-70B-Q2_K.gguf", "name": "Meta-Llama-3.1-13B"}
 
 
 
 
 
 
 
239
  ]
240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  def normalize_input(input_text):
242
- stop_words = set(stopwords.words('english'))
243
- words = input_text.split()
244
- filtered_words = [word for word in words if word.lower() not in stop_words]
245
- return " ".join(filtered_words)
246
 
247
- async def load_models():
248
- tasks = []
249
- for model in model_configs:
250
- model_path = os.path.join("models", model["filename"])
251
- if not os.path.exists(model_path):
252
- url = f"https://huggingface.co/{model['repo_id']}/resolve/main/{model['filename']}"
253
- tasks.append(download_model(url, model_path))
254
- await asyncio.gather(*tasks)
255
- for model in model_configs:
256
- model_path = os.path.join("models", model["filename"])
257
- global_data['models'][model["name"]] = Llama(model_path)
258
 
259
- async def download_model(url, model_path):
260
- wget.download(url, model_path)
 
 
 
 
 
 
 
261
 
262
- async def generate_model_response(model, inputs):
 
263
  try:
264
- response = await model.generate(inputs)
265
- return response
266
  except Exception as e:
267
- return {"error": str(e)}
268
 
269
- def get_best_response(responses):
270
- if not responses:
271
- return {"error": "No valid responses from models."}
272
- scores = [response['score'] for response in responses]
273
- best_score_index = scores.index(max(scores))
274
- return responses[best_score_index]
275
 
276
  async def process_message(message):
277
  inputs = normalize_input(message)
278
- tasks = [generate_model_response(model, inputs) for model in global_data['models'].values()]
279
- responses = await asyncio.gather(*tasks)
280
- best_response = get_best_response(responses)
281
- return best_response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
  def run_uvicorn():
284
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
285
 
286
  iface = gr.Interface(
287
  fn=process_message,
288
  inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
289
  outputs=gr.Markdown(),
290
  title="Multi-Model LLM API (CPU Optimized)",
291
- description=""
292
  )
293
 
294
  def run_gradio():
295
  iface.launch(server_port=7862, prevent_thread_lock=True)
296
 
297
- def release_resources():
298
- try:
299
- torch.cuda.empty_cache()
300
- gc.collect()
301
- except Exception as e:
302
- print(f"Failed to release resources: {e}")
303
-
304
- def resource_manager():
305
- MAX_RAM_PERCENT = 1
306
- MAX_CPU_PERCENT = 1
307
- MAX_GPU_PERCENT = 1
308
- MAX_RAM_MB = 1
309
-
310
- while True:
311
- try:
312
- virtual_mem = psutil.virtual_memory()
313
- current_ram_percent = virtual_mem.percent
314
- current_ram_mb = virtual_mem.used / (1024 * 1024)
315
-
316
- if current_ram_percent > MAX_RAM_PERCENT or current_ram_mb > MAX_RAM_MB:
317
- release_resources()
318
-
319
- current_cpu_percent = psutil.cpu_percent()
320
- if current_cpu_percent > MAX_CPU_PERCENT:
321
- psutil.Process(os.getpid()).nice()
322
-
323
- if torch.cuda.is_available():
324
- gpu = torch.cuda.current_device()
325
- gpu_mem = torch.cuda.memory_percent(gpu)
326
-
327
- if gpu_mem > MAX_GPU_PERCENT:
328
- release_resources()
329
-
330
- except Exception as e:
331
- print(f"Error in resource manager: {e}")
332
-
333
- resource_manager()
334
-
335
  if __name__ == "__main__":
336
- asyncio.run(load_models())
337
  Thread(target=run_uvicorn).start()
338
  Thread(target=run_gradio).start()
339
  asyncio.get_event_loop().run_forever()
 
 
 
 
1
  import cachetools
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
  from concurrent.futures import ThreadPoolExecutor, as_completed
 
 
5
  import asyncio
6
  import gradio as gr
7
+ import os
8
  from dotenv import load_dotenv
9
+ from fastapi import FastAPI, JSONResponse
 
10
  import uvicorn
11
  from threading import Thread
12
+ import psutil
13
+ import gc
14
+ import torch
 
 
 
 
 
15
 
16
  load_dotenv()
17
  HUGGINGFACE_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 
23
  'tokensxx': {
24
  'eos': '<|end_of-text|>',
25
  'pad': '<pad>',
 
26
  'unk': '<unk>',
27
  'bos': '<|begin_of_text|>',
28
  'sep': '<|sep|>',
 
35
  'tokens': {
36
  'eos': 'eos_token',
37
  'pad': 'pad_token',
 
38
  'unk': 'unk_token',
39
  'bos': 'bos_token',
40
  'sep': 'sep_token',
 
187
  "n_layer_dense_lead": {},
188
  "expert_weights_scale": {},
189
  "rope_yarn_log_mul": {},
 
190
  'eval': {},
191
  'time': {},
192
  'token': {},
 
215
  'load_model': {},
216
  'end': {},
217
  'llama_perf_context_print': {},
218
+ 'llm_load_print_meta': {},
219
+ 'model_type': {}
220
  }
221
 
222
  model_configs = [
223
+ {
224
+ "repo_id": "Hjgugugjhuhjggg/testing_semifinal-Q2_K-GGUF",
225
+ "filename": "testing_semifinal-q2_k.gguf",
226
+ "name": "testing"
227
+ },
228
+ {
229
+ "repo_id": "bartowski/Llama-3.2-3B-Instruct-uncensored-GGUF",
230
+ "filename": "Llama-3.2-3B-Instruct-uncensored-Q2_K.gguf",
231
+ "name": "Llama-3.2-3B-Instruct"
232
+ }
233
  ]
234
 
235
+ class ModelManager:
236
+ def __init__(self):
237
+ self.models = {}
238
+
239
+ def load_model(self, model_config):
240
+ if model_config['name'] not in self.models:
241
+ try:
242
+ self.models[model_config['name']] = Llama.from_pretrained(
243
+ repo_id=model_config['repo_id'],
244
+ filename=model_config['filename'],
245
+ use_auth_token=HUGGINGFACE_TOKEN,
246
+ n_threads=8,
247
+ use_gpu=False
248
+ )
249
+ except Exception as e:
250
+ pass
251
+
252
+ def load_all_models(self):
253
+ with ThreadPoolExecutor() as executor:
254
+ for config in model_configs:
255
+ executor.submit(self.load_model, config)
256
+ return self.models
257
+
258
+ model_manager = ModelManager()
259
+ global_data['models'] = model_manager.load_all_models()
260
+
261
+ class ChatRequest(BaseModel):
262
+ message: str
263
+
264
  def normalize_input(input_text):
265
+ return input_text.strip()
 
 
 
266
 
267
+ def remove_duplicates(text):
268
+ lines = text.split('\n')
269
+ unique_lines = []
270
+ seen_lines = set()
271
+ for line in lines:
272
+ if line not in seen_lines:
273
+ unique_lines.append(line)
274
+ seen_lines.add(line)
275
+ return '\n'.join(unique_lines)
 
 
276
 
277
+ def cache_response(func):
278
+ def wrapper(*args, **kwargs):
279
+ cache_key = f"{args}-{kwargs}"
280
+ if cache_key in cache:
281
+ return cache[cache_key]
282
+ response = func(*args, **kwargs)
283
+ cache[cache_key] = response
284
+ return response
285
+ return wrapper
286
 
287
+ @cache_response
288
+ def generate_model_response(model, inputs):
289
  try:
290
+ response = model(inputs)
291
+ return remove_duplicates(response['choices'][0]['text'])
292
  except Exception as e:
293
+ return ""
294
 
295
+ def remove_repetitive_responses(responses):
296
+ unique_responses = {}
297
+ for response in responses:
298
+ if response['model'] not in unique_responses:
299
+ unique_responses[response['model']] = response['response']
300
+ return unique_responses
301
 
302
  async def process_message(message):
303
  inputs = normalize_input(message)
304
+ with ThreadPoolExecutor() as executor:
305
+ futures = [
306
+ executor.submit(generate_model_response, model, inputs)
307
+ for model in global_data['models'].values()
308
+ ]
309
+ responses = [
310
+ {'model': model_name, 'response': future.result()}
311
+ for model_name, future in zip(global_data['models'].keys(), as_completed(futures))
312
+ ]
313
+ unique_responses = remove_repetitive_responses(responses)
314
+ formatted_response = ""
315
+ for model, response in unique_responses.items():
316
+ formatted_response += f"**{model}:**\n{response}\n\n"
317
+ return formatted_response
318
+
319
+ app = FastAPI()
320
+
321
+ @app.post("/generate")
322
+ async def generate(request: ChatRequest):
323
+ try:
324
+ response = await process_message(request.message)
325
+ return JSONResponse(content={"response": response})
326
+ except Exception as e:
327
+ return JSONResponse(content={"error": str(e)})
328
 
329
  def run_uvicorn():
330
+ try:
331
+ uvicorn.run(app, host="0.0.0.0", port=7860)
332
+ except Exception as e:
333
+ print(f"Error al ejecutar uvicorn: {e}")
334
 
335
  iface = gr.Interface(
336
  fn=process_message,
337
  inputs=gr.Textbox(lines=2, placeholder="Enter your message here..."),
338
  outputs=gr.Markdown(),
339
  title="Multi-Model LLM API (CPU Optimized)",
340
+ description="Optimized version using GPU and memory management techniques."
341
  )
342
 
343
  def run_gradio():
344
  iface.launch(server_port=7862, prevent_thread_lock=True)
345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  if __name__ == "__main__":
 
347
  Thread(target=run_uvicorn).start()
348
  Thread(target=run_gradio).start()
349
  asyncio.get_event_loop().run_forever()